Loading...
arm/string/memset_pattern_Swift.s Libc-825.24 /dev/null
--- Libc/Libc-825.24/arm/string/memset_pattern_Swift.s
+++ /dev/null
@@ -1,275 +0,0 @@
-/*
- * Copyright (c) 2011 Apple Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this
- * file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_LICENSE_HEADER_END@
- *
- *  This file implements the following functions for the Swift micro-arch:
- *
- *  void memset_pattern4(void *b, const void *pattern4, size_t len);
- *  void memset_pattern8(void *b, const void *pattern8, size_t len);
- *  void memset_pattern16(void *b, const void *pattern16, size_t len);
- *
- *  The implementation of all three functions is fundamentally the same.
- *  memset_pattern4 is extensively commented to explain, reference that
- *  if you have any questions about the other two.
- */
-
-#include <arm/arch.h>
-#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
-
-.syntax unified
-.code 32
-.text
-.globl _memset_pattern4$VARIANT$Swift
-.globl _memset_pattern8$VARIANT$Swift
-.globl _memset_pattern16$VARIANT$Swift
-
-/******************************************************************************/
-
-.align 4
-_memset_pattern4$VARIANT$Swift:
-    push    {r7,lr}
-    mov     r7,         sp
-
-//  Load the pattern and splat it to q0, then check if the buffer is at least
-//  64 bytes long.  If not, branch to a short-buffer implementation.
-    ldr     r1,        [r1]
-    vdup.32 q0,         r1
-    subs    r3,     r2, #64
-    blo     L_short4
-
-//  We want to use aligned vector stores to fill the bulk of the buffer.  In
-//  order to make that work, we need to rotate the pattern as necessary to
-//  match up with aligned locations, and we also need to extract the alignment
-//  of the destination pointer mod 16.
-    lsl     ip,     r0, #3
-    and     lr,     r0, #0xf    //  alignment of destination pointer mod 16
-    rsb     ip,     ip, #32     //  low five bits contain 32 - 8*(address%4).
-
-//  Before we start the aligned stores, we do a single unaligned store of
-//  16 bytes of the pattern to the start of the buffer.  Since the buffer is
-//  at least 64 bytes long, this store is known to lie entirely inside the
-//  buffer:
-//                              first aligned address in buffer
-//                                                            v
-//   ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
-//  ... | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f | 0 | 1 | 2 | ...
-//   ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
-//        ^
-//        unaligned store starts here:
-//      [ 0   1   2   3   0   1   2   3   0   1   2   3   0   1   2   3 ]
-    vst1.8  {q0},      [r0]!
-
-//  Subsequent stores will be aligned, and will start at the first aligned
-//  address in the buffer.  We apply the rotation that we calculated before
-//  the vector store (in the low five bits of ip) to get the pattern that
-//  is to be stored starting at the aligned location.  For example, in the
-//  picture above, the buffer had alignment of 3 mod 4, so the rotation to
-//  be applied is 32 - 8*3 = 8.  Rotating the pattern right by 8 bits gives
-//  us [ 1 2 3 0 ] (remember, we're little-endian), which we see is what
-//  needs to be stored starting at the first aligned location.
-//
-//  Besides rotating the pattern, we also need to adjust the length (by
-//  subtracting 16 - alignment mod 16), and to advance the pointer to the
-//  first aligned location.
-    ror     r1,         ip      //  Pattern to use for aligned memory
-    add     r3,         lr
-    bic     r0,         #0xf    //  destination for first aligned store
-    subs    r3,         #16     //  updated length
-    blo     1f
-
-//  Splat the rotated value across q1 and q2
-    vdup.32 q1,         r1
-    vmov    q2,         q1
-
-//  Main store loop.  We write the splatted aligned pattern across 64 bytes
-//  per iteration, terminating the loop when the remaining length of the
-//  buffer is 64 bytes or less.
-0:  subs    r3,         #64
-    vst1.32 {q1,q2}, [r0,:128]!
-    vst1.32 {q1,q2}, [r0,:128]!
-    bhi     0b
-
-//  The remaining length of the buffer is 64 bytes or less (but the total
-//  length of the buffer is at least 64 bytes; otherwise we would have
-//  branched to the "short" path).  Thus, we can handle the entirety of the
-//  remaining buffer with two 32-byte unaligned stores.
-//
-//  Again, we need to rotate the pattern to match the alignment, this time
-//  by 8*(length%4), and we also need to back up the destination pointer
-//  so that it points to precisely 64 bytes before the end of the buffer.
-//  We accomplish this by adding r3, which contains the remaining length of
-//  the buffer minus 64.
-1:  lsl     ip,     r3, #3
-    ror     r1,         ip
-    vdup.32 q8,         r1
-    vmov    q9,         q8
-    add     r0,         r3
-    vst1.32 {q8,q9},   [r0]!
-    vst1.32 {q8,q9},   [r0]
-    pop     {r7,pc}
-
-L_short4:
-//  If we branch here, the buffer is less than 64 bytes long.  At this point,
-//  register contents are as follows:
-//
-//      r0      pointer to the buffer
-//      r1      pattern
-//      r2      buffer length
-//      q0      splatted pattern
-//
-//  To begin, we store eight bytes at a time until the remaining length is
-//  less than eight bytes.
-    subs    r3,     r2, #8
-    blo     1f
-0:  subs    r3,         #8
-    vst1.32 {d0},      [r0]!
-    bhs     0b
-
-//  Then we store one byte at a time, rotating the pattern to get the next
-//  byte, until we reach the end of the buffer.
-    add     r2,     r3, #8
-1:  subs    r2,         #1
-    strbhs  r1,    [r0],#1
-    ror     r1,         #8
-    bhi     1b
-    pop     {r7,pc}
-
-/******************************************************************************/
-
-.align 4
-_memset_pattern8$VARIANT$Swift:
-//  The implementation of this function is substantially identical to that of
-//  memset_pattern4.  The only differences are in how we rotate the pattern for
-//  the purposes of extracting the bytes to store.  For clarity, only those
-//  differences are commented here; consult memset_pattern4 (above) for
-//  a detailed description of the algorithm used.
-    push    {r7,lr}
-    mov     r7,         sp
-    vld1.8  {d0},      [r1]
-    vmov    d1,         d0
-    subs    r3,     r2, #64
-    blo     L_short8
-
-    bic     sp,         #0xf    //  Align stack to 16 bytes and write 32 bytes
-    sub     sp,         #16     //  of pattern to the stack.  We will use 
-    vst1.8  {q0},    [sp,:128]  //  unaligned loads from this scratch buffer
-    sub     sp,         #16     //  to get rotated forms of the pattern.
-    vst1.8  {q0},    [sp,:128]
-    and     ip,     r0, #0x7    //  Now generate an unaligned pointer to the
-    rsb     ip,     ip, #8      //  rotated pattern that we need to use for
-    add     ip,         sp      //  aligned stores in the main loop.
-    and     lr,     r0, #0xf
-    vst1.8  {q0},      [r0]!
-    add     r3,         lr
-    bic     r0,         #0xf
-    subs    r3,         #16
-    blo     1f
-    vld1.8  {q1},      [ip]
-    vmov    q2,         q1
-0:  subs    r3,         #64
-    vst1.32 {q1,q2}, [r0,:128]!
-    vst1.32 {q1,q2}, [r0,:128]!
-    bhi     0b
-1:  and     lr,     r3, #0x7    //  Generate an unaligned pointer to the
-    add     ip,         lr      //  rotated pattern to use for cleanup.
-    vld1.8  {q8},      [ip]
-    vmov    q9,         q8
-    add     r0,         r3
-    vst1.32 {q8,q9},   [r0]!
-    vst1.32 {q8,q9},   [r0]
-    mov     sp,         r7      //  Restore stack pointer
-    pop     {r7,pc}
-
-L_short8:
-    subs    r2,         #8
-    blo     1f
-0:  subs    r2,         #8
-    vst1.32 {d0},      [r0]!
-    bhs     0b
-1:  adds    r2,         #8
-    beq     3f
-2:  vst1.8  {d0[0]},   [r0]!    //  Store one byte from NEON
-    vext.8  d0,     d0, d0, #1  //  Use VEXT to rotate pattern
-    subs    r2,         #1
-    bhi     2b
-3:  pop     {r7,pc}
-
-/******************************************************************************/
-
-.align 4
-_memset_pattern16$VARIANT$Swift:
-//  The implementation of this function is substantially identical to that of
-//  memset_pattern4.  The only differences are in how we rotate the pattern for
-//  the purposes of extracting the bytes to store.  For clarity, only those
-//  differences are commented here; consult memset_pattern4 (above) for
-//  a detailed description of the algorithm used.
-    push    {r7,lr}
-    mov     r7,         sp
-    vld1.8  {q0},      [r1]
-    subs    r3,     r2, #64
-    blo     L_short16
-
-    bic     sp,         #0xf    //  Align stack to 16 bytes and write 48 bytes
-    sub     sp,         #16     //  of pattern to the stack.  We will use 
-    vst1.8  {q0},    [sp,:128]  //  unaligned loads from this scratch buffer
-    sub     sp,         #16     //  to get rotated forms of the pattern.
-    vst1.8  {q0},    [sp,:128]
-    sub     sp,         #16   
-    vst1.8  {q0},    [sp,:128]
-    and     lr,     r0, #0xf    //  Now generate an unaligned pointer to the
-    rsb     ip,     lr, #16     //  rotated pattern that we need to use for
-    add     ip,         sp      //  aligned stores in the main loop.
-    vst1.8  {q0},      [r0]!
-    add     r3,         lr
-    bic     r0,         #0xf
-    subs    r3,         #16
-    blo     1f
-    vld1.8  {q1},      [ip]
-    vmov    q2,         q1
-0:  subs    r3,         #64
-    vst1.32 {q1,q2}, [r0,:128]!
-    vst1.32 {q1,q2}, [r0,:128]!
-    bhi     0b
-1:  and     lr,     r3, #0xf    //  Generate an unaligned pointer to the
-    add     ip,         lr      //  rotated pattern to use for cleanup.
-    vld1.8  {q8},      [ip]
-    vmov    q9,         q8
-    add     r0,         r3
-    vst1.32 {q8,q9},   [r0]!
-    vst1.32 {q8,q9},   [r0]
-    mov     sp,         r7      //  Restore stack pointer
-    pop     {r7,pc}
-
-L_short16:
-    subs    r2,         #16
-    blo     1f
-0:  subs    r2,         #16
-    vst1.32 {q0},      [r0]!
-    bhs     0b
-1:  adds    r2,         #16
-    beq     3f
-2:  vst1.8  {d0[0]},   [r0]!    //  Store one byte from NEON
-    vext.8  q0,     q0, q0, #1  //  Use VEXT to rotate pattern
-    subs    r2,         #1
-    bhi     2b
-3:  pop     {r7,pc}
-
-#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD