bcopy_CortexA8.s diff - arm/string/bcopy_CortexA8.s - Libc source code Libc-763.13

arm/string/bcopy_CortexA8.s Libc-763.13 ⇄ Libc-825.24
--- Libc/Libc-763.13/arm/string/bcopy_CortexA8.s
+++ Libc/Libc-825.24/arm/string/bcopy_CortexA8.s
@@ -21,3 +21,843 @@
  * @APPLE_LICENSE_HEADER_END@
  */
 
+#include <arm/arch.h>
+#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
+
+/*****************************************************************************
+ * Cortex-A8 implementation                                                  *
+ *****************************************************************************/
+ 
+// Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ).
+//
+// Our tests have shown that NEON is always a performance win for memcpy( ).
+// However, for the specific case of copies from a warm source to a cold
+// destination when the buffer size is between 1k and 32k, it is not enough
+// of a performance win to offset the increased power footprint, resulting
+// in an energy usage regression.  Thus, we detect that particular case, and
+// pass those copies through the ARM core registers.  All other copies larger
+// than 8 bytes are handled on NEON.
+//
+// Stephen Canon, August 2009
+
+.text
+.code 16
+.syntax unified
+
+// void bcopy(const void * source,
+//            void * destination,
+//            size_t length);
+//
+// void *memmove(void * destination,
+//               const void * source,
+//               size_t n);
+//
+// void *memcpy(void * restrict destination,
+//              const void * restrict source,
+//              size_t n);
+//
+// all copy n successive bytes from source to destination. memmove and memcpy
+// returns destination, whereas bcopy has no return value. copying takes place
+// as if it were through a temporary buffer -- after return destination contains
+// exactly the bytes from source, even if the buffers overlap.
+
+.thumb_func _bcopy$VARIANT$CortexA8
+.thumb_func _memmove$VARIANT$CortexA8
+.thumb_func _memcpy$VARIANT$CortexA8
+.globl _bcopy$VARIANT$CortexA8
+.globl _memmove$VARIANT$CortexA8
+.globl _memcpy$VARIANT$CortexA8
+
+#define SAVE_REGISTERS {r4,r5,r6,r8,r10,r11}
+#define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r11}
+               
+/*****************************************************************************
+ *  entry points                                                             *
+ *****************************************************************************/
+
+.align 2
+_bcopy$VARIANT$CortexA8:
+
+// bcopy has the first and second arguments in the opposite order as the C
+// library functions memmove and memcpy.  If bcopy is called, we swap these
+// two arguments and then fall into memmove.
+
+    mov         r3,     r0
+    mov         r0,     r1
+    mov         r1,     r3
+
+.align 2
+_memmove$VARIANT$CortexA8:
+_memcpy$VARIANT$CortexA8:
+
+// At entry to memmove/memcpy, registers contain the following values:
+//
+//  r0  pointer to the first byte of the destination buffer
+//  r1  pointer to the first byte of the source buffer
+//  r2  number of bytes to copy
+//
+// Our preference is to use a (faster and easier to understand) front-to-back
+// copy of the buffer.  However, memmove requires that copies take place as
+// though through a temporary buffer.  This means that if the buffers overlap,
+// it may be necessary to copy the buffer in reverse order.
+//
+// To properly detect such overlap, we begin by computing the offset between
+// the source and destination pointers.  If the offset happens to be zero,
+// then there is no work to be done, so we can early out.
+
+    subs    r3,     r0, r1
+    it      eq
+    bxeq    lr
+
+// r3 now contains the offset between the buffers, (destination - source).  If
+// 0 < offset < length, then the high-addressed bits of the source alias the
+// low addressed bytes of the destination.  Thus, if we were to perform the
+// copy in ascending address order, we would overwrite the high-addressed
+// source bytes before we had a chance to copy them, and the data would be lost.
+//
+// Thus, we can use the front-to-back copy only if offset is negative or
+// greater than the length.  This is the case precisely if offset compares
+// unsigned higher than length.
+
+    cmp     r3,     r2
+    bhs     L_copyFrontToBack
+                             
+/*****************************************************************************
+ *  back to front copy                                                       *
+ *****************************************************************************/
+
+// Here we have fallen through into the back-to-front copy.  We preserve the
+// original destination pointer in r0 because it is the return value for the
+// routine, and update the other registers as follows:
+//
+//  r1  one byte beyond the end of the destination buffer
+//  r2  number of bytes to copy
+//  ip  one byte beyond the end of the destination buffer
+
+    mov      ip,    r0
+    add      r1,    r2
+    add      ip,    r2
+    
+// Subtract 8 from the buffer length; if this is negative, then we will use
+// only single-byte copies, and we jump directly to a scalar copy loop.
+
+    subs     r2,        $8
+    blt      L_scalarReverseCopy
+    
+// If the destination pointer is 8-byte aligned we can use 8-byte NEON copies
+// to move the data.
+    
+    tst      ip,        $7
+    beq      L_vectorReverseCopy
+    
+// Otherwise, we copy a single byte at a time, in order of descending memory
+// address, until the destination is 8 byte aligned.  Within this loop,
+// registers are used as follows:
+//
+//  r0  original destination pointer
+//  r1  pointer to one byte past the next element to be copied
+//  r2  (bytes remaining to be copied) - 8
+//  r3  temporary to hold the byte that is being copied
+//  ip  pointer one byte past the destination of the next byte to be copied
+//
+//  byte that will be copied in this iteration
+//                            |  byte that was copied in the previous iteration                           
+//  Source buffer:            v   v                                
+//  ------------------------+---+---+-------------------------
+//  bytes still to copy ... |   |   | ... bytes already copied
+//  ------------------------+---+---+-------------------------
+//                                ^
+//                               r1 holds the address of this byte
+
+0:  ldrb     r3,   [r1, $-1]!
+    sub      r2,        $1
+    strb     r3,   [ip, $-1]!
+    tst      ip,        $7
+    bne      0b
+    
+// At this point, the destination pointer is 8 byte aligned.  Check again that
+// there are at least 8 bytes remaining to copy by comparing the remaining
+// length minus 8 to zero.  If fewer than 8 bytes remain, jump to the cleanup
+// path.
+    
+    cmp      r2,    $0
+    blt      L_scalarReverseCopy
+                                    
+/*****************************************************************************
+ *  destination is 8 byte aligned                                            *
+ *****************************************************************************/
+
+L_vectorReverseCopy:
+
+// At this point, registers contain the following values:
+//
+//  r0  original destination pointer
+//  r1  pointer to one byte past the next element to be copied
+//  r2  (bytes remaining to copy) - 8
+//  ip  pointer one byte past the destination of the next byte to be copied
+//
+// Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive.
+// NEON has really excellent alignment handling in hardware, so we would like
+// to use that to handle cases where the source is not similarly aligned to the
+// destination (it supports even single-byte misalignment at speed).  However,
+// on some SoC designs, not all of the DMA busses support such access.  Thus,
+// we must unfortunately use a software workaround in those cases.
+//
+// Fortunately, 4-byte aligned loads are supported even on the DMA busses, so
+// we only need to handle the different possible source alignments modulo 4.
+// Here we have a dispatch table to jump to the correct copy implementation
+// for the given source alignment.
+//
+// The tbh instruction loads the address offset of the correct implementation
+// from the data table that immediately follows it and adds it to the pc to 
+// jump to the correct branch.
+
+    ands     r3,    r1, $3
+    tbh     [pc, r3, lsl $1]
+0:  
+.short (L_reverseAligned0-0b)/2
+.short (L_reverseAligned1-0b)/2
+.short (L_reverseAligned2-0b)/2
+.short (L_reverseAligned3-0b)/2
+
+/*****************************************************************************
+ *  source is also at least word aligned                                     *
+ *****************************************************************************/
+    
+L_reverseAligned0:
+
+// Subtract 56 from r2, so that it contains the number of bytes remaining to
+// copy minus 64.  If this result is negative, then we jump into a loop that
+// copies 8 bytes at a time.
+
+    subs     r2,        $0x38
+    blt      L_reverseVectorCleanup
+    
+// Check if the destination pointer is 64-byte aligned.  If so, jump to a loop
+// that copies whole cachelines.
+
+    tst      ip,        $0x38
+    beq      L_reverseCachelineAligned
+    
+// Otherwise, we copy a 8 bytes at a time, in order of descending memory
+// address, until the destination is 64 byte aligned.  Within this loop,
+// registers are used as follows:
+//
+//  r0  original destination pointer
+//  r1  pointer to one byte past the next element to be copied
+//  r2  (bytes remaining to be copied) - 64
+//  ip  pointer one byte past the destination of the next byte to be copied
+//  d0  temporary storage for copy
+//
+//  bytes that will be copied after this iteration
+//        |         8 byte block that will be copied in this iteration                            
+//        v         v
+//  --------------+-------------------------------+---------------------
+//                | 0   1   2   3   4   5   6   7 | bytes already copied
+//  --------------+-------------------------------+---------------------
+//                                                  ^
+//                                                  r1 points here
+    
+0:  sub      r1,        $8
+    vld1.32 {d0},  [r1]
+    sub      ip,        $8
+    sub      r2,        $8
+    tst      ip,        $0x38
+    vst1.64 {d0},  [ip,:64]
+    bne      0b
+    
+// At this point, the destination pointer is 64 byte aligned.  Check again that
+// there are at least 64 bytes remaining to copy by comparing the remaining
+// length minus 64 to zero.  If fewer than 64 bytes remain, skip over the main
+// copy loop.
+
+    cmp      r2,        $0
+    blt      L_reverseVectorCleanup
+    
+/*****************************************************************************
+ *  destination is cacheline aligned                                         *
+ *****************************************************************************/
+
+L_reverseCachelineAligned:
+
+// In the special case that we are copying a buffer of between 1k and 32k bytes
+// we do not use a NEON copy for the main loop.  This is because if we happen
+// to be doing a copy from a source in cache to a destination that is not in
+// cache, this will result in an increase in energy usage.  In all other cases,
+// NEON gives superior energy conservation.
+
+    sub      r3,    r2, $0x3c0
+    cmp      r3,        $0x7c00
+    blo      L_useSTMDB
+    
+// Pre-decrement the source (r1) and destination (ip) pointers so that they
+// point to the first byte of the trailing 32-byte window of each buffer.
+// Additionally, load the address increment of -32 into r3.
+
+    sub      r1,        $32
+    sub      ip,        $32
+    mov      r3,        $-32
+    
+// The destination pointer is known to be 64-byte aligned, so we can use the
+// maximal alignment hint (:256) for our vector stores.  Detect if the source
+// is also at least 32-byte aligned and jump to a loop that uses maximal
+// alignment hints for the loads as well if possible.
+    
+    tst      r1,        $0x1f
+    beq      L_reverseSourceAligned
+    
+// This loop copies 64 bytes per iteration, from a 4-byte aligned source to a
+// 64-byte aligned destination, in order of descending memory address.  Within
+// this loop, registers are used as follows:
+//
+//  r0      original destination pointer (unmodified)
+//  r1      pointer to the next 32-byte block to load
+//  r2      (number of bytes remaining to copy) - 64
+//  r3      address increment of -32.
+//  ip      pointer to which the next 32-byte block is to be stored
+//  q0-q3   temporary registers used for copies
+//
+// Note that the loop is arrange in such a way that a single cleanup store is
+// necessary after the final loop iteration.  This occurs at label (1), and is
+// shared between the unaligned and aligned loops.
+    
+    vld1.32 {q2,q3}, [r1],      r3
+    vld1.32 {q0,q1}, [r1],      r3
+    subs     r2,         $64
+    vst1.64 {q2,q3}, [ip,:256], r3
+    blt      1f
+.align 3
+0:  vld1.32 {q2,q3}, [r1],      r3
+    vst1.64 {q0,q1}, [ip,:256], r3
+    vld1.32 {q0,q1}, [r1],      r3
+    subs     r2,         $64
+    vst1.64 {q2,q3}, [ip,:256], r3 
+    bge      0b
+    b        1f
+    
+L_reverseSourceAligned:
+
+// This loop is identical to the immediately preceeding loop, except that it
+// uses the additional alignment hint that the source pointer (r1) is 32-byte
+// aligned.  The two loops share cleanup code for the final iteration.
+
+    vld1.64 {q2,q3}, [r1,:256], r3
+    vld1.64 {q0,q1}, [r1,:256], r3
+    subs     r2,         $64
+    vst1.64 {q2,q3}, [ip,:256], r3
+    blt      1f
+.align 3
+0:  vld1.64 {q2,q3}, [r1,:256], r3
+    vst1.64 {q0,q1}, [ip,:256], r3
+    vld1.64 {q0,q1}, [r1,:256], r3
+    subs     r2,         $64
+    vst1.64 {q2,q3}, [ip,:256], r3
+    bge      0b
+    
+// Final vector store for both of the above loops.
+
+1:  vst1.64 {q0,q1}, [ip,:256], r3
+
+// Adjust the source and destination pointers so that they once again point to
+// the last byte that we used (which is one byte higher than the address that
+// we will use next for any required cleanup).
+
+    add      r1,         $32
+    add      ip,         $32
+    
+L_reverseVectorCleanup:
+
+// Add 56 to r2, so that it contains the number of bytes remaing to copy minus
+// 8.  A comparison of this value with zero tells us if any more whole 8-byte
+// blocks need to be copied.
+
+    adds     r2,    r2, $0x38
+    blt      L_scalarReverseCopy
+
+// This loop copies 8 bytes at a time in order of descending memory address,
+// until fewer than 8 bytes remain to be copied.  Within this loop, registers
+// are used as follows:
+//
+//  r0  original destination pointer
+//  r1  pointer to one byte past the next element to be copied
+//  r2  (bytes remaining to be copied) - 64
+//  ip  pointer one byte past the destination of the next byte to be copied
+//  d0  temporary storage for copy
+
+0:  sub      r1,        $8
+    vld1.32 {d0},  [r1]
+    sub      ip,        $8
+    subs     r2,        $8
+    vst1.64 {d0},  [ip,:64]
+    bge      0b
+
+/*****************************************************************************
+ *  sub-doubleword cleanup copies                                            *
+ *****************************************************************************/
+
+L_scalarReverseCopy:
+
+// Add 8 to r2, so that it contains the number of bytes remaining to copy, and
+// return to the calling routine if zero bytes remain.
+
+    adds     r2,        $8
+    it       eq
+    bxeq     lr
+
+// Copy one byte at a time in descending address order until we reach the front
+// of the buffer.  Within this loop, registers are used as follows:
+//
+//  r0  original destination pointer
+//  r1  pointer to one byte past the next element to be copied
+//  r2  (bytes remaining to be copied) - 8
+//  r3  temporary to hold the byte that is being copied
+//  ip  pointer one byte past the destination of the next byte to be copied
+         
+0:  ldrb     r3,   [r1, $-1]!
+    subs     r2,        $1
+    strb     r3,   [ip, $-1]!
+    bne      0b
+    bx       lr
+         
+/*****************************************************************************
+ *  STMDB loop for 1k-32k buffers                                            *
+ *****************************************************************************/
+
+// This loop copies 64 bytes each iteration in order of descending memory
+// address, using the GPRs instead of NEON.
+//
+//  r0  original destination pointer
+//  r1  pointer to one byte past the next element to be copied
+//  r2  (bytes remaining to be copied) - 64
+//  r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data
+//  ip  pointer to one byte past the next location to store to
+
+L_useSTMDB:
+    push    SAVE_REGISTERS
+.align 3
+0:  ldmdb   r1!,    COPY_REGISTERS
+    subs    r2,     r2,  $64
+    stmdb   ip!,    COPY_REGISTERS
+    ldmdb   r1!,    COPY_REGISTERS
+    pld     [r1, $-64]
+    stmdb   ip!,    COPY_REGISTERS
+    bge     0b
+    pop     SAVE_REGISTERS
+    b       L_reverseVectorCleanup
+    
+/*****************************************************************************
+ *  Misaligned reverse vld1 loop                                             *
+ *****************************************************************************/
+
+// Software alignment fixup to handle source and dest that are relatively
+// misaligned mod 4 bytes.  
+//
+// The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration,
+// which we combine with the 8 bytes loaded in the previous iteration to get a
+// 16 byte field; the next 8 bytes to be stored to the destination buffer are
+// somewhere in that field, and we get them using the VEXT instruction:
+//      
+//     |  8 bytes from this iteration  |  8 bytes from last iteration  |
+//     +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+//     | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f |
+//     +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+//         ^8 bytes to store this iteration^           |
+//                                                   could be a page boundary 
+//
+// We need to be a little bit careful, however.  Because the loads only have 4
+// byte alignment, the very first load could slop over into a page that is not
+// mapped readable.  In order to prevent this scenario, we copy eight bytes
+// using byte-by-byte before beginning the main loop.
+//
+// At the beginning of each iteration through this loop, registers are used
+// as follows:
+//
+//  r0  original destination pointer
+//  r1  pointer to the next block of 8 bytes to load
+//  r2  (bytes remaining to copy) - 8
+//  ip  pointer to the next block of 8 bytes to store
+//  d0  next 8 bytes to store
+//  d2  8 bytes loaded in the previous iteration
+//  d3  8 bytes loaded two iterations ago
+
+#define RCOPY_UNALIGNED(offset)      \
+0:  ldrb      r3,     [r1,$-1]!     ;\
+    strb      r3,     [ip,$-1]!     ;\
+    subs      r2,         $1        ;\
+    blt       L_scalarReverseCopy   ;\
+    tst       ip,         $7        ;\
+    bne       0b                    ;\
+    bic       r1,         $3        ;\
+    sub       r1,         $8        ;\
+    sub       ip,         $8        ;\
+    mov       r3,         $-8       ;\
+    vld1.32  {d2,d3}, [r1], r3      ;\
+    subs      r2,         $8        ;\
+    blt       1f                    ;\
+0:  vext.8    d0,  d2, d3, $(offset);\
+    vmov      d3,      d2           ;\
+    vld1.32  {d2},    [r1], r3      ;\
+    subs      r2,          $8       ;\
+    vst1.64  {d0},    [ip, :64], r3 ;\
+    bge       0b                    ;\
+1:  vext.8    d0,  d2, d3, $(offset);\
+    add       r1,          $8       ;\
+    vst1.64  {d0},    [ip, :64]     ;\
+2:  add       r1,          $(offset);\
+    b         L_scalarReverseCopy
+
+L_reverseAligned1:
+    RCOPY_UNALIGNED(1)
+L_reverseAligned2:
+    RCOPY_UNALIGNED(2)
+L_reverseAligned3:
+    RCOPY_UNALIGNED(3)
+
+/*****************************************************************************
+ *  front to back copy                                                       *
+ *****************************************************************************/
+
+L_copyFrontToBack:
+
+// Here the pointers are laid out such that we can use our preferred
+// front-to-back copy.  We preserve original destination pointer in r0 because
+// it is the return value for the routine, and copy it to ip to use in this
+// routine.
+
+    mov      ip,    r0
+    
+// Subtract 8 from the buffer length; if this is negative, then we will use
+// only single-byte copies, and we jump directly to a scalar copy loop.
+    
+    subs     r2,        $8
+    blt      L_scalarCopy
+    
+// If the destination pointer is 8-byte aligned we can use 8-byte NEON copies
+// to move the data.
+
+    tst      ip,        $7
+    beq      L_vectorCopy
+    
+// Otherwise, we copy a single byte at a time, in order of ascending memory
+// address, until the destination is 8 byte aligned.  Within this loop,
+// registers are used as follows:
+//
+//  r0  original destination pointer
+//  r1  pointer to the next byte to copy
+//  r2  (bytes remaining to be copied) - 8
+//  r3  temporary to hold the byte that is being copied
+//  ip  pointer to the next byte to store to
+
+0:  ldrb     r3,  [r1], $1
+    sub      r2,        $1
+    strb     r3,  [ip], $1
+    tst      ip,        $7
+    bne      0b
+    
+// At this point, the destination pointer is 8 byte aligned.  Check again that
+// there are at least 8 bytes remaining to copy by comparing the remaining
+// length minus 8 to zero.  If fewer than 8 bytes remain, jump to the cleanup
+// path.
+
+    cmp      r2,        $0
+    blt      L_scalarCopy
+    
+/*****************************************************************************
+ *  destination is doubleword aligned                                        *
+ *****************************************************************************/
+
+L_vectorCopy:
+
+// At this point, registers contain the following values:
+//
+//  r0  original destination pointer
+//  r1  pointer to the next element to be copied
+//  r2  (bytes remaining to copy) - 8
+//  ip  pointer to the destination of the next byte to be copied
+//
+// Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive.
+// NEON has really excellent alignment handling in hardware, so we would like
+// to use that to handle cases where the source is not similarly aligned to the
+// destination (it supports even single-byte misalignment at speed).  However,
+// on some SoC designs, not all of the DMA busses support such access.  Thus,
+// we must unfortunately use a software workaround in those cases.
+//
+// Fortunately, 4-byte aligned loads are supported even on the DMA busses, so
+// we only need to handle the different possible source alignments modulo 4.
+// Here we have a dispatch table to jump to the correct copy implementation
+// for the given source alignment.
+//
+// The tbh instruction loads the address offset of the correct implementation
+// from the data table that immediately follows it and adds it to the pc to 
+// jump to the correct branch.
+
+    ands     r3,    r1, $3
+    bic      r1,        $3
+    tbh     [pc, r3, lsl $1]
+0:  
+.short (L_sourceAligned0-0b)/2
+.short (L_sourceAligned1-0b)/2
+.short (L_sourceAligned2-0b)/2
+.short (L_sourceAligned3-0b)/2
+
+/*****************************************************************************
+ *  source is also at least word aligned                                     *
+ *****************************************************************************/
+    
+L_sourceAligned0:
+
+// Subtract 56 from r2, so that it contains the number of bytes remaining to
+// copy minus 64.  If this result is negative, then we jump into a loop that
+// copies 8 bytes at a time.
+
+    subs     r2,        $0x38
+    blt      L_vectorCleanup
+    
+// Check if the destination pointer is 64-byte aligned.  If so, jump to a loop
+// that copies whole cachelines.
+
+    tst      ip,        $0x38
+    beq      L_cachelineAligned
+        
+// Otherwise, we copy a 8 bytes at a time, in order of ascending memory
+// address, until the destination is 64 byte aligned.  Within this loop,
+// registers are used as follows:
+//
+//  r0  original destination pointer
+//  r1  pointer to the next element to be copied
+//  r2  (bytes remaining to be copied) - 64
+//  ip  pointer to the destination of the next byte to be copied
+//  d0  temporary storage for copy
+
+0:  vld1.32 {d0},  [r1]!
+    sub      r2,        $8
+    vst1.64 {d0},  [ip,:64]!
+    tst      ip,        $0x38
+    bne      0b
+    
+// At this point, the destination pointer is 64 byte aligned.  Check again that
+// there are at least 64 bytes remaining to copy by comparing the remaining
+// length minus 64 to zero.  If fewer than 64 bytes remain, skip over the main
+// copy loop.
+
+    cmp      r2,        $0
+    blt      L_vectorCleanup
+    
+/*****************************************************************************
+ *  destination is cacheline aligned                                         *
+ *****************************************************************************/
+
+// In the special case that we are copying a buffer of between 1k and 32k bytes
+// we do not use a NEON copy for the main loop.  This is because if we happen
+// to be doing a copy from a source in cache to a destination that is not in
+// cache, this will result in an increase in energy usage.  In all other cases,
+// NEON gives superior energy conservation.
+
+L_cachelineAligned:
+    sub      r3,    r2, $0x3c0
+    cmp      r3,        $0x7c00
+    blo      L_useSTMIA
+    
+// The destination pointer is known to be 64-byte aligned, so we can use the
+// maximal alignment hint (:256) for our vector stores.  Detect if the source
+// is also at least 32-byte aligned and jump to a loop that uses maximal
+// alignment hints for the loads as well if possible.
+
+    tst      r1,        $0x1f
+    beq      L_sourceAligned32
+    
+// This loop copies 64 bytes per iteration, from a 4-byte aligned source to a
+// 64-byte aligned destination, in order of ascending memory address.  Within
+// this loop, registers are used as follows:
+//
+//  r0      original destination pointer (unmodified)
+//  r1      pointer to the next 32-byte block to load
+//  r2      (number of bytes remaining to copy) - 64
+//  ip      pointer to which the next 32-byte block is to be stored
+//  q0-q3   temporary registers used for copies
+//
+// Note that the loop is arrange in such a way that a single cleanup store is
+// necessary after the final loop iteration.  This occurs at label (1), and is
+// shared between the unaligned and aligned loops.
+
+    vld1.32 {q2,q3}, [r1]!
+    vld1.32 {q0,q1}, [r1]!
+    subs     r2,         $64
+    vst1.64 {q2,q3}, [ip,:256]!
+    blt      1f
+.align 3
+0:  vld1.32 {q2,q3}, [r1]!
+    vst1.64 {q0,q1}, [ip,:256]!
+    vld1.32 {q0,q1}, [r1]!
+    subs     r2,         $64
+    vst1.64 {q2,q3}, [ip,:256]!
+    bge      0b
+    b        1f
+    
+L_sourceAligned32:
+
+// This loop is identical to the immediately preceeding loop, except that it
+// uses the additional alignment hint that the source pointer (r1) is 32-byte
+// aligned.  The two loops share cleanup code for the final iteration.
+
+    vld1.64 {q2,q3}, [r1,:256]!
+    vld1.64 {q0,q1}, [r1,:256]!
+    subs     r2,         $64
+    vst1.64 {q2,q3}, [ip,:256]!
+    blt      1f
+.align 3
+0:  vld1.64 {q2,q3}, [r1,:256]!
+    vst1.64 {q0,q1}, [ip,:256]!
+    vld1.64 {q0,q1}, [r1,:256]!
+    subs     r2,         $64
+    vst1.64 {q2,q3}, [ip,:256]!
+    bge      0b
+    
+// Final vector store for both of the above loops.
+
+1:  vst1.64 {q0,q1}, [ip,:256]!
+
+L_vectorCleanup:
+
+// Add 56 to r2, so that it contains the number of bytes remaing to copy minus
+// 8.  A comparison of this value with zero tells us if any more whole 8-byte
+// blocks need to be copied.
+
+    adds     r2,        $0x38
+    blt      L_scalarCopy
+
+// This loop copies 8 bytes at a time in order of descending memory address,
+// until fewer than 8 bytes remain to be copied.  Within this loop, registers
+// are used as follows:
+//
+//  r0  original destination pointer
+//  r1  pointer to the next element to be copied
+//  r2  (bytes remaining to be copied) - 64
+//  ip  pointer to the destination of the next byte to be copied
+//  d0  temporary storage for copy
+    
+0:  vld1.32 {d0},   [r1]!
+    subs     r2,        $8
+    vst1.64 {d0},   [ip,:64]!
+    bge      0b
+
+/*****************************************************************************
+ *  sub-doubleword cleanup copies                                            *
+ *****************************************************************************/
+
+L_scalarCopy:
+
+// Add 8 to r2, so that it contains the number of bytes remaining to copy, and
+// return to the calling routine if zero bytes remain.
+
+    adds     r2,        $8
+    it       eq
+    bxeq     lr
+         
+// Copy one byte at a time in descending address order until we reach the front
+// of the buffer.  Within this loop, registers are used as follows:
+//
+//  r0  original destination pointer
+//  r1  pointer to one byte past the next element to be copied
+//  r2  (bytes remaining to be copied) - 8
+//  r3  temporary to hold the byte that is being copied
+//  ip  pointer one byte past the destination of the next byte to be copied
+
+0:  ldrb     r3,    [r1], $1
+    strb     r3,    [ip], $1
+    subs     r2,          $1
+    bne      0b
+    bx       lr
+    
+/*****************************************************************************
+ *  STMIA loop for 1k-32k buffers                                            *
+ *****************************************************************************/
+    
+// This loop copies 64 bytes each iteration in order of ascending memory
+// address, using the GPRs instead of NEON.
+//
+//  r0  original destination pointer
+//  r1  pointer to the next element to be copied
+//  r2  (bytes remaining to be copied) - 64
+//  r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data
+//  ip  pointer to the next location to store to
+
+L_useSTMIA:
+    push     SAVE_REGISTERS
+.align 3
+0:  ldmia   r1!,    COPY_REGISTERS
+    subs    r2,     r2,  $64
+    stmia   ip!,    COPY_REGISTERS
+    ldmia   r1!,    COPY_REGISTERS
+    pld     [r1, $64]
+    stmia   ip!,    COPY_REGISTERS
+    bge     0b
+    pop     SAVE_REGISTERS
+    b       L_vectorCleanup
+    
+/*****************************************************************************
+ *  Misaligned forward vld1 loop                                             *
+ *****************************************************************************/
+
+// Software alignment fixup to handle source and dest that are relatively
+// misaligned mod 4 bytes.  
+//
+// The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration,
+// which we combine with the 8 bytes loaded in the previous iteration to get a
+// 16 byte field; the next 8 bytes to be stored to the destination buffer are
+// somewhere in that field, and we get them using the VEXT instruction:
+//      
+//     |  8 bytes from last iteration  |  8 bytes from this iteration  |
+//     +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+//     | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f |
+//     +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+//         ^8 bytes to store this iteration^           |
+//                                                   could be a page boundary 
+//
+// We need to be a little bit careful, however.  Because the loads only have 4
+// byte alignment, if we used this approach all the way to the end of the
+// buffer, the very last 8 byte load might slop over onto a new page by 4
+// bytes, and that new page might not be mapped into our process.  Thus, we
+// terminate this copy loop when fewer than 12 bytes remain to be copied,
+// instead of the more natural-seeming termination condition of "8 bytes
+// remaining" (the illustration above shows the worst case and demonstrates
+// why 12 is a sufficiently safe condition).
+//
+// At the beginning of each iteration through this loop, registers are used
+// as follows:
+//
+//  r0  original destination pointer
+//  r1  pointer to the next block of 8 bytes to load
+//  r2  (bytes remaining to copy) - 12
+//  ip  pointer to the next block of 8 bytes to store
+//  d0  next 8 bytes to store
+//  d2  8 bytes loaded in the previous iteration
+//  d3  8 bytes loaded two iterations ago
+
+#define COPY_UNALIGNED(offset)       \
+    subs      r2,          $4       ;\
+    blt       2f                    ;\
+    vld1.32  {d2,d3}, [r1]!         ;\
+    subs      r2,          $8       ;\
+    blt       1f                    ;\
+0:  vext.8    d0,  d2, d3, $(offset);\
+    vmov      d2,      d3           ;\
+    vld1.32  {d3},    [r1]!         ;\
+    subs      r2,          $8       ;\
+    vst1.64  {d0},    [ip, :64]!    ;\
+    bge       0b                    ;\
+1:  vext.8    d0,  d2, d3, $(offset);\
+    sub       r1,          $8       ;\
+    vst1.64  {d0},    [ip, :64]!    ;\
+2:  add       r1,          $(offset);\
+    add       r2,          $4       ;\
+    b         L_scalarCopy
+
+L_sourceAligned1:
+    COPY_UNALIGNED(1)
+L_sourceAligned2:
+    COPY_UNALIGNED(2)
+L_sourceAligned3:
+    COPY_UNALIGNED(3)
+
+#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD