bcopy_CortexA8.s diff - arm/string/bcopy_CortexA8.s - Libc source code Libc-825.24

arm/string/bcopy_CortexA8.s Libc-825.24 ⇄ /dev/null
--- Libc/Libc-825.24/arm/string/bcopy_CortexA8.s
+++ /dev/null
@@ -1,863 +0,0 @@
-/*
- * Copyright (c) 2009 Apple Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this
- * file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_LICENSE_HEADER_END@
- */
-
-#include <arm/arch.h>
-#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
-
-/*****************************************************************************
- * Cortex-A8 implementation                                                  *
- *****************************************************************************/
- 
-// Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ).
-//
-// Our tests have shown that NEON is always a performance win for memcpy( ).
-// However, for the specific case of copies from a warm source to a cold
-// destination when the buffer size is between 1k and 32k, it is not enough
-// of a performance win to offset the increased power footprint, resulting
-// in an energy usage regression.  Thus, we detect that particular case, and
-// pass those copies through the ARM core registers.  All other copies larger
-// than 8 bytes are handled on NEON.
-//
-// Stephen Canon, August 2009
-
-.text
-.code 16
-.syntax unified
-
-// void bcopy(const void * source,
-//            void * destination,
-//            size_t length);
-//
-// void *memmove(void * destination,
-//               const void * source,
-//               size_t n);
-//
-// void *memcpy(void * restrict destination,
-//              const void * restrict source,
-//              size_t n);
-//
-// all copy n successive bytes from source to destination. memmove and memcpy
-// returns destination, whereas bcopy has no return value. copying takes place
-// as if it were through a temporary buffer -- after return destination contains
-// exactly the bytes from source, even if the buffers overlap.
-
-.thumb_func _bcopy$VARIANT$CortexA8
-.thumb_func _memmove$VARIANT$CortexA8
-.thumb_func _memcpy$VARIANT$CortexA8
-.globl _bcopy$VARIANT$CortexA8
-.globl _memmove$VARIANT$CortexA8
-.globl _memcpy$VARIANT$CortexA8
-
-#define SAVE_REGISTERS {r4,r5,r6,r8,r10,r11}
-#define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r11}
-               
-/*****************************************************************************
- *  entry points                                                             *
- *****************************************************************************/
-
-.align 2
-_bcopy$VARIANT$CortexA8:
-
-// bcopy has the first and second arguments in the opposite order as the C
-// library functions memmove and memcpy.  If bcopy is called, we swap these
-// two arguments and then fall into memmove.
-
-    mov         r3,     r0
-    mov         r0,     r1
-    mov         r1,     r3
-
-.align 2
-_memmove$VARIANT$CortexA8:
-_memcpy$VARIANT$CortexA8:
-
-// At entry to memmove/memcpy, registers contain the following values:
-//
-//  r0  pointer to the first byte of the destination buffer
-//  r1  pointer to the first byte of the source buffer
-//  r2  number of bytes to copy
-//
-// Our preference is to use a (faster and easier to understand) front-to-back
-// copy of the buffer.  However, memmove requires that copies take place as
-// though through a temporary buffer.  This means that if the buffers overlap,
-// it may be necessary to copy the buffer in reverse order.
-//
-// To properly detect such overlap, we begin by computing the offset between
-// the source and destination pointers.  If the offset happens to be zero,
-// then there is no work to be done, so we can early out.
-
-    subs    r3,     r0, r1
-    it      eq
-    bxeq    lr
-
-// r3 now contains the offset between the buffers, (destination - source).  If
-// 0 < offset < length, then the high-addressed bits of the source alias the
-// low addressed bytes of the destination.  Thus, if we were to perform the
-// copy in ascending address order, we would overwrite the high-addressed
-// source bytes before we had a chance to copy them, and the data would be lost.
-//
-// Thus, we can use the front-to-back copy only if offset is negative or
-// greater than the length.  This is the case precisely if offset compares
-// unsigned higher than length.
-
-    cmp     r3,     r2
-    bhs     L_copyFrontToBack
-                             
-/*****************************************************************************
- *  back to front copy                                                       *
- *****************************************************************************/
-
-// Here we have fallen through into the back-to-front copy.  We preserve the
-// original destination pointer in r0 because it is the return value for the
-// routine, and update the other registers as follows:
-//
-//  r1  one byte beyond the end of the destination buffer
-//  r2  number of bytes to copy
-//  ip  one byte beyond the end of the destination buffer
-
-    mov      ip,    r0
-    add      r1,    r2
-    add      ip,    r2
-    
-// Subtract 8 from the buffer length; if this is negative, then we will use
-// only single-byte copies, and we jump directly to a scalar copy loop.
-
-    subs     r2,        $8
-    blt      L_scalarReverseCopy
-    
-// If the destination pointer is 8-byte aligned we can use 8-byte NEON copies
-// to move the data.
-    
-    tst      ip,        $7
-    beq      L_vectorReverseCopy
-    
-// Otherwise, we copy a single byte at a time, in order of descending memory
-// address, until the destination is 8 byte aligned.  Within this loop,
-// registers are used as follows:
-//
-//  r0  original destination pointer
-//  r1  pointer to one byte past the next element to be copied
-//  r2  (bytes remaining to be copied) - 8
-//  r3  temporary to hold the byte that is being copied
-//  ip  pointer one byte past the destination of the next byte to be copied
-//
-//  byte that will be copied in this iteration
-//                            |  byte that was copied in the previous iteration                           
-//  Source buffer:            v   v                                
-//  ------------------------+---+---+-------------------------
-//  bytes still to copy ... |   |   | ... bytes already copied
-//  ------------------------+---+---+-------------------------
-//                                ^
-//                               r1 holds the address of this byte
-
-0:  ldrb     r3,   [r1, $-1]!
-    sub      r2,        $1
-    strb     r3,   [ip, $-1]!
-    tst      ip,        $7
-    bne      0b
-    
-// At this point, the destination pointer is 8 byte aligned.  Check again that
-// there are at least 8 bytes remaining to copy by comparing the remaining
-// length minus 8 to zero.  If fewer than 8 bytes remain, jump to the cleanup
-// path.
-    
-    cmp      r2,    $0
-    blt      L_scalarReverseCopy
-                                    
-/*****************************************************************************
- *  destination is 8 byte aligned                                            *
- *****************************************************************************/
-
-L_vectorReverseCopy:
-
-// At this point, registers contain the following values:
-//
-//  r0  original destination pointer
-//  r1  pointer to one byte past the next element to be copied
-//  r2  (bytes remaining to copy) - 8
-//  ip  pointer one byte past the destination of the next byte to be copied
-//
-// Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive.
-// NEON has really excellent alignment handling in hardware, so we would like
-// to use that to handle cases where the source is not similarly aligned to the
-// destination (it supports even single-byte misalignment at speed).  However,
-// on some SoC designs, not all of the DMA busses support such access.  Thus,
-// we must unfortunately use a software workaround in those cases.
-//
-// Fortunately, 4-byte aligned loads are supported even on the DMA busses, so
-// we only need to handle the different possible source alignments modulo 4.
-// Here we have a dispatch table to jump to the correct copy implementation
-// for the given source alignment.
-//
-// The tbh instruction loads the address offset of the correct implementation
-// from the data table that immediately follows it and adds it to the pc to 
-// jump to the correct branch.
-
-    ands     r3,    r1, $3
-    tbh     [pc, r3, lsl $1]
-0:  
-.short (L_reverseAligned0-0b)/2
-.short (L_reverseAligned1-0b)/2
-.short (L_reverseAligned2-0b)/2
-.short (L_reverseAligned3-0b)/2
-
-/*****************************************************************************
- *  source is also at least word aligned                                     *
- *****************************************************************************/
-    
-L_reverseAligned0:
-
-// Subtract 56 from r2, so that it contains the number of bytes remaining to
-// copy minus 64.  If this result is negative, then we jump into a loop that
-// copies 8 bytes at a time.
-
-    subs     r2,        $0x38
-    blt      L_reverseVectorCleanup
-    
-// Check if the destination pointer is 64-byte aligned.  If so, jump to a loop
-// that copies whole cachelines.
-
-    tst      ip,        $0x38
-    beq      L_reverseCachelineAligned
-    
-// Otherwise, we copy a 8 bytes at a time, in order of descending memory
-// address, until the destination is 64 byte aligned.  Within this loop,
-// registers are used as follows:
-//
-//  r0  original destination pointer
-//  r1  pointer to one byte past the next element to be copied
-//  r2  (bytes remaining to be copied) - 64
-//  ip  pointer one byte past the destination of the next byte to be copied
-//  d0  temporary storage for copy
-//
-//  bytes that will be copied after this iteration
-//        |         8 byte block that will be copied in this iteration                            
-//        v         v
-//  --------------+-------------------------------+---------------------
-//                | 0   1   2   3   4   5   6   7 | bytes already copied
-//  --------------+-------------------------------+---------------------
-//                                                  ^
-//                                                  r1 points here
-    
-0:  sub      r1,        $8
-    vld1.32 {d0},  [r1]
-    sub      ip,        $8
-    sub      r2,        $8
-    tst      ip,        $0x38
-    vst1.64 {d0},  [ip,:64]
-    bne      0b
-    
-// At this point, the destination pointer is 64 byte aligned.  Check again that
-// there are at least 64 bytes remaining to copy by comparing the remaining
-// length minus 64 to zero.  If fewer than 64 bytes remain, skip over the main
-// copy loop.
-
-    cmp      r2,        $0
-    blt      L_reverseVectorCleanup
-    
-/*****************************************************************************
- *  destination is cacheline aligned                                         *
- *****************************************************************************/
-
-L_reverseCachelineAligned:
-
-// In the special case that we are copying a buffer of between 1k and 32k bytes
-// we do not use a NEON copy for the main loop.  This is because if we happen
-// to be doing a copy from a source in cache to a destination that is not in
-// cache, this will result in an increase in energy usage.  In all other cases,
-// NEON gives superior energy conservation.
-
-    sub      r3,    r2, $0x3c0
-    cmp      r3,        $0x7c00
-    blo      L_useSTMDB
-    
-// Pre-decrement the source (r1) and destination (ip) pointers so that they
-// point to the first byte of the trailing 32-byte window of each buffer.
-// Additionally, load the address increment of -32 into r3.
-
-    sub      r1,        $32
-    sub      ip,        $32
-    mov      r3,        $-32
-    
-// The destination pointer is known to be 64-byte aligned, so we can use the
-// maximal alignment hint (:256) for our vector stores.  Detect if the source
-// is also at least 32-byte aligned and jump to a loop that uses maximal
-// alignment hints for the loads as well if possible.
-    
-    tst      r1,        $0x1f
-    beq      L_reverseSourceAligned
-    
-// This loop copies 64 bytes per iteration, from a 4-byte aligned source to a
-// 64-byte aligned destination, in order of descending memory address.  Within
-// this loop, registers are used as follows:
-//
-//  r0      original destination pointer (unmodified)
-//  r1      pointer to the next 32-byte block to load
-//  r2      (number of bytes remaining to copy) - 64
-//  r3      address increment of -32.
-//  ip      pointer to which the next 32-byte block is to be stored
-//  q0-q3   temporary registers used for copies
-//
-// Note that the loop is arrange in such a way that a single cleanup store is
-// necessary after the final loop iteration.  This occurs at label (1), and is
-// shared between the unaligned and aligned loops.
-    
-    vld1.32 {q2,q3}, [r1],      r3
-    vld1.32 {q0,q1}, [r1],      r3
-    subs     r2,         $64
-    vst1.64 {q2,q3}, [ip,:256], r3
-    blt      1f
-.align 3
-0:  vld1.32 {q2,q3}, [r1],      r3
-    vst1.64 {q0,q1}, [ip,:256], r3
-    vld1.32 {q0,q1}, [r1],      r3
-    subs     r2,         $64
-    vst1.64 {q2,q3}, [ip,:256], r3 
-    bge      0b
-    b        1f
-    
-L_reverseSourceAligned:
-
-// This loop is identical to the immediately preceeding loop, except that it
-// uses the additional alignment hint that the source pointer (r1) is 32-byte
-// aligned.  The two loops share cleanup code for the final iteration.
-
-    vld1.64 {q2,q3}, [r1,:256], r3
-    vld1.64 {q0,q1}, [r1,:256], r3
-    subs     r2,         $64
-    vst1.64 {q2,q3}, [ip,:256], r3
-    blt      1f
-.align 3
-0:  vld1.64 {q2,q3}, [r1,:256], r3
-    vst1.64 {q0,q1}, [ip,:256], r3
-    vld1.64 {q0,q1}, [r1,:256], r3
-    subs     r2,         $64
-    vst1.64 {q2,q3}, [ip,:256], r3
-    bge      0b
-    
-// Final vector store for both of the above loops.
-
-1:  vst1.64 {q0,q1}, [ip,:256], r3
-
-// Adjust the source and destination pointers so that they once again point to
-// the last byte that we used (which is one byte higher than the address that
-// we will use next for any required cleanup).
-
-    add      r1,         $32
-    add      ip,         $32
-    
-L_reverseVectorCleanup:
-
-// Add 56 to r2, so that it contains the number of bytes remaing to copy minus
-// 8.  A comparison of this value with zero tells us if any more whole 8-byte
-// blocks need to be copied.
-
-    adds     r2,    r2, $0x38
-    blt      L_scalarReverseCopy
-
-// This loop copies 8 bytes at a time in order of descending memory address,
-// until fewer than 8 bytes remain to be copied.  Within this loop, registers
-// are used as follows:
-//
-//  r0  original destination pointer
-//  r1  pointer to one byte past the next element to be copied
-//  r2  (bytes remaining to be copied) - 64
-//  ip  pointer one byte past the destination of the next byte to be copied
-//  d0  temporary storage for copy
-
-0:  sub      r1,        $8
-    vld1.32 {d0},  [r1]
-    sub      ip,        $8
-    subs     r2,        $8
-    vst1.64 {d0},  [ip,:64]
-    bge      0b
-
-/*****************************************************************************
- *  sub-doubleword cleanup copies                                            *
- *****************************************************************************/
-
-L_scalarReverseCopy:
-
-// Add 8 to r2, so that it contains the number of bytes remaining to copy, and
-// return to the calling routine if zero bytes remain.
-
-    adds     r2,        $8
-    it       eq
-    bxeq     lr
-
-// Copy one byte at a time in descending address order until we reach the front
-// of the buffer.  Within this loop, registers are used as follows:
-//
-//  r0  original destination pointer
-//  r1  pointer to one byte past the next element to be copied
-//  r2  (bytes remaining to be copied) - 8
-//  r3  temporary to hold the byte that is being copied
-//  ip  pointer one byte past the destination of the next byte to be copied
-         
-0:  ldrb     r3,   [r1, $-1]!
-    subs     r2,        $1
-    strb     r3,   [ip, $-1]!
-    bne      0b
-    bx       lr
-         
-/*****************************************************************************
- *  STMDB loop for 1k-32k buffers                                            *
- *****************************************************************************/
-
-// This loop copies 64 bytes each iteration in order of descending memory
-// address, using the GPRs instead of NEON.
-//
-//  r0  original destination pointer
-//  r1  pointer to one byte past the next element to be copied
-//  r2  (bytes remaining to be copied) - 64
-//  r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data
-//  ip  pointer to one byte past the next location to store to
-
-L_useSTMDB:
-    push    SAVE_REGISTERS
-.align 3
-0:  ldmdb   r1!,    COPY_REGISTERS
-    subs    r2,     r2,  $64
-    stmdb   ip!,    COPY_REGISTERS
-    ldmdb   r1!,    COPY_REGISTERS
-    pld     [r1, $-64]
-    stmdb   ip!,    COPY_REGISTERS
-    bge     0b
-    pop     SAVE_REGISTERS
-    b       L_reverseVectorCleanup
-    
-/*****************************************************************************
- *  Misaligned reverse vld1 loop                                             *
- *****************************************************************************/
-
-// Software alignment fixup to handle source and dest that are relatively
-// misaligned mod 4 bytes.  
-//
-// The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration,
-// which we combine with the 8 bytes loaded in the previous iteration to get a
-// 16 byte field; the next 8 bytes to be stored to the destination buffer are
-// somewhere in that field, and we get them using the VEXT instruction:
-//      
-//     |  8 bytes from this iteration  |  8 bytes from last iteration  |
-//     +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
-//     | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f |
-//     +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
-//         ^8 bytes to store this iteration^           |
-//                                                   could be a page boundary 
-//
-// We need to be a little bit careful, however.  Because the loads only have 4
-// byte alignment, the very first load could slop over into a page that is not
-// mapped readable.  In order to prevent this scenario, we copy eight bytes
-// using byte-by-byte before beginning the main loop.
-//
-// At the beginning of each iteration through this loop, registers are used
-// as follows:
-//
-//  r0  original destination pointer
-//  r1  pointer to the next block of 8 bytes to load
-//  r2  (bytes remaining to copy) - 8
-//  ip  pointer to the next block of 8 bytes to store
-//  d0  next 8 bytes to store
-//  d2  8 bytes loaded in the previous iteration
-//  d3  8 bytes loaded two iterations ago
-
-#define RCOPY_UNALIGNED(offset)      \
-0:  ldrb      r3,     [r1,$-1]!     ;\
-    strb      r3,     [ip,$-1]!     ;\
-    subs      r2,         $1        ;\
-    blt       L_scalarReverseCopy   ;\
-    tst       ip,         $7        ;\
-    bne       0b                    ;\
-    bic       r1,         $3        ;\
-    sub       r1,         $8        ;\
-    sub       ip,         $8        ;\
-    mov       r3,         $-8       ;\
-    vld1.32  {d2,d3}, [r1], r3      ;\
-    subs      r2,         $8        ;\
-    blt       1f                    ;\
-0:  vext.8    d0,  d2, d3, $(offset);\
-    vmov      d3,      d2           ;\
-    vld1.32  {d2},    [r1], r3      ;\
-    subs      r2,          $8       ;\
-    vst1.64  {d0},    [ip, :64], r3 ;\
-    bge       0b                    ;\
-1:  vext.8    d0,  d2, d3, $(offset);\
-    add       r1,          $8       ;\
-    vst1.64  {d0},    [ip, :64]     ;\
-2:  add       r1,          $(offset);\
-    b         L_scalarReverseCopy
-
-L_reverseAligned1:
-    RCOPY_UNALIGNED(1)
-L_reverseAligned2:
-    RCOPY_UNALIGNED(2)
-L_reverseAligned3:
-    RCOPY_UNALIGNED(3)
-
-/*****************************************************************************
- *  front to back copy                                                       *
- *****************************************************************************/
-
-L_copyFrontToBack:
-
-// Here the pointers are laid out such that we can use our preferred
-// front-to-back copy.  We preserve original destination pointer in r0 because
-// it is the return value for the routine, and copy it to ip to use in this
-// routine.
-
-    mov      ip,    r0
-    
-// Subtract 8 from the buffer length; if this is negative, then we will use
-// only single-byte copies, and we jump directly to a scalar copy loop.
-    
-    subs     r2,        $8
-    blt      L_scalarCopy
-    
-// If the destination pointer is 8-byte aligned we can use 8-byte NEON copies
-// to move the data.
-
-    tst      ip,        $7
-    beq      L_vectorCopy
-    
-// Otherwise, we copy a single byte at a time, in order of ascending memory
-// address, until the destination is 8 byte aligned.  Within this loop,
-// registers are used as follows:
-//
-//  r0  original destination pointer
-//  r1  pointer to the next byte to copy
-//  r2  (bytes remaining to be copied) - 8
-//  r3  temporary to hold the byte that is being copied
-//  ip  pointer to the next byte to store to
-
-0:  ldrb     r3,  [r1], $1
-    sub      r2,        $1
-    strb     r3,  [ip], $1
-    tst      ip,        $7
-    bne      0b
-    
-// At this point, the destination pointer is 8 byte aligned.  Check again that
-// there are at least 8 bytes remaining to copy by comparing the remaining
-// length minus 8 to zero.  If fewer than 8 bytes remain, jump to the cleanup
-// path.
-
-    cmp      r2,        $0
-    blt      L_scalarCopy
-    
-/*****************************************************************************
- *  destination is doubleword aligned                                        *
- *****************************************************************************/
-
-L_vectorCopy:
-
-// At this point, registers contain the following values:
-//
-//  r0  original destination pointer
-//  r1  pointer to the next element to be copied
-//  r2  (bytes remaining to copy) - 8
-//  ip  pointer to the destination of the next byte to be copied
-//
-// Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive.
-// NEON has really excellent alignment handling in hardware, so we would like
-// to use that to handle cases where the source is not similarly aligned to the
-// destination (it supports even single-byte misalignment at speed).  However,
-// on some SoC designs, not all of the DMA busses support such access.  Thus,
-// we must unfortunately use a software workaround in those cases.
-//
-// Fortunately, 4-byte aligned loads are supported even on the DMA busses, so
-// we only need to handle the different possible source alignments modulo 4.
-// Here we have a dispatch table to jump to the correct copy implementation
-// for the given source alignment.
-//
-// The tbh instruction loads the address offset of the correct implementation
-// from the data table that immediately follows it and adds it to the pc to 
-// jump to the correct branch.
-
-    ands     r3,    r1, $3
-    bic      r1,        $3
-    tbh     [pc, r3, lsl $1]
-0:  
-.short (L_sourceAligned0-0b)/2
-.short (L_sourceAligned1-0b)/2
-.short (L_sourceAligned2-0b)/2
-.short (L_sourceAligned3-0b)/2
-
-/*****************************************************************************
- *  source is also at least word aligned                                     *
- *****************************************************************************/
-    
-L_sourceAligned0:
-
-// Subtract 56 from r2, so that it contains the number of bytes remaining to
-// copy minus 64.  If this result is negative, then we jump into a loop that
-// copies 8 bytes at a time.
-
-    subs     r2,        $0x38
-    blt      L_vectorCleanup
-    
-// Check if the destination pointer is 64-byte aligned.  If so, jump to a loop
-// that copies whole cachelines.
-
-    tst      ip,        $0x38
-    beq      L_cachelineAligned
-        
-// Otherwise, we copy a 8 bytes at a time, in order of ascending memory
-// address, until the destination is 64 byte aligned.  Within this loop,
-// registers are used as follows:
-//
-//  r0  original destination pointer
-//  r1  pointer to the next element to be copied
-//  r2  (bytes remaining to be copied) - 64
-//  ip  pointer to the destination of the next byte to be copied
-//  d0  temporary storage for copy
-
-0:  vld1.32 {d0},  [r1]!
-    sub      r2,        $8
-    vst1.64 {d0},  [ip,:64]!
-    tst      ip,        $0x38
-    bne      0b
-    
-// At this point, the destination pointer is 64 byte aligned.  Check again that
-// there are at least 64 bytes remaining to copy by comparing the remaining
-// length minus 64 to zero.  If fewer than 64 bytes remain, skip over the main
-// copy loop.
-
-    cmp      r2,        $0
-    blt      L_vectorCleanup
-    
-/*****************************************************************************
- *  destination is cacheline aligned                                         *
- *****************************************************************************/
-
-// In the special case that we are copying a buffer of between 1k and 32k bytes
-// we do not use a NEON copy for the main loop.  This is because if we happen
-// to be doing a copy from a source in cache to a destination that is not in
-// cache, this will result in an increase in energy usage.  In all other cases,
-// NEON gives superior energy conservation.
-
-L_cachelineAligned:
-    sub      r3,    r2, $0x3c0
-    cmp      r3,        $0x7c00
-    blo      L_useSTMIA
-    
-// The destination pointer is known to be 64-byte aligned, so we can use the
-// maximal alignment hint (:256) for our vector stores.  Detect if the source
-// is also at least 32-byte aligned and jump to a loop that uses maximal
-// alignment hints for the loads as well if possible.
-
-    tst      r1,        $0x1f
-    beq      L_sourceAligned32
-    
-// This loop copies 64 bytes per iteration, from a 4-byte aligned source to a
-// 64-byte aligned destination, in order of ascending memory address.  Within
-// this loop, registers are used as follows:
-//
-//  r0      original destination pointer (unmodified)
-//  r1      pointer to the next 32-byte block to load
-//  r2      (number of bytes remaining to copy) - 64
-//  ip      pointer to which the next 32-byte block is to be stored
-//  q0-q3   temporary registers used for copies
-//
-// Note that the loop is arrange in such a way that a single cleanup store is
-// necessary after the final loop iteration.  This occurs at label (1), and is
-// shared between the unaligned and aligned loops.
-
-    vld1.32 {q2,q3}, [r1]!
-    vld1.32 {q0,q1}, [r1]!
-    subs     r2,         $64
-    vst1.64 {q2,q3}, [ip,:256]!
-    blt      1f
-.align 3
-0:  vld1.32 {q2,q3}, [r1]!
-    vst1.64 {q0,q1}, [ip,:256]!
-    vld1.32 {q0,q1}, [r1]!
-    subs     r2,         $64
-    vst1.64 {q2,q3}, [ip,:256]!
-    bge      0b
-    b        1f
-    
-L_sourceAligned32:
-
-// This loop is identical to the immediately preceeding loop, except that it
-// uses the additional alignment hint that the source pointer (r1) is 32-byte
-// aligned.  The two loops share cleanup code for the final iteration.
-
-    vld1.64 {q2,q3}, [r1,:256]!
-    vld1.64 {q0,q1}, [r1,:256]!
-    subs     r2,         $64
-    vst1.64 {q2,q3}, [ip,:256]!
-    blt      1f
-.align 3
-0:  vld1.64 {q2,q3}, [r1,:256]!
-    vst1.64 {q0,q1}, [ip,:256]!
-    vld1.64 {q0,q1}, [r1,:256]!
-    subs     r2,         $64
-    vst1.64 {q2,q3}, [ip,:256]!
-    bge      0b
-    
-// Final vector store for both of the above loops.
-
-1:  vst1.64 {q0,q1}, [ip,:256]!
-
-L_vectorCleanup:
-
-// Add 56 to r2, so that it contains the number of bytes remaing to copy minus
-// 8.  A comparison of this value with zero tells us if any more whole 8-byte
-// blocks need to be copied.
-
-    adds     r2,        $0x38
-    blt      L_scalarCopy
-
-// This loop copies 8 bytes at a time in order of descending memory address,
-// until fewer than 8 bytes remain to be copied.  Within this loop, registers
-// are used as follows:
-//
-//  r0  original destination pointer
-//  r1  pointer to the next element to be copied
-//  r2  (bytes remaining to be copied) - 64
-//  ip  pointer to the destination of the next byte to be copied
-//  d0  temporary storage for copy
-    
-0:  vld1.32 {d0},   [r1]!
-    subs     r2,        $8
-    vst1.64 {d0},   [ip,:64]!
-    bge      0b
-
-/*****************************************************************************
- *  sub-doubleword cleanup copies                                            *
- *****************************************************************************/
-
-L_scalarCopy:
-
-// Add 8 to r2, so that it contains the number of bytes remaining to copy, and
-// return to the calling routine if zero bytes remain.
-
-    adds     r2,        $8
-    it       eq
-    bxeq     lr
-         
-// Copy one byte at a time in descending address order until we reach the front
-// of the buffer.  Within this loop, registers are used as follows:
-//
-//  r0  original destination pointer
-//  r1  pointer to one byte past the next element to be copied
-//  r2  (bytes remaining to be copied) - 8
-//  r3  temporary to hold the byte that is being copied
-//  ip  pointer one byte past the destination of the next byte to be copied
-
-0:  ldrb     r3,    [r1], $1
-    strb     r3,    [ip], $1
-    subs     r2,          $1
-    bne      0b
-    bx       lr
-    
-/*****************************************************************************
- *  STMIA loop for 1k-32k buffers                                            *
- *****************************************************************************/
-    
-// This loop copies 64 bytes each iteration in order of ascending memory
-// address, using the GPRs instead of NEON.
-//
-//  r0  original destination pointer
-//  r1  pointer to the next element to be copied
-//  r2  (bytes remaining to be copied) - 64
-//  r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data
-//  ip  pointer to the next location to store to
-
-L_useSTMIA:
-    push     SAVE_REGISTERS
-.align 3
-0:  ldmia   r1!,    COPY_REGISTERS
-    subs    r2,     r2,  $64
-    stmia   ip!,    COPY_REGISTERS
-    ldmia   r1!,    COPY_REGISTERS
-    pld     [r1, $64]
-    stmia   ip!,    COPY_REGISTERS
-    bge     0b
-    pop     SAVE_REGISTERS
-    b       L_vectorCleanup
-    
-/*****************************************************************************
- *  Misaligned forward vld1 loop                                             *
- *****************************************************************************/
-
-// Software alignment fixup to handle source and dest that are relatively
-// misaligned mod 4 bytes.  
-//
-// The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration,
-// which we combine with the 8 bytes loaded in the previous iteration to get a
-// 16 byte field; the next 8 bytes to be stored to the destination buffer are
-// somewhere in that field, and we get them using the VEXT instruction:
-//      
-//     |  8 bytes from last iteration  |  8 bytes from this iteration  |
-//     +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
-//     | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f |
-//     +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
-//         ^8 bytes to store this iteration^           |
-//                                                   could be a page boundary 
-//
-// We need to be a little bit careful, however.  Because the loads only have 4
-// byte alignment, if we used this approach all the way to the end of the
-// buffer, the very last 8 byte load might slop over onto a new page by 4
-// bytes, and that new page might not be mapped into our process.  Thus, we
-// terminate this copy loop when fewer than 12 bytes remain to be copied,
-// instead of the more natural-seeming termination condition of "8 bytes
-// remaining" (the illustration above shows the worst case and demonstrates
-// why 12 is a sufficiently safe condition).
-//
-// At the beginning of each iteration through this loop, registers are used
-// as follows:
-//
-//  r0  original destination pointer
-//  r1  pointer to the next block of 8 bytes to load
-//  r2  (bytes remaining to copy) - 12
-//  ip  pointer to the next block of 8 bytes to store
-//  d0  next 8 bytes to store
-//  d2  8 bytes loaded in the previous iteration
-//  d3  8 bytes loaded two iterations ago
-
-#define COPY_UNALIGNED(offset)       \
-    subs      r2,          $4       ;\
-    blt       2f                    ;\
-    vld1.32  {d2,d3}, [r1]!         ;\
-    subs      r2,          $8       ;\
-    blt       1f                    ;\
-0:  vext.8    d0,  d2, d3, $(offset);\
-    vmov      d2,      d3           ;\
-    vld1.32  {d3},    [r1]!         ;\
-    subs      r2,          $8       ;\
-    vst1.64  {d0},    [ip, :64]!    ;\
-    bge       0b                    ;\
-1:  vext.8    d0,  d2, d3, $(offset);\
-    sub       r1,          $8       ;\
-    vst1.64  {d0},    [ip, :64]!    ;\
-2:  add       r1,          $(offset);\
-    add       r2,          $4       ;\
-    b         L_scalarCopy
-
-L_sourceAligned1:
-    COPY_UNALIGNED(1)
-L_sourceAligned2:
-    COPY_UNALIGNED(2)
-L_sourceAligned3:
-    COPY_UNALIGNED(3)
-
-#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD