Loading...
--- Libc/Libc-763.13/arm/string/bcopy_CortexA8.s
+++ Libc/Libc-825.24/arm/string/bcopy_CortexA8.s
@@ -21,3 +21,843 @@
* @APPLE_LICENSE_HEADER_END@
*/
+#include <arm/arch.h>
+#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
+
+/*****************************************************************************
+ * Cortex-A8 implementation *
+ *****************************************************************************/
+
+// Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ).
+//
+// Our tests have shown that NEON is always a performance win for memcpy( ).
+// However, for the specific case of copies from a warm source to a cold
+// destination when the buffer size is between 1k and 32k, it is not enough
+// of a performance win to offset the increased power footprint, resulting
+// in an energy usage regression. Thus, we detect that particular case, and
+// pass those copies through the ARM core registers. All other copies larger
+// than 8 bytes are handled on NEON.
+//
+// Stephen Canon, August 2009
+
+.text
+.code 16
+.syntax unified
+
+// void bcopy(const void * source,
+// void * destination,
+// size_t length);
+//
+// void *memmove(void * destination,
+// const void * source,
+// size_t n);
+//
+// void *memcpy(void * restrict destination,
+// const void * restrict source,
+// size_t n);
+//
+// all copy n successive bytes from source to destination. memmove and memcpy
+// returns destination, whereas bcopy has no return value. copying takes place
+// as if it were through a temporary buffer -- after return destination contains
+// exactly the bytes from source, even if the buffers overlap.
+
+.thumb_func _bcopy$VARIANT$CortexA8
+.thumb_func _memmove$VARIANT$CortexA8
+.thumb_func _memcpy$VARIANT$CortexA8
+.globl _bcopy$VARIANT$CortexA8
+.globl _memmove$VARIANT$CortexA8
+.globl _memcpy$VARIANT$CortexA8
+
+#define SAVE_REGISTERS {r4,r5,r6,r8,r10,r11}
+#define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r11}
+
+/*****************************************************************************
+ * entry points *
+ *****************************************************************************/
+
+.align 2
+_bcopy$VARIANT$CortexA8:
+
+// bcopy has the first and second arguments in the opposite order as the C
+// library functions memmove and memcpy. If bcopy is called, we swap these
+// two arguments and then fall into memmove.
+
+ mov r3, r0
+ mov r0, r1
+ mov r1, r3
+
+.align 2
+_memmove$VARIANT$CortexA8:
+_memcpy$VARIANT$CortexA8:
+
+// At entry to memmove/memcpy, registers contain the following values:
+//
+// r0 pointer to the first byte of the destination buffer
+// r1 pointer to the first byte of the source buffer
+// r2 number of bytes to copy
+//
+// Our preference is to use a (faster and easier to understand) front-to-back
+// copy of the buffer. However, memmove requires that copies take place as
+// though through a temporary buffer. This means that if the buffers overlap,
+// it may be necessary to copy the buffer in reverse order.
+//
+// To properly detect such overlap, we begin by computing the offset between
+// the source and destination pointers. If the offset happens to be zero,
+// then there is no work to be done, so we can early out.
+
+ subs r3, r0, r1
+ it eq
+ bxeq lr
+
+// r3 now contains the offset between the buffers, (destination - source). If
+// 0 < offset < length, then the high-addressed bits of the source alias the
+// low addressed bytes of the destination. Thus, if we were to perform the
+// copy in ascending address order, we would overwrite the high-addressed
+// source bytes before we had a chance to copy them, and the data would be lost.
+//
+// Thus, we can use the front-to-back copy only if offset is negative or
+// greater than the length. This is the case precisely if offset compares
+// unsigned higher than length.
+
+ cmp r3, r2
+ bhs L_copyFrontToBack
+
+/*****************************************************************************
+ * back to front copy *
+ *****************************************************************************/
+
+// Here we have fallen through into the back-to-front copy. We preserve the
+// original destination pointer in r0 because it is the return value for the
+// routine, and update the other registers as follows:
+//
+// r1 one byte beyond the end of the destination buffer
+// r2 number of bytes to copy
+// ip one byte beyond the end of the destination buffer
+
+ mov ip, r0
+ add r1, r2
+ add ip, r2
+
+// Subtract 8 from the buffer length; if this is negative, then we will use
+// only single-byte copies, and we jump directly to a scalar copy loop.
+
+ subs r2, $8
+ blt L_scalarReverseCopy
+
+// If the destination pointer is 8-byte aligned we can use 8-byte NEON copies
+// to move the data.
+
+ tst ip, $7
+ beq L_vectorReverseCopy
+
+// Otherwise, we copy a single byte at a time, in order of descending memory
+// address, until the destination is 8 byte aligned. Within this loop,
+// registers are used as follows:
+//
+// r0 original destination pointer
+// r1 pointer to one byte past the next element to be copied
+// r2 (bytes remaining to be copied) - 8
+// r3 temporary to hold the byte that is being copied
+// ip pointer one byte past the destination of the next byte to be copied
+//
+// byte that will be copied in this iteration
+// | byte that was copied in the previous iteration
+// Source buffer: v v
+// ------------------------+---+---+-------------------------
+// bytes still to copy ... | | | ... bytes already copied
+// ------------------------+---+---+-------------------------
+// ^
+// r1 holds the address of this byte
+
+0: ldrb r3, [r1, $-1]!
+ sub r2, $1
+ strb r3, [ip, $-1]!
+ tst ip, $7
+ bne 0b
+
+// At this point, the destination pointer is 8 byte aligned. Check again that
+// there are at least 8 bytes remaining to copy by comparing the remaining
+// length minus 8 to zero. If fewer than 8 bytes remain, jump to the cleanup
+// path.
+
+ cmp r2, $0
+ blt L_scalarReverseCopy
+
+/*****************************************************************************
+ * destination is 8 byte aligned *
+ *****************************************************************************/
+
+L_vectorReverseCopy:
+
+// At this point, registers contain the following values:
+//
+// r0 original destination pointer
+// r1 pointer to one byte past the next element to be copied
+// r2 (bytes remaining to copy) - 8
+// ip pointer one byte past the destination of the next byte to be copied
+//
+// Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive.
+// NEON has really excellent alignment handling in hardware, so we would like
+// to use that to handle cases where the source is not similarly aligned to the
+// destination (it supports even single-byte misalignment at speed). However,
+// on some SoC designs, not all of the DMA busses support such access. Thus,
+// we must unfortunately use a software workaround in those cases.
+//
+// Fortunately, 4-byte aligned loads are supported even on the DMA busses, so
+// we only need to handle the different possible source alignments modulo 4.
+// Here we have a dispatch table to jump to the correct copy implementation
+// for the given source alignment.
+//
+// The tbh instruction loads the address offset of the correct implementation
+// from the data table that immediately follows it and adds it to the pc to
+// jump to the correct branch.
+
+ ands r3, r1, $3
+ tbh [pc, r3, lsl $1]
+0:
+.short (L_reverseAligned0-0b)/2
+.short (L_reverseAligned1-0b)/2
+.short (L_reverseAligned2-0b)/2
+.short (L_reverseAligned3-0b)/2
+
+/*****************************************************************************
+ * source is also at least word aligned *
+ *****************************************************************************/
+
+L_reverseAligned0:
+
+// Subtract 56 from r2, so that it contains the number of bytes remaining to
+// copy minus 64. If this result is negative, then we jump into a loop that
+// copies 8 bytes at a time.
+
+ subs r2, $0x38
+ blt L_reverseVectorCleanup
+
+// Check if the destination pointer is 64-byte aligned. If so, jump to a loop
+// that copies whole cachelines.
+
+ tst ip, $0x38
+ beq L_reverseCachelineAligned
+
+// Otherwise, we copy a 8 bytes at a time, in order of descending memory
+// address, until the destination is 64 byte aligned. Within this loop,
+// registers are used as follows:
+//
+// r0 original destination pointer
+// r1 pointer to one byte past the next element to be copied
+// r2 (bytes remaining to be copied) - 64
+// ip pointer one byte past the destination of the next byte to be copied
+// d0 temporary storage for copy
+//
+// bytes that will be copied after this iteration
+// | 8 byte block that will be copied in this iteration
+// v v
+// --------------+-------------------------------+---------------------
+// | 0 1 2 3 4 5 6 7 | bytes already copied
+// --------------+-------------------------------+---------------------
+// ^
+// r1 points here
+
+0: sub r1, $8
+ vld1.32 {d0}, [r1]
+ sub ip, $8
+ sub r2, $8
+ tst ip, $0x38
+ vst1.64 {d0}, [ip,:64]
+ bne 0b
+
+// At this point, the destination pointer is 64 byte aligned. Check again that
+// there are at least 64 bytes remaining to copy by comparing the remaining
+// length minus 64 to zero. If fewer than 64 bytes remain, skip over the main
+// copy loop.
+
+ cmp r2, $0
+ blt L_reverseVectorCleanup
+
+/*****************************************************************************
+ * destination is cacheline aligned *
+ *****************************************************************************/
+
+L_reverseCachelineAligned:
+
+// In the special case that we are copying a buffer of between 1k and 32k bytes
+// we do not use a NEON copy for the main loop. This is because if we happen
+// to be doing a copy from a source in cache to a destination that is not in
+// cache, this will result in an increase in energy usage. In all other cases,
+// NEON gives superior energy conservation.
+
+ sub r3, r2, $0x3c0
+ cmp r3, $0x7c00
+ blo L_useSTMDB
+
+// Pre-decrement the source (r1) and destination (ip) pointers so that they
+// point to the first byte of the trailing 32-byte window of each buffer.
+// Additionally, load the address increment of -32 into r3.
+
+ sub r1, $32
+ sub ip, $32
+ mov r3, $-32
+
+// The destination pointer is known to be 64-byte aligned, so we can use the
+// maximal alignment hint (:256) for our vector stores. Detect if the source
+// is also at least 32-byte aligned and jump to a loop that uses maximal
+// alignment hints for the loads as well if possible.
+
+ tst r1, $0x1f
+ beq L_reverseSourceAligned
+
+// This loop copies 64 bytes per iteration, from a 4-byte aligned source to a
+// 64-byte aligned destination, in order of descending memory address. Within
+// this loop, registers are used as follows:
+//
+// r0 original destination pointer (unmodified)
+// r1 pointer to the next 32-byte block to load
+// r2 (number of bytes remaining to copy) - 64
+// r3 address increment of -32.
+// ip pointer to which the next 32-byte block is to be stored
+// q0-q3 temporary registers used for copies
+//
+// Note that the loop is arrange in such a way that a single cleanup store is
+// necessary after the final loop iteration. This occurs at label (1), and is
+// shared between the unaligned and aligned loops.
+
+ vld1.32 {q2,q3}, [r1], r3
+ vld1.32 {q0,q1}, [r1], r3
+ subs r2, $64
+ vst1.64 {q2,q3}, [ip,:256], r3
+ blt 1f
+.align 3
+0: vld1.32 {q2,q3}, [r1], r3
+ vst1.64 {q0,q1}, [ip,:256], r3
+ vld1.32 {q0,q1}, [r1], r3
+ subs r2, $64
+ vst1.64 {q2,q3}, [ip,:256], r3
+ bge 0b
+ b 1f
+
+L_reverseSourceAligned:
+
+// This loop is identical to the immediately preceeding loop, except that it
+// uses the additional alignment hint that the source pointer (r1) is 32-byte
+// aligned. The two loops share cleanup code for the final iteration.
+
+ vld1.64 {q2,q3}, [r1,:256], r3
+ vld1.64 {q0,q1}, [r1,:256], r3
+ subs r2, $64
+ vst1.64 {q2,q3}, [ip,:256], r3
+ blt 1f
+.align 3
+0: vld1.64 {q2,q3}, [r1,:256], r3
+ vst1.64 {q0,q1}, [ip,:256], r3
+ vld1.64 {q0,q1}, [r1,:256], r3
+ subs r2, $64
+ vst1.64 {q2,q3}, [ip,:256], r3
+ bge 0b
+
+// Final vector store for both of the above loops.
+
+1: vst1.64 {q0,q1}, [ip,:256], r3
+
+// Adjust the source and destination pointers so that they once again point to
+// the last byte that we used (which is one byte higher than the address that
+// we will use next for any required cleanup).
+
+ add r1, $32
+ add ip, $32
+
+L_reverseVectorCleanup:
+
+// Add 56 to r2, so that it contains the number of bytes remaing to copy minus
+// 8. A comparison of this value with zero tells us if any more whole 8-byte
+// blocks need to be copied.
+
+ adds r2, r2, $0x38
+ blt L_scalarReverseCopy
+
+// This loop copies 8 bytes at a time in order of descending memory address,
+// until fewer than 8 bytes remain to be copied. Within this loop, registers
+// are used as follows:
+//
+// r0 original destination pointer
+// r1 pointer to one byte past the next element to be copied
+// r2 (bytes remaining to be copied) - 64
+// ip pointer one byte past the destination of the next byte to be copied
+// d0 temporary storage for copy
+
+0: sub r1, $8
+ vld1.32 {d0}, [r1]
+ sub ip, $8
+ subs r2, $8
+ vst1.64 {d0}, [ip,:64]
+ bge 0b
+
+/*****************************************************************************
+ * sub-doubleword cleanup copies *
+ *****************************************************************************/
+
+L_scalarReverseCopy:
+
+// Add 8 to r2, so that it contains the number of bytes remaining to copy, and
+// return to the calling routine if zero bytes remain.
+
+ adds r2, $8
+ it eq
+ bxeq lr
+
+// Copy one byte at a time in descending address order until we reach the front
+// of the buffer. Within this loop, registers are used as follows:
+//
+// r0 original destination pointer
+// r1 pointer to one byte past the next element to be copied
+// r2 (bytes remaining to be copied) - 8
+// r3 temporary to hold the byte that is being copied
+// ip pointer one byte past the destination of the next byte to be copied
+
+0: ldrb r3, [r1, $-1]!
+ subs r2, $1
+ strb r3, [ip, $-1]!
+ bne 0b
+ bx lr
+
+/*****************************************************************************
+ * STMDB loop for 1k-32k buffers *
+ *****************************************************************************/
+
+// This loop copies 64 bytes each iteration in order of descending memory
+// address, using the GPRs instead of NEON.
+//
+// r0 original destination pointer
+// r1 pointer to one byte past the next element to be copied
+// r2 (bytes remaining to be copied) - 64
+// r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data
+// ip pointer to one byte past the next location to store to
+
+L_useSTMDB:
+ push SAVE_REGISTERS
+.align 3
+0: ldmdb r1!, COPY_REGISTERS
+ subs r2, r2, $64
+ stmdb ip!, COPY_REGISTERS
+ ldmdb r1!, COPY_REGISTERS
+ pld [r1, $-64]
+ stmdb ip!, COPY_REGISTERS
+ bge 0b
+ pop SAVE_REGISTERS
+ b L_reverseVectorCleanup
+
+/*****************************************************************************
+ * Misaligned reverse vld1 loop *
+ *****************************************************************************/
+
+// Software alignment fixup to handle source and dest that are relatively
+// misaligned mod 4 bytes.
+//
+// The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration,
+// which we combine with the 8 bytes loaded in the previous iteration to get a
+// 16 byte field; the next 8 bytes to be stored to the destination buffer are
+// somewhere in that field, and we get them using the VEXT instruction:
+//
+// | 8 bytes from this iteration | 8 bytes from last iteration |
+// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+// | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f |
+// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+// ^8 bytes to store this iteration^ |
+// could be a page boundary
+//
+// We need to be a little bit careful, however. Because the loads only have 4
+// byte alignment, the very first load could slop over into a page that is not
+// mapped readable. In order to prevent this scenario, we copy eight bytes
+// using byte-by-byte before beginning the main loop.
+//
+// At the beginning of each iteration through this loop, registers are used
+// as follows:
+//
+// r0 original destination pointer
+// r1 pointer to the next block of 8 bytes to load
+// r2 (bytes remaining to copy) - 8
+// ip pointer to the next block of 8 bytes to store
+// d0 next 8 bytes to store
+// d2 8 bytes loaded in the previous iteration
+// d3 8 bytes loaded two iterations ago
+
+#define RCOPY_UNALIGNED(offset) \
+0: ldrb r3, [r1,$-1]! ;\
+ strb r3, [ip,$-1]! ;\
+ subs r2, $1 ;\
+ blt L_scalarReverseCopy ;\
+ tst ip, $7 ;\
+ bne 0b ;\
+ bic r1, $3 ;\
+ sub r1, $8 ;\
+ sub ip, $8 ;\
+ mov r3, $-8 ;\
+ vld1.32 {d2,d3}, [r1], r3 ;\
+ subs r2, $8 ;\
+ blt 1f ;\
+0: vext.8 d0, d2, d3, $(offset);\
+ vmov d3, d2 ;\
+ vld1.32 {d2}, [r1], r3 ;\
+ subs r2, $8 ;\
+ vst1.64 {d0}, [ip, :64], r3 ;\
+ bge 0b ;\
+1: vext.8 d0, d2, d3, $(offset);\
+ add r1, $8 ;\
+ vst1.64 {d0}, [ip, :64] ;\
+2: add r1, $(offset);\
+ b L_scalarReverseCopy
+
+L_reverseAligned1:
+ RCOPY_UNALIGNED(1)
+L_reverseAligned2:
+ RCOPY_UNALIGNED(2)
+L_reverseAligned3:
+ RCOPY_UNALIGNED(3)
+
+/*****************************************************************************
+ * front to back copy *
+ *****************************************************************************/
+
+L_copyFrontToBack:
+
+// Here the pointers are laid out such that we can use our preferred
+// front-to-back copy. We preserve original destination pointer in r0 because
+// it is the return value for the routine, and copy it to ip to use in this
+// routine.
+
+ mov ip, r0
+
+// Subtract 8 from the buffer length; if this is negative, then we will use
+// only single-byte copies, and we jump directly to a scalar copy loop.
+
+ subs r2, $8
+ blt L_scalarCopy
+
+// If the destination pointer is 8-byte aligned we can use 8-byte NEON copies
+// to move the data.
+
+ tst ip, $7
+ beq L_vectorCopy
+
+// Otherwise, we copy a single byte at a time, in order of ascending memory
+// address, until the destination is 8 byte aligned. Within this loop,
+// registers are used as follows:
+//
+// r0 original destination pointer
+// r1 pointer to the next byte to copy
+// r2 (bytes remaining to be copied) - 8
+// r3 temporary to hold the byte that is being copied
+// ip pointer to the next byte to store to
+
+0: ldrb r3, [r1], $1
+ sub r2, $1
+ strb r3, [ip], $1
+ tst ip, $7
+ bne 0b
+
+// At this point, the destination pointer is 8 byte aligned. Check again that
+// there are at least 8 bytes remaining to copy by comparing the remaining
+// length minus 8 to zero. If fewer than 8 bytes remain, jump to the cleanup
+// path.
+
+ cmp r2, $0
+ blt L_scalarCopy
+
+/*****************************************************************************
+ * destination is doubleword aligned *
+ *****************************************************************************/
+
+L_vectorCopy:
+
+// At this point, registers contain the following values:
+//
+// r0 original destination pointer
+// r1 pointer to the next element to be copied
+// r2 (bytes remaining to copy) - 8
+// ip pointer to the destination of the next byte to be copied
+//
+// Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive.
+// NEON has really excellent alignment handling in hardware, so we would like
+// to use that to handle cases where the source is not similarly aligned to the
+// destination (it supports even single-byte misalignment at speed). However,
+// on some SoC designs, not all of the DMA busses support such access. Thus,
+// we must unfortunately use a software workaround in those cases.
+//
+// Fortunately, 4-byte aligned loads are supported even on the DMA busses, so
+// we only need to handle the different possible source alignments modulo 4.
+// Here we have a dispatch table to jump to the correct copy implementation
+// for the given source alignment.
+//
+// The tbh instruction loads the address offset of the correct implementation
+// from the data table that immediately follows it and adds it to the pc to
+// jump to the correct branch.
+
+ ands r3, r1, $3
+ bic r1, $3
+ tbh [pc, r3, lsl $1]
+0:
+.short (L_sourceAligned0-0b)/2
+.short (L_sourceAligned1-0b)/2
+.short (L_sourceAligned2-0b)/2
+.short (L_sourceAligned3-0b)/2
+
+/*****************************************************************************
+ * source is also at least word aligned *
+ *****************************************************************************/
+
+L_sourceAligned0:
+
+// Subtract 56 from r2, so that it contains the number of bytes remaining to
+// copy minus 64. If this result is negative, then we jump into a loop that
+// copies 8 bytes at a time.
+
+ subs r2, $0x38
+ blt L_vectorCleanup
+
+// Check if the destination pointer is 64-byte aligned. If so, jump to a loop
+// that copies whole cachelines.
+
+ tst ip, $0x38
+ beq L_cachelineAligned
+
+// Otherwise, we copy a 8 bytes at a time, in order of ascending memory
+// address, until the destination is 64 byte aligned. Within this loop,
+// registers are used as follows:
+//
+// r0 original destination pointer
+// r1 pointer to the next element to be copied
+// r2 (bytes remaining to be copied) - 64
+// ip pointer to the destination of the next byte to be copied
+// d0 temporary storage for copy
+
+0: vld1.32 {d0}, [r1]!
+ sub r2, $8
+ vst1.64 {d0}, [ip,:64]!
+ tst ip, $0x38
+ bne 0b
+
+// At this point, the destination pointer is 64 byte aligned. Check again that
+// there are at least 64 bytes remaining to copy by comparing the remaining
+// length minus 64 to zero. If fewer than 64 bytes remain, skip over the main
+// copy loop.
+
+ cmp r2, $0
+ blt L_vectorCleanup
+
+/*****************************************************************************
+ * destination is cacheline aligned *
+ *****************************************************************************/
+
+// In the special case that we are copying a buffer of between 1k and 32k bytes
+// we do not use a NEON copy for the main loop. This is because if we happen
+// to be doing a copy from a source in cache to a destination that is not in
+// cache, this will result in an increase in energy usage. In all other cases,
+// NEON gives superior energy conservation.
+
+L_cachelineAligned:
+ sub r3, r2, $0x3c0
+ cmp r3, $0x7c00
+ blo L_useSTMIA
+
+// The destination pointer is known to be 64-byte aligned, so we can use the
+// maximal alignment hint (:256) for our vector stores. Detect if the source
+// is also at least 32-byte aligned and jump to a loop that uses maximal
+// alignment hints for the loads as well if possible.
+
+ tst r1, $0x1f
+ beq L_sourceAligned32
+
+// This loop copies 64 bytes per iteration, from a 4-byte aligned source to a
+// 64-byte aligned destination, in order of ascending memory address. Within
+// this loop, registers are used as follows:
+//
+// r0 original destination pointer (unmodified)
+// r1 pointer to the next 32-byte block to load
+// r2 (number of bytes remaining to copy) - 64
+// ip pointer to which the next 32-byte block is to be stored
+// q0-q3 temporary registers used for copies
+//
+// Note that the loop is arrange in such a way that a single cleanup store is
+// necessary after the final loop iteration. This occurs at label (1), and is
+// shared between the unaligned and aligned loops.
+
+ vld1.32 {q2,q3}, [r1]!
+ vld1.32 {q0,q1}, [r1]!
+ subs r2, $64
+ vst1.64 {q2,q3}, [ip,:256]!
+ blt 1f
+.align 3
+0: vld1.32 {q2,q3}, [r1]!
+ vst1.64 {q0,q1}, [ip,:256]!
+ vld1.32 {q0,q1}, [r1]!
+ subs r2, $64
+ vst1.64 {q2,q3}, [ip,:256]!
+ bge 0b
+ b 1f
+
+L_sourceAligned32:
+
+// This loop is identical to the immediately preceeding loop, except that it
+// uses the additional alignment hint that the source pointer (r1) is 32-byte
+// aligned. The two loops share cleanup code for the final iteration.
+
+ vld1.64 {q2,q3}, [r1,:256]!
+ vld1.64 {q0,q1}, [r1,:256]!
+ subs r2, $64
+ vst1.64 {q2,q3}, [ip,:256]!
+ blt 1f
+.align 3
+0: vld1.64 {q2,q3}, [r1,:256]!
+ vst1.64 {q0,q1}, [ip,:256]!
+ vld1.64 {q0,q1}, [r1,:256]!
+ subs r2, $64
+ vst1.64 {q2,q3}, [ip,:256]!
+ bge 0b
+
+// Final vector store for both of the above loops.
+
+1: vst1.64 {q0,q1}, [ip,:256]!
+
+L_vectorCleanup:
+
+// Add 56 to r2, so that it contains the number of bytes remaing to copy minus
+// 8. A comparison of this value with zero tells us if any more whole 8-byte
+// blocks need to be copied.
+
+ adds r2, $0x38
+ blt L_scalarCopy
+
+// This loop copies 8 bytes at a time in order of descending memory address,
+// until fewer than 8 bytes remain to be copied. Within this loop, registers
+// are used as follows:
+//
+// r0 original destination pointer
+// r1 pointer to the next element to be copied
+// r2 (bytes remaining to be copied) - 64
+// ip pointer to the destination of the next byte to be copied
+// d0 temporary storage for copy
+
+0: vld1.32 {d0}, [r1]!
+ subs r2, $8
+ vst1.64 {d0}, [ip,:64]!
+ bge 0b
+
+/*****************************************************************************
+ * sub-doubleword cleanup copies *
+ *****************************************************************************/
+
+L_scalarCopy:
+
+// Add 8 to r2, so that it contains the number of bytes remaining to copy, and
+// return to the calling routine if zero bytes remain.
+
+ adds r2, $8
+ it eq
+ bxeq lr
+
+// Copy one byte at a time in descending address order until we reach the front
+// of the buffer. Within this loop, registers are used as follows:
+//
+// r0 original destination pointer
+// r1 pointer to one byte past the next element to be copied
+// r2 (bytes remaining to be copied) - 8
+// r3 temporary to hold the byte that is being copied
+// ip pointer one byte past the destination of the next byte to be copied
+
+0: ldrb r3, [r1], $1
+ strb r3, [ip], $1
+ subs r2, $1
+ bne 0b
+ bx lr
+
+/*****************************************************************************
+ * STMIA loop for 1k-32k buffers *
+ *****************************************************************************/
+
+// This loop copies 64 bytes each iteration in order of ascending memory
+// address, using the GPRs instead of NEON.
+//
+// r0 original destination pointer
+// r1 pointer to the next element to be copied
+// r2 (bytes remaining to be copied) - 64
+// r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data
+// ip pointer to the next location to store to
+
+L_useSTMIA:
+ push SAVE_REGISTERS
+.align 3
+0: ldmia r1!, COPY_REGISTERS
+ subs r2, r2, $64
+ stmia ip!, COPY_REGISTERS
+ ldmia r1!, COPY_REGISTERS
+ pld [r1, $64]
+ stmia ip!, COPY_REGISTERS
+ bge 0b
+ pop SAVE_REGISTERS
+ b L_vectorCleanup
+
+/*****************************************************************************
+ * Misaligned forward vld1 loop *
+ *****************************************************************************/
+
+// Software alignment fixup to handle source and dest that are relatively
+// misaligned mod 4 bytes.
+//
+// The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration,
+// which we combine with the 8 bytes loaded in the previous iteration to get a
+// 16 byte field; the next 8 bytes to be stored to the destination buffer are
+// somewhere in that field, and we get them using the VEXT instruction:
+//
+// | 8 bytes from last iteration | 8 bytes from this iteration |
+// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+// | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f |
+// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+// ^8 bytes to store this iteration^ |
+// could be a page boundary
+//
+// We need to be a little bit careful, however. Because the loads only have 4
+// byte alignment, if we used this approach all the way to the end of the
+// buffer, the very last 8 byte load might slop over onto a new page by 4
+// bytes, and that new page might not be mapped into our process. Thus, we
+// terminate this copy loop when fewer than 12 bytes remain to be copied,
+// instead of the more natural-seeming termination condition of "8 bytes
+// remaining" (the illustration above shows the worst case and demonstrates
+// why 12 is a sufficiently safe condition).
+//
+// At the beginning of each iteration through this loop, registers are used
+// as follows:
+//
+// r0 original destination pointer
+// r1 pointer to the next block of 8 bytes to load
+// r2 (bytes remaining to copy) - 12
+// ip pointer to the next block of 8 bytes to store
+// d0 next 8 bytes to store
+// d2 8 bytes loaded in the previous iteration
+// d3 8 bytes loaded two iterations ago
+
+#define COPY_UNALIGNED(offset) \
+ subs r2, $4 ;\
+ blt 2f ;\
+ vld1.32 {d2,d3}, [r1]! ;\
+ subs r2, $8 ;\
+ blt 1f ;\
+0: vext.8 d0, d2, d3, $(offset);\
+ vmov d2, d3 ;\
+ vld1.32 {d3}, [r1]! ;\
+ subs r2, $8 ;\
+ vst1.64 {d0}, [ip, :64]! ;\
+ bge 0b ;\
+1: vext.8 d0, d2, d3, $(offset);\
+ sub r1, $8 ;\
+ vst1.64 {d0}, [ip, :64]! ;\
+2: add r1, $(offset);\
+ add r2, $4 ;\
+ b L_scalarCopy
+
+L_sourceAligned1:
+ COPY_UNALIGNED(1)
+L_sourceAligned2:
+ COPY_UNALIGNED(2)
+L_sourceAligned3:
+ COPY_UNALIGNED(3)
+
+#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD