Loading...
--- Libc/Libc-825.24/arm/string/bcopy_CortexA8.s
+++ /dev/null
@@ -1,863 +0,0 @@
-/*
- * Copyright (c) 2009 Apple Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this
- * file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_LICENSE_HEADER_END@
- */
-
-#include <arm/arch.h>
-#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
-
-/*****************************************************************************
- * Cortex-A8 implementation *
- *****************************************************************************/
-
-// Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ).
-//
-// Our tests have shown that NEON is always a performance win for memcpy( ).
-// However, for the specific case of copies from a warm source to a cold
-// destination when the buffer size is between 1k and 32k, it is not enough
-// of a performance win to offset the increased power footprint, resulting
-// in an energy usage regression. Thus, we detect that particular case, and
-// pass those copies through the ARM core registers. All other copies larger
-// than 8 bytes are handled on NEON.
-//
-// Stephen Canon, August 2009
-
-.text
-.code 16
-.syntax unified
-
-// void bcopy(const void * source,
-// void * destination,
-// size_t length);
-//
-// void *memmove(void * destination,
-// const void * source,
-// size_t n);
-//
-// void *memcpy(void * restrict destination,
-// const void * restrict source,
-// size_t n);
-//
-// all copy n successive bytes from source to destination. memmove and memcpy
-// returns destination, whereas bcopy has no return value. copying takes place
-// as if it were through a temporary buffer -- after return destination contains
-// exactly the bytes from source, even if the buffers overlap.
-
-.thumb_func _bcopy$VARIANT$CortexA8
-.thumb_func _memmove$VARIANT$CortexA8
-.thumb_func _memcpy$VARIANT$CortexA8
-.globl _bcopy$VARIANT$CortexA8
-.globl _memmove$VARIANT$CortexA8
-.globl _memcpy$VARIANT$CortexA8
-
-#define SAVE_REGISTERS {r4,r5,r6,r8,r10,r11}
-#define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r11}
-
-/*****************************************************************************
- * entry points *
- *****************************************************************************/
-
-.align 2
-_bcopy$VARIANT$CortexA8:
-
-// bcopy has the first and second arguments in the opposite order as the C
-// library functions memmove and memcpy. If bcopy is called, we swap these
-// two arguments and then fall into memmove.
-
- mov r3, r0
- mov r0, r1
- mov r1, r3
-
-.align 2
-_memmove$VARIANT$CortexA8:
-_memcpy$VARIANT$CortexA8:
-
-// At entry to memmove/memcpy, registers contain the following values:
-//
-// r0 pointer to the first byte of the destination buffer
-// r1 pointer to the first byte of the source buffer
-// r2 number of bytes to copy
-//
-// Our preference is to use a (faster and easier to understand) front-to-back
-// copy of the buffer. However, memmove requires that copies take place as
-// though through a temporary buffer. This means that if the buffers overlap,
-// it may be necessary to copy the buffer in reverse order.
-//
-// To properly detect such overlap, we begin by computing the offset between
-// the source and destination pointers. If the offset happens to be zero,
-// then there is no work to be done, so we can early out.
-
- subs r3, r0, r1
- it eq
- bxeq lr
-
-// r3 now contains the offset between the buffers, (destination - source). If
-// 0 < offset < length, then the high-addressed bits of the source alias the
-// low addressed bytes of the destination. Thus, if we were to perform the
-// copy in ascending address order, we would overwrite the high-addressed
-// source bytes before we had a chance to copy them, and the data would be lost.
-//
-// Thus, we can use the front-to-back copy only if offset is negative or
-// greater than the length. This is the case precisely if offset compares
-// unsigned higher than length.
-
- cmp r3, r2
- bhs L_copyFrontToBack
-
-/*****************************************************************************
- * back to front copy *
- *****************************************************************************/
-
-// Here we have fallen through into the back-to-front copy. We preserve the
-// original destination pointer in r0 because it is the return value for the
-// routine, and update the other registers as follows:
-//
-// r1 one byte beyond the end of the destination buffer
-// r2 number of bytes to copy
-// ip one byte beyond the end of the destination buffer
-
- mov ip, r0
- add r1, r2
- add ip, r2
-
-// Subtract 8 from the buffer length; if this is negative, then we will use
-// only single-byte copies, and we jump directly to a scalar copy loop.
-
- subs r2, $8
- blt L_scalarReverseCopy
-
-// If the destination pointer is 8-byte aligned we can use 8-byte NEON copies
-// to move the data.
-
- tst ip, $7
- beq L_vectorReverseCopy
-
-// Otherwise, we copy a single byte at a time, in order of descending memory
-// address, until the destination is 8 byte aligned. Within this loop,
-// registers are used as follows:
-//
-// r0 original destination pointer
-// r1 pointer to one byte past the next element to be copied
-// r2 (bytes remaining to be copied) - 8
-// r3 temporary to hold the byte that is being copied
-// ip pointer one byte past the destination of the next byte to be copied
-//
-// byte that will be copied in this iteration
-// | byte that was copied in the previous iteration
-// Source buffer: v v
-// ------------------------+---+---+-------------------------
-// bytes still to copy ... | | | ... bytes already copied
-// ------------------------+---+---+-------------------------
-// ^
-// r1 holds the address of this byte
-
-0: ldrb r3, [r1, $-1]!
- sub r2, $1
- strb r3, [ip, $-1]!
- tst ip, $7
- bne 0b
-
-// At this point, the destination pointer is 8 byte aligned. Check again that
-// there are at least 8 bytes remaining to copy by comparing the remaining
-// length minus 8 to zero. If fewer than 8 bytes remain, jump to the cleanup
-// path.
-
- cmp r2, $0
- blt L_scalarReverseCopy
-
-/*****************************************************************************
- * destination is 8 byte aligned *
- *****************************************************************************/
-
-L_vectorReverseCopy:
-
-// At this point, registers contain the following values:
-//
-// r0 original destination pointer
-// r1 pointer to one byte past the next element to be copied
-// r2 (bytes remaining to copy) - 8
-// ip pointer one byte past the destination of the next byte to be copied
-//
-// Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive.
-// NEON has really excellent alignment handling in hardware, so we would like
-// to use that to handle cases where the source is not similarly aligned to the
-// destination (it supports even single-byte misalignment at speed). However,
-// on some SoC designs, not all of the DMA busses support such access. Thus,
-// we must unfortunately use a software workaround in those cases.
-//
-// Fortunately, 4-byte aligned loads are supported even on the DMA busses, so
-// we only need to handle the different possible source alignments modulo 4.
-// Here we have a dispatch table to jump to the correct copy implementation
-// for the given source alignment.
-//
-// The tbh instruction loads the address offset of the correct implementation
-// from the data table that immediately follows it and adds it to the pc to
-// jump to the correct branch.
-
- ands r3, r1, $3
- tbh [pc, r3, lsl $1]
-0:
-.short (L_reverseAligned0-0b)/2
-.short (L_reverseAligned1-0b)/2
-.short (L_reverseAligned2-0b)/2
-.short (L_reverseAligned3-0b)/2
-
-/*****************************************************************************
- * source is also at least word aligned *
- *****************************************************************************/
-
-L_reverseAligned0:
-
-// Subtract 56 from r2, so that it contains the number of bytes remaining to
-// copy minus 64. If this result is negative, then we jump into a loop that
-// copies 8 bytes at a time.
-
- subs r2, $0x38
- blt L_reverseVectorCleanup
-
-// Check if the destination pointer is 64-byte aligned. If so, jump to a loop
-// that copies whole cachelines.
-
- tst ip, $0x38
- beq L_reverseCachelineAligned
-
-// Otherwise, we copy a 8 bytes at a time, in order of descending memory
-// address, until the destination is 64 byte aligned. Within this loop,
-// registers are used as follows:
-//
-// r0 original destination pointer
-// r1 pointer to one byte past the next element to be copied
-// r2 (bytes remaining to be copied) - 64
-// ip pointer one byte past the destination of the next byte to be copied
-// d0 temporary storage for copy
-//
-// bytes that will be copied after this iteration
-// | 8 byte block that will be copied in this iteration
-// v v
-// --------------+-------------------------------+---------------------
-// | 0 1 2 3 4 5 6 7 | bytes already copied
-// --------------+-------------------------------+---------------------
-// ^
-// r1 points here
-
-0: sub r1, $8
- vld1.32 {d0}, [r1]
- sub ip, $8
- sub r2, $8
- tst ip, $0x38
- vst1.64 {d0}, [ip,:64]
- bne 0b
-
-// At this point, the destination pointer is 64 byte aligned. Check again that
-// there are at least 64 bytes remaining to copy by comparing the remaining
-// length minus 64 to zero. If fewer than 64 bytes remain, skip over the main
-// copy loop.
-
- cmp r2, $0
- blt L_reverseVectorCleanup
-
-/*****************************************************************************
- * destination is cacheline aligned *
- *****************************************************************************/
-
-L_reverseCachelineAligned:
-
-// In the special case that we are copying a buffer of between 1k and 32k bytes
-// we do not use a NEON copy for the main loop. This is because if we happen
-// to be doing a copy from a source in cache to a destination that is not in
-// cache, this will result in an increase in energy usage. In all other cases,
-// NEON gives superior energy conservation.
-
- sub r3, r2, $0x3c0
- cmp r3, $0x7c00
- blo L_useSTMDB
-
-// Pre-decrement the source (r1) and destination (ip) pointers so that they
-// point to the first byte of the trailing 32-byte window of each buffer.
-// Additionally, load the address increment of -32 into r3.
-
- sub r1, $32
- sub ip, $32
- mov r3, $-32
-
-// The destination pointer is known to be 64-byte aligned, so we can use the
-// maximal alignment hint (:256) for our vector stores. Detect if the source
-// is also at least 32-byte aligned and jump to a loop that uses maximal
-// alignment hints for the loads as well if possible.
-
- tst r1, $0x1f
- beq L_reverseSourceAligned
-
-// This loop copies 64 bytes per iteration, from a 4-byte aligned source to a
-// 64-byte aligned destination, in order of descending memory address. Within
-// this loop, registers are used as follows:
-//
-// r0 original destination pointer (unmodified)
-// r1 pointer to the next 32-byte block to load
-// r2 (number of bytes remaining to copy) - 64
-// r3 address increment of -32.
-// ip pointer to which the next 32-byte block is to be stored
-// q0-q3 temporary registers used for copies
-//
-// Note that the loop is arrange in such a way that a single cleanup store is
-// necessary after the final loop iteration. This occurs at label (1), and is
-// shared between the unaligned and aligned loops.
-
- vld1.32 {q2,q3}, [r1], r3
- vld1.32 {q0,q1}, [r1], r3
- subs r2, $64
- vst1.64 {q2,q3}, [ip,:256], r3
- blt 1f
-.align 3
-0: vld1.32 {q2,q3}, [r1], r3
- vst1.64 {q0,q1}, [ip,:256], r3
- vld1.32 {q0,q1}, [r1], r3
- subs r2, $64
- vst1.64 {q2,q3}, [ip,:256], r3
- bge 0b
- b 1f
-
-L_reverseSourceAligned:
-
-// This loop is identical to the immediately preceeding loop, except that it
-// uses the additional alignment hint that the source pointer (r1) is 32-byte
-// aligned. The two loops share cleanup code for the final iteration.
-
- vld1.64 {q2,q3}, [r1,:256], r3
- vld1.64 {q0,q1}, [r1,:256], r3
- subs r2, $64
- vst1.64 {q2,q3}, [ip,:256], r3
- blt 1f
-.align 3
-0: vld1.64 {q2,q3}, [r1,:256], r3
- vst1.64 {q0,q1}, [ip,:256], r3
- vld1.64 {q0,q1}, [r1,:256], r3
- subs r2, $64
- vst1.64 {q2,q3}, [ip,:256], r3
- bge 0b
-
-// Final vector store for both of the above loops.
-
-1: vst1.64 {q0,q1}, [ip,:256], r3
-
-// Adjust the source and destination pointers so that they once again point to
-// the last byte that we used (which is one byte higher than the address that
-// we will use next for any required cleanup).
-
- add r1, $32
- add ip, $32
-
-L_reverseVectorCleanup:
-
-// Add 56 to r2, so that it contains the number of bytes remaing to copy minus
-// 8. A comparison of this value with zero tells us if any more whole 8-byte
-// blocks need to be copied.
-
- adds r2, r2, $0x38
- blt L_scalarReverseCopy
-
-// This loop copies 8 bytes at a time in order of descending memory address,
-// until fewer than 8 bytes remain to be copied. Within this loop, registers
-// are used as follows:
-//
-// r0 original destination pointer
-// r1 pointer to one byte past the next element to be copied
-// r2 (bytes remaining to be copied) - 64
-// ip pointer one byte past the destination of the next byte to be copied
-// d0 temporary storage for copy
-
-0: sub r1, $8
- vld1.32 {d0}, [r1]
- sub ip, $8
- subs r2, $8
- vst1.64 {d0}, [ip,:64]
- bge 0b
-
-/*****************************************************************************
- * sub-doubleword cleanup copies *
- *****************************************************************************/
-
-L_scalarReverseCopy:
-
-// Add 8 to r2, so that it contains the number of bytes remaining to copy, and
-// return to the calling routine if zero bytes remain.
-
- adds r2, $8
- it eq
- bxeq lr
-
-// Copy one byte at a time in descending address order until we reach the front
-// of the buffer. Within this loop, registers are used as follows:
-//
-// r0 original destination pointer
-// r1 pointer to one byte past the next element to be copied
-// r2 (bytes remaining to be copied) - 8
-// r3 temporary to hold the byte that is being copied
-// ip pointer one byte past the destination of the next byte to be copied
-
-0: ldrb r3, [r1, $-1]!
- subs r2, $1
- strb r3, [ip, $-1]!
- bne 0b
- bx lr
-
-/*****************************************************************************
- * STMDB loop for 1k-32k buffers *
- *****************************************************************************/
-
-// This loop copies 64 bytes each iteration in order of descending memory
-// address, using the GPRs instead of NEON.
-//
-// r0 original destination pointer
-// r1 pointer to one byte past the next element to be copied
-// r2 (bytes remaining to be copied) - 64
-// r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data
-// ip pointer to one byte past the next location to store to
-
-L_useSTMDB:
- push SAVE_REGISTERS
-.align 3
-0: ldmdb r1!, COPY_REGISTERS
- subs r2, r2, $64
- stmdb ip!, COPY_REGISTERS
- ldmdb r1!, COPY_REGISTERS
- pld [r1, $-64]
- stmdb ip!, COPY_REGISTERS
- bge 0b
- pop SAVE_REGISTERS
- b L_reverseVectorCleanup
-
-/*****************************************************************************
- * Misaligned reverse vld1 loop *
- *****************************************************************************/
-
-// Software alignment fixup to handle source and dest that are relatively
-// misaligned mod 4 bytes.
-//
-// The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration,
-// which we combine with the 8 bytes loaded in the previous iteration to get a
-// 16 byte field; the next 8 bytes to be stored to the destination buffer are
-// somewhere in that field, and we get them using the VEXT instruction:
-//
-// | 8 bytes from this iteration | 8 bytes from last iteration |
-// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
-// | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f |
-// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
-// ^8 bytes to store this iteration^ |
-// could be a page boundary
-//
-// We need to be a little bit careful, however. Because the loads only have 4
-// byte alignment, the very first load could slop over into a page that is not
-// mapped readable. In order to prevent this scenario, we copy eight bytes
-// using byte-by-byte before beginning the main loop.
-//
-// At the beginning of each iteration through this loop, registers are used
-// as follows:
-//
-// r0 original destination pointer
-// r1 pointer to the next block of 8 bytes to load
-// r2 (bytes remaining to copy) - 8
-// ip pointer to the next block of 8 bytes to store
-// d0 next 8 bytes to store
-// d2 8 bytes loaded in the previous iteration
-// d3 8 bytes loaded two iterations ago
-
-#define RCOPY_UNALIGNED(offset) \
-0: ldrb r3, [r1,$-1]! ;\
- strb r3, [ip,$-1]! ;\
- subs r2, $1 ;\
- blt L_scalarReverseCopy ;\
- tst ip, $7 ;\
- bne 0b ;\
- bic r1, $3 ;\
- sub r1, $8 ;\
- sub ip, $8 ;\
- mov r3, $-8 ;\
- vld1.32 {d2,d3}, [r1], r3 ;\
- subs r2, $8 ;\
- blt 1f ;\
-0: vext.8 d0, d2, d3, $(offset);\
- vmov d3, d2 ;\
- vld1.32 {d2}, [r1], r3 ;\
- subs r2, $8 ;\
- vst1.64 {d0}, [ip, :64], r3 ;\
- bge 0b ;\
-1: vext.8 d0, d2, d3, $(offset);\
- add r1, $8 ;\
- vst1.64 {d0}, [ip, :64] ;\
-2: add r1, $(offset);\
- b L_scalarReverseCopy
-
-L_reverseAligned1:
- RCOPY_UNALIGNED(1)
-L_reverseAligned2:
- RCOPY_UNALIGNED(2)
-L_reverseAligned3:
- RCOPY_UNALIGNED(3)
-
-/*****************************************************************************
- * front to back copy *
- *****************************************************************************/
-
-L_copyFrontToBack:
-
-// Here the pointers are laid out such that we can use our preferred
-// front-to-back copy. We preserve original destination pointer in r0 because
-// it is the return value for the routine, and copy it to ip to use in this
-// routine.
-
- mov ip, r0
-
-// Subtract 8 from the buffer length; if this is negative, then we will use
-// only single-byte copies, and we jump directly to a scalar copy loop.
-
- subs r2, $8
- blt L_scalarCopy
-
-// If the destination pointer is 8-byte aligned we can use 8-byte NEON copies
-// to move the data.
-
- tst ip, $7
- beq L_vectorCopy
-
-// Otherwise, we copy a single byte at a time, in order of ascending memory
-// address, until the destination is 8 byte aligned. Within this loop,
-// registers are used as follows:
-//
-// r0 original destination pointer
-// r1 pointer to the next byte to copy
-// r2 (bytes remaining to be copied) - 8
-// r3 temporary to hold the byte that is being copied
-// ip pointer to the next byte to store to
-
-0: ldrb r3, [r1], $1
- sub r2, $1
- strb r3, [ip], $1
- tst ip, $7
- bne 0b
-
-// At this point, the destination pointer is 8 byte aligned. Check again that
-// there are at least 8 bytes remaining to copy by comparing the remaining
-// length minus 8 to zero. If fewer than 8 bytes remain, jump to the cleanup
-// path.
-
- cmp r2, $0
- blt L_scalarCopy
-
-/*****************************************************************************
- * destination is doubleword aligned *
- *****************************************************************************/
-
-L_vectorCopy:
-
-// At this point, registers contain the following values:
-//
-// r0 original destination pointer
-// r1 pointer to the next element to be copied
-// r2 (bytes remaining to copy) - 8
-// ip pointer to the destination of the next byte to be copied
-//
-// Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive.
-// NEON has really excellent alignment handling in hardware, so we would like
-// to use that to handle cases where the source is not similarly aligned to the
-// destination (it supports even single-byte misalignment at speed). However,
-// on some SoC designs, not all of the DMA busses support such access. Thus,
-// we must unfortunately use a software workaround in those cases.
-//
-// Fortunately, 4-byte aligned loads are supported even on the DMA busses, so
-// we only need to handle the different possible source alignments modulo 4.
-// Here we have a dispatch table to jump to the correct copy implementation
-// for the given source alignment.
-//
-// The tbh instruction loads the address offset of the correct implementation
-// from the data table that immediately follows it and adds it to the pc to
-// jump to the correct branch.
-
- ands r3, r1, $3
- bic r1, $3
- tbh [pc, r3, lsl $1]
-0:
-.short (L_sourceAligned0-0b)/2
-.short (L_sourceAligned1-0b)/2
-.short (L_sourceAligned2-0b)/2
-.short (L_sourceAligned3-0b)/2
-
-/*****************************************************************************
- * source is also at least word aligned *
- *****************************************************************************/
-
-L_sourceAligned0:
-
-// Subtract 56 from r2, so that it contains the number of bytes remaining to
-// copy minus 64. If this result is negative, then we jump into a loop that
-// copies 8 bytes at a time.
-
- subs r2, $0x38
- blt L_vectorCleanup
-
-// Check if the destination pointer is 64-byte aligned. If so, jump to a loop
-// that copies whole cachelines.
-
- tst ip, $0x38
- beq L_cachelineAligned
-
-// Otherwise, we copy a 8 bytes at a time, in order of ascending memory
-// address, until the destination is 64 byte aligned. Within this loop,
-// registers are used as follows:
-//
-// r0 original destination pointer
-// r1 pointer to the next element to be copied
-// r2 (bytes remaining to be copied) - 64
-// ip pointer to the destination of the next byte to be copied
-// d0 temporary storage for copy
-
-0: vld1.32 {d0}, [r1]!
- sub r2, $8
- vst1.64 {d0}, [ip,:64]!
- tst ip, $0x38
- bne 0b
-
-// At this point, the destination pointer is 64 byte aligned. Check again that
-// there are at least 64 bytes remaining to copy by comparing the remaining
-// length minus 64 to zero. If fewer than 64 bytes remain, skip over the main
-// copy loop.
-
- cmp r2, $0
- blt L_vectorCleanup
-
-/*****************************************************************************
- * destination is cacheline aligned *
- *****************************************************************************/
-
-// In the special case that we are copying a buffer of between 1k and 32k bytes
-// we do not use a NEON copy for the main loop. This is because if we happen
-// to be doing a copy from a source in cache to a destination that is not in
-// cache, this will result in an increase in energy usage. In all other cases,
-// NEON gives superior energy conservation.
-
-L_cachelineAligned:
- sub r3, r2, $0x3c0
- cmp r3, $0x7c00
- blo L_useSTMIA
-
-// The destination pointer is known to be 64-byte aligned, so we can use the
-// maximal alignment hint (:256) for our vector stores. Detect if the source
-// is also at least 32-byte aligned and jump to a loop that uses maximal
-// alignment hints for the loads as well if possible.
-
- tst r1, $0x1f
- beq L_sourceAligned32
-
-// This loop copies 64 bytes per iteration, from a 4-byte aligned source to a
-// 64-byte aligned destination, in order of ascending memory address. Within
-// this loop, registers are used as follows:
-//
-// r0 original destination pointer (unmodified)
-// r1 pointer to the next 32-byte block to load
-// r2 (number of bytes remaining to copy) - 64
-// ip pointer to which the next 32-byte block is to be stored
-// q0-q3 temporary registers used for copies
-//
-// Note that the loop is arrange in such a way that a single cleanup store is
-// necessary after the final loop iteration. This occurs at label (1), and is
-// shared between the unaligned and aligned loops.
-
- vld1.32 {q2,q3}, [r1]!
- vld1.32 {q0,q1}, [r1]!
- subs r2, $64
- vst1.64 {q2,q3}, [ip,:256]!
- blt 1f
-.align 3
-0: vld1.32 {q2,q3}, [r1]!
- vst1.64 {q0,q1}, [ip,:256]!
- vld1.32 {q0,q1}, [r1]!
- subs r2, $64
- vst1.64 {q2,q3}, [ip,:256]!
- bge 0b
- b 1f
-
-L_sourceAligned32:
-
-// This loop is identical to the immediately preceeding loop, except that it
-// uses the additional alignment hint that the source pointer (r1) is 32-byte
-// aligned. The two loops share cleanup code for the final iteration.
-
- vld1.64 {q2,q3}, [r1,:256]!
- vld1.64 {q0,q1}, [r1,:256]!
- subs r2, $64
- vst1.64 {q2,q3}, [ip,:256]!
- blt 1f
-.align 3
-0: vld1.64 {q2,q3}, [r1,:256]!
- vst1.64 {q0,q1}, [ip,:256]!
- vld1.64 {q0,q1}, [r1,:256]!
- subs r2, $64
- vst1.64 {q2,q3}, [ip,:256]!
- bge 0b
-
-// Final vector store for both of the above loops.
-
-1: vst1.64 {q0,q1}, [ip,:256]!
-
-L_vectorCleanup:
-
-// Add 56 to r2, so that it contains the number of bytes remaing to copy minus
-// 8. A comparison of this value with zero tells us if any more whole 8-byte
-// blocks need to be copied.
-
- adds r2, $0x38
- blt L_scalarCopy
-
-// This loop copies 8 bytes at a time in order of descending memory address,
-// until fewer than 8 bytes remain to be copied. Within this loop, registers
-// are used as follows:
-//
-// r0 original destination pointer
-// r1 pointer to the next element to be copied
-// r2 (bytes remaining to be copied) - 64
-// ip pointer to the destination of the next byte to be copied
-// d0 temporary storage for copy
-
-0: vld1.32 {d0}, [r1]!
- subs r2, $8
- vst1.64 {d0}, [ip,:64]!
- bge 0b
-
-/*****************************************************************************
- * sub-doubleword cleanup copies *
- *****************************************************************************/
-
-L_scalarCopy:
-
-// Add 8 to r2, so that it contains the number of bytes remaining to copy, and
-// return to the calling routine if zero bytes remain.
-
- adds r2, $8
- it eq
- bxeq lr
-
-// Copy one byte at a time in descending address order until we reach the front
-// of the buffer. Within this loop, registers are used as follows:
-//
-// r0 original destination pointer
-// r1 pointer to one byte past the next element to be copied
-// r2 (bytes remaining to be copied) - 8
-// r3 temporary to hold the byte that is being copied
-// ip pointer one byte past the destination of the next byte to be copied
-
-0: ldrb r3, [r1], $1
- strb r3, [ip], $1
- subs r2, $1
- bne 0b
- bx lr
-
-/*****************************************************************************
- * STMIA loop for 1k-32k buffers *
- *****************************************************************************/
-
-// This loop copies 64 bytes each iteration in order of ascending memory
-// address, using the GPRs instead of NEON.
-//
-// r0 original destination pointer
-// r1 pointer to the next element to be copied
-// r2 (bytes remaining to be copied) - 64
-// r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data
-// ip pointer to the next location to store to
-
-L_useSTMIA:
- push SAVE_REGISTERS
-.align 3
-0: ldmia r1!, COPY_REGISTERS
- subs r2, r2, $64
- stmia ip!, COPY_REGISTERS
- ldmia r1!, COPY_REGISTERS
- pld [r1, $64]
- stmia ip!, COPY_REGISTERS
- bge 0b
- pop SAVE_REGISTERS
- b L_vectorCleanup
-
-/*****************************************************************************
- * Misaligned forward vld1 loop *
- *****************************************************************************/
-
-// Software alignment fixup to handle source and dest that are relatively
-// misaligned mod 4 bytes.
-//
-// The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration,
-// which we combine with the 8 bytes loaded in the previous iteration to get a
-// 16 byte field; the next 8 bytes to be stored to the destination buffer are
-// somewhere in that field, and we get them using the VEXT instruction:
-//
-// | 8 bytes from last iteration | 8 bytes from this iteration |
-// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
-// | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f |
-// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
-// ^8 bytes to store this iteration^ |
-// could be a page boundary
-//
-// We need to be a little bit careful, however. Because the loads only have 4
-// byte alignment, if we used this approach all the way to the end of the
-// buffer, the very last 8 byte load might slop over onto a new page by 4
-// bytes, and that new page might not be mapped into our process. Thus, we
-// terminate this copy loop when fewer than 12 bytes remain to be copied,
-// instead of the more natural-seeming termination condition of "8 bytes
-// remaining" (the illustration above shows the worst case and demonstrates
-// why 12 is a sufficiently safe condition).
-//
-// At the beginning of each iteration through this loop, registers are used
-// as follows:
-//
-// r0 original destination pointer
-// r1 pointer to the next block of 8 bytes to load
-// r2 (bytes remaining to copy) - 12
-// ip pointer to the next block of 8 bytes to store
-// d0 next 8 bytes to store
-// d2 8 bytes loaded in the previous iteration
-// d3 8 bytes loaded two iterations ago
-
-#define COPY_UNALIGNED(offset) \
- subs r2, $4 ;\
- blt 2f ;\
- vld1.32 {d2,d3}, [r1]! ;\
- subs r2, $8 ;\
- blt 1f ;\
-0: vext.8 d0, d2, d3, $(offset);\
- vmov d2, d3 ;\
- vld1.32 {d3}, [r1]! ;\
- subs r2, $8 ;\
- vst1.64 {d0}, [ip, :64]! ;\
- bge 0b ;\
-1: vext.8 d0, d2, d3, $(offset);\
- sub r1, $8 ;\
- vst1.64 {d0}, [ip, :64]! ;\
-2: add r1, $(offset);\
- add r2, $4 ;\
- b L_scalarCopy
-
-L_sourceAligned1:
- COPY_UNALIGNED(1)
-L_sourceAligned2:
- COPY_UNALIGNED(2)
-L_sourceAligned3:
- COPY_UNALIGNED(3)
-
-#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD