Loading...
--- Libc/Libc-825.25/arm/string/bcopy_CortexA9.s
+++ /dev/null
@@ -1,444 +0,0 @@
-/*
- * Copyright (c) 2010 Apple Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this
- * file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_LICENSE_HEADER_END@
- *
- * This file implements the following functions for the Cortex-A9 processor:
- *
- * void bcopy(const void * source,
- * void * destination,
- * size_t length);
- *
- * void *memmove(void * destination,
- * const void * source,
- * size_t n);
- *
- * void *memcpy(void * restrict destination,
- * const void * restrict source,
- * size_t n);
- *
- * All copy n successive bytes from source to destination. Memmove and memcpy
- * return destination, whereas bcopy has no return value. Copying takes place
- * as if it were through a temporary buffer -- after return destination
- * contains exactly the bytes from source, even if the buffers overlap (this is
- * not required of memcpy by the C standard; its behavior is undefined if the
- * buffers overlap, but we are holding ourselves to the historical behavior of
- * this function on OS X and iOS).
- */
-
-#include <arm/arch.h>
-#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
-
-/*****************************************************************************
- * Macros *
- *****************************************************************************/
-
-#define A9_ENTRY(name) \
- .align 2;\
- .globl _ ## name ## $VARIANT$CortexA9;\
- _ ## name ## $VARIANT$CortexA9:
-
-#define ESTABLISH_FRAME \
- push {r0,r4,r7,lr};\
- add r7, sp, #8
-
-#define CLEAR_FRAME_AND_RETURN \
- pop {r0,r4,r7,pc}
-
-#define ADDITIONAL_CALLEE_SAVE_REGISTERS {r5,r6,r8,r10}
-
-#define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r12}
-
-/*****************************************************************************
- * entry points *
- *****************************************************************************/
-
-.text
-.syntax unified
-.code 32
-
-A9_ENTRY(bcopy)
-// Translate bcopy calls into memcpy calls by swapping the first and second
-// arguments.
- mov r3, r0
- mov r0, r1
- mov r1, r3
-
-A9_ENTRY(memcpy)
-A9_ENTRY(memmove)
-// Our preference is to copy the data in ascending address order, but if the
-// buffers overlap such that the beginning of the destination buffer aliases
-// the end of the source buffer, we need to copy in descending address order
-// instead to preserve the memmove semantics. We detect this case with the
-// test:
-//
-// destination - source < length (unsigned compare)
-//
-// If the address of the source buffer is higher than the address of the
-// destination buffer, this arithmetic can overflow, but the overflowed value
-// can only be smaller than length if the buffers do not overlap, so we don't
-// need to worry about false positives due to the overflow (they happen, but
-// only in cases where copying in either order is correct).
- subs r3, r0, r1
- bxeq lr
- ESTABLISH_FRAME
- cmp r3, r2
- blo L_descendingCopy
-
-/*****************************************************************************
- * ascending copy *
- *****************************************************************************/
-
-// The layout of the two buffers is such that we can use our preferred
-// (ascending address order) copy implementation. Throughout this copy,
-// registers are used as follows:
-//
-// r0 lowest unwritten address in the destination buffer.
-// r1 lowest unread address in the source buffer.
-// r2 number of bytes remaining to copy less an offset that varies
-// with the size of the copies that are being made.
-// r3, r4, r5, r6, r8, r9, r10, r12
-// temporary registers used to hold the data during copies.
-// r12 also used as a scratch register for alignment / length calculations
-
-L_ascendingCopy:
-// We begin by checking if less than four bytes are to be copied; if so, we
-// branch directly to a small-buffer copy and return. Otherwise, we copy up
-// to three bytes if needed to make the destination pointer have word (four
-// byte) alignment.
- subs r2, #4
- blo L_ascendingLengthLessThanFour
- ands ip, r0, #0x3
- beq L_ascendingDestinationWordAligned
- ldrb r3, [r1],#1
- cmp ip, #2
- ldrbls r4, [r1],#1
- strb r3, [r0],#1
- ldrblo r3, [r1],#1
- add r2, ip
- strbls r4, [r0],#1
- strblo r3, [r0],#1
- subs r2, #4
- bhs L_ascendingDestinationWordAligned
-
-L_ascendingLengthLessThanFour:
-// Conditionally copies up to three bytes, assuming no alignment. This is
-// only used if the original length of the buffer is smaller than four.
- lsls ip, r2, #31
- ldrbcs r3, [r1],#1
- ldrbcs ip, [r1],#1
- ldrbmi r4, [r1]
- strbcs r3, [r0],#1
- strbcs ip, [r0],#1
- strbmi r4, [r0]
- CLEAR_FRAME_AND_RETURN
-
-L_ascendingDestinationWordAligned:
-// We know that the destination has word alignment. If the source is not
-// similarly aligned, jump to an unaligned copy loop.
- tst r1, #0x3
- bne L_ascendingUnalignedCopy
-
-/*****************************************************************************
- * ascending copy, both buffers have word alignment *
- *****************************************************************************/
-
-// If less than sixty-four bytes remain to be copied, jump directly to the
-// word-aligned cleanup path. Otherwise, we copy up to 28 bytes as needed
-// to make the destination pointer have cacheline alignment.
- subs r2, r2, #0x3c
- blo L_ascendingLengthLessThanSixtyFour
-0: tst r0, #0x1c
- beq L_ascendingDestinationCachelineAligned
- ldr r3, [r1],#4
- subs r2, #4
- str r3, [r0],#4
- bhs 0b
- b L_ascendingLengthLessThanSixtyFour
-
-L_ascendingDestinationCachelineAligned:
-// Unrolled main copy loop; copies two cachelines (64 bytes) per iteration.
-// Empirical testing suggests that 0x60 is the optimal lookahead for preload,
-// though anything between 0x40 and 0x100 seems to be "acceptable".
- push ADDITIONAL_CALLEE_SAVE_REGISTERS
-0: ldm r1!, COPY_REGISTERS
- subs r2, r2, #0x40
- stm r0!, COPY_REGISTERS
- pld [r1, #0x60]
- ldm r1!, COPY_REGISTERS
- pld [r1, #0x60]
- stm r0!, COPY_REGISTERS
- bhs 0b
- pop ADDITIONAL_CALLEE_SAVE_REGISTERS
-
-L_ascendingLengthLessThanSixtyFour:
-// Cleanup copy of up to 63 bytes. We can assume that both the source and
-// destination addresses have word alignment here.
- tst r2, #0x30
- beq 1f
-0: ldm r1!, {r3,r4,r9,ip}
- sub r2, r2, #0x10
- stm r0!, {r3,r4,r9,ip}
- tst r2, #0x30
- bne 0b
-1: tst r2, #0xf
- beq 2f
- lsls ip, r2, #29
- ldmcs r1!, {r3,ip}
- stmcs r0!, {r3,ip}
- ldrmi r3, [r1],#4
- strmi r3, [r0],#4
- lsls ip, r2, #31
- ldrhcs r3, [r1],#2
- strhcs r3, [r0],#2
- ldrbmi ip, [r1]
- strbmi ip, [r0]
-2: CLEAR_FRAME_AND_RETURN
-
-/*****************************************************************************
- * ascending copy, source buffer is not word aligned *
- *****************************************************************************/
-
-L_ascendingUnalignedCopy:
-// Destination buffer is word aligned, but source buffer is not. Copy
-// byte-by-byte until the destination buffer has eightbyte alignment.
- subs r2, #4
- blo L_ascendingUnalignedByteCleanup
-0: tst r0, #0x7
- beq L_ascendingUnalignedVectorCopy
- ldrb r3, [r1],#1
- subs r2, #1
- strb r3, [r0],#1
- bhs 0b
-L_ascendingUnalignedByteCleanup:
- adds r2, #8
- beq 1f
-0: ldrb r3, [r1],#1
- subs r2, #1
- strb r3, [r0],#1
- bne 0b
-1: CLEAR_FRAME_AND_RETURN
-
-L_ascendingUnalignedVectorCopy:
-// Destination buffer is eightbyte aligned. Source buffer has unknown
-// alignment. Use NEON to handle the misaligned copies. We begin by copying
-// up to 24 bytes to get cacheline alignment of the destination buffer.
- subs r2, #0x18
- blo L_ascendingUnalignedVectorCleanup
-0: tst r0, #0x18
- beq L_ascendingUnalignedCachelineCopy
- vld1.8 {d0}, [r1]!
- subs r2, #8
- vst1.8 {d0}, [r0,:64]!
- bhs 0b
-L_ascendingUnalignedVectorCleanup:
- adds r2, #0x18
- blo L_ascendingUnalignedByteCleanup
-0: vld1.8 {d0}, [r1]!
- subs r2, #8
- vst1.8 {d0}, [r0,:64]!
- bhs 0b
- b L_ascendingUnalignedByteCleanup
-
-L_ascendingUnalignedCachelineCopy:
-// Main copy loop; moves 32 bytes per iteration. Requires only byte alignment
-// of the source address.
- vld1.8 {q0,q1},[r1]!
- pld [r1, #0x60]
- vst1.8 {q0,q1},[r0,:256]!
- subs r2, #0x20
- bhs L_ascendingUnalignedCachelineCopy
- b L_ascendingUnalignedVectorCleanup
-
-/*****************************************************************************
- * descending copy *
- *****************************************************************************/
-
-// The layout of the two buffers is such that we must copy in descending-
-// address order. Throughout this copy, registers are used as follows:
-//
-// r0 lowest address in the destination buffer that has been written to.
-// r1 lowest address in the source buffer that has been read from.
-// r2 number of bytes remaining to copy less an offset that varies
-// with the size of the copies that are being made.
-// r3, r4, r5, r6, r8, r9, r10, r12
-// temporary registers used to hold the data during copies.
-// r12 also used as a scratch register for alignment / length calculations
-
-L_descendingCopy:
-// We begin by checking if less than four bytes are to be copied; if so, we
-// branch directly to a small-buffer copy and return. Otherwise, we copy up
-// to three bytes if needed to make the destination pointer have word (four
-// byte) alignment.
- add r1, r2
- add r0, r2
- subs r2, #4
- blo L_descendingLengthLessThanFour
- ands ip, r0, #0x3
- beq L_descendingDestinationWordAligned
- ldrb r3, [r1, #-1]!
- cmp ip, #2
- ldrbhs r4, [r1, #-1]!
- strb r3, [r0, #-1]!
- ldrbhi r3, [r1, #-1]!
- strbhs r4, [r0, #-1]!
- strbhi r3, [r0, #-1]!
- subs r2, ip
- bhs L_descendingDestinationWordAligned
-
-L_descendingLengthLessThanFour:
-// Conditionally copies up to three bytes, assuming no alignment. This is
-// only used if the original length of the buffer is smaller than four.
- lsls ip, r2, #31
- ldrbcs r3, [r1, #-1]!
- ldrbcs ip, [r1, #-1]!
- ldrbmi r4, [r1, #-1]
- strbcs r3, [r0, #-1]!
- strbcs ip, [r0, #-1]!
- strbmi r4, [r0, #-1]
- CLEAR_FRAME_AND_RETURN
-
-L_descendingDestinationWordAligned:
-// We know that the destination has word alignment. If the source is not
-// similarly aligned, jump to an unaligned copy loop.
- tst r1, #0x3
- bne L_descendingUnalignedCopy
-
-/*****************************************************************************
- * descending copy, both buffers have word alignment *
- *****************************************************************************/
-
-// If less than sixty-four bytes remain to be copied, jump directly to the
-// word-aligned cleanup path. Otherwise, we copy up to 28 bytes as needed
-// to make the destination pointer have cacheline alignment.
- subs r2, r2, #0x3c
- blo L_descendingLengthLessThanSixtyFour
-0: tst r0, #0x1c
- beq L_descendingDestinationCachelineAligned
- ldr r3, [r1, #-4]!
- subs r2, #4
- str r3, [r0, #-4]!
- bhs 0b
- b L_descendingLengthLessThanSixtyFour
-
-L_descendingDestinationCachelineAligned:
-// Unrolled main copy loop; copies two cachelines (64 bytes) per iteration.
-// Empirical testing suggests that -0x80 is the optimal lookahead for preload,
-// though anything between -0x40 and -0x100 seems to be "acceptable".
- push ADDITIONAL_CALLEE_SAVE_REGISTERS
-0: ldmdb r1!, COPY_REGISTERS
- subs r2, r2, #0x40
- stmdb r0!, COPY_REGISTERS
- pld [r1, #-0x80]
- ldmdb r1!, COPY_REGISTERS
- pld [r1, #-0x80]
- stmdb r0!, COPY_REGISTERS
- bhs 0b
- pop ADDITIONAL_CALLEE_SAVE_REGISTERS
-
-L_descendingLengthLessThanSixtyFour:
-// Cleanup copy of up to 63 bytes. We can assume that both the source and
-// destination addresses have word alignment here.
- tst r2, #0x30
- beq 1f
-0: ldmdb r1!, {r3,r4,r9,ip}
- sub r2, r2, #0x10
- stmdb r0!, {r3,r4,r9,ip}
- tst r2, #0x30
- bne 0b
-1: tst r2, #0xf
- beq 2f
- lsls ip, r2, #29
- ldmdbcs r1!, {r3,ip}
- stmdbcs r0!, {r3,ip}
- ldrmi r3, [r1, #-4]!
- strmi r3, [r0, #-4]!
- lsls ip, r2, #31
- ldrhcs r3, [r1, #-2]!
- strhcs r3, [r0, #-2]!
- ldrbmi ip, [r1, #-1]
- strbmi ip, [r0, #-1]
-2: CLEAR_FRAME_AND_RETURN
-
-/*****************************************************************************
- * descending copy, source buffer is not word aligned *
- *****************************************************************************/
-
-L_descendingUnalignedCopy:
-// Destination buffer is word aligned, but source buffer is not. Copy
-// byte-by-byte until the destination buffer has eightbyte alignment.
- subs r2, #4
- blo L_descendingUnalignedByteCleanup
-0: tst r0, #0x7
- beq L_descendingUnalignedVectorCopy
- ldrb r3, [r1, #-1]!
- subs r2, #1
- strb r3, [r0, #-1]!
- bhs 0b
-L_descendingUnalignedByteCleanup:
- adds r2, #8
- beq 1f
-0: ldrb r3, [r1, #-1]!
- subs r2, #1
- strb r3, [r0, #-1]!
- bne 0b
-1: CLEAR_FRAME_AND_RETURN
-
-L_descendingUnalignedVectorCopy:
-// Destination buffer is eightbyte aligned. Source buffer has unknown
-// alignment. Use NEON to handle the misaligned copies. We begin by copying
-// up to 24 bytes to get cacheline alignment of the destination buffer.
- subs r2, #0x18
- blo L_descendingUnalignedVectorCleanup
-0: tst r0, #0x18
- beq L_descendingUnalignedCachelineCopy
- sub r1, #8
- vld1.8 {d0}, [r1]
- sub r0, #8
- vst1.8 {d0}, [r0,:64]
- subs r2, #8
- bhs 0b
-L_descendingUnalignedVectorCleanup:
- adds r2, #0x18
- blo L_descendingUnalignedByteCleanup
-0: sub r1, #8
- vld1.8 {d0}, [r1]
- sub r0, #8
- vst1.8 {d0}, [r0,:64]
- subs r2, #8
- bhs 0b
- b L_descendingUnalignedByteCleanup
-
-L_descendingUnalignedCachelineCopy:
-// Main copy loop; moves 32 bytes per iteration. Requires only byte alignment
-// of the source address.
- sub r1, #32
- sub r0, #32
- mov r4, #-32
-0: vld1.8 {q0,q1},[r1], r4
- pld [r1, #-0x60]
- vst1.8 {q0,q1},[r0,:256], r4
- subs r2, #0x20
- bhs 0b
- add r1, #32
- add r0, #32
- b L_descendingUnalignedVectorCleanup
-
-#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD