bcopy_Generic.s diff - arm/string/bcopy_Generic.s - Libc source code Libc-763.13

arm/string/bcopy_Generic.s Libc-763.13 ⇄ Libc-825.25
--- Libc/Libc-763.13/arm/string/bcopy_Generic.s
+++ Libc/Libc-825.25/arm/string/bcopy_Generic.s
@@ -21,3 +21,387 @@
  * @APPLE_LICENSE_HEADER_END@
  */
 
+/*****************************************************************************
+ * ARMv5 and ARMv6 implementation, also used in dyld on later archs          *
+ *****************************************************************************/
+ 
+#include <arm/arch.h>
+#if !defined _ARM_ARCH_7 || defined VARIANT_DYLD
+
+.text
+.align 2
+	
+	.globl _memcpy
+	.globl _bcopy
+	.globl _memmove
+
+_bcopy:		/* void bcopy(const void *src, void *dest, size_t len); */
+	mov		r3, r0
+	mov		r0, r1
+	mov		r1, r3
+
+_memcpy:		/* void *memcpy(void *dest, const void *src, size_t len); */
+_memmove: 	/* void *memmove(void *dest, const void *src, size_t len); */
+	/* check for zero len or if the pointers are the same */
+	cmp		r2, #0
+	cmpne	r0, r1
+	bxeq	lr
+
+	/* save r0 (return value), r4 (scratch), and r5 (scratch) */
+	stmfd	sp!, { r0, r4, r5, r7, lr }
+	add	r7, sp, #12
+	
+	/* check for overlap. r3 <- distance between src & dest */
+	subhs	r3, r0, r1
+	sublo	r3, r1, r0
+	cmp		r3, r2			/* if distance(src, dest) < len, we have overlap */
+	blo		Loverlap
+
+Lnormalforwardcopy:
+	/* are src and dest dissimilarly word aligned? */
+	mov		r12, r0, lsl #30
+	cmp		r12, r1, lsl #30
+	bne		Lnonwordaligned_forward
+
+	/* if len < 64, do a quick forward copy */
+	cmp		r2, #64
+	blt		Lsmallforwardcopy
+
+	/* check for 16 byte src/dest unalignment */
+	tst		r0, #0xf
+	bne		Lsimilarlyunaligned
+
+	/* check for 32 byte dest unalignment */
+	tst		r0, #(1<<4)
+	bne		Lunaligned_32
+
+Lmorethan64_aligned:
+	/* save some more registers to use in the copy */
+	stmfd	sp!, { r6, r8, r10, r11 }
+
+	/* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
+	sub		r2, r2, #64
+
+L64loop:
+	/* copy 64 bytes at a time */
+	ldmia	r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
+#ifdef _ARM_ARCH_6
+	pld		[r1, #32]
+#endif
+	stmia	r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
+	ldmia	r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
+	subs	r2, r2, #64
+#ifdef _ARM_ARCH_6
+	pld		[r1, #32]
+#endif
+	stmia	r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
+	bge		L64loop
+
+	/* restore the scratch registers we just saved */
+	ldmfd	sp!, { r6, r8, r10, r11 }
+
+	/* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
+	adds	r2, r2, #64
+	beq		Lexit
+
+Llessthan64_aligned:
+	/* copy 16 bytes at a time until we have < 16 bytes */
+	cmp		r2, #16
+	ldmgeia	r1!, { r3, r4, r5, r12 }
+	stmgeia	r0!, { r3, r4, r5, r12 }
+	subges	r2, r2, #16
+	bgt		Llessthan64_aligned
+	beq		Lexit
+	
+Llessthan16_aligned:
+	mov		r2, r2, lsl #28
+	msr		cpsr_f, r2
+
+	ldmmiia	r1!, { r2, r3 }
+	ldreq	r4, [r1], #4
+	ldrcsh	r5, [r1], #2
+	ldrvsb	r12, [r1], #1
+
+	stmmiia	r0!, { r2, r3 }
+	streq	r4, [r0], #4
+	strcsh	r5, [r0], #2
+	strvsb	r12, [r0], #1
+	b		Lexit
+
+Lsimilarlyunaligned:
+	/* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
+	mov		r12, r0, lsl #28
+	rsb		r12, r12, #0
+	msr		cpsr_f, r12
+
+	ldrvsb	r3, [r1], #1
+	ldrcsh	r4, [r1], #2
+	ldreq	r5, [r1], #4
+
+	strvsb	r3, [r0], #1
+	strcsh	r4, [r0], #2
+	streq	r5, [r0], #4
+
+	ldmmiia	r1!, { r3, r4 }
+	stmmiia	r0!, { r3, r4 }
+
+	subs	r2, r2, r12, lsr #28
+	beq		Lexit
+
+Lunaligned_32:
+	/* bring up to dest 32 byte alignment */
+	tst		r0, #(1 << 4)
+	ldmneia	r1!, { r3, r4, r5, r12 }
+	stmneia	r0!, { r3, r4, r5, r12 }
+	subne	r2, r2, #16
+
+	/* we should now be aligned, see what copy method we should use */
+	cmp		r2, #64
+	bge		Lmorethan64_aligned
+	b		Llessthan64_aligned
+	
+Lbytewise2:
+	/* copy 2 bytes at a time */
+	subs	r2, r2, #2
+
+	ldrb	r3, [r1], #1
+	ldrplb	r4, [r1], #1
+
+	strb	r3, [r0], #1
+	strplb	r4, [r0], #1
+
+	bhi		Lbytewise2
+	b		Lexit
+
+Lbytewise:
+	/* simple bytewise forward copy */
+	ldrb	r3, [r1], #1
+	subs	r2, r2, #1
+	strb	r3, [r0], #1
+	bne		Lbytewise
+	b		Lexit
+
+Lsmallforwardcopy:
+	/* src and dest are word aligned similarly, less than 64 bytes to copy */
+	cmp		r2, #4
+	blt		Lbytewise2
+
+	/* bytewise copy until word aligned */
+	tst		r1, #3
+Lwordalignloop:
+	ldrneb	r3, [r1], #1
+	strneb	r3, [r0], #1
+	subne	r2, r2, #1
+	tstne	r1, #3
+	bne		Lwordalignloop
+
+	cmp		r2, #16
+	bge		Llessthan64_aligned
+	blt		Llessthan16_aligned
+
+Loverlap:
+	/* src and dest overlap in some way, len > 0 */
+	cmp		r0, r1				/* if dest > src */
+	bhi		Loverlap_srclower
+
+Loverlap_destlower:
+	/* dest < src, see if we can still do a fast forward copy or fallback to slow forward copy */
+	cmp		r3, #64
+	bge		Lnormalforwardcopy 	/* overlap is greater than one stride of the copy, use normal copy */
+
+	cmp		r3, #2
+	bge		Lbytewise2
+	b		Lbytewise
+
+	/* the following routines deal with having to copy in the reverse direction */
+Loverlap_srclower:
+	/* src < dest, with overlap */
+
+	/* src += len; dest += len; */
+	add		r0, r0, r2
+	add		r1, r1, r2
+
+	/* we have to copy in reverse no matter what, test if we can we use a large block reverse copy */
+	cmp		r2, #64				/* less than 64 bytes to copy? */
+	cmpgt	r3, #64				/* less than 64 bytes of nonoverlap? */
+	blt		Lbytewise_reverse
+
+	/* test of src and dest are nonword aligned differently */
+	mov		r3, r0, lsl #30
+	cmp		r3, r1, lsl #30
+	bne		Lbytewise_reverse
+
+	/* test if src and dest are non word aligned or dest is non 16 byte aligned */
+	tst		r0, #0xf
+	bne		Lunaligned_reverse_similarly
+
+	/* test for dest 32 byte alignment */
+	tst		r0, #(1<<4)
+	bne		Lunaligned_32_reverse_similarly
+
+	/* 64 byte reverse block copy, src and dest aligned */
+Lmorethan64_aligned_reverse:
+	/* save some more registers to use in the copy */
+	stmfd	sp!, { r6, r8, r10, r11 }
+
+	/* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
+	sub		r2, r2, #64
+
+L64loop_reverse:
+	/* copy 64 bytes at a time */
+	ldmdb	r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
+#ifdef _ARM_ARCH_6
+	pld		[r1, #-32]
+#endif
+	stmdb	r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }	
+	ldmdb	r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }	
+	subs	r2, r2, #64
+#ifdef _ARM_ARCH_6
+	pld		[r1, #-32]
+#endif
+	stmdb	r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }	
+	bge		L64loop_reverse
+
+	/* restore the scratch registers we just saved */
+	ldmfd	sp!, { r6, r8, r10, r11 }
+
+	/* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
+	adds	r2, r2, #64
+	beq		Lexit
+
+Lbytewise_reverse:
+	ldrb	r3, [r1, #-1]!
+	strb	r3, [r0, #-1]!
+	subs	r2, r2, #1
+	bne		Lbytewise_reverse
+	b		Lexit
+
+Lunaligned_reverse_similarly:
+	/* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
+	mov		r12, r0, lsl #28
+	msr		cpsr_f, r12
+
+	ldrvsb	r3, [r1, #-1]!
+	ldrcsh	r4, [r1, #-2]!
+	ldreq	r5, [r1, #-4]!
+
+	strvsb	r3, [r0, #-1]!
+	strcsh	r4, [r0, #-2]!
+	streq	r5, [r0, #-4]!
+
+	ldmmidb	r1!, { r3, r4 }
+	stmmidb	r0!, { r3, r4 }
+
+	subs	r2, r2, r12, lsr #28
+	beq		Lexit
+
+Lunaligned_32_reverse_similarly:
+	/* bring up to dest 32 byte alignment */
+	tst		r0, #(1 << 4)
+	ldmnedb	r1!, { r3, r4, r5, r12 }
+	stmnedb	r0!, { r3, r4, r5, r12 }
+	subne	r2, r2, #16
+
+	/* we should now be aligned, see what copy method we should use */
+	cmp		r2, #64
+	bge		Lmorethan64_aligned_reverse
+	b		Lbytewise_reverse
+
+	/* the following routines deal with non word aligned copies */
+Lnonwordaligned_forward:
+	cmp		r2, #8
+	blt		Lbytewise2			/* not worth the effort with less than 24 bytes total */
+
+	/* bytewise copy until src word aligned */
+	tst		r1, #3
+Lwordalignloop2:
+	ldrneb	r3, [r1], #1
+	strneb	r3, [r0], #1
+	subne	r2, r2, #1
+	tstne	r1, #3
+	bne		Lwordalignloop2
+
+	/* figure out how the src and dest are unaligned */
+	and		r3, r0, #3
+	cmp		r3, #2
+	blt		Lalign1_forward
+	beq		Lalign2_forward
+	bgt		Lalign3_forward
+
+Lalign1_forward:
+	/* the dest pointer is 1 byte off from src */
+	mov		r12, r2, lsr #2		/* number of words we should copy */
+	sub		r0, r0, #1
+
+	/* prime the copy */
+	ldrb	r4, [r0]			/* load D[7:0] */
+
+Lalign1_forward_loop:
+	ldr		r3, [r1], #4		/* load S */
+	orr		r4, r4, r3, lsl #8	/* D[31:8] = S[24:0] */
+	str		r4, [r0], #4		/* save D */
+	mov		r4, r3, lsr #24		/* D[7:0] = S[31:25] */
+	subs	r12, r12, #1
+	bne		Lalign1_forward_loop
+
+	/* finish the copy off */
+	strb	r4, [r0], #1		/* save D[7:0] */
+
+	ands	r2, r2, #3
+	beq		Lexit
+	b		Lbytewise2
+
+Lalign2_forward:
+	/* the dest pointer is 2 bytes off from src */
+	mov		r12, r2, lsr #2		/* number of words we should copy */
+	sub		r0, r0, #2
+
+	/* prime the copy */
+	ldrh	r4, [r0]			/* load D[15:0] */
+
+Lalign2_forward_loop:
+	ldr		r3, [r1], #4		/* load S */
+	orr		r4, r4, r3, lsl #16	/* D[31:16] = S[15:0] */
+	str		r4, [r0], #4		/* save D */
+	mov		r4, r3, lsr #16		/* D[15:0] = S[31:15] */
+	subs	r12, r12, #1
+	bne		Lalign2_forward_loop
+
+	/* finish the copy off */
+	strh	r4, [r0], #2		/* save D[15:0] */
+
+	ands	r2, r2, #3
+	beq		Lexit
+	b		Lbytewise2
+
+Lalign3_forward:
+	/* the dest pointer is 3 bytes off from src */
+	mov		r12, r2, lsr #2		/* number of words we should copy */
+	sub		r0, r0, #3
+
+	/* prime the copy */
+	ldr		r4, [r0]
+	and		r4, r4, #0x00ffffff	/* load D[24:0] */
+
+Lalign3_forward_loop:
+	ldr		r3, [r1], #4		/* load S */
+	orr		r4, r4, r3, lsl #24	/* D[31:25] = S[7:0] */
+	str		r4, [r0], #4		/* save D */
+	mov		r4, r3, lsr #8		/* D[24:0] = S[31:8] */
+	subs	r12, r12, #1
+	bne		Lalign3_forward_loop
+
+	/* finish the copy off */
+	strh	r4, [r0], #2		/* save D[15:0] */
+	mov		r4, r4, lsr #16
+	strb	r4, [r0], #1		/* save D[23:16] */
+
+	ands	r2, r2, #3
+	beq		Lexit
+	b		Lbytewise2
+
+Lexit:
+	ldmfd	sp!, {r0, r4, r5, r7, pc}
+
+#endif // !defined _ARM_ARCH_7 || defined VARIANT_DYLD
+