Loading...
ppc/string/memcmp.s Libc-320.1.3 Libc-391
--- Libc/Libc-320.1.3/ppc/string/memcmp.s
+++ Libc/Libc-391/ppc/string/memcmp.s
@@ -24,6 +24,13 @@
 #include <mach/ppc/asm.h>
 #undef	ASSEMBLER
 
+/* We use mode-independent "g" opcodes such as "srgi".  These expand
+ * into word operations when targeting __ppc__, and into doubleword
+ * operations when targeting __ppc64__.
+ */
+#include <architecture/ppc/mode_independent_asm.h>
+
+
 // ***************     ***********
 // * M E M C M P * and * B C M P *
 // ***************     ***********
@@ -39,31 +46,33 @@
 // finding a difference, we might get a spurious page fault by
 // reading bytes past the difference.  To avoid this, we never do a "lwz"
 // that crosses a page boundary.
+//
+// In 64-bit mode, this routine is doubleword parallel.
 
         .text
         .globl EXT(memcmp)
         .globl EXT(bcmp)
 
         .align 	5
-LEXT(memcmp)							// int memcmp(const char *s1,const char *s2,size_t len);
+LEXT(memcmp)                        // int memcmp(const char *s1,const char *s2,size_t len);
 LEXT(bcmp)							// int   bcmp(const char *s1,const char *s2,size_t len);
-        cmplwi	cr1,r5,8			// is buffer too short to bother with word compares?
-        andi.	r0,r3,3				// is LHS word aligned?
+        cmplgi	cr1,r5,2*GPR_BYTES  // is buffer too short to bother with parallel compares?
+        andi.	r0,r3,GPR_BYTES-1   // is LHS aligned?
         blt		cr1,Lshort			// short buffer, so just compare byte-by-byte
         beq		Laligned			// skip if aligned
-        subfic	r0,r0,4				// r0 <- #bytes to word align LHS
+        subfic	r0,r0,GPR_BYTES     // r0 <- #bytes to align LHS
         mtctr	r0					// set up for byte loop
         b		Lbyteloop
         
 // Handle short buffer or end-of-buffer.
 //		r3 = LHS ptr (unaligned)
 //		r4 = RHS ptr (unaligned)
-//		r5 = length remaining in buffer (0..7)
+//		r5 = length remaining in buffer (0..2*GPR_BYTES-1)
 
 Lshort:
-        cmpwi	r5,0				// null buffer?
+        cmpgi	r5,0				// null buffer?
         mtctr	r5					// assume not null, and set up for loop
-        bne		Lshortloop			// buffer not null
+        bne     Lshortloop			// buffer not null
         li		r3,0				// say "equal"
         blr
         
@@ -79,14 +88,14 @@
         sub		r3,r7,r8			// generate return value
         blr 
 
-// We're at a RHS page boundary.  Compare 4 bytes in order to cross the
-// page but still keep the LHS ptr word-aligned.
+// We're at a RHS page boundary.  Compare GPR_BYTES bytes in order to cross the
+// page but still keep the LHS ptr aligned.
 
 Lcrosspage:
-        cmplwi	r5,8				// enough bytes left to use word compares?
-        li		r0,4				// get #bytes to cross RHS page
+        cmplgi	r5,2*GPR_BYTES      // enough bytes left to use parallel compares?
+        li		r0,GPR_BYTES        // get #bytes to cross RHS page
         blt		Lshort				// buffer is about to end
-        mtctr	r0					// set up to compare 4 bytes
+        mtctr	r0
         b		Lbyteloop
         
 // Compare byte-by-byte.
@@ -105,14 +114,14 @@
         cmpw	r7,r8				// compare the bytes
         bdnzt	eq,Lbyteloop		// loop if more to go and bytes are equal
         
-        bne		Ldifferent			// done if we found differing bytes
+        bne     Ldifferent			// done if we found differing bytes
                 
-// LHS is now word aligned.  Loop over words until end of RHS page or buffer.
-// When we get to the end of the page, we compare 4 bytes, so that we keep
-// the LHS word aligned.
+// LHS is now aligned.  Loop over words/doublewords until end of RHS page or buffer.
+// When we get to the end of the page, we compare 4/8 bytes, so that we keep
+// the LHS aligned.
 //		r3 = LHS ptr (aligned)
 //		r4 = RHS ptr (unaligned)
-//		r5 = length remaining in buffer (>= 4 bytes)
+//		r5 = length remaining in buffer (>= GPR_BYTES bytes)
 
 Laligned:
         rlwinm	r9,r4,0,0xFFF		// get RHS offset in page
@@ -121,40 +130,45 @@
         subfe	r8,r5,r5			// * r9 <- min(r0,r5),
         and		r7,r7,r8			// * using algorithm in Compiler Writer's Guide
         add		r9,r0,r7			// ***
-        srwi.	r8,r9,2				// get #words we can compare
-        rlwinm	r9,r9,0,0,29		// get #bytes we will compare word-parallel
+        srgi.	r8,r9,LOG2_GPR_BYTES// get #words/doublewords we can compare
+        clrrgi  r9,r9,LOG2_GPR_BYTES// get #bytes we will compare word-parallel
         beq--	Lcrosspage			// we're at a RHS page boundary
         mtctr	r8					// set up loop count
         sub		r5,r5,r9			// decrement length remaining
         b		Lwordloop
         
-// Compare a word at a time, until one of two conditions:
+// Compare a word or doubleword at a time, until one of two conditions:
 //		- a difference is found
 //		- end of count (ie, end of buffer or RHS page, whichever is first)
 // At this point, registers are as follows:
 //		r3 = LHS ptr (aligned)
 //		r4 = RHS ptr (unaligned)
 //		r5 = length remaining in buffer (may be 0)
-//     ctr = count of words until end of buffer or RHS page
+//     ctr = count of word/doublewords until end of buffer or RHS page
 
-        .align	5					// align inner loop, which is 8 words long
+        .align	5					// align inner loop
 Lwordloop:
-        lwz		r7,0(r3)			// r7 <- next 4 LHS bytes
-        addi	r3,r3,4
-        lwz		r8,0(r4)			// r8 <- next 4 RHS bytes
-        addi	r4,r4,4
-        xor.	r11,r7,r8			// compare the words
+        lg		r7,0(r3)			// r7 <- next aligned LHS word or doubleword
+        addi	r3,r3,GPR_BYTES
+        lg		r8,0(r4)			// r8 <- next unaligned RHS word or doubleword
+        addi	r4,r4,GPR_BYTES
+        xor.	r11,r7,r8			// compare them
         bdnzt	eq,Lwordloop		// loop if ctr!=0 and cr0_eq
         
-        beq--	Lcrosspage			// skip if buffer or page end reached
+        beq     Lcrosspage			// skip if buffer or page end reached wo difference
         
 // Found differing bytes.
 
-        cntlzw	r0,r11				// find 1st difference (r0 = 0..31)
-        rlwinm	r9,r0,0,0x18		// byte align bit offset (r9 = 0,8,16, or 24)
-        addi	r0,r9,8				// now, r0 = 8, 16, 24, or 32
+        cntlzg	r0,r11				// find 1st difference (r0 = 0..31 or 63)
+        rlwinm	r9,r0,0,0x38		// byte align bit offset (r9 = 0,8,16, or 24 etc)
+        addi	r0,r9,8				// now, r0 = 8, 16, 24, or 32 etc
+#if defined(__ppc__)
         rlwnm	r7,r7,r0,24,31		// right justify differing bytes and mask off rest
         rlwnm	r8,r8,r0,24,31
+#else
+        rldcl   r7,r7,r0,56         // right justify differing bytes and mask off rest
+        rldcl   r8,r8,r0,56
+#endif
 
 Ldifferent:							// bytes in r7 and r8 differ
         sub		r3,r7,r8			// compute return value