Loading...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | /* * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this * file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_LICENSE_HEADER_END@ */ // *************** // * S T R C M P * // *************** // // int strcmp(const char *s1, const char *s2); // // We optimize the compare by doing it in parallel, using SSE. This introduces // a complication: if we blindly did vector loads from both sides until // finding a difference (or 0), we might get a spurious page fault by // reading bytes past the difference. To avoid this, we never do a load // that crosses a page boundary. .text .globl _strcmp .align 4 _strcmp: // int strcmp(const char *s1,const char *s2); pushl %esi pushl %edi movl 12(%esp),%esi // get LHS ptr movl 16(%esp),%edi // get RHS ptr // In order to avoid spurious page faults, we loop over: // // min( bytes_in_LHS_page, bytes_in_RHS_page) >> 4 // // 16-byte chunks. When we near a page end, we have to revert to a byte-by-byte // comparison until reaching the next page, then resume the vector comparison. // %esi = LHS ptr // %edi = RHS ptr LNextChunk: movl %esi,%eax // copy ptrs movl %edi,%edx andl $4095,%eax // mask down to page offsets andl $4095,%edx cmpl %eax,%edx // which is bigger? cmova %edx,%eax // %eax = max(LHS offset, RHS offset); movl $4096,%edx subl %eax,%edx // get #bytes to next page crossing movl %edx,%eax shrl $4,%edx // get #chunks till end of operand or page jnz LLoopOverChunks // enter vector loop movl %eax,%edx // no chunks... jmp LLoopOverBytes // ...so loop over bytes until page end // Loop over bytes. // %esi = LHS ptr // %edi = RHS ptr // %edx = byte count .align 4,0x90 // align inner loops to optimize I-fetch LLoopOverBytes: movzb (%esi),%eax // get LHS byte movzb (%edi),%ecx // get RHS byte inc %esi inc %edi testl %eax,%eax // 0? jz LExit0 // yes, we're done subl %ecx,%eax // compare them jnz LExit // done if not equal dec %edx // more to go? jnz LLoopOverBytes jmp LNextChunk // we've come to end of page // Loop over 16-byte chunks. // %esi = LHS ptr // %edi = RHS ptr // %edx = chunk count .align 4,0x90 // align inner loops to optimize I-fetch LLoopOverChunks: movdqu (%esi),%xmm1 // get LHS movdqu (%edi),%xmm2 // get RHS pxor %xmm0,%xmm0 // get some 0s in the shadow of the loads addl $16,%esi pcmpeqb %xmm1,%xmm2 // compare LHS to RHS pcmpeqb %xmm1,%xmm0 // compare LHS to 0s addl $16,%edi pmovmskb %xmm2,%eax // get result mask for comparison of LHS and RHS pmovmskb %xmm0,%ecx // get result mask for 0 check xorl $0xFFFF,%eax // complement compare mask so 1 means "not equal" orl %ecx,%eax // combine the masks and check for 1-bits jnz LFoundDiffOr0 // we found differing bytes or a 0-byte dec %edx // more to go? jnz LLoopOverChunks jmp LNextChunk // compare up to next page boundary // Found a zero and/or a difference in vector compare. // %esi = LHS ptr, already advanced by 16 // %edi = RHS ptr, already advanced by 16 // %eax = bit n set if bytes n differed or were 0 LFoundDiffOr0: bsf %eax,%edx // which byte differed or was 0? subl $16,%esi // point to start of vectors while we wait for bit scan subl $16,%edi movzb (%esi,%edx),%eax // get LHS byte movzb (%edi,%edx),%ecx // get RHS byte subl %ecx,%eax // compute difference (ie, return value) popl %edi popl %esi ret // Found a zero and/or difference in byte loop. // %eax = LHS byte // %ecx = RHS byte LExit0: subl %ecx,%eax // compute difference (ie, return value) LExit: // here with difference already in %eax popl %edi popl %esi ret |