Loading...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 | /* * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * * The contents of this file constitute Original Code as defined in and * are subject to the Apple Public Source License Version 1.1 (the * "License"). You may not use this file except in compliance with the * License. Please obtain a copy of the License at * http://www.apple.com/publicsource and read it before using this file. * * This Original Code and all software distributed under the License are * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the * License for the specific language governing rights and limitations * under the License. * * @APPLE_LICENSE_HEADER_END@ */ #include <ppc/asm.h> #include <ppc/exception.h> #include <assym.s> .text .align 2 .globl _memset .globl _bzero .globl _bzero_nc .globl _bzero_phys // *********************** // * B Z E R O _ P H Y S * // *********************** // // void bzero_phys(addr64_t phys_addr, uint32_t length); // // Takes a phys addr in (r3,r4), and length in r5. We leave cache on. .align 5 LEXT(bzero_phys) mflr r12 // save return address rlwinm r3,r3,0,1,0 // coallesce long-long in (r3,r4) into reg64_t in r3 rlwimi r3,r4,0,0,31 mr r4,r5 // put length where bzero() expects it bl EXT(ml_set_physical_get_ffs) // turn DR off, SF on, features in cr6, old MSR in r11 bl EXT(bzero) // use normal bzero() routine mtlr r12 // restore return b EXT(ml_restore) // restore MSR, turning DR on and SF off // ******************* // * B Z E R O _ N C * // ******************* // // void bzero_nc(char *addr, unsigned int length); // // For use with uncached memory. Doesn't seem to be used at all, so probably not // performance critical. NB: we must avoid unaligned stores, because some // machines (eg, 970) take alignment exceptions on _any_ unaligned op to uncached // memory. Of course, we must also avoid dcbz. LEXT(bzero_nc) cmplwi cr1,r4,20 // too short to bother with 16-byte loops? cmplwi cr7,r4,0 // check for (len==0) li r6,0 // get a 0 bge cr1,bznc1 // skip if length >=20 mtctr r4 // set up byte loop beqlr-- cr7 // done if len=0 // Short operands, loop over bytes. bznc0: stb r6,0(r3) addi r3,r3,1 bdnz bznc0 blr // Handle operands long enough to do doubleword stores; we must doubleword // align, to avoid alignment exceptions. bznc1: neg r7,r3 // start to compute #bytes to align mfsprg r10,2 // get feature flags andi. r0,r7,7 // get #bytes to doubleword align mr r5,r3 // make copy of operand ptr as bcopy expects mtcrf 0x02,r10 // put pf64Bitb etc in cr6 beq bzero_tail // already doubleword aligned sub r4,r4,r0 // adjust count mtctr r0 // set up loop bznc2: // zero bytes until doubleword aligned stb r6,0(r5) addi r5,r5,1 bdnz bznc2 b bzero_tail // join bzero, now that r5 is aligned // ************* *************** // * B Z E R O * and * M E M S E T * // ************* *************** // // void * memset(void *b, int c, size_t len); // void bzero(void *b, size_t len); // // These routines support G3, G4, and the 970, and run in both 32 and // 64-bit mode. Lengths (size_t) are always 32 bits. // // Register use: // r0 = temp // r2 = temp // r3 = original ptr, not changed since memset returns it // r4 = count of bytes to set // r5 = working operand ptr ("rp") // r6 = value to store (usually 0) // r7-r9 = temps // r10 = feature flags // r11 = old MSR (if bzero_phys) // r12 = return address (if bzero_phys) // cr6 = feature flags (pf64Bit, pf128Byte, and pf32Byte) .align 5 LEXT(memset) // void * memset(void *b, int c, size_t len); andi. r6,r4,0xFF // copy value to working register, test for 0 mr r4,r5 // move length to working register bne-- memset1 // skip if nonzero LEXT(bzero) // void bzero(void *b, size_t len); dcbtst 0,r3 // touch in 1st cache block mfsprg r10,2 // get features li r6,0 // get a 0 neg r7,r3 // start to compute #bytes to align andi. r0,r10,pf128Byte+pf32Byte // get cache line size mtcrf 0x02,r10 // put pf128Byte etc in cr6 cmplw r4,r0 // operand length >= cache line size? mr r5,r3 // make copy of operand ptr (can't change r3) blt bzero_tail // too short for dcbz (or dcbz128) rlwinm r0,r7,0,0x1F // get #bytes to 32-byte align rlwinm r9,r7,0,0x7F // get #bytes to 128-byte align bt++ pf128Byteb,bzero_128 // skip if 128-byte processor // Operand length >=32 and cache line size is 32. // r0 = #bytes to 32-byte align // r4 = length // r5 = ptr to operand // r6 = 0 sub r2,r4,r0 // adjust length cmpwi cr1,r0,0 // already 32-byte aligned? srwi. r8,r2,5 // get #32-byte chunks beq bzero_tail // not long enough to dcbz mtctr r8 // set up loop count rlwinm r4,r2,0,27,31 // mask down to leftover byte count beq cr1,bz_dcbz32 // skip if already 32-byte aligned // 32-byte align. We just store 32 0s, rather than test and use conditional // branches. This is usually faster, because there are no mispredicts. stw r6,0(r5) // zero next 32 bytes stw r6,4(r5) stw r6,8(r5) stw r6,12(r5) stw r6,16(r5) stw r6,20(r5) stw r6,24(r5) stw r6,28(r5) add r5,r5,r0 // now r5 is 32-byte aligned b bz_dcbz32 // Loop doing 32-byte version of DCBZ instruction. .align 4 // align the inner loop bz_dcbz32: dcbz 0,r5 // zero another 32 bytes addi r5,r5,32 bdnz bz_dcbz32 // Store trailing bytes. This routine is used both by bzero and memset. // r4 = #bytes to store (may be large if memset) // r5 = address // r6 = value to store (in all 8 bytes) // cr6 = pf64Bit etc flags bzero_tail: srwi. r0,r4,4 // get #(16-byte-chunks) mtcrf 0x01,r4 // remaining byte count to cr7 beq bzt3 // no 16-byte chunks mtctr r0 // set up loop count bt++ pf64Bitb,bzt2 // skip if 64-bit processor b bzt1 .align 5 bzt1: // loop over 16-byte chunks on 32-bit processor stw r6,0(r5) stw r6,4(r5) stw r6,8(r5) stw r6,12(r5) addi r5,r5,16 bdnz bzt1 b bzt3 .align 5 bzt2: // loop over 16-byte chunks on 64-bit processor std r6,0(r5) std r6,8(r5) addi r5,r5,16 bdnz bzt2 bf 28,bzt4 // 8-byte chunk? std r6,0(r5) addi r5,r5,8 b bzt4 bzt3: bf 28,bzt4 // 8-byte chunk? stw r6,0(r5) stw r6,4(r5) addi r5,r5,8 bzt4: bf 29,bzt5 // word? stw r6,0(r5) addi r5,r5,4 bzt5: bf 30,bzt6 // halfword? sth r6,0(r5) addi r5,r5,2 bzt6: bflr 31 // byte? stb r6,0(r5) blr // Operand length is >=128 and cache line size is 128. We assume that // because the linesize is 128 bytes, this is a 64-bit processor. // r4 = length // r5 = ptr to operand // r6 = 0 // r7 = neg(r5) // r9 = #bytes to 128-byte align .align 5 bzero_128: sub r2,r4,r9 // r2 <- length remaining after cache-line aligning rlwinm r0,r7,0,0xF // r0 <- #bytes to 16-byte align srwi. r8,r2,7 // r8 <- number of cache lines to 0 std r6,0(r5) // always store 16 bytes to 16-byte align... std r6,8(r5) // ...even if too short for dcbz128 add r5,r5,r0 // 16-byte align ptr sub r4,r4,r0 // adjust count beq bzero_tail // r8==0, not long enough to dcbz128 sub. r7,r9,r0 // get #bytes remaining to 128-byte align rlwinm r4,r2,0,0x7F // r4 <- length remaining after dcbz128'ing mtctr r8 // set up dcbz128 loop beq bz_dcbz128 // already 128-byte aligned b bz_align // enter loop over 16-byte chunks // 128-byte align by looping over 16-byte chunks. .align 5 bz_align: // loop over 16-byte chunks subic. r7,r7,16 // more to go? std r6,0(r5) std r6,8(r5) addi r5,r5,16 bgt bz_align b bz_dcbz128 // enter dcbz128 loop // Loop over 128-byte cache lines. // r4 = length remaining after cache lines (0..127) // r5 = ptr (128-byte aligned) // r6 = 0 // ctr = count of cache lines to 0 .align 5 bz_dcbz128: dcbz128 0,r5 // zero a 128-byte cache line addi r5,r5,128 bdnz bz_dcbz128 b bzero_tail // handle leftovers // Handle memset() for nonzero values. This case is relatively infrequent; // the large majority of memset() calls are for 0. // r3 = ptr // r4 = count // r6 = value in lower byte (nonzero) memset1: cmplwi r4,16 // too short to bother aligning? rlwimi r6,r6,8,16,23 // replicate value to low 2 bytes mr r5,r3 // make working copy of operand ptr rlwimi r6,r6,16,0,15 // value now in all 4 bytes blt bzero_tail // length<16, we won't be using "std" mfsprg r10,2 // get feature flags neg r7,r5 // start to compute #bytes to align rlwinm r6,r6,0,1,0 // value now in all 8 bytes (if 64-bit) andi. r0,r7,7 // r6 <- #bytes to doubleword align stw r6,0(r5) // store 8 bytes to avoid a loop stw r6,4(r5) mtcrf 0x02,r10 // get pf64Bit flag etc in cr6 sub r4,r4,r0 // adjust count add r5,r5,r0 // doubleword align ptr b bzero_tail |