Loading...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 | /* * Copyright (c) 2016-2021 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * extern uint32_t os_cpu_copy_in_cksum(const void *src, void *dst, * uint32_t len, uint32_t sum0); * * input : * src : source starting address * dst : destination starting address * len : byte stream length * sum0 : initial 32-bit sum * * output : * the source byte stream is copied into the destination buffer * the function returns the partial 16-bit checksum accumulated * in a 32-bit variable (without 1's complement); caller is * responsible for folding the 32-bit sum into 16-bit and * performing the 1's complement if applicable */ /* * the following definitions default the implementation to little-endian * architectures */ #define LITTLE_ENDIAN 1 #define BYTE_ORDER LITTLE_ENDIAN /* * renaming registers to ease code porting from arm64 */ #define v0 q0 #define v1 q1 #define v2 q2 #define v3 q3 #define v8 q8 #define v9 q9 #define v10 q10 #define v11 q11 #define v12 q12 #define v13 q13 #define v14 q14 #define v15 q15 .syntax unified .align 2 .code 16 .thumb_func _os_cpu_copy_in_cksum .text .globl _os_cpu_copy_in_cksum _os_cpu_copy_in_cksum: #define src r0 #define dst r1 #define len r2 #define sum r3 #define need_swap r4 #define partial r5 #define t r12 push {r4,r5,r7,lr} add r7, sp, #8 /* set up base pointer for debug tracing */ cmp len, #0 mov partial, #0 /* partial = 0; */ mov need_swap, #0 /* needs_swap = 0; */ cbnz len, 0f b L_len_0 0: /* * Deal with odd-addressed byte, use w7 to store temporary sum, deposit this * byte to high byte of 16-bit in w7 * * t = 0; * if ((uintptr_t)src & 1) { * t = *src << 8; * *dst++ = *src++; * --len; * } */ tst src, #1 beq 1f ldrb partial, [src] add src, src, #1 strb partial, [dst], #1 #if BYTE_ORDER == LITTLE_ENDIAN lsl partial, partial, #8 #endif subs len, len, #1 mov need_swap, #1 beq L_len_0 1: #ifdef KERNEL vpush {v8-v15} vpush {v0-v3} #endif /* * pre-decrement len by 8*16, and if less tha 8*16 bytes, try * 4*16 bytes next. * v0,v1 will store temp result after we exit the L128 loop */ veor v0, v0, v0 veor v1, v1, v1 cmp len, #8*16 vmov s0, partial /* move partial to 1st 64b lane in v0 */ blt L64_bytes /* * accumulate 8 x 2 x 16-bit pairs into 16 lanes in v0-v3 * branch to finish off if len<128 */ vld1.8 {q8,q9}, [src]! veor v2, v2, v2 vld1.8 {q10,q11}, [src]! veor v3, v3, v3 vld1.8 {q12,q13}, [src]! subs len, len, #2*8*16 vld1.8 {q14,q15}, [src]! blt L128_finishup /* * loop for loading and accumulating 16 32-bit words nto 8 8-byte * accumulators per iteration */ L128_loop: vpadal.u16 v0, v8 vst1.8 {q8,q9}, [dst]! vpadal.u16 v1, v9 vld1.8 {q8,q9}, [src]! vpadal.u16 v2, v10 vst1.8 {q10,q11}, [dst]! vpadal.u16 v3, v11 vld1.8 {q10,q11}, [src]! vpadal.u16 v0, v12 vst1.8 {q12,q13}, [dst]! vpadal.u16 v1, v13 vld1.8 {q12,q13}, [src]! vpadal.u16 v2, v14 vst1.8 {q14,q15}, [dst]! vpadal.u16 v3, v15 vld1.8 {q14,q15}, [src]! subs len, len, #8*16 bge L128_loop L128_finishup: vpadal.u16 v0, v8 vst1.8 {q8,q9}, [dst]! vpadal.u16 v1, v9 vpadal.u16 v2, v10 vst1.8 {q10,q11}, [dst]! vpadal.u16 v3, v11 vpadal.u16 v0, v12 vst1.8 {q12,q13}, [dst]! vpadal.u16 v1, v13 vpadal.u16 v2, v14 vst1.8 {q14,q15}, [dst]! vpadal.u16 v3, v15 add len, len, #8*16 vadd.i32 v0, v0, v2 vadd.i32 v1, v1, v3 L64_bytes: cmp len, #4*16 blt L32_bytes vld1.8 {q8,q9}, [src]! vld1.8 {q10,q11}, [src]! vpadal.u16 v0, v8 vst1.8 {q8,q9}, [dst]! vpadal.u16 v1, v9 vpadal.u16 v0, v10 vst1.8 {q10,q11}, [dst]! vpadal.u16 v1, v11 sub len, len, #4*16 L32_bytes: cmp len, #2*16 blt L16_bytes vld1.8 {q8,q9}, [src]! vpadal.u16 v0, v8 vst1.8 {q8,q9}, [dst]! vpadal.u16 v1, v9 sub len, len, #2*16 L16_bytes: vadd.i32 v0, v0, v1 cmp len, #16 blt L8_bytes vld1.8 {q8}, [src]! vpadal.u16 v0, v8 vst1.8 {q8}, [dst]! sub len, len, #16 L8_bytes: veor v1, v1, v1 tst len, #8 beq L4_bytes vld1.8 {d2}, [src]! vst1.8 {d2}, [dst]! vpadal.u16 v0, v1 L4_bytes: ands len, len, #7 vpadd.i32 d0, d0, d1 vpadd.i32 d0, d0, d1 vmov partial, s0 #ifdef KERNEL vpop {q0-q1} vpop {q2-q3} vpop {q8-q9} vpop {q10-q11} vpop {q12-q13} vpop {q14-q15} #endif beq L_len_0 subs len, len, #2 blt L_trailing_bytes L2_bytes: ldrh t, [src], #2 strh t, [dst], #2 add partial, partial, t subs len, len, #2 bge L2_bytes L_trailing_bytes: tst len, #1 beq L_len_0 ldrb t,[src],#1 strb t,[dst],#1 #if BYTE_ORDER != LITTLE_ENDIAN lsl t, t, #8 #endif add partial, partial, t L_len_0: /* * if (needs_swap) * partial = (partial << 8) + (partial >> 24); */ cbz need_swap, 1f lsl t, partial, #8 add partial, t, partial, lsr #24 1: movw lr, #0xffff /* final_acc = (sum0 >> 16) + (sum0 & 0xffff); */ and r0, sum, lr add r0, r0, sum, lsr #16 /* final_acc += (partial >> 16) + (partial & 0xffff); */ add r0, r0, partial, lsr #16 and partial, partial, lr add r0, r0, partial /* final_acc = (final_acc >> 16) + (final_acc & 0xffff); */ and t, r0, lr add r0, t, r0, lsr #16 /* * One final fold in case of carry from the previous one. * final_acc = (final_acc >> 16) + (final_acc & 0xffff); */ and t, r0, lr add r0, t, r0, lsr #16 /* * return (~final_acc & 0xffff); * * mvn r0, r0 * and r0, r0, lr */ pop {r4,r5,r7,pc} |