Loading...
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
/*
 * Copyright (c) 2009 Apple Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */

/*****************************************************************************
 * Cortex-A8 implementation                                                  *
 *****************************************************************************/
 
// Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ).
//
// Our tests have shown that NEON is always a performance win for memcpy( ).
// However, for the specific case of copies from a warm source to a cold
// destination when the buffer size is between 1k and 32k, it is not enough
// of a performance win to offset the increased power footprint, resulting
// in an energy usage regression.  Thus, we detect that particular case, and
// pass those copies through the ARM core registers.  All other copies larger
// than 8 bytes are handled on NEON.
//
// Stephen Canon, August 2009

.text
.code 16
.syntax unified

// void bcopy(const void * source,
//            void * destination,
//            size_t length);
//
// void *memmove(void * destination,
//               const void * source,
//               size_t n);
//
// void *memcpy(void * restrict destination,
//              const void * restrict source,
//              size_t n);
//
// all copy n successive bytes from source to destination. memmove and memcpy
// returns destination, whereas bcopy has no return value. copying takes place
// as if it were through a temporary buffer -- after return destination contains
// exactly the bytes from source, even if the buffers overlap.

.thumb_func _bcopy
.globl _bcopy    
.thumb_func _memmove
.globl _memmove
.thumb_func _memcpy
.globl _memcpy

.align 2
_bcopy:
	mov       r3,      r0           // swap the first and second arguments
	mov       r0,      r1           // and fall through into memmove
	mov       r1,      r3           //

.align 2
_memmove:
_memcpy:
    subs      r3,      r0,  r1      // offset = destination addr - source addr
    it        eq
    bxeq      lr                    // if source == destination, early out

//  Our preference is for using a (faster) front-to-back copy.  However, if
//  0 < offset < length, it is necessary to copy back-to-front for correctness.
//  We have already ruled out offset == 0, so we can use an unsigned compare
//  with length -- if offset is higher, offset is either greater than length
//  or negative.

    cmp       r3,      r2
    bhs       L_copyFrontToBack
                             
/*****************************************************************************
 *  back to front copy                                                       *
 *****************************************************************************/

    mov       ip,      r0           // copy destination pointer.
    add       r1,           r2      // move source pointer to end of source array
    add       ip,           r2      // move destination pointer to end of dest array
    
    subs      r2,           $8      // if length - 8 is negative (i.e. length
    blt       L_scalarReverseCopy   // is less than 8), jump to cleanup path.
    tst       ip,           $7      // if (destination + length) is doubleword
    beq       L_vectorReverseCopy   // aligned, jump to fast path.
    
0:  ldrb      r3,     [r1, $-1]!    // load byte
    sub       r2,           $1      // decrement length
    strb      r3,     [ip, $-1]!    // store byte
    tst       ip,           $7      // test alignment
    bne       0b
    
    cmp       r2,           $0      // if length - 8 is negative,
    blt       L_scalarReverseCopy   // jump to the cleanup code
                                    
/*****************************************************************************
 *  destination is doubleword aligned                                        *
 *****************************************************************************/

L_vectorReverseCopy:
    ands      r3,      r1,  $3      // Extract the alignment of the source
    bic       r1,           $3
    tbh      [pc, r3, lsl $1]       // Dispatch table on source alignment
0:  
.short (L_reverseAligned0-0b)/2     // The NEON alignment hardware does not work
.short (L_reverseAligned1-0b)/2     // properly with sub 4-byte alignment and
.short (L_reverseAligned2-0b)/2     // buffers that are uncacheable, so we need
.short (L_reverseAligned3-0b)/2     // to have a software workaround.

/*****************************************************************************
 *  source is also at least word aligned                                     *
 *****************************************************************************/
    
L_reverseAligned0:
    subs      r2,           $0x38   // if length - 64 is negative, jump to
    blt       L_reverseVectorCleanup// the cleanup path.
    tst       ip,           $0x38   // if (destination + length) is cacheline
    beq       L_reverseCachelineAligned // aligned, jump to the fast path.
    
0:  sub       r1,           $8      // copy eight bytes at a time until the
    vld1.32  {d0},    [r1]          // destination is 8 byte aligned.
    sub       ip,           $8      //
    sub       r2,           $8      //
    tst       ip,           $0x38   //
    vst1.64  {d0},    [ip, :64]     //
    bne       0b                    //
    
    cmp       r2,           $0      // if length - 64 is negative,
    blt       L_reverseVectorCleanup// jump to the cleanup code
    
L_reverseCachelineAligned:
    sub       r3,      r2,  $0x3c0  // If 1024 < length < 32768, use core
    cmp       r3,          $0x7c00  // register copies instead of NEON to
    blo       L_useSTMDB            // control energy usage.
    
    sub       r1,           $32     // decrement source
    sub       ip,           $32     // decrement destination
    mov       r3,           $-32    // load address increment
    tst       r1,           $0x1f   // if source shares 32 byte alignment
    beq       L_reverseSourceAligned// jump to loop with more alignment hints
    
    vld1.32  {q2,q3}, [r1], r3      // This loop handles 4-byte aligned copies
    vld1.32  {q0,q1}, [r1], r3      // as generally as possible.
    subs      r2,           $64     // 
    vst1.64  {q2,q3}, [ip,:256], r3 // The Cortex-A8 NEON unit does not always
    blt       1f                    // properly handle misalignment in vld1
.align 3                            // with an element size of 8 or 16, so
0:  vld1.32  {q2,q3}, [r1], r3      // this is the best we can do without
    vst1.64  {q0,q1}, [ip,:256], r3 // handling alignment in software.
    vld1.32   {q0,q1}, [r1], r3     // 
    subs      r2,           $64     // 
    vst1.64  {q2,q3}, [ip,:256], r3 // 
    bge       0b                    // 
    b         1f                    // 
    
L_reverseSourceAligned:
    vld1.64  {q2,q3}, [r1,:256], r3 // Identical to loop above except for
    vld1.64  {q0,q1}, [r1,:256], r3 // additional alignment information; this
    subs      r2,           $64     // gets an additional .5 bytes per cycle
    vst1.64  {q2,q3}, [ip,:256], r3 // on Cortex-A8.
    blt       1f                    // 
.align 3                            // 
0:  vld1.64  {q2,q3}, [r1,:256], r3 //
    vst1.64  {q0,q1}, [ip,:256], r3 //
    vld1.64  {q0,q1}, [r1,:256], r3 //
    subs      r2,           $64     //
    vst1.64  {q2,q3}, [ip,:256], r3 //
    bge       0b                    //
1:  vst1.64  {q0,q1}, [ip,:256], r3 // loop cleanup: final 32 byte store
    add       r1,           $32     // point source at last element stored
    add       ip,           $32     // point destination at last element stored
    
L_reverseVectorCleanup:
    adds      r2,           $0x38   // If (length - 8) < 0, goto scalar cleanup
    blt       L_scalarReverseCopy   //

0:  sub       r1,           $8      // copy eight bytes at a time until
    vld1.32  {d0},    [r1]          // (length - 8) < 0.
    sub       ip,           $8      //
    subs      r2,           $8      //
    vst1.64  {d0},    [ip, :64]     //
    bge       0b                    //

/*****************************************************************************
 *  sub-doubleword cleanup copies                                            *
 *****************************************************************************/

L_scalarReverseCopy:
    adds      r2,           #0x8    // restore length
    it        eq                    // if this is zero
    bxeq      lr                    // early out
         
0:  ldrb      r3,     [r1, #-1]!    // load a byte from source
    strb      r3,     [ip, #-1]!    // store to destination
    subs      r2,           #0x1    // subtract one from length
    bne       0b                    // if non-zero, repeat
    bx        lr                    // return
         
/*****************************************************************************
 *  STMDB loop for 1k-32k buffers                                            *
 *****************************************************************************/

L_useSTMDB:
    push     {r4-r8,r10,r11}
.align 3
0:  ldmdb	  r1!,  {r3-r8,r10,r11}
    subs      r2,           #0x40
    stmdb     ip!,  {r3-r8,r10,r11}
    ldmdb	  r1!,  {r3-r8,r10,r11}
	pld		 [r1, #-0x40]
    stmdb     ip!,  {r3-r8,r10,r11}
    bge       0b
    pop      {r4-r8,r10,r11}
    b         L_reverseVectorCleanup
    
/*****************************************************************************
 *  Misaligned vld1 loop                                                     *
 *****************************************************************************/

// Software alignment fixup to handle source and dest that are relatively
// misaligned mod 4 bytes.  Load two 4-byte aligned double words from source, 
// use vext.8 to extract a double word to store, and perform an 8-byte aligned
// store to destination.

#define RCOPY_UNALIGNED(offset)      \
    subs      r2,          $8       ;\
    blt       2f                    ;\
    sub       r1,          $8       ;\
    sub       ip,          $8       ;\
    mov       r3,          $-8      ;\
    vld1.32  {d2,d3}, [r1], r3      ;\
    subs      r2,          $8       ;\
    blt       1f                    ;\
0:  vext.8    d0,  d2, d3, $(offset);\
    vmov      d3,      d2           ;\
    vld1.32  {d2},    [r1], r3      ;\
    subs      r2,          $8       ;\
    vst1.64  {d0},    [ip, :64], r3 ;\
    bge       0b                    ;\
1:  vext.8    d0,  d2, d3, $(offset);\
    add       r1,          $8       ;\
    vst1.64  {d0},    [ip, :64]     ;\
2:  add       r2,          $8       ;\
    add       r1,          $(offset);\
    b         L_scalarReverseCopy

L_reverseAligned1:
    RCOPY_UNALIGNED(1)
L_reverseAligned2:
    RCOPY_UNALIGNED(2)
L_reverseAligned3:
    RCOPY_UNALIGNED(3)

/*****************************************************************************
 *  front to back copy                                                       *
 *****************************************************************************/

L_copyFrontToBack:
    mov       ip,      r0           // copy destination pointer.
    subs      r2,           $8      // if length - 8 is negative (i.e. length
    blt       L_scalarCopy          // is less than 8), jump to cleanup path.
    tst       ip,           $7      // if the destination is doubleword
    beq       L_vectorCopy          // aligned, jump to fast path.
    
0:  ldrb      r3,     [r1], $1      // load byte
    sub       r2,           $1      // decrement length
    strb      r3,     [ip], $1      // store byte
    tst       ip,           $7      // test alignment
    bne       0b
    
    cmp       r2,           $0      // if length - 8 is negative,
    blt       L_scalarCopy          // jump to the cleanup code
    
/*****************************************************************************
 *  destination is doubleword aligned                                        *
 *****************************************************************************/

L_vectorCopy:
    ands      r3,      r1,  $3      // Extract the alignment of the source
    bic       r1,           $3
    tbh      [pc, r3, lsl $1]       // Dispatch table on source alignment
0:  
.short (L_sourceAligned0-0b)/2      // The NEON alignment hardware does not work
.short (L_sourceAligned1-0b)/2      // properly with sub 4-byte alignment and
.short (L_sourceAligned2-0b)/2      // buffers that are uncacheable, so we need
.short (L_sourceAligned3-0b)/2      // to have a software workaround.

/*****************************************************************************
 *  source is also at least word aligned                                     *
 *****************************************************************************/
    
L_sourceAligned0:
    subs      r2,           $0x38   // If (length - 64) < 0
    blt       L_vectorCleanup       //   jump to cleanup code
    tst       ip,           $0x38   // If destination is 64 byte aligned
    beq       L_cachelineAligned    //   jump to main loop
    
0:  vld1.32  {d0},    [r1]!         // Copy one double word at a time until
    sub       r2,           $8      // the destination is 64-byte aligned.
    vst1.64  {d0},    [ip, :64]!    //
    tst       ip,           $0x38   //
    bne       0b                    //
    
    cmp       r2,           $0      // If (length - 64) < 0, goto cleanup
    blt       L_vectorCleanup       //
    
L_cachelineAligned:
    sub       r3,      r2,  $0x3c0  // If 1024 < length < 32768, use core
    cmp       r3,          $0x7c00  // register copies instead of NEON to
    blo       L_useSTMIA            // control energy usage.
    tst       r1,           $0x1f   // If source has 32-byte alignment, use
    beq       L_sourceAligned32     // an optimized loop.
    
    vld1.32  {q2,q3}, [r1]!         // This is the most common path for small
    vld1.32  {q0,q1}, [r1]!         // copies, which are alarmingly frequent.
    subs      r2,           #0x40   // It requires 4-byte alignment on the
    vst1.64  {q2,q3}, [ip, :256]!   // source.  For ordinary malloc'd buffers,
    blt       1f                    // this path could handle only single-byte
.align 3                            // alignment at speed by using vld1.8
0:  vld1.32  {q2,q3}, [r1]!         // instead of vld1.32; however, the NEON
    vst1.64  {q0,q1}, [ip, :256]!   // alignment handler misbehaves for some
    vld1.32  {q0,q1}, [r1]!         // special copies if the element size is
    subs      r2,           #0x40   // 8 or 16, so we need to work around
    vst1.64  {q2,q3}, [ip, :256]!   // sub 4-byte alignment in software, in
    bge       0b                    // another code path.
    b         1f
    
L_sourceAligned32:
    vld1.64  {q2,q3}, [r1, :256]!   // When the source shares 32-byte alignment
    vld1.64  {q0,q1}, [r1, :256]!   // with the destination, we use this loop
    subs      r2,           #0x40   // instead, which specifies the maximum
    vst1.64  {q2,q3}, [ip, :256]!   // :256 alignment on all loads and stores.
    blt       1f                    // 
.align 3                            // This gets an additional .5 bytes per
0:  vld1.64  {q2,q3}, [r1, :256]!   // cycle for in-cache copies, which is not
    vst1.64  {q0,q1}, [ip, :256]!   // insignificant for this (rather common)
    vld1.64  {q0,q1}, [r1, :256]!   // case.
    subs      r2,           #0x40   // 
    vst1.64  {q2,q3}, [ip, :256]!   // This is identical to the above loop,
    bge       0b                    // except for the additional alignment.
1:  vst1.64  {q0,q1}, [ip, :256]!   // 

L_vectorCleanup:
    adds      r2,           $0x38   // If (length - 8) < 0, goto scalar cleanup
    blt       L_scalarCopy          //
    
0:  vld1.32  {d0},    [r1]!         // Copy one doubleword at a time until
    subs      r2,           $8      // (length - 8) < 0.
    vst1.64  {d0},    [ip, :64]!    //
    bge       0b                    //

/*****************************************************************************
 *  sub-doubleword cleanup copies                                            *
 *****************************************************************************/

L_scalarCopy:
    adds      r2,           #0x8    // restore length
    it        eq                    // if this is zero
    bxeq      lr                    // early out
         
0:  ldrb      r3,     [r1], #1      // load a byte from source
    strb      r3,     [ip], #1      // store to destination
    subs      r2,           #1      // subtract one from length
    bne       0b                    // if non-zero, repeat
    bx        lr                    // return
    
/*****************************************************************************
 *  STMIA loop for 1k-32k buffers                                            *
 *****************************************************************************/

L_useSTMIA:
    push     {r4-r8,r10,r11}
.align 3
0:  ldmia     r1!,  {r3-r8,r10,r11}
    subs      r2,      r2,  #64
    stmia     ip!,  {r3-r8,r10,r11}
    ldmia     r1!,  {r3-r8,r10,r11}
    pld      [r1, #64]
    stmia     ip!,  {r3-r8,r10,r11}
    bge       0b
    pop      {r4-r8,r10,r11}
    b         L_vectorCleanup
    
/*****************************************************************************
 *  Misaligned reverse vld1 loop                                             *
 *****************************************************************************/

// Software alignment fixup to handle source and dest that are relatively
// misaligned mod 4 bytes.  Load two 4-byte aligned double words from source, 
// use vext.8 to extract a double word to store, and perform an 8-byte aligned
// store to destination.

#define COPY_UNALIGNED(offset)       \
    subs      r2,          $8       ;\
    blt       2f                    ;\
    vld1.32  {d2,d3}, [r1]!         ;\
    subs      r2,          $8       ;\
    blt       1f                    ;\
0:  vext.8    d0,  d2, d3, $(offset);\
    vmov      d2,      d3           ;\
    vld1.32  {d3},    [r1]!         ;\
    subs      r2,          $8       ;\
    vst1.64  {d0},    [ip, :64]!    ;\
    bge       0b                    ;\
1:  vext.8    d0,  d2, d3, $(offset);\
    sub       r1,          $8       ;\
    vst1.64  {d0},    [ip, :64]!    ;\
2:  add       r1,          $(offset);\
    add       r2,          $8       ;\
    b         L_scalarCopy

L_sourceAligned1:
    COPY_UNALIGNED(1)
L_sourceAligned2:
    COPY_UNALIGNED(2)
L_sourceAligned3:
    COPY_UNALIGNED(3)