Loading...
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
/*
 * Copyright (c) 2010 Apple Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 *
 *  This file implements the following functions for the Cortex-A9 processor:
 *
 *  void bcopy(const void * source,
 *             void * destination,
 *             size_t length);
 *
 *  void *memmove(void * destination,
 *                const void * source,
 *                size_t n);
 *
 *  void *memcpy(void * restrict destination,
 *               const void * restrict source,
 *               size_t n);
 *
 * All copy n successive bytes from source to destination.  Memmove and memcpy
 * return destination, whereas bcopy has no return value.  Copying takes place
 * as if it were through a temporary buffer -- after return destination
 * contains exactly the bytes from source, even if the buffers overlap (this is
 * not required of memcpy by the C standard; its behavior is undefined if the
 * buffers overlap, but we are holding ourselves to the historical behavior of
 * this function on OS X and iOS).
 */

#include <arm/arch.h>
#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD

/*****************************************************************************
 * Macros                                                                    *
 *****************************************************************************/

#define A9_ENTRY(name) \
	.align 2;\
	.globl _ ## name ## $VARIANT$CortexA9;\
	_ ## name ## $VARIANT$CortexA9:

#define ESTABLISH_FRAME \
	push   {r0,r4,r7,lr};\
	add     r7,     sp, #8
    
#define CLEAR_FRAME_AND_RETURN \
	pop    {r0,r4,r7,pc}
    
#define ADDITIONAL_CALLEE_SAVE_REGISTERS {r5,r6,r8,r10}

#define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r12}

/*****************************************************************************
 *  entry points                                                             *
 *****************************************************************************/

.text
.syntax unified
.code 32

A9_ENTRY(bcopy)
//  Translate bcopy calls into memcpy calls by swapping the first and second
//  arguments.
	mov     r3,     r0
	mov     r0,     r1
	mov     r1,     r3

A9_ENTRY(memcpy)
A9_ENTRY(memmove)
//  Our preference is to copy the data in ascending address order, but if the
//  buffers overlap such that the beginning of the destination buffer aliases
//  the end of the source buffer, we need to copy in descending address order
//  instead to preserve the memmove semantics.  We detect this case with the
//  test:
//
//      destination - source < length    (unsigned compare)
//
//  If the address of the source buffer is higher than the address of the
//  destination buffer, this arithmetic can overflow, but the overflowed value
//  can only be smaller than length if the buffers do not overlap, so we don't
//  need to worry about false positives due to the overflow (they happen, but
//  only in cases where copying in either order is correct).
	subs    r3,     r0, r1
	bxeq    lr
	ESTABLISH_FRAME
	cmp     r3,     r2
	blo     L_descendingCopy

/*****************************************************************************
 *  ascending copy                                                           *
 *****************************************************************************/

//  The layout of the two buffers is such that we can use our preferred
//  (ascending address order) copy implementation.  Throughout this copy,
//  registers are used as follows:
//
//      r0  lowest unwritten address in the destination buffer.
//      r1  lowest unread address in the source buffer.
//      r2  number of bytes remaining to copy less an offset that varies
//          with the size of the copies that are being made.
//      r3, r4, r5, r6, r8, r9, r10, r12
//          temporary registers used to hold the data during copies.
//      r12 also used as a scratch register for alignment / length calculations

L_ascendingCopy:
//  We begin by checking if less than four bytes are to be copied; if so, we
//  branch directly to a small-buffer copy and return.  Otherwise, we copy up
//  to three bytes if needed to make the destination pointer have word (four
//  byte) alignment.
	subs    r2,         #4
	blo     L_ascendingLengthLessThanFour
	ands    ip,     r0, #0x3
	beq     L_ascendingDestinationWordAligned
	ldrb    r3,    [r1],#1
	cmp     ip,         #2
	ldrbls  r4,    [r1],#1
	strb    r3,    [r0],#1
	ldrblo  r3,    [r1],#1
	add     r2,         ip
	strbls  r4,    [r0],#1
	strblo  r3,    [r0],#1
	subs    r2,         #4
	bhs     L_ascendingDestinationWordAligned
    
L_ascendingLengthLessThanFour:
//  Conditionally copies up to three bytes, assuming no alignment.  This is
//  only used if the original length of the buffer is smaller than four.
	lsls    ip,     r2, #31
	ldrbcs  r3,    [r1],#1
	ldrbcs  ip,    [r1],#1
	ldrbmi  r4,    [r1]
	strbcs  r3,    [r0],#1
	strbcs  ip,    [r0],#1
	strbmi  r4,    [r0]
	CLEAR_FRAME_AND_RETURN
    
L_ascendingDestinationWordAligned:
//  We know that the destination has word alignment.  If the source is not
//  similarly aligned, jump to an unaligned copy loop.
	tst     r1,         #0x3
	bne		L_ascendingUnalignedCopy

/*****************************************************************************
 *  ascending copy, both buffers have word alignment                         *
 *****************************************************************************/
    
//  If less than sixty-four bytes remain to be copied, jump directly to the
//  word-aligned cleanup path.  Otherwise, we copy up to 28 bytes as needed
//  to make the destination pointer have cacheline alignment.
	subs    r2,     r2, #0x3c
	blo     L_ascendingLengthLessThanSixtyFour
0:  tst     r0,         #0x1c
	beq     L_ascendingDestinationCachelineAligned
	ldr     r3,    [r1],#4
	subs    r2,         #4
	str     r3,    [r0],#4
	bhs     0b
	b       L_ascendingLengthLessThanSixtyFour

L_ascendingDestinationCachelineAligned:
//  Unrolled main copy loop; copies two cachelines (64 bytes) per iteration.
//  Empirical testing suggests that 0x60 is the optimal lookahead for preload,
//  though anything between 0x40 and 0x100 seems to be "acceptable".
	push    ADDITIONAL_CALLEE_SAVE_REGISTERS
0:	ldm     r1!,    COPY_REGISTERS
	subs    r2,     r2, #0x40
	stm     r0!,    COPY_REGISTERS
	pld    [r1, #0x60]
	ldm     r1!,    COPY_REGISTERS
	pld    [r1, #0x60]
	stm     r0!,    COPY_REGISTERS
	bhs     0b
	pop     ADDITIONAL_CALLEE_SAVE_REGISTERS

L_ascendingLengthLessThanSixtyFour:
//  Cleanup copy of up to 63 bytes.  We can assume that both the source and
//  destination addresses have word alignment here.
    tst     r2,         #0x30
    beq     1f
0:  ldm     r1!,   {r3,r4,r9,ip}
    sub     r2,     r2, #0x10
    stm     r0!,   {r3,r4,r9,ip}
    tst     r2,         #0x30
    bne     0b
1:  tst     r2,         #0xf
    beq     2f
    lsls    ip,     r2, #29
    ldmcs   r1!,   {r3,ip}
    stmcs   r0!,   {r3,ip}
    ldrmi   r3,    [r1],#4
    strmi   r3,    [r0],#4
	lsls    ip,     r2, #31
	ldrhcs  r3,    [r1],#2
	strhcs  r3,    [r0],#2
	ldrbmi  ip,    [r1]
	strbmi  ip,    [r0]
2:  CLEAR_FRAME_AND_RETURN

/*****************************************************************************
 *  ascending copy, source buffer is not word aligned                        *
 *****************************************************************************/

L_ascendingUnalignedCopy:
//  Destination buffer is word aligned, but source buffer is not.  Copy
//  byte-by-byte until the destination buffer has eightbyte alignment.
    subs    r2,         #4
    blo     L_ascendingUnalignedByteCleanup
0:  tst     r0,         #0x7
    beq     L_ascendingUnalignedVectorCopy
    ldrb    r3,    [r1],#1
    subs    r2,         #1
    strb    r3,    [r0],#1
    bhs     0b
L_ascendingUnalignedByteCleanup:
    adds    r2,         #8
    beq     1f
0:  ldrb    r3,    [r1],#1
    subs    r2,         #1
    strb    r3,    [r0],#1
    bne     0b
1:  CLEAR_FRAME_AND_RETURN
    
L_ascendingUnalignedVectorCopy:
//  Destination buffer is eightbyte aligned.  Source buffer has unknown
//  alignment.  Use NEON to handle the misaligned copies.  We begin by copying
//  up to 24 bytes to get cacheline alignment of the destination buffer.
    subs    r2,         #0x18
    blo     L_ascendingUnalignedVectorCleanup
0:  tst     r0,         #0x18
    beq     L_ascendingUnalignedCachelineCopy
    vld1.8 {d0},   [r1]!
    subs    r2,         #8
    vst1.8 {d0},   [r0,:64]!
    bhs     0b
L_ascendingUnalignedVectorCleanup:
    adds    r2,         #0x18
    blo     L_ascendingUnalignedByteCleanup
0:  vld1.8 {d0},   [r1]!
    subs    r2,         #8
    vst1.8 {d0},   [r0,:64]!
    bhs     0b
    b       L_ascendingUnalignedByteCleanup
    
L_ascendingUnalignedCachelineCopy:
//  Main copy loop; moves 32 bytes per iteration.  Requires only byte alignment
//  of the source address.
    vld1.8 {q0,q1},[r1]!
    pld    [r1, #0x60]
    vst1.8 {q0,q1},[r0,:256]!
    subs    r2,         #0x20
    bhs     L_ascendingUnalignedCachelineCopy
    b       L_ascendingUnalignedVectorCleanup

/*****************************************************************************
 *  descending copy                                                          *
 *****************************************************************************/

//  The layout of the two buffers is such that we must copy in descending-
//  address order.  Throughout this copy, registers are used as follows:
//
//      r0  lowest address in the destination buffer that has been written to.
//      r1  lowest address in the source buffer that has been read from.
//      r2  number of bytes remaining to copy less an offset that varies
//          with the size of the copies that are being made.
//      r3, r4, r5, r6, r8, r9, r10, r12
//          temporary registers used to hold the data during copies.
//      r12 also used as a scratch register for alignment / length calculations

L_descendingCopy:
//  We begin by checking if less than four bytes are to be copied; if so, we
//  branch directly to a small-buffer copy and return.  Otherwise, we copy up
//  to three bytes if needed to make the destination pointer have word (four
//  byte) alignment.
    add     r1,     r2
    add     r0,     r2
    subs    r2,         #4
	blo     L_descendingLengthLessThanFour
	ands    ip,     r0, #0x3
	beq     L_descendingDestinationWordAligned
	ldrb    r3,    [r1, #-1]!
	cmp     ip,         #2
	ldrbhs  r4,    [r1, #-1]!
	strb    r3,    [r0, #-1]!
	ldrbhi  r3,    [r1, #-1]!
	strbhs  r4,    [r0, #-1]!
	strbhi  r3,    [r0, #-1]!
	subs    r2,         ip
	bhs     L_descendingDestinationWordAligned
        
L_descendingLengthLessThanFour:
//  Conditionally copies up to three bytes, assuming no alignment.  This is
//  only used if the original length of the buffer is smaller than four.
	lsls    ip,     r2, #31
	ldrbcs  r3,    [r1, #-1]!
	ldrbcs  ip,    [r1, #-1]!
	ldrbmi  r4,    [r1, #-1]
	strbcs  r3,    [r0, #-1]!
	strbcs  ip,    [r0, #-1]!
	strbmi  r4,    [r0, #-1]
	CLEAR_FRAME_AND_RETURN
    
L_descendingDestinationWordAligned:
//  We know that the destination has word alignment.  If the source is not
//  similarly aligned, jump to an unaligned copy loop.
	tst     r1,         #0x3
	bne		L_descendingUnalignedCopy

/*****************************************************************************
 *  descending copy, both buffers have word alignment                        *
 *****************************************************************************/
    
//  If less than sixty-four bytes remain to be copied, jump directly to the
//  word-aligned cleanup path.  Otherwise, we copy up to 28 bytes as needed
//  to make the destination pointer have cacheline alignment.
	subs    r2,     r2, #0x3c
	blo     L_descendingLengthLessThanSixtyFour
0:  tst     r0,         #0x1c
	beq     L_descendingDestinationCachelineAligned
	ldr     r3,    [r1, #-4]!
	subs    r2,         #4
	str     r3,    [r0, #-4]!
	bhs     0b
	b       L_descendingLengthLessThanSixtyFour

L_descendingDestinationCachelineAligned:
//  Unrolled main copy loop; copies two cachelines (64 bytes) per iteration.
//  Empirical testing suggests that -0x80 is the optimal lookahead for preload,
//  though anything between -0x40 and -0x100 seems to be "acceptable".
	push    ADDITIONAL_CALLEE_SAVE_REGISTERS
0:	ldmdb   r1!,    COPY_REGISTERS
	subs    r2,     r2, #0x40
	stmdb   r0!,    COPY_REGISTERS
	pld    [r1, #-0x80]
	ldmdb   r1!,    COPY_REGISTERS
	pld    [r1, #-0x80]
	stmdb   r0!,    COPY_REGISTERS
	bhs     0b
	pop     ADDITIONAL_CALLEE_SAVE_REGISTERS

L_descendingLengthLessThanSixtyFour:
//  Cleanup copy of up to 63 bytes.  We can assume that both the source and
//  destination addresses have word alignment here.
    tst     r2,         #0x30
    beq     1f
0:  ldmdb   r1!,   {r3,r4,r9,ip}
    sub     r2,     r2, #0x10
    stmdb   r0!,   {r3,r4,r9,ip}
    tst     r2,         #0x30
    bne     0b
1:  tst     r2,         #0xf
    beq     2f
    lsls    ip,     r2, #29
    ldmdbcs r1!,   {r3,ip}
    stmdbcs r0!,   {r3,ip}
    ldrmi   r3,    [r1, #-4]!
    strmi   r3,    [r0, #-4]!
	lsls    ip,     r2, #31
	ldrhcs  r3,    [r1, #-2]!
	strhcs  r3,    [r0, #-2]!
	ldrbmi  ip,    [r1, #-1]
	strbmi  ip,    [r0, #-1]
2:  CLEAR_FRAME_AND_RETURN

/*****************************************************************************
 *  descending copy, source buffer is not word aligned                       *
 *****************************************************************************/

L_descendingUnalignedCopy:
//  Destination buffer is word aligned, but source buffer is not.  Copy
//  byte-by-byte until the destination buffer has eightbyte alignment.
    subs    r2,         #4
    blo     L_descendingUnalignedByteCleanup
0:  tst     r0,         #0x7
    beq     L_descendingUnalignedVectorCopy
    ldrb    r3,    [r1, #-1]!
    subs    r2,         #1
    strb    r3,    [r0, #-1]!
    bhs     0b
L_descendingUnalignedByteCleanup:
    adds    r2,         #8
    beq     1f
0:  ldrb    r3,    [r1, #-1]!
    subs    r2,         #1
    strb    r3,    [r0, #-1]!
    bne     0b
1:  CLEAR_FRAME_AND_RETURN
    
L_descendingUnalignedVectorCopy:
//  Destination buffer is eightbyte aligned.  Source buffer has unknown
//  alignment.  Use NEON to handle the misaligned copies.  We begin by copying
//  up to 24 bytes to get cacheline alignment of the destination buffer.
    subs    r2,         #0x18
    blo     L_descendingUnalignedVectorCleanup
0:  tst     r0,         #0x18
    beq     L_descendingUnalignedCachelineCopy
    sub     r1,         #8
    vld1.8 {d0},   [r1]
    sub     r0,         #8
    vst1.8 {d0},   [r0,:64]
    subs    r2,         #8
    bhs     0b
L_descendingUnalignedVectorCleanup:
    adds    r2,         #0x18
    blo     L_descendingUnalignedByteCleanup
0:  sub     r1,         #8
    vld1.8 {d0},   [r1]
    sub     r0,         #8
    vst1.8 {d0},   [r0,:64]
    subs    r2,         #8
    bhs     0b
    b       L_descendingUnalignedByteCleanup
    
L_descendingUnalignedCachelineCopy:
//  Main copy loop; moves 32 bytes per iteration.  Requires only byte alignment
//  of the source address.
    sub     r1,         #32
    sub     r0,         #32
    mov     r4,         #-32
0:  vld1.8 {q0,q1},[r1], r4
    pld    [r1, #-0x60]
    vst1.8 {q0,q1},[r0,:256], r4
    subs    r2,         #0x20
    bhs     0b
    add     r1,         #32
    add     r0,         #32
    b       L_descendingUnalignedVectorCleanup

#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD