Loading...
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
/*
 * Copyright (c) 2009 Apple Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */

#include <arm/arch.h>
#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD

/*****************************************************************************
 * Cortex-A8 implementation                                                  *
 *****************************************************************************/
 
// Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ).
//
// Our tests have shown that NEON is always a performance win for memcpy( ).
// However, for the specific case of copies from a warm source to a cold
// destination when the buffer size is between 1k and 32k, it is not enough
// of a performance win to offset the increased power footprint, resulting
// in an energy usage regression.  Thus, we detect that particular case, and
// pass those copies through the ARM core registers.  All other copies larger
// than 8 bytes are handled on NEON.
//
// Stephen Canon, August 2009

.text
.code 16
.syntax unified

// void bcopy(const void * source,
//            void * destination,
//            size_t length);
//
// void *memmove(void * destination,
//               const void * source,
//               size_t n);
//
// void *memcpy(void * restrict destination,
//              const void * restrict source,
//              size_t n);
//
// all copy n successive bytes from source to destination. memmove and memcpy
// returns destination, whereas bcopy has no return value. copying takes place
// as if it were through a temporary buffer -- after return destination contains
// exactly the bytes from source, even if the buffers overlap.

.thumb_func _bcopy$VARIANT$CortexA8
.thumb_func _memmove$VARIANT$CortexA8
.thumb_func _memcpy$VARIANT$CortexA8
.globl _bcopy$VARIANT$CortexA8
.globl _memmove$VARIANT$CortexA8
.globl _memcpy$VARIANT$CortexA8

#define SAVE_REGISTERS {r4,r5,r6,r8,r10,r11}
#define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r11}
               
/*****************************************************************************
 *  entry points                                                             *
 *****************************************************************************/

.align 2
_bcopy$VARIANT$CortexA8:

// bcopy has the first and second arguments in the opposite order as the C
// library functions memmove and memcpy.  If bcopy is called, we swap these
// two arguments and then fall into memmove.

    mov         r3,     r0
    mov         r0,     r1
    mov         r1,     r3

.align 2
_memmove$VARIANT$CortexA8:
_memcpy$VARIANT$CortexA8:

// At entry to memmove/memcpy, registers contain the following values:
//
//  r0  pointer to the first byte of the destination buffer
//  r1  pointer to the first byte of the source buffer
//  r2  number of bytes to copy
//
// Our preference is to use a (faster and easier to understand) front-to-back
// copy of the buffer.  However, memmove requires that copies take place as
// though through a temporary buffer.  This means that if the buffers overlap,
// it may be necessary to copy the buffer in reverse order.
//
// To properly detect such overlap, we begin by computing the offset between
// the source and destination pointers.  If the offset happens to be zero,
// then there is no work to be done, so we can early out.

    subs    r3,     r0, r1
    it      eq
    bxeq    lr

// r3 now contains the offset between the buffers, (destination - source).  If
// 0 < offset < length, then the high-addressed bits of the source alias the
// low addressed bytes of the destination.  Thus, if we were to perform the
// copy in ascending address order, we would overwrite the high-addressed
// source bytes before we had a chance to copy them, and the data would be lost.
//
// Thus, we can use the front-to-back copy only if offset is negative or
// greater than the length.  This is the case precisely if offset compares
// unsigned higher than length.

    cmp     r3,     r2
    bhs     L_copyFrontToBack
                             
/*****************************************************************************
 *  back to front copy                                                       *
 *****************************************************************************/

// Here we have fallen through into the back-to-front copy.  We preserve the
// original destination pointer in r0 because it is the return value for the
// routine, and update the other registers as follows:
//
//  r1  one byte beyond the end of the destination buffer
//  r2  number of bytes to copy
//  ip  one byte beyond the end of the destination buffer

    mov      ip,    r0
    add      r1,    r2
    add      ip,    r2
    
// Subtract 8 from the buffer length; if this is negative, then we will use
// only single-byte copies, and we jump directly to a scalar copy loop.

    subs     r2,        $8
    blt      L_scalarReverseCopy
    
// If the destination pointer is 8-byte aligned we can use 8-byte NEON copies
// to move the data.
    
    tst      ip,        $7
    beq      L_vectorReverseCopy
    
// Otherwise, we copy a single byte at a time, in order of descending memory
// address, until the destination is 8 byte aligned.  Within this loop,
// registers are used as follows:
//
//  r0  original destination pointer
//  r1  pointer to one byte past the next element to be copied
//  r2  (bytes remaining to be copied) - 8
//  r3  temporary to hold the byte that is being copied
//  ip  pointer one byte past the destination of the next byte to be copied
//
//  byte that will be copied in this iteration
//                            |  byte that was copied in the previous iteration                           
//  Source buffer:            v   v                                
//  ------------------------+---+---+-------------------------
//  bytes still to copy ... |   |   | ... bytes already copied
//  ------------------------+---+---+-------------------------
//                                ^
//                               r1 holds the address of this byte

0:  ldrb     r3,   [r1, $-1]!
    sub      r2,        $1
    strb     r3,   [ip, $-1]!
    tst      ip,        $7
    bne      0b
    
// At this point, the destination pointer is 8 byte aligned.  Check again that
// there are at least 8 bytes remaining to copy by comparing the remaining
// length minus 8 to zero.  If fewer than 8 bytes remain, jump to the cleanup
// path.
    
    cmp      r2,    $0
    blt      L_scalarReverseCopy
                                    
/*****************************************************************************
 *  destination is 8 byte aligned                                            *
 *****************************************************************************/

L_vectorReverseCopy:

// At this point, registers contain the following values:
//
//  r0  original destination pointer
//  r1  pointer to one byte past the next element to be copied
//  r2  (bytes remaining to copy) - 8
//  ip  pointer one byte past the destination of the next byte to be copied
//
// Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive.
// NEON has really excellent alignment handling in hardware, so we would like
// to use that to handle cases where the source is not similarly aligned to the
// destination (it supports even single-byte misalignment at speed).  However,
// on some SoC designs, not all of the DMA busses support such access.  Thus,
// we must unfortunately use a software workaround in those cases.
//
// Fortunately, 4-byte aligned loads are supported even on the DMA busses, so
// we only need to handle the different possible source alignments modulo 4.
// Here we have a dispatch table to jump to the correct copy implementation
// for the given source alignment.
//
// The tbh instruction loads the address offset of the correct implementation
// from the data table that immediately follows it and adds it to the pc to 
// jump to the correct branch.

    ands     r3,    r1, $3
    tbh     [pc, r3, lsl $1]
0:  
.short (L_reverseAligned0-0b)/2
.short (L_reverseAligned1-0b)/2
.short (L_reverseAligned2-0b)/2
.short (L_reverseAligned3-0b)/2

/*****************************************************************************
 *  source is also at least word aligned                                     *
 *****************************************************************************/
    
L_reverseAligned0:

// Subtract 56 from r2, so that it contains the number of bytes remaining to
// copy minus 64.  If this result is negative, then we jump into a loop that
// copies 8 bytes at a time.

    subs     r2,        $0x38
    blt      L_reverseVectorCleanup
    
// Check if the destination pointer is 64-byte aligned.  If so, jump to a loop
// that copies whole cachelines.

    tst      ip,        $0x38
    beq      L_reverseCachelineAligned
    
// Otherwise, we copy a 8 bytes at a time, in order of descending memory
// address, until the destination is 64 byte aligned.  Within this loop,
// registers are used as follows:
//
//  r0  original destination pointer
//  r1  pointer to one byte past the next element to be copied
//  r2  (bytes remaining to be copied) - 64
//  ip  pointer one byte past the destination of the next byte to be copied
//  d0  temporary storage for copy
//
//  bytes that will be copied after this iteration
//        |         8 byte block that will be copied in this iteration                            
//        v         v
//  --------------+-------------------------------+---------------------
//                | 0   1   2   3   4   5   6   7 | bytes already copied
//  --------------+-------------------------------+---------------------
//                                                  ^
//                                                  r1 points here
    
0:  sub      r1,        $8
    vld1.32 {d0},  [r1]
    sub      ip,        $8
    sub      r2,        $8
    tst      ip,        $0x38
    vst1.64 {d0},  [ip,:64]
    bne      0b
    
// At this point, the destination pointer is 64 byte aligned.  Check again that
// there are at least 64 bytes remaining to copy by comparing the remaining
// length minus 64 to zero.  If fewer than 64 bytes remain, skip over the main
// copy loop.

    cmp      r2,        $0
    blt      L_reverseVectorCleanup
    
/*****************************************************************************
 *  destination is cacheline aligned                                         *
 *****************************************************************************/

L_reverseCachelineAligned:

// In the special case that we are copying a buffer of between 1k and 32k bytes
// we do not use a NEON copy for the main loop.  This is because if we happen
// to be doing a copy from a source in cache to a destination that is not in
// cache, this will result in an increase in energy usage.  In all other cases,
// NEON gives superior energy conservation.

    sub      r3,    r2, $0x3c0
    cmp      r3,        $0x7c00
    blo      L_useSTMDB
    
// Pre-decrement the source (r1) and destination (ip) pointers so that they
// point to the first byte of the trailing 32-byte window of each buffer.
// Additionally, load the address increment of -32 into r3.

    sub      r1,        $32
    sub      ip,        $32
    mov      r3,        $-32
    
// The destination pointer is known to be 64-byte aligned, so we can use the
// maximal alignment hint (:256) for our vector stores.  Detect if the source
// is also at least 32-byte aligned and jump to a loop that uses maximal
// alignment hints for the loads as well if possible.
    
    tst      r1,        $0x1f
    beq      L_reverseSourceAligned
    
// This loop copies 64 bytes per iteration, from a 4-byte aligned source to a
// 64-byte aligned destination, in order of descending memory address.  Within
// this loop, registers are used as follows:
//
//  r0      original destination pointer (unmodified)
//  r1      pointer to the next 32-byte block to load
//  r2      (number of bytes remaining to copy) - 64
//  r3      address increment of -32.
//  ip      pointer to which the next 32-byte block is to be stored
//  q0-q3   temporary registers used for copies
//
// Note that the loop is arrange in such a way that a single cleanup store is
// necessary after the final loop iteration.  This occurs at label (1), and is
// shared between the unaligned and aligned loops.
    
    vld1.32 {q2,q3}, [r1],      r3
    vld1.32 {q0,q1}, [r1],      r3
    subs     r2,         $64
    vst1.64 {q2,q3}, [ip,:256], r3
    blt      1f
.align 3
0:  vld1.32 {q2,q3}, [r1],      r3
    vst1.64 {q0,q1}, [ip,:256], r3
    vld1.32 {q0,q1}, [r1],      r3
    subs     r2,         $64
    vst1.64 {q2,q3}, [ip,:256], r3 
    bge      0b
    b        1f
    
L_reverseSourceAligned:

// This loop is identical to the immediately preceeding loop, except that it
// uses the additional alignment hint that the source pointer (r1) is 32-byte
// aligned.  The two loops share cleanup code for the final iteration.

    vld1.64 {q2,q3}, [r1,:256], r3
    vld1.64 {q0,q1}, [r1,:256], r3
    subs     r2,         $64
    vst1.64 {q2,q3}, [ip,:256], r3
    blt      1f
.align 3
0:  vld1.64 {q2,q3}, [r1,:256], r3
    vst1.64 {q0,q1}, [ip,:256], r3
    vld1.64 {q0,q1}, [r1,:256], r3
    subs     r2,         $64
    vst1.64 {q2,q3}, [ip,:256], r3
    bge      0b
    
// Final vector store for both of the above loops.

1:  vst1.64 {q0,q1}, [ip,:256], r3

// Adjust the source and destination pointers so that they once again point to
// the last byte that we used (which is one byte higher than the address that
// we will use next for any required cleanup).

    add      r1,         $32
    add      ip,         $32
    
L_reverseVectorCleanup:

// Add 56 to r2, so that it contains the number of bytes remaing to copy minus
// 8.  A comparison of this value with zero tells us if any more whole 8-byte
// blocks need to be copied.

    adds     r2,    r2, $0x38
    blt      L_scalarReverseCopy

// This loop copies 8 bytes at a time in order of descending memory address,
// until fewer than 8 bytes remain to be copied.  Within this loop, registers
// are used as follows:
//
//  r0  original destination pointer
//  r1  pointer to one byte past the next element to be copied
//  r2  (bytes remaining to be copied) - 64
//  ip  pointer one byte past the destination of the next byte to be copied
//  d0  temporary storage for copy

0:  sub      r1,        $8
    vld1.32 {d0},  [r1]
    sub      ip,        $8
    subs     r2,        $8
    vst1.64 {d0},  [ip,:64]
    bge      0b

/*****************************************************************************
 *  sub-doubleword cleanup copies                                            *
 *****************************************************************************/

L_scalarReverseCopy:

// Add 8 to r2, so that it contains the number of bytes remaining to copy, and
// return to the calling routine if zero bytes remain.

    adds     r2,        $8
    it       eq
    bxeq     lr

// Copy one byte at a time in descending address order until we reach the front
// of the buffer.  Within this loop, registers are used as follows:
//
//  r0  original destination pointer
//  r1  pointer to one byte past the next element to be copied
//  r2  (bytes remaining to be copied) - 8
//  r3  temporary to hold the byte that is being copied
//  ip  pointer one byte past the destination of the next byte to be copied
         
0:  ldrb     r3,   [r1, $-1]!
    subs     r2,        $1
    strb     r3,   [ip, $-1]!
    bne      0b
    bx       lr
         
/*****************************************************************************
 *  STMDB loop for 1k-32k buffers                                            *
 *****************************************************************************/

// This loop copies 64 bytes each iteration in order of descending memory
// address, using the GPRs instead of NEON.
//
//  r0  original destination pointer
//  r1  pointer to one byte past the next element to be copied
//  r2  (bytes remaining to be copied) - 64
//  r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data
//  ip  pointer to one byte past the next location to store to

L_useSTMDB:
    push    SAVE_REGISTERS
.align 3
0:  ldmdb   r1!,    COPY_REGISTERS
    subs    r2,     r2,  $64
    stmdb   ip!,    COPY_REGISTERS
    ldmdb   r1!,    COPY_REGISTERS
    pld     [r1, $-64]
    stmdb   ip!,    COPY_REGISTERS
    bge     0b
    pop     SAVE_REGISTERS
    b       L_reverseVectorCleanup
    
/*****************************************************************************
 *  Misaligned reverse vld1 loop                                             *
 *****************************************************************************/

// Software alignment fixup to handle source and dest that are relatively
// misaligned mod 4 bytes.  
//
// The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration,
// which we combine with the 8 bytes loaded in the previous iteration to get a
// 16 byte field; the next 8 bytes to be stored to the destination buffer are
// somewhere in that field, and we get them using the VEXT instruction:
//      
//     |  8 bytes from this iteration  |  8 bytes from last iteration  |
//     +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//     | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f |
//     +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//         ^8 bytes to store this iteration^           |
//                                                   could be a page boundary 
//
// We need to be a little bit careful, however.  Because the loads only have 4
// byte alignment, the very first load could slop over into a page that is not
// mapped readable.  In order to prevent this scenario, we copy eight bytes
// using byte-by-byte before beginning the main loop.
//
// At the beginning of each iteration through this loop, registers are used
// as follows:
//
//  r0  original destination pointer
//  r1  pointer to the next block of 8 bytes to load
//  r2  (bytes remaining to copy) - 8
//  ip  pointer to the next block of 8 bytes to store
//  d0  next 8 bytes to store
//  d2  8 bytes loaded in the previous iteration
//  d3  8 bytes loaded two iterations ago

#define RCOPY_UNALIGNED(offset)      \
0:  ldrb      r3,     [r1,$-1]!     ;\
    strb      r3,     [ip,$-1]!     ;\
    subs      r2,         $1        ;\
    blt       L_scalarReverseCopy   ;\
    tst       ip,         $7        ;\
    bne       0b                    ;\
    bic       r1,         $3        ;\
    sub       r1,         $8        ;\
    sub       ip,         $8        ;\
    mov       r3,         $-8       ;\
    vld1.32  {d2,d3}, [r1], r3      ;\
    subs      r2,         $8        ;\
    blt       1f                    ;\
0:  vext.8    d0,  d2, d3, $(offset);\
    vmov      d3,      d2           ;\
    vld1.32  {d2},    [r1], r3      ;\
    subs      r2,          $8       ;\
    vst1.64  {d0},    [ip, :64], r3 ;\
    bge       0b                    ;\
1:  vext.8    d0,  d2, d3, $(offset);\
    add       r1,          $8       ;\
    vst1.64  {d0},    [ip, :64]     ;\
2:  add       r1,          $(offset);\
    b         L_scalarReverseCopy

L_reverseAligned1:
    RCOPY_UNALIGNED(1)
L_reverseAligned2:
    RCOPY_UNALIGNED(2)
L_reverseAligned3:
    RCOPY_UNALIGNED(3)

/*****************************************************************************
 *  front to back copy                                                       *
 *****************************************************************************/

L_copyFrontToBack:

// Here the pointers are laid out such that we can use our preferred
// front-to-back copy.  We preserve original destination pointer in r0 because
// it is the return value for the routine, and copy it to ip to use in this
// routine.

    mov      ip,    r0
    
// Subtract 8 from the buffer length; if this is negative, then we will use
// only single-byte copies, and we jump directly to a scalar copy loop.
    
    subs     r2,        $8
    blt      L_scalarCopy
    
// If the destination pointer is 8-byte aligned we can use 8-byte NEON copies
// to move the data.

    tst      ip,        $7
    beq      L_vectorCopy
    
// Otherwise, we copy a single byte at a time, in order of ascending memory
// address, until the destination is 8 byte aligned.  Within this loop,
// registers are used as follows:
//
//  r0  original destination pointer
//  r1  pointer to the next byte to copy
//  r2  (bytes remaining to be copied) - 8
//  r3  temporary to hold the byte that is being copied
//  ip  pointer to the next byte to store to

0:  ldrb     r3,  [r1], $1
    sub      r2,        $1
    strb     r3,  [ip], $1
    tst      ip,        $7
    bne      0b
    
// At this point, the destination pointer is 8 byte aligned.  Check again that
// there are at least 8 bytes remaining to copy by comparing the remaining
// length minus 8 to zero.  If fewer than 8 bytes remain, jump to the cleanup
// path.

    cmp      r2,        $0
    blt      L_scalarCopy
    
/*****************************************************************************
 *  destination is doubleword aligned                                        *
 *****************************************************************************/

L_vectorCopy:

// At this point, registers contain the following values:
//
//  r0  original destination pointer
//  r1  pointer to the next element to be copied
//  r2  (bytes remaining to copy) - 8
//  ip  pointer to the destination of the next byte to be copied
//
// Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive.
// NEON has really excellent alignment handling in hardware, so we would like
// to use that to handle cases where the source is not similarly aligned to the
// destination (it supports even single-byte misalignment at speed).  However,
// on some SoC designs, not all of the DMA busses support such access.  Thus,
// we must unfortunately use a software workaround in those cases.
//
// Fortunately, 4-byte aligned loads are supported even on the DMA busses, so
// we only need to handle the different possible source alignments modulo 4.
// Here we have a dispatch table to jump to the correct copy implementation
// for the given source alignment.
//
// The tbh instruction loads the address offset of the correct implementation
// from the data table that immediately follows it and adds it to the pc to 
// jump to the correct branch.

    ands     r3,    r1, $3
    bic      r1,        $3
    tbh     [pc, r3, lsl $1]
0:  
.short (L_sourceAligned0-0b)/2
.short (L_sourceAligned1-0b)/2
.short (L_sourceAligned2-0b)/2
.short (L_sourceAligned3-0b)/2

/*****************************************************************************
 *  source is also at least word aligned                                     *
 *****************************************************************************/
    
L_sourceAligned0:

// Subtract 56 from r2, so that it contains the number of bytes remaining to
// copy minus 64.  If this result is negative, then we jump into a loop that
// copies 8 bytes at a time.

    subs     r2,        $0x38
    blt      L_vectorCleanup
    
// Check if the destination pointer is 64-byte aligned.  If so, jump to a loop
// that copies whole cachelines.

    tst      ip,        $0x38
    beq      L_cachelineAligned
        
// Otherwise, we copy a 8 bytes at a time, in order of ascending memory
// address, until the destination is 64 byte aligned.  Within this loop,
// registers are used as follows:
//
//  r0  original destination pointer
//  r1  pointer to the next element to be copied
//  r2  (bytes remaining to be copied) - 64
//  ip  pointer to the destination of the next byte to be copied
//  d0  temporary storage for copy

0:  vld1.32 {d0},  [r1]!
    sub      r2,        $8
    vst1.64 {d0},  [ip,:64]!
    tst      ip,        $0x38
    bne      0b
    
// At this point, the destination pointer is 64 byte aligned.  Check again that
// there are at least 64 bytes remaining to copy by comparing the remaining
// length minus 64 to zero.  If fewer than 64 bytes remain, skip over the main
// copy loop.

    cmp      r2,        $0
    blt      L_vectorCleanup
    
/*****************************************************************************
 *  destination is cacheline aligned                                         *
 *****************************************************************************/

// In the special case that we are copying a buffer of between 1k and 32k bytes
// we do not use a NEON copy for the main loop.  This is because if we happen
// to be doing a copy from a source in cache to a destination that is not in
// cache, this will result in an increase in energy usage.  In all other cases,
// NEON gives superior energy conservation.

L_cachelineAligned:
    sub      r3,    r2, $0x3c0
    cmp      r3,        $0x7c00
    blo      L_useSTMIA
    
// The destination pointer is known to be 64-byte aligned, so we can use the
// maximal alignment hint (:256) for our vector stores.  Detect if the source
// is also at least 32-byte aligned and jump to a loop that uses maximal
// alignment hints for the loads as well if possible.

    tst      r1,        $0x1f
    beq      L_sourceAligned32
    
// This loop copies 64 bytes per iteration, from a 4-byte aligned source to a
// 64-byte aligned destination, in order of ascending memory address.  Within
// this loop, registers are used as follows:
//
//  r0      original destination pointer (unmodified)
//  r1      pointer to the next 32-byte block to load
//  r2      (number of bytes remaining to copy) - 64
//  ip      pointer to which the next 32-byte block is to be stored
//  q0-q3   temporary registers used for copies
//
// Note that the loop is arrange in such a way that a single cleanup store is
// necessary after the final loop iteration.  This occurs at label (1), and is
// shared between the unaligned and aligned loops.

    vld1.32 {q2,q3}, [r1]!
    vld1.32 {q0,q1}, [r1]!
    subs     r2,         $64
    vst1.64 {q2,q3}, [ip,:256]!
    blt      1f
.align 3
0:  vld1.32 {q2,q3}, [r1]!
    vst1.64 {q0,q1}, [ip,:256]!
    vld1.32 {q0,q1}, [r1]!
    subs     r2,         $64
    vst1.64 {q2,q3}, [ip,:256]!
    bge      0b
    b        1f
    
L_sourceAligned32:

// This loop is identical to the immediately preceeding loop, except that it
// uses the additional alignment hint that the source pointer (r1) is 32-byte
// aligned.  The two loops share cleanup code for the final iteration.

    vld1.64 {q2,q3}, [r1,:256]!
    vld1.64 {q0,q1}, [r1,:256]!
    subs     r2,         $64
    vst1.64 {q2,q3}, [ip,:256]!
    blt      1f
.align 3
0:  vld1.64 {q2,q3}, [r1,:256]!
    vst1.64 {q0,q1}, [ip,:256]!
    vld1.64 {q0,q1}, [r1,:256]!
    subs     r2,         $64
    vst1.64 {q2,q3}, [ip,:256]!
    bge      0b
    
// Final vector store for both of the above loops.

1:  vst1.64 {q0,q1}, [ip,:256]!

L_vectorCleanup:

// Add 56 to r2, so that it contains the number of bytes remaing to copy minus
// 8.  A comparison of this value with zero tells us if any more whole 8-byte
// blocks need to be copied.

    adds     r2,        $0x38
    blt      L_scalarCopy

// This loop copies 8 bytes at a time in order of descending memory address,
// until fewer than 8 bytes remain to be copied.  Within this loop, registers
// are used as follows:
//
//  r0  original destination pointer
//  r1  pointer to the next element to be copied
//  r2  (bytes remaining to be copied) - 64
//  ip  pointer to the destination of the next byte to be copied
//  d0  temporary storage for copy
    
0:  vld1.32 {d0},   [r1]!
    subs     r2,        $8
    vst1.64 {d0},   [ip,:64]!
    bge      0b

/*****************************************************************************
 *  sub-doubleword cleanup copies                                            *
 *****************************************************************************/

L_scalarCopy:

// Add 8 to r2, so that it contains the number of bytes remaining to copy, and
// return to the calling routine if zero bytes remain.

    adds     r2,        $8
    it       eq
    bxeq     lr
         
// Copy one byte at a time in descending address order until we reach the front
// of the buffer.  Within this loop, registers are used as follows:
//
//  r0  original destination pointer
//  r1  pointer to one byte past the next element to be copied
//  r2  (bytes remaining to be copied) - 8
//  r3  temporary to hold the byte that is being copied
//  ip  pointer one byte past the destination of the next byte to be copied

0:  ldrb     r3,    [r1], $1
    strb     r3,    [ip], $1
    subs     r2,          $1
    bne      0b
    bx       lr
    
/*****************************************************************************
 *  STMIA loop for 1k-32k buffers                                            *
 *****************************************************************************/
    
// This loop copies 64 bytes each iteration in order of ascending memory
// address, using the GPRs instead of NEON.
//
//  r0  original destination pointer
//  r1  pointer to the next element to be copied
//  r2  (bytes remaining to be copied) - 64
//  r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data
//  ip  pointer to the next location to store to

L_useSTMIA:
    push     SAVE_REGISTERS
.align 3
0:  ldmia   r1!,    COPY_REGISTERS
    subs    r2,     r2,  $64
    stmia   ip!,    COPY_REGISTERS
    ldmia   r1!,    COPY_REGISTERS
    pld     [r1, $64]
    stmia   ip!,    COPY_REGISTERS
    bge     0b
    pop     SAVE_REGISTERS
    b       L_vectorCleanup
    
/*****************************************************************************
 *  Misaligned forward vld1 loop                                             *
 *****************************************************************************/

// Software alignment fixup to handle source and dest that are relatively
// misaligned mod 4 bytes.  
//
// The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration,
// which we combine with the 8 bytes loaded in the previous iteration to get a
// 16 byte field; the next 8 bytes to be stored to the destination buffer are
// somewhere in that field, and we get them using the VEXT instruction:
//      
//     |  8 bytes from last iteration  |  8 bytes from this iteration  |
//     +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//     | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f |
//     +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
//         ^8 bytes to store this iteration^           |
//                                                   could be a page boundary 
//
// We need to be a little bit careful, however.  Because the loads only have 4
// byte alignment, if we used this approach all the way to the end of the
// buffer, the very last 8 byte load might slop over onto a new page by 4
// bytes, and that new page might not be mapped into our process.  Thus, we
// terminate this copy loop when fewer than 12 bytes remain to be copied,
// instead of the more natural-seeming termination condition of "8 bytes
// remaining" (the illustration above shows the worst case and demonstrates
// why 12 is a sufficiently safe condition).
//
// At the beginning of each iteration through this loop, registers are used
// as follows:
//
//  r0  original destination pointer
//  r1  pointer to the next block of 8 bytes to load
//  r2  (bytes remaining to copy) - 12
//  ip  pointer to the next block of 8 bytes to store
//  d0  next 8 bytes to store
//  d2  8 bytes loaded in the previous iteration
//  d3  8 bytes loaded two iterations ago

#define COPY_UNALIGNED(offset)       \
    subs      r2,          $4       ;\
    blt       2f                    ;\
    vld1.32  {d2,d3}, [r1]!         ;\
    subs      r2,          $8       ;\
    blt       1f                    ;\
0:  vext.8    d0,  d2, d3, $(offset);\
    vmov      d2,      d3           ;\
    vld1.32  {d3},    [r1]!         ;\
    subs      r2,          $8       ;\
    vst1.64  {d0},    [ip, :64]!    ;\
    bge       0b                    ;\
1:  vext.8    d0,  d2, d3, $(offset);\
    sub       r1,          $8       ;\
    vst1.64  {d0},    [ip, :64]!    ;\
2:  add       r1,          $(offset);\
    add       r2,          $4       ;\
    b         L_scalarCopy

L_sourceAligned1:
    COPY_UNALIGNED(1)
L_sourceAligned2:
    COPY_UNALIGNED(2)
L_sourceAligned3:
    COPY_UNALIGNED(3)

#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD