OSDN Git Service

Merge "Add read bandwidth and small refactor."
[android-x86/system-extras.git] / tests / memtest / bandwidth.h
1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifndef __BANDWIDTH_H__
18 #define __BANDWIDTH_H__
19
20 #include "memtest.h"
21
22 // Bandwidth Class definitions.
23 class BandwidthBenchmark {
24 public:
25     BandwidthBenchmark()
26         : _size(0),
27           _num_warm_loops(DEFAULT_NUM_WARM_LOOPS),
28           _num_loops(DEFAULT_NUM_LOOPS) {}
29     virtual ~BandwidthBenchmark() {}
30
31     bool run() {
32         if (_size == 0) {
33             return false;
34         }
35         if (!canRun()) {
36             return false;
37         }
38
39         bench(_num_warm_loops);
40
41         nsecs_t t = system_time();
42         bench(_num_loops);
43         t = system_time() - t;
44
45         _mb_per_sec = (_size*(_num_loops/_BYTES_PER_MB))/(t/_NUM_NS_PER_SEC);
46
47         return true;
48     }
49
50     bool canRun() { return !usesNeon() || isNeonSupported(); }
51
52     virtual bool setSize(size_t size) = 0;
53
54     virtual const char *getName() = 0;
55
56     virtual bool verify() = 0;
57
58     virtual bool usesNeon() { return false; }
59
60     bool isNeonSupported() {
61 #if defined(__ARM_NEON__)
62         return true;
63 #else
64         return false;
65 #endif
66     }
67
68     // Accessors/mutators.
69     double mb_per_sec() { return _mb_per_sec; }
70     size_t num_warm_loops() { return _num_warm_loops; }
71     size_t num_loops() { return _num_loops; }
72     size_t size() { return _size; }
73
74     void set_num_warm_loops(size_t num_warm_loops) {
75         _num_warm_loops = num_warm_loops;
76     }
77     void set_num_loops(size_t num_loops) { _num_loops = num_loops; }
78
79     // Static constants
80     static const unsigned int DEFAULT_NUM_WARM_LOOPS = 1000000;
81     static const unsigned int DEFAULT_NUM_LOOPS = 20000000;
82
83 protected:
84     virtual void bench(size_t num_loops) = 0;
85
86     double _mb_per_sec;
87     size_t _size;
88     size_t _num_warm_loops;
89     size_t _num_loops;
90
91 private:
92     // Static constants
93     static const double _NUM_NS_PER_SEC = 1000000000.0;
94     static const double _BYTES_PER_MB = 1024.0* 1024.0;
95 };
96
97 class CopyBandwidthBenchmark : public BandwidthBenchmark {
98 public:
99     CopyBandwidthBenchmark() : BandwidthBenchmark(), _src(NULL), _dst(NULL) { }
100
101     bool setSize(size_t size) {
102         if (_src) {
103            free(_src);
104         }
105         if (_dst) {
106             free(_dst);
107         }
108
109         if (size == 0) {
110             _size = DEFAULT_COPY_SIZE;
111         } else {
112             _size = size;
113         }
114
115         _src = reinterpret_cast<char*>(memalign(64, _size));
116         if (!_src) {
117             perror("Failed to allocate memory for test.");
118             return false;
119         }
120         _dst = reinterpret_cast<char*>(memalign(64, _size));
121         if (!_dst) {
122             perror("Failed to allocate memory for test.");
123             return false;
124         }
125
126         return true;
127     }
128     virtual ~CopyBandwidthBenchmark() {
129         if (_src) {
130             free(_src);
131             _src = NULL;
132         }
133         if (_dst) {
134             free(_dst);
135             _dst = NULL;
136         }
137     }
138
139     bool verify() {
140         memset(_src, 0x23, _size);
141         memset(_dst, 0, _size);
142         bench(1);
143         if (memcmp(_src, _dst, _size) != 0) {
144             printf("Strings failed to compare after one loop.\n");
145             return false;
146         }
147
148         memset(_src, 0x23, _size);
149         memset(_dst, 0, _size);
150         _num_loops = 2;
151         bench(2);
152         if (memcmp(_src, _dst, _size) != 0) {
153             printf("Strings failed to compare after two loops.\n");
154             return false;
155         }
156
157         return true;
158     }
159
160 protected:
161     char *_src;
162     char *_dst;
163
164     static const unsigned int DEFAULT_COPY_SIZE = 8000;
165 };
166
167 class CopyLdrdStrdBenchmark : public CopyBandwidthBenchmark {
168 public:
169     CopyLdrdStrdBenchmark() : CopyBandwidthBenchmark() { }
170     virtual ~CopyLdrdStrdBenchmark() {}
171
172     const char *getName() { return "ldrd/strd"; }
173
174 protected:
175     // Copy using ldrd/strd instructions.
176     void bench(size_t num_loops) {
177         asm volatile(
178             "stmfd sp!, {r0,r1,r2,r3,r4,r6,r7}\n"
179
180             "mov r0, %0\n"
181             "mov r1, %1\n"
182             "mov r2, %2\n"
183             "mov r3, %3\n"
184
185             "0:\n"
186             "mov r4, r2, lsr #6\n"
187
188             "1:\n"
189             "ldrd r6, r7, [r0]\n"
190             "strd r6, r7, [r1]\n"
191             "ldrd r6, r7, [r0, #8]\n"
192             "strd r6, r7, [r1, #8]\n"
193             "ldrd r6, r7, [r0, #16]\n"
194             "strd r6, r7, [r1, #16]\n"
195             "ldrd r6, r7, [r0, #24]\n"
196             "strd r6, r7, [r1, #24]\n"
197             "ldrd r6, r7, [r0, #32]\n"
198             "strd r6, r7, [r1, #32]\n"
199             "ldrd r6, r7, [r0, #40]\n"
200             "strd r6, r7, [r1, #40]\n"
201             "ldrd r6, r7, [r0, #48]\n"
202             "strd r6, r7, [r1, #48]\n"
203             "ldrd r6, r7, [r0, #56]\n"
204             "strd r6, r7, [r1, #56]\n"
205
206             "add  r0, r0, #64\n"
207             "add  r1, r1, #64\n"
208             "subs r4, r4, #1\n"
209             "bgt 1b\n"
210
211             "sub r0, r0, r2\n"
212             "sub r1, r1, r2\n"
213             "subs r3, r3, #1\n"
214             "bgt 0b\n"
215
216             "ldmfd sp!, {r0,r1,r2,r3,r4,r6,r7}\n"
217         :: "r" (_src), "r" (_dst), "r" (_size), "r" (num_loops) : "r0", "r1", "r2", "r3");
218     }
219 };
220
221 class CopyLdmiaStmiaBenchmark : public CopyBandwidthBenchmark {
222 public:
223     CopyLdmiaStmiaBenchmark() : CopyBandwidthBenchmark() { }
224     virtual ~CopyLdmiaStmiaBenchmark() {}
225
226     const char *getName() { return "ldmia/stmia"; }
227
228 protected:
229     // Copy using ldmia/stmia instructions.
230     void bench(size_t num_loops) {
231         asm volatile(
232             "stmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12}\n"
233
234             "mov r0, %0\n"
235             "mov r1, %1\n"
236             "mov r2, %2\n"
237             "mov r3, %3\n"
238
239             "0:\n"
240             "mov r4, r2, lsr #6\n"
241
242             "1:\n"
243             "ldmia r0!, {r5, r6, r7, r8, r9, r10, r11, r12}\n"
244             "stmia r1!, {r5, r6, r7, r8, r9, r10, r11, r12}\n"
245             "subs r4, r4, #1\n"
246             "ldmia r0!, {r5, r6, r7, r8, r9, r10, r11, r12}\n"
247             "stmia r1!, {r5, r6, r7, r8, r9, r10, r11, r12}\n"
248             "bgt 1b\n"
249
250             "sub r0, r0, r2\n"
251             "sub r1, r1, r2\n"
252             "subs r3, r3, #1\n"
253             "bgt 0b\n"
254
255             "ldmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12}\n"
256         :: "r" (_src), "r" (_dst), "r" (_size), "r" (num_loops) : "r0", "r1", "r2", "r3");
257     }
258 };
259
260 class CopyVldVstBenchmark : public CopyBandwidthBenchmark {
261 public:
262     CopyVldVstBenchmark() : CopyBandwidthBenchmark() { }
263     virtual ~CopyVldVstBenchmark() {}
264
265     const char *getName() { return "vld/vst"; }
266
267 protected:
268     // Copy using vld/vst instructions.
269     void bench(size_t num_loops) {
270         asm volatile(
271             "stmfd sp!, {r0,r1,r2,r3,r4}\n"
272
273             "mov r0, %0\n"
274             "mov r1, %1\n"
275             "mov r2, %2\n"
276             "mov r3, %3\n"
277
278             "0:\n"
279             "mov r4, r2, lsr #6\n"
280
281             "1:\n"
282             "vld1.8 {d0-d3}, [r0]!\n"
283             "vld1.8 {d4-d7}, [r0]!\n"
284             "subs r4, r4, #1\n"
285             "vst1.8 {d0-d3}, [r1:128]!\n"
286             "vst1.8 {d4-d7}, [r1:128]!\n"
287             "bgt 1b\n"
288
289             "sub r0, r0, r2\n"
290             "sub r1, r1, r2\n"
291             "subs r3, r3, #1\n"
292             "bgt 0b\n"
293
294             "ldmfd sp!, {r0,r1,r2,r3,r4}\n"
295         :: "r" (_src), "r" (_dst), "r" (_size), "r" (num_loops) : "r0", "r1", "r2", "r3");
296     }
297 };
298
299 class CopyVldmiaVstmiaBenchmark : public CopyBandwidthBenchmark {
300 public:
301     CopyVldmiaVstmiaBenchmark() : CopyBandwidthBenchmark() { }
302     virtual ~CopyVldmiaVstmiaBenchmark() {}
303
304     const char *getName() { return "vldmia/vstmia"; }
305
306 protected:
307     // Copy using vld/vst instructions.
308     void bench(size_t num_loops) {
309         asm volatile(
310             "stmfd sp!, {r0,r1,r2,r3,r4}\n"
311
312             "mov r0, %0\n"
313             "mov r1, %1\n"
314             "mov r2, %2\n"
315             "mov r3, %3\n"
316
317             "0:\n"
318             "mov r4, r2, lsr #6\n"
319
320             "1:\n"
321             "vldmia r0!, {d0-d7}\n"
322             "subs r4, r4, #1\n"
323             "vstmia r1!, {d0-d7}\n"
324             "bgt 1b\n"
325
326             "sub r0, r0, r2\n"
327             "sub r1, r1, r2\n"
328             "subs r3, r3, #1\n"
329             "bgt 0b\n"
330
331             "ldmfd sp!, {r0,r1,r2,r3,r4}\n"
332         :: "r" (_src), "r" (_dst), "r" (_size), "r" (num_loops) : "r0", "r1", "r2", "r3");
333     }
334 };
335
336 class MemcpyBenchmark : public CopyBandwidthBenchmark {
337 public:
338     MemcpyBenchmark() : CopyBandwidthBenchmark() { }
339     virtual ~MemcpyBenchmark() {}
340
341     const char *getName() { return "memcpy"; }
342
343 protected:
344     void bench(size_t num_loops) {
345         for (size_t i = 0; i < num_loops; i++) {
346             memcpy(_dst, _src, _size);
347         }
348     }
349 };
350
351 class SingleBufferBandwidthBenchmark : public BandwidthBenchmark {
352 public:
353     SingleBufferBandwidthBenchmark() : BandwidthBenchmark(), _buffer(NULL) { }
354     virtual ~SingleBufferBandwidthBenchmark() {
355         if (_buffer) {
356             free(_buffer);
357             _buffer = NULL;
358         }
359     }
360
361     bool setSize(size_t size) {
362         if (_buffer) {
363             free(_buffer);
364             _buffer = NULL;
365         }
366
367         if (_size == 0) {
368             _size = DEFAULT_SINGLE_BUFFER_SIZE;
369         } else {
370             _size = size;
371         }
372
373         _buffer = reinterpret_cast<char*>(memalign(64, _size));
374         if (!_buffer) {
375             perror("Failed to allocate memory for test.");
376             return false;
377         }
378         memset(_buffer, 0, _size);
379
380         return true;
381     }
382
383     bool verify() { return true; }
384
385 protected:
386     char *_buffer;
387
388     static const unsigned int DEFAULT_SINGLE_BUFFER_SIZE = 16000;
389 };
390
391 class WriteBandwidthBenchmark : public SingleBufferBandwidthBenchmark {
392 public:
393     WriteBandwidthBenchmark() : SingleBufferBandwidthBenchmark() { }
394     virtual ~WriteBandwidthBenchmark() { }
395
396     bool verify() {
397         memset(_buffer, 0, _size);
398         bench(1);
399         for (size_t i = 0; i < _size; i++) {
400             if (_buffer[i] != 1) {
401                 printf("Strings failed to compare after one loop.\n");
402                 return false;
403             }
404         }
405
406         memset(_buffer, 0, _size);
407         bench(2);
408         for (size_t i = 0; i < _size; i++) {
409             if (_buffer[i] != 2) {
410                 printf("Strings failed to compare after two loops.\n");
411                 return false;
412             }
413         }
414
415         return true;
416     }
417 };
418
419 class WriteStrdBenchmark : public WriteBandwidthBenchmark {
420 public:
421     WriteStrdBenchmark() : WriteBandwidthBenchmark() { }
422     virtual ~WriteStrdBenchmark() {}
423
424     const char *getName() { return "strd"; }
425
426 protected:
427     // Write a given value using strd.
428     void bench(size_t num_loops) {
429         asm volatile(
430             "stmfd sp!, {r0,r1,r2,r3,r4,r5}\n"
431
432             "mov r0, %0\n"
433             "mov r1, %1\n"
434             "mov r2, %2\n"
435
436             "mov r4, #0\n"
437             "mov r5, #0\n"
438
439             "0:\n"
440             "mov r3, r1, lsr #5\n"
441
442             "add r4, r4, #0x01010101\n"
443             "mov r5, r4\n"
444
445             "1:\n"
446             "subs r3, r3, #1\n"
447             "strd r4, r5, [r0]\n"
448             "strd r4, r5, [r0, #8]\n"
449             "strd r4, r5, [r0, #16]\n"
450             "strd r4, r5, [r0, #24]\n"
451             "add  r0, r0, #32\n"
452             "bgt 1b\n"
453
454             "sub r0, r0, r1\n"
455             "subs r2, r2, #1\n"
456             "bgt 0b\n"
457
458             "ldmfd sp!, {r0,r1,r2,r3,r4,r5}\n"
459           :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2");
460     }
461 };
462
463 class WriteStmiaBenchmark : public WriteBandwidthBenchmark {
464 public:
465     WriteStmiaBenchmark() : WriteBandwidthBenchmark() { }
466     virtual ~WriteStmiaBenchmark() {}
467
468     const char *getName() { return "stmia"; }
469
470 protected:
471       // Write a given value using stmia.
472       void bench(size_t num_loops) {
473           asm volatile(
474               "stmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11}\n"
475
476               "mov r0, %0\n"
477               "mov r1, %1\n"
478               "mov r2, %2\n"
479
480               "mov r4, #0\n"
481
482               "0:\n"
483               "mov r3, r1, lsr #5\n"
484
485               "add r4, r4, #0x01010101\n"
486               "mov r5, r4\n"
487               "mov r6, r4\n"
488               "mov r7, r4\n"
489               "mov r8, r4\n"
490               "mov r9, r4\n"
491               "mov r10, r4\n"
492               "mov r11, r4\n"
493
494               "1:\n"
495               "subs r3, r3, #1\n"
496               "stmia r0!, {r4, r5, r6, r7, r8, r9, r10, r11}\n"
497               "bgt 1b\n"
498
499               "sub r0, r0, r1\n"
500               "subs r2, r2, #1\n"
501               "bgt 0b\n"
502
503               "ldmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11}\n"
504         :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2");
505     }
506 };
507
508 class WriteVstBenchmark : public WriteBandwidthBenchmark {
509 public:
510     WriteVstBenchmark() : WriteBandwidthBenchmark() { }
511     virtual ~WriteVstBenchmark() {}
512
513     const char *getName() { return "vst"; }
514
515     bool usesNeon() { return true; }
516
517 protected:
518     // Write a given value using vst.
519     void bench(size_t num_loops) {
520 #if defined(__ARM_NEON__)
521         asm volatile(
522             "stmfd sp!, {r0,r1,r2,r3,r4}\n"
523
524             "mov r0, %0\n"
525             "mov r1, %1\n"
526             "mov r2, %2\n"
527             "mov r4, #0\n"
528
529             "0:\n"
530             "mov r3, r1, lsr #5\n"
531
532             "add r4, r4, #1\n"
533             "vdup.8 d0, r4\n"
534             "vmov d1, d0\n"
535             "vmov d2, d0\n"
536             "vmov d3, d0\n"
537
538             "1:\n"
539             "subs r3, r3, #1\n"
540             "vst1.8 {d0-d3}, [r0:128]!\n"
541             "bgt 1b\n"
542
543             "sub r0, r0, r1\n"
544             "subs r2, r2, #1\n"
545             "bgt 0b\n"
546
547             "ldmfd sp!, {r0,r1,r2,r3,r4}\n"
548         :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2");
549 #endif
550     }
551 };
552
553 class WriteVstmiaBenchmark : public WriteBandwidthBenchmark {
554 public:
555     WriteVstmiaBenchmark() : WriteBandwidthBenchmark() { }
556     virtual ~WriteVstmiaBenchmark() {}
557
558     const char *getName() { return "vstmia"; }
559
560     bool usesNeon() { return true; }
561
562 protected:
563     // Write a given value using vstmia.
564     void bench(size_t num_loops) {
565 #if defined(__ARM_NEON__)
566         asm volatile(
567             "stmfd sp!, {r0,r1,r2,r3,r4}\n"
568
569             "mov r0, %0\n"
570             "mov r1, %1\n"
571             "mov r2, %2\n"
572             "mov r4, #0\n"
573
574             "0:\n"
575             "mov r3, r1, lsr #5\n"
576
577             "add r4, r4, #1\n"
578             "vdup.8 d0, r4\n"
579             "vmov d1, d0\n"
580             "vmov d2, d0\n"
581             "vmov d3, d0\n"
582
583             "1:\n"
584             "subs r3, r3, #1\n"
585             "vstmia r0!, {d0-d3}\n"
586             "bgt 1b\n"
587
588             "sub r0, r0, r1\n"
589             "subs r2, r2, #1\n"
590             "bgt 0b\n"
591
592             "ldmfd sp!, {r0,r1,r2,r3,r4}\n"
593         :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2");
594 #endif
595     }
596 };
597
598 class MemsetBenchmark : public WriteBandwidthBenchmark {
599 public:
600     MemsetBenchmark() : WriteBandwidthBenchmark() { }
601     virtual ~MemsetBenchmark() {}
602
603     const char *getName() { return "memset"; }
604
605 protected:
606     void bench(size_t num_loops) {
607         for (size_t i = 0; i < num_loops; i++) {
608             memset(_buffer, (i % 255) + 1, _size);
609         }
610     }
611 };
612
613 class ReadLdrdBenchmark : public SingleBufferBandwidthBenchmark {
614 public:
615     ReadLdrdBenchmark() : SingleBufferBandwidthBenchmark() { }
616     virtual ~ReadLdrdBenchmark() {}
617
618     const char *getName() { return "ldrd"; }
619
620 protected:
621     // Write a given value using strd.
622     void bench(size_t num_loops) {
623         asm volatile(
624             "stmfd sp!, {r0,r1,r2,r3,r4,r5}\n"
625
626             "mov r0, %0\n"
627             "mov r1, %1\n"
628             "mov r2, %2\n"
629
630             "0:\n"
631             "mov r3, r1, lsr #5\n"
632
633             "1:\n"
634             "subs r3, r3, #1\n"
635             "ldrd r4, r5, [r0]\n"
636             "ldrd r4, r5, [r0, #8]\n"
637             "ldrd r4, r5, [r0, #16]\n"
638             "ldrd r4, r5, [r0, #24]\n"
639             "add  r0, r0, #32\n"
640             "bgt 1b\n"
641
642             "sub r0, r0, r1\n"
643             "subs r2, r2, #1\n"
644             "bgt 0b\n"
645
646             "ldmfd sp!, {r0,r1,r2,r3,r4,r5}\n"
647           :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2");
648     }
649 };
650
651 class ReadLdmiaBenchmark : public SingleBufferBandwidthBenchmark {
652 public:
653     ReadLdmiaBenchmark() : SingleBufferBandwidthBenchmark() { }
654     virtual ~ReadLdmiaBenchmark() {}
655
656     const char *getName() { return "ldmia"; }
657
658 protected:
659       // Write a given value using stmia.
660       void bench(size_t num_loops) {
661           asm volatile(
662               "stmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11}\n"
663
664               "mov r0, %0\n"
665               "mov r1, %1\n"
666               "mov r2, %2\n"
667
668               "0:\n"
669               "mov r3, r1, lsr #5\n"
670
671               "1:\n"
672               "subs r3, r3, #1\n"
673               "ldmia r0!, {r4, r5, r6, r7, r8, r9, r10, r11}\n"
674               "bgt 1b\n"
675
676               "sub r0, r0, r1\n"
677               "subs r2, r2, #1\n"
678               "bgt 0b\n"
679
680               "ldmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11}\n"
681         :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2");
682     }
683 };
684
685 class ReadVldBenchmark : public SingleBufferBandwidthBenchmark {
686 public:
687     ReadVldBenchmark() : SingleBufferBandwidthBenchmark() { }
688     virtual ~ReadVldBenchmark() {}
689
690     const char *getName() { return "vld"; }
691
692     bool usesNeon() { return true; }
693
694 protected:
695     // Write a given value using vst.
696     void bench(size_t num_loops) {
697 #if defined(__ARM_NEON__)
698         asm volatile(
699             "stmfd sp!, {r0,r1,r2,r3}\n"
700
701             "mov r0, %0\n"
702             "mov r1, %1\n"
703             "mov r2, %2\n"
704
705             "0:\n"
706             "mov r3, r1, lsr #5\n"
707
708             "1:\n"
709             "subs r3, r3, #1\n"
710             "vld1.8 {d0-d3}, [r0:128]!\n"
711             "bgt 1b\n"
712
713             "sub r0, r0, r1\n"
714             "subs r2, r2, #1\n"
715             "bgt 0b\n"
716
717             "ldmfd sp!, {r0,r1,r2,r3}\n"
718         :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2");
719 #endif
720     }
721 };
722
723 class ReadVldmiaBenchmark : public SingleBufferBandwidthBenchmark {
724 public:
725     ReadVldmiaBenchmark() : SingleBufferBandwidthBenchmark() { }
726     virtual ~ReadVldmiaBenchmark() {}
727
728     const char *getName() { return "vldmia"; }
729
730     bool usesNeon() { return true; }
731
732 protected:
733     // Write a given value using vstmia.
734     void bench(size_t num_loops) {
735 #if defined(__ARM_NEON__)
736         asm volatile(
737             "stmfd sp!, {r0,r1,r2,r3}\n"
738
739             "mov r0, %0\n"
740             "mov r1, %1\n"
741             "mov r2, %2\n"
742
743             "0:\n"
744             "mov r3, r1, lsr #5\n"
745
746             "1:\n"
747             "subs r3, r3, #1\n"
748             "vldmia r0!, {d0-d3}\n"
749             "bgt 1b\n"
750
751             "sub r0, r0, r1\n"
752             "subs r2, r2, #1\n"
753             "bgt 0b\n"
754
755             "ldmfd sp!, {r0,r1,r2,r3}\n"
756         :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2");
757 #endif
758     }
759 };
760
761 #endif  // __BANDWIDTH_H__