12 #define CACHE_HIT_SIZE 1 << 17
16 size_t size_start = 64;
17 size_t size_end = 16 * (1ull << 20);
18 size_t samples = 2048;
19 size_t size_per_test = 64 * (1ull << 20);
25 void __attribute__((noinline)) memcpy_noinline(void *dst, void *src, size_t size);
26 void __attribute__((noinline)) memset_noinline(void *dst, int value, size_t size);
27 uint64_t __attribute__((noinline)) sum(volatile void *src, size_t size);
35 static void usage(char* p) {
36 printf("Usage: %s <test> <options>\n"
37 "<test> is one of the following:\n"
41 "<options> are optional and apply to all tests:\n"
43 " Simulates cpu-only load of a test. Guaranteed to use L2\n"
44 " instead. Not supported on --sum test.\n"
45 " --delay DELAY_DIVISOR\n"
46 " --start START_SIZE_MB\n"
47 " --end END_SIZE_MB (requires start, optional)\n"
48 " --samples NUM_SAMPLES\n"
52 int main(int argc, char *argv[])
54 BenchType type = MemcpyBench;
59 for (int i = 1; i < argc; i++) {
60 if (string(argv[i]) == string("--memcpy")) {
62 } else if (string(argv[i]) == string("--memset")) {
64 } else if (string(argv[i]) == string("--sum")) {
66 } else if (string(argv[i]) == string("--dummy")) {
68 } else if (i + 1 < argc) {
69 if (string(argv[i]) == string("--delay")) {
70 delay = atoi(argv[++i]);
71 } else if (string(argv[i]) == string("--start")) {
72 size_start = atoi(argv[++i]) * (1ull << 20);
73 size_end = size_start;
74 } else if (string(argv[i]) == string("--end")) {
75 size_t end = atoi(argv[++i]) * (1ull << 20);
76 if (end > size_start && i > 3
77 && string(argv[i-3]) == string("--start")) {
80 printf("Cannot specify --end without --start.\n");
83 } else if (string(argv[i]) == string("--samples")) {
84 samples = atoi(argv[++i]);
86 printf("Unknown argument %s\n", argv[i]);
90 printf("The %s option requires a single argument.\n", argv[i]);
95 unique_ptr<uint8_t[]> src(new uint8_t[size_end]);
96 unique_ptr<uint8_t[]> dst(new uint8_t[size_end]);
97 memset(src.get(), 1, size_end);
99 double start_pow = log10(size_start);
100 double end_pow = log10(size_end);
101 double pow_inc = (end_pow - start_pow) / samples;
103 //cout << "src: " << (uintptr_t)src.get() << endl;
104 //cout << "dst: " << (uintptr_t)dst.get() << endl;
106 for (double cur_pow = start_pow; cur_pow <= end_pow && samples > 0;
107 cur_pow += pow_inc) {
108 chrono::time_point<chrono::high_resolution_clock>
109 copy_start, copy_end, pre_wait;
111 size_t cur_size = (size_t)pow(10.0, cur_pow);
112 size_t iter_per_size = size_per_test / cur_size;
117 memcpy_noinline(src.get(), dst.get(), cur_size);
118 memset_noinline(dst.get(), 0xdeadbeef, cur_size);
119 size_t hit_size = CACHE_HIT_SIZE;
120 copy_start = chrono::high_resolution_clock::now();
121 for (int i = 0; i < iter_per_size; i++) {
123 memset_noinline(dst.get(), 0xdeadbeef, cur_size);
125 while (hit_size < cur_size) {
127 (dst.get(), 0xdeadbeef, CACHE_HIT_SIZE);
132 this_thread::sleep_for(chrono
133 ::nanoseconds(size_per_test / delay));
135 copy_end = chrono::high_resolution_clock::now();
139 memcpy_noinline(dst.get(), src.get(), cur_size);
140 memcpy_noinline(src.get(), dst.get(), cur_size);
141 size_t hit_size = CACHE_HIT_SIZE;
142 copy_start = chrono::high_resolution_clock::now();
143 for (int i = 0; i < iter_per_size; i++) {
145 memcpy_noinline(dst.get(), src.get(), cur_size);
147 while (hit_size < cur_size) {
149 (dst.get(), src.get(), CACHE_HIT_SIZE);
150 hit_size += CACHE_HIT_SIZE;
154 this_thread::sleep_for(chrono
155 ::nanoseconds(size_per_test / delay));
157 copy_end = chrono::high_resolution_clock::now();
162 s += sum(src.get(), cur_size);
163 copy_start = chrono::high_resolution_clock::now();
164 for (int i = 0; i < iter_per_size; i++) {
165 s += sum(src.get(), cur_size);
167 this_thread::sleep_for(chrono
168 ::nanoseconds(size_per_test / delay));
170 copy_end = chrono::high_resolution_clock::now();
177 double ns_per_copy = chrono::duration_cast<chrono::nanoseconds>(copy_end - copy_start).count() / double(iter_per_size);
178 double gb_per_sec = ((double)cur_size / (1ull<<30)) / (ns_per_copy / 1.0E9);
179 if (type == MemcpyBench)
181 double percent_waiting = 0;
183 percent_waiting = (size_per_test / delay) / ns_per_copy * 100;
185 cout << "size: " << cur_size << ", perf: " << gb_per_sec
186 << "GB/s, iter: " << iter_per_size << ", \% time spent waiting: "
187 << percent_waiting << endl;