6 #include "sha3-allInOne.h"
\r
12 #include <x86intrin.h>
\r
15 #define FNV(v1,v2) int32_t( ((v1)*FNV_PRIME) ^ (v2) )
\r
16 const int FNV_PRIME = 0x01000193;
\r
18 struct Mat256x256i8 {
\r
21 void toIdentityMatrix() {
\r
22 for(int i = 0; i < 256; i++) {
\r
23 for(int j = 0; j < 256; j++) {
\r
24 d[i][j] = (i==j)?1:0; // diagonal
\r
29 void copyFrom(const Mat256x256i8& other) {
\r
30 for(int i = 0; i < 256; i++) {
\r
31 for(int j = 0; j < 256; j++) {
\r
32 this->d[j][i] = other.d[j][i];
\r
38 // this->toIdentityMatrix();
\r
41 Mat256x256i8(const Mat256x256i8& other) {
\r
42 this->copyFrom(other);
\r
45 void copyFrom_helper(LTCMemory& ltcMem, int offset) {
\r
46 for(int i = 0; i < 256; i++) {
\r
47 const Words32& lo=ltcMem.get(i*4 + offset);
\r
48 const Words32& hi=ltcMem.get(i*4 + 2 + offset);
\r
49 for(int j = 0; j < 64; j++) {
\r
50 uint32_t i32 = j>=32?hi.get(j-32):lo.get(j);
\r
51 d[j*4+0][i] = (i32>> 0) & 0xFF;
\r
52 d[j*4+1][i] = (i32>> 8) & 0xFF;
\r
53 d[j*4+2][i] = (i32>>16) & 0xFF;
\r
54 d[j*4+3][i] = (i32>>24) & 0xFF;
\r
59 void copyFromEven(LTCMemory& ltcMem) {
\r
60 copyFrom_helper(ltcMem, 0);
\r
63 void copyFromOdd(LTCMemory& ltcMem) {
\r
64 copyFrom_helper(ltcMem, 1);
\r
67 void add(Mat256x256i8& a, Mat256x256i8& b) {
\r
68 for(int i = 0; i < 256; i++) {
\r
69 for(int j = 0; j < 256; j++) {
\r
70 int tmp = int(a.d[i][j]) + int(b.d[i][j]);
\r
71 this->d[i][j] = (tmp & 0xFF);
\r
77 struct Mat256x256i16 {
\r
78 int16_t d[256][256];
\r
80 void toIdentityMatrix() {
\r
81 for(int i = 0; i < 256; i++) {
\r
82 for(int j = 0; j < 256; j++) {
\r
83 d[i][j] = (i==j?1:0); // diagonal
\r
88 void copyFrom(const Mat256x256i8& other) {
\r
89 for(int i = 0; i < 256; i++) {
\r
90 for(int j = 0; j < 256; j++) {
\r
91 this->d[j][i] = int16_t(other.d[j][i]);
\r
92 assert(this->d[j][i] == other.d[j][i]);
\r
97 void copyFrom(const Mat256x256i16& other) {
\r
98 for(int i = 0; i < 256; i++) {
\r
99 for(int j = 0; j < 256; j++) {
\r
100 this->d[j][i] = other.d[j][i];
\r
106 // this->toIdentityMatrix();
\r
109 Mat256x256i16(const Mat256x256i16& other) {
\r
110 this->copyFrom(other);
\r
113 void copyFrom_helper(LTCMemory& ltcMem, int offset) {
\r
114 for(int i = 0; i < 256; i++) {
\r
115 const Words32& lo = ltcMem.get(i*4 + offset);
\r
116 const Words32& hi = ltcMem.get(i*4 + 2 + offset);
\r
117 for(int j = 0; j < 64; j++) {
\r
118 uint32_t i32 = j>=32?hi.get(j-32):lo.get(j);
\r
119 d[j*4+0][i] = int8_t((i32>> 0) & 0xFF);
\r
120 d[j*4+1][i] = int8_t((i32>> 8) & 0xFF);
\r
121 d[j*4+2][i] = int8_t((i32>>16) & 0xFF);
\r
122 d[j*4+3][i] = int8_t((i32>>24) & 0xFF);
\r
127 void copyFromEven(LTCMemory& ltcMem) {
\r
128 copyFrom_helper(ltcMem, 0);
\r
131 void copyFromOdd(LTCMemory& ltcMem) {
\r
132 copyFrom_helper(ltcMem, 1);
\r
135 void mul(const Mat256x256i16& a, const Mat256x256i16& b) {
\r
136 for(int i = 0; i < 256; i += 16) {
\r
137 for(int j = 0; j < 256; j += 16) {
\r
138 for(int ii = i; ii < i+16; ii += 8) {
\r
139 __m256i r[8],s,t[8],u[8],m[8];
\r
140 r[0] = _mm256_set1_epi16(0);
\r
141 r[1] = _mm256_set1_epi16(0);
\r
142 r[2] = _mm256_set1_epi16(0);
\r
143 r[3] = _mm256_set1_epi16(0);
\r
144 r[4] = _mm256_set1_epi16(0);
\r
145 r[5] = _mm256_set1_epi16(0);
\r
146 r[6] = _mm256_set1_epi16(0);
\r
147 r[7] = _mm256_set1_epi16(0);
\r
148 for(int k = 0; k < 256; k++) {
\r
149 s = *((__m256i*)(&(b.d[k][j])));
\r
150 u[0] = _mm256_set1_epi16(a.d[ii+0][k]);
\r
151 u[1] = _mm256_set1_epi16(a.d[ii+1][k]);
\r
152 u[2] = _mm256_set1_epi16(a.d[ii+2][k]);
\r
153 u[3] = _mm256_set1_epi16(a.d[ii+3][k]);
\r
154 u[4] = _mm256_set1_epi16(a.d[ii+4][k]);
\r
155 u[5] = _mm256_set1_epi16(a.d[ii+5][k]);
\r
156 u[6] = _mm256_set1_epi16(a.d[ii+6][k]);
\r
157 u[7] = _mm256_set1_epi16(a.d[ii+7][k]);
\r
158 m[0] = _mm256_mullo_epi16(u[0],s);
\r
159 m[1] = _mm256_mullo_epi16(u[1],s);
\r
160 m[2] = _mm256_mullo_epi16(u[2],s);
\r
161 m[3] = _mm256_mullo_epi16(u[3],s);
\r
162 m[4] = _mm256_mullo_epi16(u[4],s);
\r
163 m[5] = _mm256_mullo_epi16(u[5],s);
\r
164 m[6] = _mm256_mullo_epi16(u[6],s);
\r
165 m[7] = _mm256_mullo_epi16(u[7],s);
\r
166 r[0] = _mm256_add_epi16(r[0],m[0]);
\r
167 r[1] = _mm256_add_epi16(r[1],m[1]);
\r
168 r[2] = _mm256_add_epi16(r[2],m[2]);
\r
169 r[3] = _mm256_add_epi16(r[3],m[3]);
\r
170 r[4] = _mm256_add_epi16(r[4],m[4]);
\r
171 r[5] = _mm256_add_epi16(r[5],m[5]);
\r
172 r[6] = _mm256_add_epi16(r[6],m[6]);
\r
173 r[7] = _mm256_add_epi16(r[7],m[7]);
\r
175 t[0] = _mm256_slli_epi16(r[0],8);
\r
176 t[1] = _mm256_slli_epi16(r[1],8);
\r
177 t[2] = _mm256_slli_epi16(r[2],8);
\r
178 t[3] = _mm256_slli_epi16(r[3],8);
\r
179 t[4] = _mm256_slli_epi16(r[4],8);
\r
180 t[5] = _mm256_slli_epi16(r[5],8);
\r
181 t[6] = _mm256_slli_epi16(r[6],8);
\r
182 t[7] = _mm256_slli_epi16(r[7],8);
\r
183 t[0] = _mm256_add_epi16(r[0],t[0]);
\r
184 t[1] = _mm256_add_epi16(r[1],t[1]);
\r
185 t[2] = _mm256_add_epi16(r[2],t[2]);
\r
186 t[3] = _mm256_add_epi16(r[3],t[3]);
\r
187 t[4] = _mm256_add_epi16(r[4],t[4]);
\r
188 t[5] = _mm256_add_epi16(r[5],t[5]);
\r
189 t[6] = _mm256_add_epi16(r[6],t[6]);
\r
190 t[7] = _mm256_add_epi16(r[7],t[7]);
\r
191 for(int x = 0; x < 8; x++) {
\r
192 this->d[ii+x][j+0 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*0 +1)));
\r
193 this->d[ii+x][j+1 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*1 +1)));
\r
194 this->d[ii+x][j+2 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*2 +1)));
\r
195 this->d[ii+x][j+3 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*3 +1)));
\r
196 this->d[ii+x][j+4 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*4 +1)));
\r
197 this->d[ii+x][j+5 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*5 +1)));
\r
198 this->d[ii+x][j+6 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*6 +1)));
\r
199 this->d[ii+x][j+7 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*7 +1)));
\r
200 this->d[ii+x][j+8 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*8 +1)));
\r
201 this->d[ii+x][j+9 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*9 +1)));
\r
202 this->d[ii+x][j+10] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*10+1)));
\r
203 this->d[ii+x][j+11] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*11+1)));
\r
204 this->d[ii+x][j+12] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*12+1)));
\r
205 this->d[ii+x][j+13] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*13+1)));
\r
206 this->d[ii+x][j+14] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*14+1)));
\r
207 this->d[ii+x][j+15] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*15+1)));
\r
214 void add(Mat256x256i16& a, Mat256x256i16& b) {
\r
215 for(int i = 0; i < 256; i++) {
\r
216 for(int j = 0; j < 256; j++) {
\r
217 int tmp = int(a.d[i][j]) + int(b.d[i][j]);
\r
218 this->d[i][j] = (tmp & 0xFF);
\r
223 void toMatI8(Mat256x256i8& other) {
\r
224 for(int i = 0; i < 256; i++) {
\r
225 for(int j = 0; j < 256; j++) {
\r
226 other.d[j][i] = (this->d[j][i]) & 0xFF;
\r
231 void topup(Mat256x256i8& other) {
\r
232 for(int i = 0; i < 256; i++) {
\r
233 for(int j = 0; j < 256; j++) {
\r
234 other.d[j][i] += (this->d[j][i]) & 0xFF;
\r
241 struct Arr256x64i32 {
\r
242 uint32_t d[256][64];
\r
244 uint8_t* d0RawPtr() {
\r
245 return (uint8_t*)(d[0]);
\r
248 Arr256x64i32(const Mat256x256i8& mat) {
\r
249 for(int j = 0; j < 256; j++) {
\r
250 for(int i = 0; i < 64; i++) {
\r
251 d[j][i] = ((uint32_t(uint8_t(mat.d[j][i + 192]))) << 24) |
\r
252 ((uint32_t(uint8_t(mat.d[j][i + 128]))) << 16) |
\r
253 ((uint32_t(uint8_t(mat.d[j][i + 64]))) << 8) |
\r
254 ((uint32_t(uint8_t(mat.d[j][i]))) << 0);
\r
260 for(int k = 256; k > 1; k = k/2) {
\r
261 for(int j = 0; j < k/2; j++) {
\r
262 for(int i = 0; i < 64; i++) {
\r
263 d[j][i] = FNV(d[j][i], d[j + k/2][i]);
\r
270 // struct BytomMatList8 {
\r
271 // std::vector<Mat256x256i8*> matVec;
\r
273 // Mat256x256i8 at(int i) {
\r
274 // return *(matVec[i]);
\r
277 // BytomMatList8() {
\r
278 // for(int i=0; i<256; i++) {
\r
279 // Mat256x256i8* ptr = new Mat256x256i8;
\r
280 // assert(ptr!=NULL);
\r
281 // matVec.push_back(ptr);
\r
285 // ~BytomMatList8() {
\r
286 // for(int i=0; i<256; i++) {
\r
287 // delete matVec[i];
\r
291 // void init(const Words32& X_in) {
\r
292 // Words32 X = X_in;
\r
293 // LTCMemory ltcMem;
\r
294 // for(int i=0; i<128; i++) {
\r
295 // ltcMem.scrypt(X);
\r
296 // matVec[2*i]->copyFromEven(ltcMem);
\r
297 // matVec[2*i+1]->copyFromOdd(ltcMem);
\r
302 struct BytomMatList16 {
\r
303 std::vector<Mat256x256i16*> matVec;
\r
305 Mat256x256i16 at(int i) {
\r
306 return *(matVec[i]);
\r
310 for(int i = 0; i < 256; i++) {
\r
311 Mat256x256i16* ptr = new Mat256x256i16;
\r
312 assert(ptr != NULL);
\r
313 matVec.push_back(ptr);
\r
317 ~BytomMatList16() {
\r
318 for(int i = 0; i < 256; i++)
\r
322 void init(const Words32& X_in) {
\r
325 for(int i = 0; i < 128; i++) {
\r
327 matVec[2*i]->copyFromEven(ltcMem);
\r
328 matVec[2*i + 1]->copyFromOdd(ltcMem);
\r
332 // void copyFrom(BytomMatList8& other) {
\r
333 // for(int i=0; i<256; i++) {
\r
334 // matVec[i]->copyFrom(*other.matVec[i]);
\r
338 // void copyFrom(BytomMatList16& other) {
\r
339 // for(int i=0; i<256; i++) {
\r
340 // matVec[i]->copyFrom(*other.matVec[i]);
\r
345 // extern BytomMatList8* matList_int8;
\r
346 extern BytomMatList16* matList_int16;
\r
348 inline void iter_mineBytom(const uint8_t *fixedMessage,
\r
350 // uint8_t nonce[8],
\r
351 uint8_t result[32]) {
\r
352 Mat256x256i8 *resArr8 = new Mat256x256i8[4];
\r
354 clock_t start, end;
\r
356 // Itz faster using single thread ...
\r
357 #pragma omp parallel for simd
\r
358 for(int k = 0; k < 4; k++) { // The k-loop
\r
359 sha3_ctx *ctx = new sha3_ctx;
\r
360 Mat256x256i16 *mat16 = new Mat256x256i16;
\r
361 Mat256x256i16 *tmp16 = new Mat256x256i16;
\r
362 uint8_t sequence[32];
\r
363 rhash_sha3_256_init(ctx);
\r
364 rhash_sha3_update(ctx, fixedMessage + (len*k/4), len/4);//分四轮消耗掉fixedMessage
\r
365 rhash_sha3_final(ctx, sequence);
\r
366 tmp16->toIdentityMatrix();
\r
368 for(int j = 0; j < 2; j++) {
\r
369 // equivalent as tmp=tmp*matlist, i+=1
\r
370 for(int i = 0; i < 32; i += 2) {
\r
371 // "mc = ma dot mb.T" in GoLang code
\r
372 mat16->mul(*tmp16, matList_int16->at(sequence[i]));
\r
373 // "ma = mc" in GoLang code
\r
374 tmp16->mul(*mat16, matList_int16->at(sequence[i+1]));
\r
377 // "res[k] = mc" in GoLang code
\r
378 tmp16->toMatI8(resArr8[k]); // 0.00018s
\r
385 Mat256x256i8 *res8 = new Mat256x256i8;
\r
386 res8->add(resArr8[0], resArr8[1]);
\r
387 res8->add(*res8, resArr8[2]);
\r
388 res8->add(*res8, resArr8[3]);
\r
391 // std::cout << "\tTime for getting MulMatix: "
\r
392 // << (double)(end - start) / CLOCKS_PER_SEC * 1000 << "ms"
\r
395 Arr256x64i32 arr(*res8);
\r
397 sha3_ctx *ctx = new sha3_ctx;
\r
398 rhash_sha3_256_init(ctx);
\r
399 rhash_sha3_update(ctx, arr.d0RawPtr(), 256);
\r
400 rhash_sha3_final(ctx, result);
\r
407 inline void incrNonce(uint8_t nonce[8]) {
\r
408 for(int i = 0; i < 8; i++) {
\r
409 if(nonce[i] != 255) {
\r
418 inline int countLeadingZero(uint8_t result[32]) {
\r
420 for(int i = 31; i >= 0; i--) { // NOTE: reverse
\r
421 if(result[i] < 1) {
\r
423 } else if(result[i]<2) {
\r
426 } else if(result[i]<4) {
\r
429 } else if(result[i]<8) {
\r
432 } else if(result[i]<16) {
\r
435 } else if(result[i]<32) {
\r
438 } else if(result[i]<64) {
\r
441 } else if(result[i]<128) {
\r
449 // inline int test_mineBytom(
\r
450 // const uint8_t *fixedMessage,
\r
452 // uint8_t nonce[32],
\r
454 // int leadingZeroThres)
\r
456 // assert(len%4==0);
\r
458 // for(step=0; step<count; step++) {
\r
459 // uint8_t result[32];
\r
460 // //std::cerr<<"Mine step "<<step<<std::endl;
\r
461 // iter_mineBytom(fixedMessage,100,nonce,result);
\r
462 // std::cerr<<"Mine step "<<step<<std::endl;
\r
463 // for (int i = 0; i < 32; i++) {
\r
464 // printf("%02x ", result[i]);
\r
468 // if (countLeadingZero(result) > leadingZeroThres)
\r
470 // incrNonce(nonce);
\r