mining/tensority/cgo_algorithm/lib/BytomPoW.h

   1 /* BytomPoW.h */
   2 #ifndef BYTOMPOW_H
   3 #define BYTOMPOW_H
   4
   5 #include "scrypt.h"
   6 #include "sha3-allInOne.h"
   7 #include <iostream>
   8 #include <vector>
   9 #include <time.h>
  10 #include <assert.h>
  11 #include <stdint.h>
  12 #include <x86intrin.h>
  13 #include <omp.h>
  14
  15 #define FNV(v1,v2) int32_t( ((v1)*FNV_PRIME) ^ (v2) )
  16 const int FNV_PRIME = 0x01000193;
  17
  18 struct Mat256x256i8 {
  19     int8_t d[256][256];
  20
  21     void toIdentityMatrix() {
  22         for(int i = 0; i < 256; i++) {
  23             for(int j = 0; j < 256; j++) {
  24                 d[i][j] = (i==j)?1:0; // diagonal
  25             }
  26         }
  27     }
  28
  29     void copyFrom(const Mat256x256i8& other) {
  30         for(int i = 0; i < 256; i++) {
  31             for(int j = 0; j < 256; j++) {
  32                 this->d[j][i] = other.d[j][i];
  33             }
  34         }
  35     }
  36
  37     Mat256x256i8() {
  38 //        this->toIdentityMatrix();
  39     }
  40
  41     Mat256x256i8(const Mat256x256i8& other) {
  42         this->copyFrom(other);
  43     }
  44
  45     void copyFrom_helper(LTCMemory& ltcMem, int offset) {
  46         for(int i = 0; i < 256; i++) {
  47             const Words32& lo=ltcMem.get(i*4 + offset);
  48             const Words32& hi=ltcMem.get(i*4 + 2 + offset);
  49             for(int j = 0; j < 64; j++) {
  50                 uint32_t i32 = j>=32?hi.get(j-32):lo.get(j);
  51                 d[j*4+0][i] = (i32>> 0) & 0xFF;
  52                 d[j*4+1][i] = (i32>> 8) & 0xFF;
  53                 d[j*4+2][i] = (i32>>16) & 0xFF;
  54                 d[j*4+3][i] = (i32>>24) & 0xFF;
  55             }
  56         }
  57     }
  58
  59     void copyFromEven(LTCMemory& ltcMem) {
  60         copyFrom_helper(ltcMem, 0);
  61     }
  62
  63     void copyFromOdd(LTCMemory& ltcMem) {
  64         copyFrom_helper(ltcMem, 1);
  65     }
  66
  67     void add(Mat256x256i8& a, Mat256x256i8& b) {
  68         for(int i = 0; i < 256; i++) {
  69             for(int j = 0; j < 256; j++) {
  70                 int tmp = int(a.d[i][j]) + int(b.d[i][j]);
  71                 this->d[i][j] = (tmp & 0xFF);
  72             }
  73         }
  74     }
  75 };
  76
  77 struct Mat256x256i16 {
  78     int16_t d[256][256];
  79
  80     void toIdentityMatrix() {
  81         for(int i = 0; i < 256; i++) {
  82             for(int j = 0; j < 256; j++) {
  83                 d[i][j] = (i==j?1:0); // diagonal
  84             }
  85         }
  86     }
  87
  88     void copyFrom(const Mat256x256i8& other) {
  89         for(int i = 0; i < 256; i++) {
  90             for(int j = 0; j < 256; j++) {
  91                 this->d[j][i] = int16_t(other.d[j][i]);
  92                 assert(this->d[j][i] == other.d[j][i]);
  93             }
  94         }
  95     }
  96
  97     void copyFrom(const Mat256x256i16& other) {
  98         for(int i = 0; i < 256; i++) {
  99             for(int j = 0; j < 256; j++) {
 100                 this->d[j][i] = other.d[j][i];
 101             }
 102         }
 103     }
 104
 105     Mat256x256i16() {
 106 //        this->toIdentityMatrix();
 107     }
 108
 109     Mat256x256i16(const Mat256x256i16& other) {
 110         this->copyFrom(other);
 111     }
 112
 113     void copyFrom_helper(LTCMemory& ltcMem, int offset) {
 114         for(int i = 0; i < 256; i++) {
 115             const Words32& lo = ltcMem.get(i*4 + offset);
 116             const Words32& hi = ltcMem.get(i*4 + 2 + offset);
 117             for(int j = 0; j < 64; j++) {
 118                 uint32_t i32 = j>=32?hi.get(j-32):lo.get(j);
 119                 d[j*4+0][i] = int8_t((i32>> 0) & 0xFF);
 120                 d[j*4+1][i] = int8_t((i32>> 8) & 0xFF);
 121                 d[j*4+2][i] = int8_t((i32>>16) & 0xFF);
 122                 d[j*4+3][i] = int8_t((i32>>24) & 0xFF);
 123             }
 124         }
 125     }
 126
 127     void copyFromEven(LTCMemory& ltcMem) {
 128         copyFrom_helper(ltcMem, 0);
 129     }
 130
 131     void copyFromOdd(LTCMemory& ltcMem) {
 132         copyFrom_helper(ltcMem, 1);
 133     }
 134
 135     void mul(const Mat256x256i16& a, const Mat256x256i16& b) {
 136         for(int i = 0; i < 256; i += 16) {
 137             for(int j = 0; j < 256; j += 16) {
 138                 for(int ii = i; ii < i+16; ii += 8) {
 139                     __m256i r[8],s,t[8],u[8],m[8];
 140                     r[0] = _mm256_set1_epi16(0);
 141                     r[1] = _mm256_set1_epi16(0);
 142                     r[2] = _mm256_set1_epi16(0);
 143                     r[3] = _mm256_set1_epi16(0);
 144                     r[4] = _mm256_set1_epi16(0);
 145                     r[5] = _mm256_set1_epi16(0);
 146                     r[6] = _mm256_set1_epi16(0);
 147                     r[7] = _mm256_set1_epi16(0);
 148                     for(int k = 0; k < 256; k++) {
 149                         s = *((__m256i*)(&(b.d[k][j])));
 150                         u[0] = _mm256_set1_epi16(a.d[ii+0][k]);
 151                         u[1] = _mm256_set1_epi16(a.d[ii+1][k]);
 152                         u[2] = _mm256_set1_epi16(a.d[ii+2][k]);
 153                         u[3] = _mm256_set1_epi16(a.d[ii+3][k]);
 154                         u[4] = _mm256_set1_epi16(a.d[ii+4][k]);
 155                         u[5] = _mm256_set1_epi16(a.d[ii+5][k]);
 156                         u[6] = _mm256_set1_epi16(a.d[ii+6][k]);
 157                         u[7] = _mm256_set1_epi16(a.d[ii+7][k]);
 158                         m[0] = _mm256_mullo_epi16(u[0],s);
 159                         m[1] = _mm256_mullo_epi16(u[1],s);
 160                         m[2] = _mm256_mullo_epi16(u[2],s);
 161                         m[3] = _mm256_mullo_epi16(u[3],s);
 162                         m[4] = _mm256_mullo_epi16(u[4],s);
 163                         m[5] = _mm256_mullo_epi16(u[5],s);
 164                         m[6] = _mm256_mullo_epi16(u[6],s);
 165                         m[7] = _mm256_mullo_epi16(u[7],s);
 166                         r[0] = _mm256_add_epi16(r[0],m[0]);
 167                         r[1] = _mm256_add_epi16(r[1],m[1]);
 168                         r[2] = _mm256_add_epi16(r[2],m[2]);
 169                         r[3] = _mm256_add_epi16(r[3],m[3]);
 170                         r[4] = _mm256_add_epi16(r[4],m[4]);
 171                         r[5] = _mm256_add_epi16(r[5],m[5]);
 172                         r[6] = _mm256_add_epi16(r[6],m[6]);
 173                         r[7] = _mm256_add_epi16(r[7],m[7]);
 174                     }
 175                     t[0] = _mm256_slli_epi16(r[0],8);
 176                     t[1] = _mm256_slli_epi16(r[1],8);
 177                     t[2] = _mm256_slli_epi16(r[2],8);
 178                     t[3] = _mm256_slli_epi16(r[3],8);
 179                     t[4] = _mm256_slli_epi16(r[4],8);
 180                     t[5] = _mm256_slli_epi16(r[5],8);
 181                     t[6] = _mm256_slli_epi16(r[6],8);
 182                     t[7] = _mm256_slli_epi16(r[7],8);
 183                     t[0] = _mm256_add_epi16(r[0],t[0]);
 184                     t[1] = _mm256_add_epi16(r[1],t[1]);
 185                     t[2] = _mm256_add_epi16(r[2],t[2]);
 186                     t[3] = _mm256_add_epi16(r[3],t[3]);
 187                     t[4] = _mm256_add_epi16(r[4],t[4]);
 188                     t[5] = _mm256_add_epi16(r[5],t[5]);
 189                     t[6] = _mm256_add_epi16(r[6],t[6]);
 190                     t[7] = _mm256_add_epi16(r[7],t[7]);
 191                     for(int x = 0; x < 8; x++) {
 192                         this->d[ii+x][j+0 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*0 +1)));
 193                         this->d[ii+x][j+1 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*1 +1)));
 194                         this->d[ii+x][j+2 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*2 +1)));
 195                         this->d[ii+x][j+3 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*3 +1)));
 196                         this->d[ii+x][j+4 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*4 +1)));
 197                         this->d[ii+x][j+5 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*5 +1)));
 198                         this->d[ii+x][j+6 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*6 +1)));
 199                         this->d[ii+x][j+7 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*7 +1)));
 200                         this->d[ii+x][j+8 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*8 +1)));
 201                         this->d[ii+x][j+9 ] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*9 +1)));
 202                         this->d[ii+x][j+10] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*10+1)));
 203                         this->d[ii+x][j+11] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*11+1)));
 204                         this->d[ii+x][j+12] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*12+1)));
 205                         this->d[ii+x][j+13] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*13+1)));
 206                         this->d[ii+x][j+14] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*14+1)));
 207                         this->d[ii+x][j+15] = int16_t(int8_t(_mm256_extract_epi8(t[x],2*15+1)));
 208                     }
 209                 }
 210             }
 211         }
 212     }
 213
 214     void add(Mat256x256i16& a, Mat256x256i16& b) {
 215         for(int i = 0; i < 256; i++) {
 216             for(int j = 0; j < 256; j++) {
 217                 int tmp = int(a.d[i][j]) + int(b.d[i][j]);
 218                 this->d[i][j] = (tmp & 0xFF);
 219             }
 220         }
 221     }
 222
 223     void toMatI8(Mat256x256i8& other) {
 224         for(int i = 0; i < 256; i++) {
 225             for(int j = 0; j < 256; j++) {
 226                 other.d[j][i] = (this->d[j][i]) & 0xFF;
 227             }
 228         }
 229     }
 230
 231     void topup(Mat256x256i8& other) {
 232         for(int i = 0; i < 256; i++) {
 233             for(int j = 0; j < 256; j++) {
 234                 other.d[j][i] += (this->d[j][i]) & 0xFF;
 235             }
 236         }
 237     }
 238 };
 239
 240
 241 struct Arr256x64i32 {
 242     uint32_t d[256][64];
 243
 244     uint8_t* d0RawPtr() {
 245         return (uint8_t*)(d[0]);
 246     }
 247
 248     Arr256x64i32(const Mat256x256i8& mat) {
 249         for(int j = 0; j < 256; j++) {
 250             for(int i = 0; i < 64; i++) {
 251                 d[j][i] = ((uint32_t(uint8_t(mat.d[j][i + 192]))) << 24) |
 252                           ((uint32_t(uint8_t(mat.d[j][i + 128]))) << 16) |
 253                           ((uint32_t(uint8_t(mat.d[j][i +  64]))) <<  8) |
 254                           ((uint32_t(uint8_t(mat.d[j][i]))) << 0);
 255             }
 256         }
 257     }
 258
 259     void reduceFNV() {
 260         for(int k = 256; k > 1; k = k/2) {
 261             for(int j = 0; j < k/2; j++) {
 262                 for(int i = 0; i < 64; i++) {
 263                     d[j][i] = FNV(d[j][i], d[j + k/2][i]);
 264                 }
 265             }
 266         }
 267     }
 268 };
 269
 270 // struct BytomMatList8 {
 271 //     std::vector<Mat256x256i8*> matVec;
 272
 273 //     Mat256x256i8 at(int i) {
 274 //         return *(matVec[i]);
 275 //     }
 276
 277 //     BytomMatList8() {
 278 //         for(int i=0; i<256; i++) {
 279 //             Mat256x256i8* ptr = new Mat256x256i8;
 280 //             assert(ptr!=NULL);
 281 //             matVec.push_back(ptr);
 282 //         }
 283 //     }
 284
 285 //     ~BytomMatList8() {
 286 //         for(int i=0; i<256; i++) {
 287 //             delete matVec[i];
 288 //         }
 289 //     }
 290
 291 //     void init(const Words32& X_in) {
 292 //         Words32 X = X_in;
 293 //         LTCMemory ltcMem;
 294 //         for(int i=0; i<128; i++) {
 295 //             ltcMem.scrypt(X);
 296 //             matVec[2*i]->copyFromEven(ltcMem);
 297 //             matVec[2*i+1]->copyFromOdd(ltcMem);
 298 //         }
 299 //     }
 300 // };
 301
 302 struct BytomMatList16 {
 303     std::vector<Mat256x256i16*> matVec;
 304
 305     Mat256x256i16 at(int i) {
 306         return *(matVec[i]);
 307     }
 308
 309     BytomMatList16() {
 310         for(int i = 0; i < 256; i++) {
 311             Mat256x256i16* ptr = new Mat256x256i16;
 312             assert(ptr != NULL);
 313             matVec.push_back(ptr);
 314         }
 315     }
 316
 317     ~BytomMatList16() {
 318         for(int i = 0; i < 256; i++)
 319             delete matVec[i];
 320     }
 321
 322     void init(const Words32& X_in) {
 323         Words32 X = X_in;
 324         LTCMemory ltcMem;
 325         for(int i = 0; i < 128; i++) {
 326             ltcMem.scrypt(X);
 327             matVec[2*i]->copyFromEven(ltcMem);
 328             matVec[2*i + 1]->copyFromOdd(ltcMem);
 329         }
 330     }
 331
 332     // void copyFrom(BytomMatList8& other) {
 333     //     for(int i=0; i<256; i++) {
 334     //         matVec[i]->copyFrom(*other.matVec[i]);
 335     //     }
 336     // }
 337
 338     // void copyFrom(BytomMatList16& other) {
 339     //     for(int i=0; i<256; i++) {
 340     //         matVec[i]->copyFrom(*other.matVec[i]);
 341     //     }
 342     // }
 343 };
 344
 345 // extern BytomMatList8* matList_int8;
 346 extern BytomMatList16* matList_int16;
 347
 348 inline void iter_mineBytom(const uint8_t *fixedMessage,
 349                             uint32_t len,
 350                             // uint8_t nonce[8],
 351                             uint8_t result[32]) {
 352     Mat256x256i8 *resArr8 = new Mat256x256i8[4];
 353
 354     clock_t start, end;
 355     start = clock();
 356     // Itz faster using single thread ...
 357     #pragma omp parallel for simd
 358     for(int k = 0; k < 4; k++) { // The k-loop
 359         sha3_ctx *ctx = new sha3_ctx;
 360         Mat256x256i16 *mat16 = new Mat256x256i16;
 361         Mat256x256i16 *tmp16 = new Mat256x256i16;
 362         uint8_t sequence[32];
 363         rhash_sha3_256_init(ctx);
 364         rhash_sha3_update(ctx, fixedMessage + (len*k/4), len/4);//分四轮消耗掉fixedMessage
 365         rhash_sha3_final(ctx, sequence);
 366         tmp16->toIdentityMatrix();
 367
 368         for(int j = 0; j < 2; j++) {
 369             // equivalent as tmp=tmp*matlist, i+=1
 370             for(int i = 0; i < 32; i += 2) {
 371                 // "mc = ma dot mb.T" in GoLang code
 372                 mat16->mul(*tmp16, matList_int16->at(sequence[i]));
 373                 // "ma = mc" in GoLang code
 374                 tmp16->mul(*mat16, matList_int16->at(sequence[i+1]));
 375             }
 376         }
 377         // "res[k] = mc" in GoLang code
 378         tmp16->toMatI8(resArr8[k]); // 0.00018s
 379         delete mat16;
 380         delete tmp16;
 381         delete ctx;
 382     }
 383
 384     // 3.7e-05s
 385     Mat256x256i8 *res8 = new Mat256x256i8;
 386     res8->add(resArr8[0], resArr8[1]);
 387     res8->add(*res8, resArr8[2]);
 388     res8->add(*res8, resArr8[3]);
 389
 390     end = clock();
 391     // std::cout << "\tTime for getting MulMatix: "
 392     //           << (double)(end - start) / CLOCKS_PER_SEC * 1000 << "ms"
 393     //           << std::endl;
 394
 395     Arr256x64i32 arr(*res8);
 396     arr.reduceFNV();
 397     sha3_ctx *ctx = new sha3_ctx;
 398     rhash_sha3_256_init(ctx);
 399     rhash_sha3_update(ctx, arr.d0RawPtr(), 256);
 400     rhash_sha3_final(ctx, result);
 401
 402     delete res8;
 403     delete[] resArr8;
 404     delete ctx;
 405 }
 406
 407 inline void incrNonce(uint8_t nonce[8]) {
 408     for(int i = 0; i < 8; i++) {
 409         if(nonce[i] != 255) {
 410             nonce[i]++;
 411             break;
 412         } else {
 413             nonce[i] = 0;
 414         }
 415     }
 416 }
 417
 418 inline int countLeadingZero(uint8_t result[32]) {
 419     int count = 0;
 420     for(int i = 31; i >= 0; i--) { // NOTE: reverse
 421         if(result[i] < 1) {
 422             count += 8;
 423         } else if(result[i]<2)  {
 424             count += 7;
 425             break;
 426         } else if(result[i]<4)  {
 427             count += 6;
 428             break;
 429         } else if(result[i]<8)  {
 430             count += 5;
 431             break;
 432         } else if(result[i]<16) {
 433             count += 4;
 434             break;
 435         } else if(result[i]<32) {
 436             count += 3;
 437             break;
 438         } else if(result[i]<64) {
 439             count += 2;
 440             break;
 441         } else if(result[i]<128) {
 442             count += 1;
 443             break;
 444         }
 445     }
 446     return count;
 447 }
 448
 449 // inline int test_mineBytom(
 450 //     const uint8_t *fixedMessage,
 451 //     uint32_t len,
 452 //     uint8_t nonce[32],
 453 //     int count,
 454 //     int leadingZeroThres)
 455 // {
 456 //   assert(len%4==0);
 457 //   int step;
 458 //   for(step=0; step<count; step++) {
 459 //     uint8_t result[32];
 460 //     //std::cerr<<"Mine step "<<step<<std::endl;
 461 //     iter_mineBytom(fixedMessage,100,nonce,result);
 462 //     std::cerr<<"Mine step "<<step<<std::endl;
 463 //     for (int i = 0; i < 32; i++) {
 464 //       printf("%02x ", result[i]);
 465 //       if (i % 8 == 7)
 466 //         printf("\n");
 467 //     }
 468 //     if (countLeadingZero(result) > leadingZeroThres)
 469 //       return step;
 470 //     incrNonce(nonce);
 471 //   }
 472 //   return step;
 473 // }
 474
 475
 476 #endif
 477