int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { const uint32_t Htarg = ptarget[7]; int Nfactor = GetNfactor(bswap_32x4(pdata[17])); if (Nfactor > scrypt_maxN) { scrypt_fatal_error("scrypt: N out of range"); } N = (1 << (Nfactor + 1)); parallel = 0; int throughput = cuda_throughput(thr_id); uint32_t *data[2] = { new uint32_t[20*throughput], new uint32_t[20*throughput] }; uint32_t *hash = new uint32_t[8*throughput]; uint32_t n = pdata[19] - 1; // int i; #if !defined(SCRYPT_TEST) static int power_on_self_test = 0; if (!power_on_self_test) { power_on_self_test = 1; if (!scrypt_power_on_self_test()) scrypt_fatal_error("scrypt: power on self test failed"); } #endif /* byte swap pdata into data[0]/[1] arrays */ for (int k=0; k<2; ++k) { for(int z=0;z<20;z++) data[k][z] = bswap_32x4(pdata[z]); for(int i=1;i<throughput;++i) memcpy(&data[k][20*i], &data[k][0], 20*sizeof(uint32_t)); } scrypt_aligned_alloc Xbuf[2] = { scrypt_alloc(128 * throughput), scrypt_alloc(128 * throughput) }; scrypt_aligned_alloc Vbuf = scrypt_alloc((uint64_t)N * 128); scrypt_aligned_alloc Ybuf = scrypt_alloc(128); uint32_t nonce[2]; uint32_t* cuda_X[2] = { cuda_transferbuffer(thr_id,0), cuda_transferbuffer(thr_id,1) }; #if !defined(SCRYPT_CHOOSE_COMPILETIME) scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix(); #endif int cur = 0, nxt = 1; nonce[cur] = n+1; for(int i=0;i<throughput;++i) { uint32_t tmp_nonce = ++n; data[cur][20*i + 19] = bswap_32x4(tmp_nonce); } /* 1: X = PBKDF2(password, salt) */ for(int i=0;i<throughput;++i) scrypt_pbkdf2_1((unsigned char *)&data[cur][20*i], 80, (unsigned char *)&data[cur][20*i], 80, Xbuf[cur].ptr + 128 * i, 128); /* 2: X = ROMix(X) in CUDA */ memcpy(cuda_X[cur], Xbuf[cur].ptr, 128 * throughput); cuda_scrypt_HtoD(thr_id, cuda_X[cur], cur); cuda_scrypt_serialize(thr_id, cur); cuda_scrypt_core(thr_id, cur, N); cuda_scrypt_done(thr_id, cur); cuda_scrypt_DtoH(thr_id, cuda_X[cur], cur); cuda_scrypt_flush(thr_id, cur); do { nonce[nxt] = n+1; for(int i=0;i<throughput;++i) { uint32_t tmp_nonce = ++n; data[nxt][20*i + 19] = bswap_32x4(tmp_nonce); } /* 1: X = PBKDF2(password, salt) */ for(int i=0;i<throughput;++i) scrypt_pbkdf2_1((unsigned char *)&data[nxt][20*i], 80, (unsigned char *)&data[nxt][20*i], 80, Xbuf[nxt].ptr + 128 * i, 128); /* 2: X = ROMix(X) in CUDA */ memcpy(cuda_X[nxt], Xbuf[nxt].ptr, 128 * throughput); cuda_scrypt_HtoD(thr_id, cuda_X[nxt], nxt); cuda_scrypt_serialize(thr_id, nxt); cuda_scrypt_core(thr_id, nxt, N); cuda_scrypt_done(thr_id, nxt); cuda_scrypt_DtoH(thr_id, cuda_X[nxt], nxt); cuda_scrypt_flush(thr_id, nxt); cuda_scrypt_sync(thr_id, cur); #define VERIFY_ALL 0 #if VERIFY_ALL { /* 2: X = ROMix(X) */ for(int i=0;i<throughput;++i) scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)Ybuf.ptr, (scrypt_mix_word_t *)Vbuf.ptr, N); unsigned int err = 0; for(int i=0;i<throughput;++i) { unsigned char *ref = (Xbuf[cur].ptr + 128 * i); unsigned char *dat = (unsigned char*)(cuda_X[cur] + 32 * i); if (memcmp(ref, dat, 128) != 0) { err++; #if 0 uint32_t *ref32 = (uint32_t*) ref; uint32_t *dat32 = (uint32_t*) dat; for (int j=0; j<32; ++j) { if (ref32[j] != dat32[j]) fprintf(stderr, "ref32[i=%d][j=%d] = $%08x / $%08x\n", i, j, ref32[j], dat32[j]); } #endif } } if (err > 0) fprintf(stderr, "%d out of %d hashes differ.\n", err, throughput); } #endif memcpy(Xbuf[cur].ptr, cuda_X[cur], 128 * throughput); /* 3: Out = PBKDF2(password, X) */ for(int i=0;i<throughput;++i) scrypt_pbkdf2_1((unsigned char *)&data[cur][20*i], 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)&hash[8*i], 32); for(int i=0;i<throughput;++i) { volatile unsigned char *hashc = (unsigned char *)&hash[8*i]; if (hash[8*i+7] <= Htarg && fulltest(&hash[8*i], ptarget)) { uint32_t tmp_nonce = nonce[cur]+i; uint32_t thash[8], tdata[20]; for(int z=0;z<20;z++) tdata[z] = bswap_32x4(pdata[z]); tdata[19] = bswap_32x4(tmp_nonce); scrypt_pbkdf2_1((unsigned char *)tdata, 80, (unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128); scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)(Ybuf.ptr), (scrypt_mix_word_t *)(Vbuf.ptr), N); scrypt_pbkdf2_1((unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)thash, 32); if (memcmp(thash, &hash[8*i], 32) == 0) { *hashes_done = (n-throughput) - pdata[19] + 1; pdata[19] = tmp_nonce; scrypt_free(&Vbuf); scrypt_free(&Ybuf); scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]); delete[] data[0]; delete[] data[1]; delete[] hash; return 1; } else { applog(LOG_INFO, "GPU #%d: %s result does not validate on CPU (i=%d, s=%d)!", device_map[thr_id], device_name[thr_id], i, cur); } } } cur = (cur+1)&1; nxt = (nxt+1)&1; } while ((n-throughput) < max_nonce && !work_restart[thr_id].restart); scrypt_free(&Vbuf); scrypt_free(&Ybuf); scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]); delete[] data[0]; delete[] data[1]; delete[] hash; *hashes_done = (n-throughput) - pdata[19] + 1; pdata[19] = (n-throughput); return 0; }
int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, struct timeval *tv_start, struct timeval *tv_end, unsigned long *hashes_done) { const uint32_t Htarg = ptarget[7]; static int s_Nfactor = 0; if (s_Nfactor == 0 && strlen(jane_params) > 0) applog(LOG_INFO, "Given scrypt-jane parameters: %s", jane_params); int Nfactor = GetNfactor(bswap_32x4(pdata[17])); if (Nfactor > scrypt_maxN) { scrypt_fatal_error("scrypt: N out of range"); } if (Nfactor != s_Nfactor) { // all of this isn't very thread-safe... N = (1 << (Nfactor + 1)); applog(LOG_INFO, "Nfactor is %d (N=%d)!", Nfactor, N); if (s_Nfactor != 0) { // handle N-factor increase at runtime // by adjusting the lookup_gap by factor 2 if (s_Nfactor == Nfactor-1) for (int i=0; i < 8; ++i) device_lookup_gap[i] *= 2; } s_Nfactor = Nfactor; } int throughput = cuda_throughput(thr_id); if(throughput == 0) return -1; gettimeofday(tv_start, NULL); uint32_t *data[2] = { new uint32_t[20*throughput], new uint32_t[20*throughput] }; uint32_t* hash[2] = { cuda_hashbuffer(thr_id,0), cuda_hashbuffer(thr_id,1) }; uint32_t n = pdata[19]; /* byte swap pdata into data[0]/[1] arrays */ for (int k=0; k<2; ++k) { for(int z=0;z<20;z++) data[k][z] = bswap_32x4(pdata[z]); for(int i=1;i<throughput;++i) memcpy(&data[k][20*i], &data[k][0], 20*sizeof(uint32_t)); } if (parallel == 2) prepare_keccak512(thr_id, pdata); scrypt_aligned_alloc Xbuf[2] = { scrypt_alloc(128 * throughput), scrypt_alloc(128 * throughput) }; scrypt_aligned_alloc Vbuf = scrypt_alloc((uint64_t)N * 128); scrypt_aligned_alloc Ybuf = scrypt_alloc(128); uint32_t nonce[2]; uint32_t* cuda_X[2] = { cuda_transferbuffer(thr_id,0), cuda_transferbuffer(thr_id,1) }; #if !defined(SCRYPT_CHOOSE_COMPILETIME) scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix(); #endif int cur = 0, nxt = 1; int iteration = 0; do { nonce[nxt] = n; if (parallel < 2) { for(int i=0;i<throughput;++i) { uint32_t tmp_nonce = n++; data[nxt][20*i + 19] = bswap_32x4(tmp_nonce); } for(int i=0;i<throughput;++i) scrypt_pbkdf2_1((unsigned char *)&data[nxt][20*i], 80, (unsigned char *)&data[nxt][20*i], 80, Xbuf[nxt].ptr + 128 * i, 128); memcpy(cuda_X[nxt], Xbuf[nxt].ptr, 128 * throughput); cuda_scrypt_serialize(thr_id, nxt); cuda_scrypt_HtoD(thr_id, cuda_X[nxt], nxt); cuda_scrypt_core(thr_id, nxt, N); cuda_scrypt_done(thr_id, nxt); cuda_scrypt_DtoH(thr_id, cuda_X[nxt], nxt, false); cuda_scrypt_flush(thr_id, nxt); if(!cuda_scrypt_sync(thr_id, cur)) { return -1; } memcpy(Xbuf[cur].ptr, cuda_X[cur], 128 * throughput); for(int i=0;i<throughput;++i) scrypt_pbkdf2_1((unsigned char *)&data[cur][20*i], 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)(&hash[cur][8*i]), 32); #define VERIFY_ALL 0 #if VERIFY_ALL { /* 2: X = ROMix(X) */ for(int i=0;i<throughput;++i) scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)Ybuf.ptr, (scrypt_mix_word_t *)Vbuf.ptr, N); unsigned int err = 0; for(int i=0;i<throughput;++i) { unsigned char *ref = (Xbuf[cur].ptr + 128 * i); unsigned char *dat = (unsigned char*)(cuda_X[cur] + 32 * i); if (memcmp(ref, dat, 128) != 0) { err++; #if 0 uint32_t *ref32 = (uint32_t*) ref; uint32_t *dat32 = (uint32_t*) dat; for (int j=0; j<32; ++j) { if (ref32[j] != dat32[j]) fprintf(stderr, "ref32[i=%d][j=%d] = $%08x / $%08x\n", i, j, ref32[j], dat32[j]); } #endif } } if (err > 0) fprintf(stderr, "%d out of %d hashes differ.\n", err, throughput); } #endif } else { n += throughput; cuda_scrypt_serialize(thr_id, nxt); pre_keccak512(thr_id, nxt, nonce[nxt], throughput); cuda_scrypt_core(thr_id, nxt, N); cuda_scrypt_flush(thr_id, nxt); post_keccak512(thr_id, nxt, nonce[nxt], throughput); cuda_scrypt_done(thr_id, nxt); cuda_scrypt_DtoH(thr_id, hash[nxt], nxt, true); if(!cuda_scrypt_sync(thr_id, cur)) { return -1; } } if(iteration > 0) { for(int i=0;i<throughput;++i) { volatile unsigned char *hashc = (unsigned char *)(&hash[cur][8*i]); if (hash[cur][8*i+7] <= Htarg && fulltest(&hash[cur][8*i], ptarget)) { uint32_t tmp_nonce = nonce[cur]+i; uint32_t thash[8], tdata[20]; for(int z=0;z<20;z++) tdata[z] = bswap_32x4(pdata[z]); tdata[19] = bswap_32x4(tmp_nonce); scrypt_pbkdf2_1((unsigned char *)tdata, 80, (unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128); scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)(Ybuf.ptr), (scrypt_mix_word_t *)(Vbuf.ptr), N); scrypt_pbkdf2_1((unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)thash, 32); if (memcmp(thash, &hash[cur][8*i], 32) == 0) { //applog(LOG_INFO, "GPU #%d: %s result validates on CPU.", device_map[thr_id], device_name[thr_id]); *hashes_done = n - pdata[19]; pdata[19] = tmp_nonce; scrypt_free(&Vbuf); scrypt_free(&Ybuf); scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]); delete[] data[0]; delete[] data[1]; gettimeofday(tv_end, NULL); return 1; } else { applog(LOG_INFO, "GPU #%d: %s result does not validate on CPU (i=%d, s=%d)!", device_map[thr_id], device_name[thr_id], i, cur); } } } } cur = (cur+1)&1; nxt = (nxt+1)&1; ++iteration; } while (n <= max_nonce && !work_restart[thr_id].restart); scrypt_free(&Vbuf); scrypt_free(&Ybuf); scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]); delete[] data[0]; delete[] data[1]; *hashes_done = n - pdata[19]; pdata[19] = n; gettimeofday(tv_end, NULL); return 0; }
int scanhash_keccak(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, struct timeval *tv_start, struct timeval *tv_end, unsigned long *hashes_done) { int throughput = cuda_throughput(thr_id); gettimeofday(tv_start, NULL); uint32_t n = pdata[19] - 1; // TESTING ONLY // ((uint32_t*)ptarget)[7] = 0x0000000f; const uint32_t Htarg = ptarget[7]; uint32_t endiandata[20]; for (int kk=0; kk < 20; kk++) be32enc(&endiandata[kk], pdata[kk]); cuda_prepare_keccak256(thr_id, endiandata, ptarget); uint32_t *cuda_hash64[2] = { (uint32_t *)cuda_hashbuffer(thr_id, 0), (uint32_t *)cuda_hashbuffer(thr_id, 1) }; memset(cuda_hash64[0], 0xff, throughput * 8 * sizeof(uint32_t)); memset(cuda_hash64[1], 0xff, throughput * 8 * sizeof(uint32_t)); bool validate = false; uint32_t nonce[2]; int cur = 0, nxt = 1; // begin work on first CUDA stream nonce[cur] = n+1; n += throughput; cuda_do_keccak256(thr_id, 0, cuda_hash64[cur], nonce[cur], throughput, validate); do { nonce[nxt] = n+1; n += throughput; if ((n-throughput) < max_nonce && !work_restart[thr_id].restart) { // begin work on next CUDA stream cuda_do_keccak256(thr_id, 0, cuda_hash64[nxt], nonce[nxt], throughput, validate); } // synchronize current stream and get the "winning" nonce index, if any cuda_scrypt_sync(thr_id, cur); uint32_t result = *cuda_hash64[cur]; // optional full CPU based validation (see validate flag) if (validate) { for (int i=0; i < throughput; ++i) { uint32_t hash64[8]; be32enc(&endiandata[19], nonce[cur]+i); crypto_hash( (unsigned char*)hash64, (unsigned char*)&endiandata[0], 80 ); if (memcmp(hash64, &cuda_hash64[8*i], 32)) fprintf(stderr, "CPU and CUDA hashes (i=%d) differ!\n", i); } } else if (result != 0xffffffff && result > pdata[19]) { uint32_t hash64[8]; be32enc(&endiandata[19], result); crypto_hash( (unsigned char*)hash64, (unsigned char*)&endiandata[0], 80 ); if (result >= nonce[cur] && result < nonce[cur]+throughput && hash64[7] <= Htarg && fulltest(hash64, ptarget)) { *hashes_done = n-throughput - pdata[19] + 1; pdata[19] = result; gettimeofday(tv_end, NULL); return true; } else { applog(LOG_INFO, "GPU #%d: %s result for nonce $%08x does not validate on CPU!", device_map[thr_id], device_name[thr_id], result); } } cur = (cur + 1) % 2; nxt = (nxt + 1) % 2; } while ((n-throughput) < max_nonce && !work_restart[thr_id].restart); *hashes_done = n-throughput - pdata[19] + 1; if (n-throughput > pdata[19]) // CB: don't report values bigger max_nonce pdata[19] = max_nonce > n-throughput ? n-throughput : max_nonce; else pdata[19] = 0xffffffffU; // CB: prevent nonce space overflow. gettimeofday(tv_end, NULL); return 0; }