void scrypt_N_1_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint32_t N, uint8_t *out, size_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V) { uint32_t chunk_bytes, i; const uint32_t r = SCRYPT_R; const uint32_t p = SCRYPT_P; #if !defined(SCRYPT_CHOOSE_COMPILETIME) scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix(); #endif chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2; /* 1: X = PBKDF2(password, salt) */ scrypt_pbkdf2_1(password, password_len, salt, salt_len, X, chunk_bytes * p); /* 2: X = ROMix(X) */ for (i = 0; i < p; i++) scrypt_ROMix_1((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V, N); /* 3: Out = PBKDF2(password, X) */ scrypt_pbkdf2_1(password, password_len, X, chunk_bytes * p, out, bytes); #ifdef SCRYPT_PREVENT_STATE_LEAK /* This is an unnecessary security feature - mikaelh */ scrypt_ensure_zero(Y, (p + 1) * chunk_bytes); #endif }
int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, struct timeval *tv_start, struct timeval *tv_end, unsigned long *hashes_done) { const uint32_t Htarg = ptarget[7]; static int s_Nfactor = 0; if (s_Nfactor == 0 && strlen(jane_params) > 0) applog(LOG_INFO, "Given scrypt-jane parameters: %s", jane_params); int Nfactor = GetNfactor(bswap_32x4(pdata[17])); if (Nfactor > scrypt_maxN) { scrypt_fatal_error("scrypt: N out of range"); } if (Nfactor != s_Nfactor) { // all of this isn't very thread-safe... N = (1 << (Nfactor + 1)); applog(LOG_INFO, "Nfactor is %d (N=%d)!", Nfactor, N); if (s_Nfactor != 0) { // handle N-factor increase at runtime // by adjusting the lookup_gap by factor 2 if (s_Nfactor == Nfactor-1) for (int i=0; i < 8; ++i) device_lookup_gap[i] *= 2; } s_Nfactor = Nfactor; } int throughput = cuda_throughput(thr_id); if(throughput == 0) return -1; gettimeofday(tv_start, NULL); uint32_t *data[2] = { new uint32_t[20*throughput], new uint32_t[20*throughput] }; uint32_t* hash[2] = { cuda_hashbuffer(thr_id,0), cuda_hashbuffer(thr_id,1) }; uint32_t n = pdata[19]; /* byte swap pdata into data[0]/[1] arrays */ for (int k=0; k<2; ++k) { for(int z=0;z<20;z++) data[k][z] = bswap_32x4(pdata[z]); for(int i=1;i<throughput;++i) memcpy(&data[k][20*i], &data[k][0], 20*sizeof(uint32_t)); } if (parallel == 2) prepare_keccak512(thr_id, pdata); scrypt_aligned_alloc Xbuf[2] = { scrypt_alloc(128 * throughput), scrypt_alloc(128 * throughput) }; scrypt_aligned_alloc Vbuf = scrypt_alloc((uint64_t)N * 128); scrypt_aligned_alloc Ybuf = scrypt_alloc(128); uint32_t nonce[2]; uint32_t* cuda_X[2] = { cuda_transferbuffer(thr_id,0), cuda_transferbuffer(thr_id,1) }; #if !defined(SCRYPT_CHOOSE_COMPILETIME) scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix(); #endif int cur = 0, nxt = 1; int iteration = 0; do { nonce[nxt] = n; if (parallel < 2) { for(int i=0;i<throughput;++i) { uint32_t tmp_nonce = n++; data[nxt][20*i + 19] = bswap_32x4(tmp_nonce); } for(int i=0;i<throughput;++i) scrypt_pbkdf2_1((unsigned char *)&data[nxt][20*i], 80, (unsigned char *)&data[nxt][20*i], 80, Xbuf[nxt].ptr + 128 * i, 128); memcpy(cuda_X[nxt], Xbuf[nxt].ptr, 128 * throughput); cuda_scrypt_serialize(thr_id, nxt); cuda_scrypt_HtoD(thr_id, cuda_X[nxt], nxt); cuda_scrypt_core(thr_id, nxt, N); cuda_scrypt_done(thr_id, nxt); cuda_scrypt_DtoH(thr_id, cuda_X[nxt], nxt, false); cuda_scrypt_flush(thr_id, nxt); if(!cuda_scrypt_sync(thr_id, cur)) { return -1; } memcpy(Xbuf[cur].ptr, cuda_X[cur], 128 * throughput); for(int i=0;i<throughput;++i) scrypt_pbkdf2_1((unsigned char *)&data[cur][20*i], 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)(&hash[cur][8*i]), 32); #define VERIFY_ALL 0 #if VERIFY_ALL { /* 2: X = ROMix(X) */ for(int i=0;i<throughput;++i) scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)Ybuf.ptr, (scrypt_mix_word_t *)Vbuf.ptr, N); unsigned int err = 0; for(int i=0;i<throughput;++i) { unsigned char *ref = (Xbuf[cur].ptr + 128 * i); unsigned char *dat = (unsigned char*)(cuda_X[cur] + 32 * i); if (memcmp(ref, dat, 128) != 0) { err++; #if 0 uint32_t *ref32 = (uint32_t*) ref; uint32_t *dat32 = (uint32_t*) dat; for (int j=0; j<32; ++j) { if (ref32[j] != dat32[j]) fprintf(stderr, "ref32[i=%d][j=%d] = $%08x / $%08x\n", i, j, ref32[j], dat32[j]); } #endif } } if (err > 0) fprintf(stderr, "%d out of %d hashes differ.\n", err, throughput); } #endif } else { n += throughput; cuda_scrypt_serialize(thr_id, nxt); pre_keccak512(thr_id, nxt, nonce[nxt], throughput); cuda_scrypt_core(thr_id, nxt, N); cuda_scrypt_flush(thr_id, nxt); post_keccak512(thr_id, nxt, nonce[nxt], throughput); cuda_scrypt_done(thr_id, nxt); cuda_scrypt_DtoH(thr_id, hash[nxt], nxt, true); if(!cuda_scrypt_sync(thr_id, cur)) { return -1; } } if(iteration > 0) { for(int i=0;i<throughput;++i) { volatile unsigned char *hashc = (unsigned char *)(&hash[cur][8*i]); if (hash[cur][8*i+7] <= Htarg && fulltest(&hash[cur][8*i], ptarget)) { uint32_t tmp_nonce = nonce[cur]+i; uint32_t thash[8], tdata[20]; for(int z=0;z<20;z++) tdata[z] = bswap_32x4(pdata[z]); tdata[19] = bswap_32x4(tmp_nonce); scrypt_pbkdf2_1((unsigned char *)tdata, 80, (unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128); scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)(Ybuf.ptr), (scrypt_mix_word_t *)(Vbuf.ptr), N); scrypt_pbkdf2_1((unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)thash, 32); if (memcmp(thash, &hash[cur][8*i], 32) == 0) { //applog(LOG_INFO, "GPU #%d: %s result validates on CPU.", device_map[thr_id], device_name[thr_id]); *hashes_done = n - pdata[19]; pdata[19] = tmp_nonce; scrypt_free(&Vbuf); scrypt_free(&Ybuf); scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]); delete[] data[0]; delete[] data[1]; gettimeofday(tv_end, NULL); return 1; } else { applog(LOG_INFO, "GPU #%d: %s result does not validate on CPU (i=%d, s=%d)!", device_map[thr_id], device_name[thr_id], i, cur); } } } } cur = (cur+1)&1; nxt = (nxt+1)&1; ++iteration; } while (n <= max_nonce && !work_restart[thr_id].restart); scrypt_free(&Vbuf); scrypt_free(&Ybuf); scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]); delete[] data[0]; delete[] data[1]; *hashes_done = n - pdata[19]; pdata[19] = n; gettimeofday(tv_end, NULL); return 0; }
int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { const uint32_t Htarg = ptarget[7]; int Nfactor = GetNfactor(bswap_32x4(pdata[17])); if (Nfactor > scrypt_maxN) { scrypt_fatal_error("scrypt: N out of range"); } N = (1 << (Nfactor + 1)); parallel = 0; int throughput = cuda_throughput(thr_id); uint32_t *data[2] = { new uint32_t[20*throughput], new uint32_t[20*throughput] }; uint32_t *hash = new uint32_t[8*throughput]; uint32_t n = pdata[19] - 1; // int i; #if !defined(SCRYPT_TEST) static int power_on_self_test = 0; if (!power_on_self_test) { power_on_self_test = 1; if (!scrypt_power_on_self_test()) scrypt_fatal_error("scrypt: power on self test failed"); } #endif /* byte swap pdata into data[0]/[1] arrays */ for (int k=0; k<2; ++k) { for(int z=0;z<20;z++) data[k][z] = bswap_32x4(pdata[z]); for(int i=1;i<throughput;++i) memcpy(&data[k][20*i], &data[k][0], 20*sizeof(uint32_t)); } scrypt_aligned_alloc Xbuf[2] = { scrypt_alloc(128 * throughput), scrypt_alloc(128 * throughput) }; scrypt_aligned_alloc Vbuf = scrypt_alloc((uint64_t)N * 128); scrypt_aligned_alloc Ybuf = scrypt_alloc(128); uint32_t nonce[2]; uint32_t* cuda_X[2] = { cuda_transferbuffer(thr_id,0), cuda_transferbuffer(thr_id,1) }; #if !defined(SCRYPT_CHOOSE_COMPILETIME) scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix(); #endif int cur = 0, nxt = 1; nonce[cur] = n+1; for(int i=0;i<throughput;++i) { uint32_t tmp_nonce = ++n; data[cur][20*i + 19] = bswap_32x4(tmp_nonce); } /* 1: X = PBKDF2(password, salt) */ for(int i=0;i<throughput;++i) scrypt_pbkdf2_1((unsigned char *)&data[cur][20*i], 80, (unsigned char *)&data[cur][20*i], 80, Xbuf[cur].ptr + 128 * i, 128); /* 2: X = ROMix(X) in CUDA */ memcpy(cuda_X[cur], Xbuf[cur].ptr, 128 * throughput); cuda_scrypt_HtoD(thr_id, cuda_X[cur], cur); cuda_scrypt_serialize(thr_id, cur); cuda_scrypt_core(thr_id, cur, N); cuda_scrypt_done(thr_id, cur); cuda_scrypt_DtoH(thr_id, cuda_X[cur], cur); cuda_scrypt_flush(thr_id, cur); do { nonce[nxt] = n+1; for(int i=0;i<throughput;++i) { uint32_t tmp_nonce = ++n; data[nxt][20*i + 19] = bswap_32x4(tmp_nonce); } /* 1: X = PBKDF2(password, salt) */ for(int i=0;i<throughput;++i) scrypt_pbkdf2_1((unsigned char *)&data[nxt][20*i], 80, (unsigned char *)&data[nxt][20*i], 80, Xbuf[nxt].ptr + 128 * i, 128); /* 2: X = ROMix(X) in CUDA */ memcpy(cuda_X[nxt], Xbuf[nxt].ptr, 128 * throughput); cuda_scrypt_HtoD(thr_id, cuda_X[nxt], nxt); cuda_scrypt_serialize(thr_id, nxt); cuda_scrypt_core(thr_id, nxt, N); cuda_scrypt_done(thr_id, nxt); cuda_scrypt_DtoH(thr_id, cuda_X[nxt], nxt); cuda_scrypt_flush(thr_id, nxt); cuda_scrypt_sync(thr_id, cur); #define VERIFY_ALL 0 #if VERIFY_ALL { /* 2: X = ROMix(X) */ for(int i=0;i<throughput;++i) scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)Ybuf.ptr, (scrypt_mix_word_t *)Vbuf.ptr, N); unsigned int err = 0; for(int i=0;i<throughput;++i) { unsigned char *ref = (Xbuf[cur].ptr + 128 * i); unsigned char *dat = (unsigned char*)(cuda_X[cur] + 32 * i); if (memcmp(ref, dat, 128) != 0) { err++; #if 0 uint32_t *ref32 = (uint32_t*) ref; uint32_t *dat32 = (uint32_t*) dat; for (int j=0; j<32; ++j) { if (ref32[j] != dat32[j]) fprintf(stderr, "ref32[i=%d][j=%d] = $%08x / $%08x\n", i, j, ref32[j], dat32[j]); } #endif } } if (err > 0) fprintf(stderr, "%d out of %d hashes differ.\n", err, throughput); } #endif memcpy(Xbuf[cur].ptr, cuda_X[cur], 128 * throughput); /* 3: Out = PBKDF2(password, X) */ for(int i=0;i<throughput;++i) scrypt_pbkdf2_1((unsigned char *)&data[cur][20*i], 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)&hash[8*i], 32); for(int i=0;i<throughput;++i) { volatile unsigned char *hashc = (unsigned char *)&hash[8*i]; if (hash[8*i+7] <= Htarg && fulltest(&hash[8*i], ptarget)) { uint32_t tmp_nonce = nonce[cur]+i; uint32_t thash[8], tdata[20]; for(int z=0;z<20;z++) tdata[z] = bswap_32x4(pdata[z]); tdata[19] = bswap_32x4(tmp_nonce); scrypt_pbkdf2_1((unsigned char *)tdata, 80, (unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128); scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)(Ybuf.ptr), (scrypt_mix_word_t *)(Vbuf.ptr), N); scrypt_pbkdf2_1((unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)thash, 32); if (memcmp(thash, &hash[8*i], 32) == 0) { *hashes_done = (n-throughput) - pdata[19] + 1; pdata[19] = tmp_nonce; scrypt_free(&Vbuf); scrypt_free(&Ybuf); scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]); delete[] data[0]; delete[] data[1]; delete[] hash; return 1; } else { applog(LOG_INFO, "GPU #%d: %s result does not validate on CPU (i=%d, s=%d)!", device_map[thr_id], device_name[thr_id], i, cur); } } } cur = (cur+1)&1; nxt = (nxt+1)&1; } while ((n-throughput) < max_nonce && !work_restart[thr_id].restart); scrypt_free(&Vbuf); scrypt_free(&Ybuf); scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]); delete[] data[0]; delete[] data[1]; delete[] hash; *hashes_done = (n-throughput) - pdata[19] + 1; pdata[19] = (n-throughput); return 0; }