int scanhash_scrypt_jane(int thr_id, uint32_t *pdata,
	const uint32_t *ptarget,
	uint32_t max_nonce, struct timeval *tv_start, struct timeval *tv_end, unsigned long *hashes_done)
{
	const uint32_t Htarg = ptarget[7];
	static int s_Nfactor = 0;

	if (s_Nfactor == 0 && strlen(jane_params) > 0)
		applog(LOG_INFO, "Given scrypt-jane parameters: %s", jane_params);
	
	int Nfactor = GetNfactor(bswap_32x4(pdata[17]));
	if (Nfactor > scrypt_maxN) {
		scrypt_fatal_error("scrypt: N out of range");
	}
	
	if (Nfactor != s_Nfactor)
	{
		// all of this isn't very thread-safe...
		N = (1 << (Nfactor + 1));

		applog(LOG_INFO, "Nfactor is %d (N=%d)!", Nfactor, N);

		if (s_Nfactor != 0) {
			// handle N-factor increase at runtime
			// by adjusting the lookup_gap by factor 2
			if (s_Nfactor == Nfactor-1)
				for (int i=0; i < 8; ++i)
					device_lookup_gap[i] *= 2;
		}
		s_Nfactor = Nfactor;
	}

	int throughput = cuda_throughput(thr_id);
	
    if(throughput == 0)
        return -1;

	gettimeofday(tv_start, NULL);

	uint32_t *data[2] = { new uint32_t[20*throughput], new uint32_t[20*throughput] };
	uint32_t* hash[2]   = { cuda_hashbuffer(thr_id,0), cuda_hashbuffer(thr_id,1) };

	uint32_t n = pdata[19];
	
	/* byte swap pdata into data[0]/[1] arrays */
	for (int k=0; k<2; ++k) {
		for(int z=0;z<20;z++) data[k][z] = bswap_32x4(pdata[z]);
		for(int i=1;i<throughput;++i) memcpy(&data[k][20*i], &data[k][0], 20*sizeof(uint32_t));
	}
	if (parallel == 2) prepare_keccak512(thr_id, pdata);

	scrypt_aligned_alloc Xbuf[2] = { scrypt_alloc(128 * throughput), scrypt_alloc(128 * throughput) };
	scrypt_aligned_alloc Vbuf = scrypt_alloc((uint64_t)N * 128);
	scrypt_aligned_alloc Ybuf = scrypt_alloc(128);

	uint32_t nonce[2];
	uint32_t* cuda_X[2]      = { cuda_transferbuffer(thr_id,0), cuda_transferbuffer(thr_id,1) };

#if !defined(SCRYPT_CHOOSE_COMPILETIME)
	scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
#endif

	int cur = 0, nxt = 1;
    int iteration = 0;

	do {
		nonce[nxt] = n;

		if (parallel < 2) 
        {
		    for(int i=0;i<throughput;++i) {
			    uint32_t tmp_nonce = n++;
			    data[nxt][20*i + 19] = bswap_32x4(tmp_nonce);
		    }

			for(int i=0;i<throughput;++i)
				scrypt_pbkdf2_1((unsigned char *)&data[nxt][20*i], 80, (unsigned char *)&data[nxt][20*i], 80, Xbuf[nxt].ptr + 128 * i, 128);
            
			memcpy(cuda_X[nxt], Xbuf[nxt].ptr, 128 * throughput);
		    cuda_scrypt_serialize(thr_id, nxt);
			cuda_scrypt_HtoD(thr_id, cuda_X[nxt], nxt);
            cuda_scrypt_core(thr_id, nxt, N);
		    cuda_scrypt_done(thr_id, nxt);

			cuda_scrypt_DtoH(thr_id, cuda_X[nxt], nxt, false);
            
		    cuda_scrypt_flush(thr_id, nxt);

            if(!cuda_scrypt_sync(thr_id, cur))
            {
                return -1;
            }

			memcpy(Xbuf[cur].ptr, cuda_X[cur], 128 * throughput);
			for(int i=0;i<throughput;++i)
				scrypt_pbkdf2_1((unsigned char *)&data[cur][20*i], 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)(&hash[cur][8*i]), 32);
  
#define VERIFY_ALL 0
#if VERIFY_ALL
		    {
			    /* 2: X = ROMix(X) */
			    for(int i=0;i<throughput;++i)
				    scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)Ybuf.ptr, (scrypt_mix_word_t *)Vbuf.ptr, N);

			    unsigned int err = 0;
			    for(int i=0;i<throughput;++i) {
				    unsigned char *ref = (Xbuf[cur].ptr + 128 * i);
				    unsigned char *dat = (unsigned char*)(cuda_X[cur] + 32 * i);
				    if (memcmp(ref, dat, 128) != 0)
				    {
					    err++;
#if 0
					    uint32_t *ref32 = (uint32_t*) ref;
					    uint32_t *dat32 = (uint32_t*) dat;
					    for (int j=0; j<32; ++j) {
						    if (ref32[j] != dat32[j])
						    fprintf(stderr, "ref32[i=%d][j=%d] = $%08x / $%08x\n", i, j, ref32[j], dat32[j]);
					    }
#endif
				    }
			    }
			    if (err > 0) fprintf(stderr, "%d out of %d hashes differ.\n", err, throughput);
		    }
#endif
		} else {
            n += throughput;

		    cuda_scrypt_serialize(thr_id, nxt);
			pre_keccak512(thr_id, nxt, nonce[nxt], throughput);
            cuda_scrypt_core(thr_id, nxt, N);

            cuda_scrypt_flush(thr_id, nxt);
		    
			post_keccak512(thr_id, nxt, nonce[nxt], throughput);
    	    cuda_scrypt_done(thr_id, nxt);

			cuda_scrypt_DtoH(thr_id, hash[nxt], nxt, true);
    	    
            if(!cuda_scrypt_sync(thr_id, cur))
            {
                return -1;
            }
		}

        if(iteration > 0)
        {
		    for(int i=0;i<throughput;++i) {
			    volatile unsigned char *hashc = (unsigned char *)(&hash[cur][8*i]);

			    if (hash[cur][8*i+7] <= Htarg && fulltest(&hash[cur][8*i], ptarget)) {

				    uint32_t tmp_nonce = nonce[cur]+i;
					
				    uint32_t thash[8], tdata[20];
				    for(int z=0;z<20;z++) tdata[z] = bswap_32x4(pdata[z]);
				    tdata[19] = bswap_32x4(tmp_nonce);
				    scrypt_pbkdf2_1((unsigned char *)tdata, 80, (unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128);
				    scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)(Ybuf.ptr), (scrypt_mix_word_t *)(Vbuf.ptr), N);
				    scrypt_pbkdf2_1((unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)thash, 32);
				    if (memcmp(thash, &hash[cur][8*i], 32) == 0)
				    {
					    //applog(LOG_INFO, "GPU #%d: %s result validates on CPU.", device_map[thr_id], device_name[thr_id]);

					    *hashes_done = n - pdata[19];
					    pdata[19] = tmp_nonce;
					    scrypt_free(&Vbuf);
					    scrypt_free(&Ybuf);
					    scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]);
					    delete[] data[0]; delete[] data[1];
					    gettimeofday(tv_end, NULL);
					    return 1;
				    }
				    else
				    {
					    applog(LOG_INFO, "GPU #%d: %s result does not validate on CPU (i=%d, s=%d)!", device_map[thr_id], device_name[thr_id], i, cur);
				    }
			    }
		    }
        }

		cur = (cur+1)&1; 
        nxt = (nxt+1)&1;
        ++iteration;
	} while (n <= max_nonce && !work_restart[thr_id].restart);
	
	scrypt_free(&Vbuf);
	scrypt_free(&Ybuf);
	scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]);
	delete[] data[0]; delete[] data[1];
	
	*hashes_done = n - pdata[19];
	pdata[19] = n;
	gettimeofday(tv_end, NULL);
	return 0;
}
Exemple #2
0
int scanhash_keccak(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
	uint32_t max_nonce, struct timeval *tv_start, struct timeval *tv_end, unsigned long *hashes_done)
{
	int throughput = cuda_throughput(thr_id);

	gettimeofday(tv_start, NULL);

	uint32_t n = pdata[19] - 1;
	
	// TESTING ONLY
//	((uint32_t*)ptarget)[7] = 0x0000000f;
	
	const uint32_t Htarg = ptarget[7];

	uint32_t endiandata[20];
	for (int kk=0; kk < 20; kk++)
		be32enc(&endiandata[kk], pdata[kk]);

	cuda_prepare_keccak256(thr_id, endiandata, ptarget);

	uint32_t *cuda_hash64[2] = { (uint32_t *)cuda_hashbuffer(thr_id, 0), (uint32_t *)cuda_hashbuffer(thr_id, 1) };
	memset(cuda_hash64[0], 0xff, throughput * 8 * sizeof(uint32_t));
	memset(cuda_hash64[1], 0xff, throughput * 8 * sizeof(uint32_t));

	bool validate = false;
	uint32_t nonce[2];
	int cur = 0, nxt = 1;

	// begin work on first CUDA stream
	nonce[cur] = n+1; n += throughput;
	cuda_do_keccak256(thr_id, 0, cuda_hash64[cur], nonce[cur], throughput, validate);

	do {

		nonce[nxt] = n+1; n += throughput;
		if ((n-throughput) < max_nonce && !work_restart[thr_id].restart)
		{
			// begin work on next CUDA stream
			cuda_do_keccak256(thr_id, 0, cuda_hash64[nxt], nonce[nxt], throughput, validate);
		}

		// synchronize current stream and get the "winning" nonce index, if any
		cuda_scrypt_sync(thr_id, cur);
		uint32_t result =  *cuda_hash64[cur];

		// optional full CPU based validation (see validate flag)
		if (validate)
		{
			for (int i=0; i < throughput; ++i)
			{
				uint32_t hash64[8];
				be32enc(&endiandata[19], nonce[cur]+i); 
				crypto_hash( (unsigned char*)hash64, (unsigned char*)&endiandata[0], 80 );
	
				if (memcmp(hash64, &cuda_hash64[8*i], 32))
					fprintf(stderr, "CPU and CUDA hashes (i=%d) differ!\n", i);
			}
		}
		else if (result != 0xffffffff && result > pdata[19])
		{
			uint32_t hash64[8];
			be32enc(&endiandata[19], result);
			crypto_hash( (unsigned char*)hash64, (unsigned char*)&endiandata[0], 80 );
			if (result >= nonce[cur] && result < nonce[cur]+throughput && hash64[7] <= Htarg && fulltest(hash64, ptarget)) {
				*hashes_done = n-throughput - pdata[19] + 1;
				pdata[19] = result;
				gettimeofday(tv_end, NULL);
				return true;
			} else {
				applog(LOG_INFO, "GPU #%d: %s result for nonce $%08x does not validate on CPU!", device_map[thr_id], device_name[thr_id], result);
			}
		}
		cur = (cur + 1) % 2;
		nxt = (nxt + 1) % 2;
	} while ((n-throughput) < max_nonce && !work_restart[thr_id].restart);
	
	*hashes_done = n-throughput - pdata[19] + 1;
	if (n-throughput > pdata[19])
		// CB: don't report values bigger max_nonce
		pdata[19] = max_nonce > n-throughput ? n-throughput : max_nonce;
	else
		pdata[19] = 0xffffffffU; // CB: prevent nonce space overflow.
	gettimeofday(tv_end, NULL);
	return 0;
}