static void calc_checksum(char *name) { long64 orig_size; if (!work) work = alloc_work(total_work); data = map_file(name, 0, &size); orig_size = size; if ((size & 0x3f) == 0x10) { size &= ~0x3fULL; memcpy(checksum1, data + size, 16); checksum_found = 1; } else { if (size & 0x3f) { printf("Size of %s is not a multiple of 64.\n", name); exit(1); } checksum_found = 0; } int chunks = (size + CHUNK - 1) / CHUNK; results = (uint64 *)malloc(32 * chunks); fill_work(total_work, chunks, 0, work); run_threaded(checksum_worker, work, 0); CityHashCrc128((char *)results, 32 * chunks, checksum2); unmap_file(data, orig_size); free(results); if (checksum_found) checksum_match = (checksum1[0] == checksum2[0] && checksum1[1] == checksum2[1]); }
/** * This proof-of-work is computationally difficult even for a single hash, * but must be so to prevent optimizations to the required memory foot print. * * The maximum level of parallelism achievable per GB of RAM is 8, and the highest * end GPUs now have 4 GB of ram which means they could in theory support 32 * parallel execution of this proof-of-work. * * On GPU's you only tend to get 1 instruction per 4 clock cycles in a single * thread context. Modern super-scalar CPU's can get more than 1 instruction * per block and CityHash is specifically optomized to take advantage of this. * In addition to getting more done per-cycle, CPU's have close to 4x the clock * frequency. * * Based upon these characteristics alone, I estimate that a CPU can execute the * serial portions of this algorithm at least 16x faster than a GPU which means * that an 8-core CPU should easily compete with a 128 core GPU. Fortunately, * a 128 core GPU would require 16 GB of RAM. Note also, that most GPUs have * less than 128 'real' cores that are able to handle conditionals. * * Further more, GPU's are not well suited for branch misprediction and code * must be optimized to avoid branches as much as possible. * * Lastly this algorithm takes advantage of a hardware instruction that is * unlikely to be included in GPUs (Intel CRC32). The lack of this hardware * instruction alone is likely to give the CPU an order of magnitude advantage * over the GPUs. */ fc::sha1 proof_of_work( const fc::sha256& in, unsigned char* buffer_128m ) { // I use 8 threads to reduce latency in the calculation, it may be possible to // get higher-throughput by going to a single thread and performing multiple // proofs in parallel. Latency is important in some cases (verification) while // throughput is important in mining. Two variations of this method may eventually // need to be created. static fc::thread _threads[8]; uint32_t* _seeds = (uint32_t*)∈ fc::future<void> ready[8]; for( uint32_t i = 0; i < 8; ++i ) { ready[i] = _threads[i].async( [=]() { boost::random::mt11213b gen( _seeds[i] ); // this generator does not optimize well on GPU, but is fast on CPU uint32_t* start = (uint32_t*)(buffer_128m + i * MB128/8); uint32_t* end = start + MB128/8/4; for( uint32_t* pos = start; pos < end; ++pos ) { *pos = gen(); if( *pos % 17 > 15 ) // hardware branch misprediction, foils GPU data-parallel instructions { if( pos > (start +32/4) ) *pos = CityHashCrc128( (char*)(pos-32/4), 32 ).first; // CRC is fast on Intel } else if ( *pos > gen() ) // unpredictiable branch foils GPU and CPU { if( (*pos % 2) && pos > (start +256/4) ) *pos = CityHashCrc128( (char*)(pos-256/4), 32 ).first; // CRC is fast on Intel } } }); } for( uint32_t i = 0; i < 8; ++i ) { ready[i].wait(); } // require full 128 MB to complete sequential step auto out = CityHashCrc128( (char*)buffer_128m, MB128 ); return fc::sha1::hash( (char*)&out, sizeof(out) ); }