OpenCLMomentumV3::OpenCLMomentumV3(int _HASH_BITS, int _device_num) {
	max_threads = 1<<30; // very big
	HASH_BITS = _HASH_BITS;
	device_num = _device_num;

	OpenCLMain& main = OpenCLMain::getInstance();

	// checks if device exists
	if (main.getInstance().getNumDevices() <= device_num) {
		printf("ERROR: DEVICE %d does not exist. Please limit your threads to one per device.\n", device_num);
		assert(false);
	}

	// compiles
	fprintf(stdout, "Starting OpenCLMomentum V3\n");
	fprintf(stdout, "Device %02d: %s\n", device_num, main.getDevice(device_num)->getName().c_str());
	cl_ulong maxWorkGroupSize = main.getDevice(device_num)->getMaxWorkGroupSize();
	fprintf(stdout, "Max work group size: %llu\n", maxWorkGroupSize);

	if (maxWorkGroupSize < max_threads) max_threads = maxWorkGroupSize;

	OpenCLContext *context = main.getDevice(device_num)->getContext();
	std::vector<std::string> program_filenames;
	program_filenames.push_back("opencl/opencl_cryptsha512.h");
	program_filenames.push_back("opencl/cryptsha512_kernel.cl");
	program_filenames.push_back("opencl/OpenCLMomentumV3.cl");
	OpenCLProgram *program = context->loadProgramFromFiles(program_filenames);

	// prealoc kernels
	OpenCLKernel *kernel = program->getKernel("kernel_sha512");
	OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table");

	// only one queue, helps with memory leaking
	queue = context->createCommandQueue(main.getDevice(device_num));

	size_t BLOCKSIZE = max_threads;
	// allocate internal structure
	cl_message = context->createBuffer(sizeof(uint8_t)*32, CL_MEM_READ_ONLY, NULL);
	internal_hash_table = context->createBuffer(sizeof(uint32_t)*(1<<HASH_BITS), CL_MEM_READ_WRITE, NULL);
	temp_collisions = context->createBuffer(sizeof(collision_struct)*getCollisionCeiling(), CL_MEM_WRITE_ONLY, NULL);
	temp_collisions_count = context->createBuffer(sizeof(size_t), CL_MEM_READ_WRITE, NULL);

	// sets args
	kernel_cleanup->resetArgs();
	kernel_cleanup->addGlobalArg(internal_hash_table);

	kernel->resetArgs();
	kernel->addGlobalArg(cl_message);
	kernel->addGlobalArg(internal_hash_table);
	uint32_t ht_size = 1<<HASH_BITS;
	kernel->addScalarUInt(ht_size);
	kernel->addGlobalArg(temp_collisions);
	kernel->addGlobalArg(temp_collisions_count);

}
Example #2
0
void OpenCLMomentumV9::find_collisions(uint8_t* message, collision_struct* collisions, size_t* collision_count) {

    // temp storage
    *collision_count = 0;
    uint32_t ht_size = 1<<HASH_BITS;
    SHA512_Context c512_avxsse;

    SHA512_Init(&c512_avxsse);
    uint8_t midhash[32+4];
    memcpy(midhash+4, message, 32);
    *((uint32_t*)midhash) = 0;
    SHA512_Update_Simple(&c512_avxsse, midhash, 32+4);
    SHA512_PreFinal(&c512_avxsse);

    *(uint32_t *)(&c512_avxsse.buffer.bytes[0]) = 0;
    uint64_t * swap_helper = (uint64_t*)(&c512_avxsse.buffer.bytes[0]);
    for (int i = 1; i < 5; i++) {
        swap_helper[i] = SWAP64(swap_helper[i]);
    }

    OpenCLContext *context = OpenCLMain::getInstance().getDevice(device_num)->getContext();
    OpenCLProgram *program = context->getProgram(0);

    OpenCLKernel *kernel = program->getKernel("kernel_sha512");
    OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table");

    assert(kernel != NULL);

    //size_t BLOCKSIZE = main.getPlatform(0)->getDevice(0)->getMaxWorkGroupSize();
    size_t BLOCKSIZE = kernel->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num));
    //has to be a power of 2
    BLOCKSIZE = 1<<log2(BLOCKSIZE);
    size_t BLOCKSIZE_CLEAN = kernel_cleanup->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num));
    BLOCKSIZE_CLEAN = 1<<log2(BLOCKSIZE_CLEAN);

//	printf("BLOCKSIZE = %ld\n", BLOCKSIZE);
//	printf("BLOCKSIZE_CLEAN = %ld\n", BLOCKSIZE_CLEAN);

    // cleans up the hash table
    queue->enqueueKernel1D(kernel_cleanup, 1<<HASH_BITS, BLOCKSIZE_CLEAN);

    queue->enqueueWriteBuffer(cl_message, c512_avxsse.buffer.bytes, sizeof(uint8_t)*SHA512_BLOCK_SIZE);
    queue->enqueueWriteBuffer(temp_collisions_count, collision_count, sizeof(size_t));

    queue->enqueueKernel1D(kernel, MAX_MOMENTUM_NONCE/8, BLOCKSIZE);
    queue->enqueueReadBuffer(temp_collisions_count, collision_count, sizeof(size_t));
    queue->enqueueReadBuffer(temp_collisions, collisions, sizeof(collision_struct)*getCollisionCeiling());
    queue->finish();


}
void OpenCLMomentumV3::find_collisions(uint8_t* message, collision_struct* collisions, size_t* collision_count) {

	// temp storage
	*collision_count = 0;

	OpenCLContext *context = OpenCLMain::getInstance().getDevice(device_num)->getContext();
	OpenCLProgram *program = context->getProgram(0);

	OpenCLKernel *kernel = program->getKernel("kernel_sha512");
	OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table");

	assert(kernel != NULL);

	//size_t BLOCKSIZE = main.getPlatform(0)->getDevice(0)->getMaxWorkGroupSize();
	size_t BLOCKSIZE = kernel->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num));
	//has to be a power of 2
	BLOCKSIZE = 1<<log2(BLOCKSIZE);
	size_t BLOCKSIZE_CLEAN = kernel_cleanup->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num));
	BLOCKSIZE_CLEAN = 1<<log2(BLOCKSIZE_CLEAN);

//	printf("BLOCKSIZE = %ld\n", BLOCKSIZE);
//	printf("BLOCKSIZE_CLEAN = %ld\n", BLOCKSIZE_CLEAN);

	// cleans up the hash table
	queue->enqueueKernel1D(kernel_cleanup, 1<<HASH_BITS, BLOCKSIZE_CLEAN);

	queue->enqueueWriteBuffer(cl_message, message, sizeof(uint8_t)*32);
	queue->enqueueWriteBuffer(temp_collisions_count, collision_count, sizeof(size_t));

	queue->enqueueKernel1D(kernel, MAX_MOMENTUM_NONCE/8, BLOCKSIZE);
	queue->enqueueReadBuffer(temp_collisions_count, collision_count, sizeof(size_t));
	queue->enqueueReadBuffer(temp_collisions, collisions, sizeof(collision_struct)*getCollisionCeiling());
	queue->finish();


}
void OpenCLMomentumV8::find_collisions(uint8_t* message, collision_struct* out_buff, size_t* out_count) {

	// temp storage
	*out_count = 0;
	uint32_t ht_size = 1<<HASH_BITS;
	SHA512_Context c512_avxsse;

	SHA512_Init(&c512_avxsse);
	uint8_t midhash[32+4];
	memcpy(midhash+4, message, 32);
	*((uint32_t*)midhash) = 0;
	SHA512_Update_Simple(&c512_avxsse, midhash, 32+4);
	SHA512_PreFinal(&c512_avxsse);

	*(uint32_t *)(&c512_avxsse.buffer.bytes[0]) = 0;
	uint64_t * swap_helper = (uint64_t*)(&c512_avxsse.buffer.bytes[0]);
	for (int i = 1; i < 5; i++) {
		swap_helper[i] = SWAP64(swap_helper[i]);
	}

	OpenCLContext *context = OpenCLMain::getInstance().getDevice(device_num)->getContext();
	OpenCLProgram *program = context->getProgram(0);

	OpenCLKernel *kernel_calculate_all_hashes = program->getKernel("calculate_all_hashes");
	OpenCLKernel *kernel_fill_table = program->getKernel("fill_table");
	OpenCLKernel *kernel_find_collisions = program->getKernel("find_collisions");
	OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table");

	OpenCLDevice * device = OpenCLMain::getInstance().getDevice(device_num);

	// cleans up the hash table
	size_t kc_wgsize = kernel_cleanup->getWorkGroupSize(device);
	kc_wgsize = 1<<log2(kc_wgsize);
	queue->enqueueKernel1D(kernel_cleanup, 1<<HASH_BITS, kc_wgsize);

//	printf("Cleaning the HT\n");
//	queue->finish();

	queue->enqueueWriteBuffer(cl_message, c512_avxsse.buffer.bytes, sizeof(uint8_t)*SHA512_BLOCK_SIZE);
	// step 1, calculate hashes
	size_t kcah_wgsize = kernel_calculate_all_hashes->getWorkGroupSize(device);
	kcah_wgsize = 1<<log2(kcah_wgsize);
	queue->enqueueKernel1D(kernel_calculate_all_hashes, MAX_MOMENTUM_NONCE/8,
			kcah_wgsize);

//	uint64_t * apa = new uint64_t[MAX_MOMENTUM_NONCE];
//	queue->enqueueReadBuffer(hashes, apa, sizeof(uint64_t)*MAX_MOMENTUM_NONCE);
//	queue->finish();
//
//	printf("testing hashes\n");
//	uint64_t count = 0;
//	for (int i = 0; i < MAX_MOMENTUM_NONCE; i++) {
//		if (apa[i] == 0) {
//			count++;
//			printf("BAD HASH AT: %d %X\n", i, apa[i]);
//		}
//	}
//	printf("counted %X bad hashes\n", count);
//	printf("NOW REALLY TEST THEM hashes\n");
//	count = 0;
//	for (uint32_t i = 0; i < MAX_MOMENTUM_NONCE/8; i+=8) {
//		sph_sha512_context c512_sph; //SPH
//		sph_sha512_init(&c512_sph);
//		sph_sha512(&c512_sph, &i, 4);
//		sph_sha512(&c512_sph, message, 32);
//		uint64_t out[8];
//		sph_sha512_close(&c512_sph, out);
//		for (int j =0; j < 8; j++) {
//			if (apa[i+j] != out[j]) {
//				count++;
//				uint64_t xxx = apa[i+j];
//				printf("BAD HASH AT: %d => %X != %X\n", i, apa[i+j], out[j]);
//			}
//		}
//	}
//	printf("counted %X bad hashes\n", count);

	// step 2, populate hashtable
	size_t kft_wgsize = kernel_fill_table->getWorkGroupSize(device);
	kft_wgsize = 1<<log2(kft_wgsize);
	queue->enqueueKernel1D(kernel_fill_table, MAX_MOMENTUM_NONCE,
							kft_wgsize);

//	printf("step 2, populate hashtable\n");
//	queue->finish();

	queue->enqueueWriteBuffer(collisions_count, out_count, sizeof(size_t));
	// step 3, find collisions
	size_t kfc_wgsize = kernel_find_collisions->getWorkGroupSize(device);
	kfc_wgsize = 1<<log2(kfc_wgsize);
	queue->enqueueKernel1D(kernel_find_collisions, MAX_MOMENTUM_NONCE,
							kfc_wgsize);

//	printf("step 3, find collisions\n");
//	queue->finish();

	queue->enqueueReadBuffer(collisions_count, out_count, sizeof(size_t));
	queue->enqueueReadBuffer(collisions, out_buff, sizeof(collision_struct)*getCollisionCeiling());

//	printf("step 4, copy output\n");
	queue->finish();


#ifdef DEBUG
	printf("Collision Count = %d\n", (*out_count));
#endif

}
ProtoshareOpenCL::ProtoshareOpenCL(int _device_num)
{
    this->device_num = _device_num;

    printf("Initializing GPU %d\n", device_num);
    OpenCLMain &main = OpenCLMain::getInstance();
    this->device = main.getDevice(device_num);


    printf("======================================================================\n");
    printf("Device information for: %s\n", device->getName().c_str());
    device->dumpDeviceInfo(); // Makes troubleshooting easier
    printf("======================================================================\n");
    printf("\n");

    // Sanitize input parameters
    if (commandlineInput.wgs == 0) {
        this->wgs = device->getMaxWorkGroupSize();
    } else {
        this->wgs = commandlineInput.wgs;
    }

    this->buckets_log2 = commandlineInput.buckets_log2;
    this->vect_type = commandlineInput.vect_type;
    this->bucket_size = commandlineInput.bucket_size;
    this->target_mem = commandlineInput.target_mem;

// If bucket size unset and target memory unset, use maximum usable memory
    if (target_mem == 0 && bucket_size == 0) {
        target_mem = device->getGlobalMemSize() / 1024 / 1024;
    }

    // If set, convert target memory into a usable value for bucket_size
    if (target_mem > 0) {
        // Convert target to bytes, subtract 1 to guarantee results LESS THAN target
        uint32 target_mem_temp = (target_mem * 1024 * 1024);

        // Lazy calculation, assume large bucket_size, scale back from there
        bucket_size = 1024;

        while (bucket_size > 0 && calc_total_mem_usage(buckets_log2, bucket_size) > target_mem_temp) { bucket_size--; }

        // Make sure the parameter configuration is sane:
        if (bucket_size < 1) {
            printf("ERROR: Memory target of %d MB cannot be attained with 2^%d buckets!\n", target_mem, buckets_log2);
            printf("       Please lower the value of \"-b\" or increase the value of \"-m\".\n");
            exit(0);
        }
    }


    // Make sure we can allocate hash_list (cannot violate CL_DEVICE_MAX_MEM_ALLOC_SIZE)
    cl_ulong required_mem = calc_hash_mem_usage(buckets_log2, bucket_size);
    cl_ulong available_mem = device->getMaxMemAllocSize();
    if (required_mem > available_mem) {
        printf("ERROR: Device %d cannot allocate 2^%d buckets of %d elements!\n", device_num, buckets_log2, bucket_size);
        printf("       CL_DEVICE_MAX_MEM_ALLOC_SIZE is %d MB, this configuration requires %d MB\n", available_mem / 1024 / 1024, required_mem / 1024 / 1024);
        printf("       Please lower the value of \"-b\" or \"-s\" or increase the value of \"-m\".\n");
        exit(0);
    }


    // Make sure we can allocate nonce_map (cannot violate CL_DEVICE_MAX_MEM_ALLOC_SIZE)
    required_mem = calc_index_mem_usage(buckets_log2, bucket_size);
    available_mem = device->getMaxMemAllocSize();
    if (required_mem > available_mem) {
        printf("ERROR: Device %d cannot allocate index of 2^%d elements!\n", device_num, buckets_log2);
        printf("       CL_DEVICE_MAX_MEM_ALLOC_SIZE is %d MB, this configuration requires %d MB\n", available_mem / 1024 / 1024, required_mem / 1024 / 1024);
        printf("       Please lower the value of \"-b\" or increase the value of \"-m\".\n");
        exit(0);
    }
    
    
    // Make sure the whole thing fits in memory
    required_mem = calc_total_mem_usage(buckets_log2, bucket_size);
    available_mem = device->getGlobalMemSize();
    if (required_mem > available_mem) {
        printf("ERROR: Device %d cannot store 2^%d buckets of %d elements!\n", device_num, buckets_log2, bucket_size);
        printf("       CL_DEVICE_GLOBAL_MEM_SIZE is %d MB, this configuration requires %d MB\n", available_mem / 1024 / 1024, required_mem / 1024 / 1024);
        printf("       Please lower the value of \"-b\" or \"-s\" or increase the value of \"-m\".\n");
        exit(0);
    }
    

    // All clear, show the running parameters!
    printf("Using %d work group size\n", wgs);
    printf("Using vector size %d\n", vect_type);
    printf("Using 2^%d buckets\n", buckets_log2);
    printf("Using %d elements per bucket\n", bucket_size);
    printf("Using %d MB of memory\n", required_mem / 1024 / 1024);
    printf("Estimated drop percentage: %5.2f%%\n", 100 * poisson_estimate((1 << buckets_log2), MAX_MOMENTUM_NONCE, bucket_size));
    printf("\n");


    // Compile the OpenCL code
    printf("Compiling OpenCL code... this may take 3-5 minutes\n");


    bool isGPU = device->isGPU();
    if (!isGPU) { gpu_watchdog_max_wait *= 6; } // Effectively disable the watchdog

    std::stringstream params;
    params << " -I ./opencl/";
    params << " -D DEVICE_GPU=" << (isGPU ? 1 : 0);
    params << " -D VECT_TYPE=" << vect_type;
    params << " -D LOCAL_WGS=" << wgs;
    params << " -D NUM_BUCKETS_LOG2=" << buckets_log2;
    params << " -D BUCKET_SIZE=" << bucket_size;

#ifdef USE_SOURCE
    std::vector<std::string> file_list;
    file_list.push_back("opencl/momentum.cl");
    OpenCLProgram* program = device->getContext()->loadProgramFromFiles(file_list, params.str());
#else
    std::vector<std::string> input_src;
    input_src.push_back(getMomentumOpenCL());
    OpenCLProgram* program = device->getContext()->loadProgramFromStrings(input_src, params.str());
#endif

    kernel_hash   = program->getKernel("hash_step");
    kernel_reset  = program->getKernel("reset_and_seek");

    mid_hash = device->getContext()->createBuffer(32 * sizeof(cl_uint), CL_MEM_READ_ONLY, NULL);

    hash_list  = device->getContext()->createBuffer(calc_hash_mem_usage(buckets_log2, bucket_size), CL_MEM_READ_WRITE, NULL);
    index_list = device->getContext()->createBuffer(calc_index_mem_usage(buckets_log2, bucket_size), CL_MEM_READ_WRITE, NULL);

    nonce_a = device->getContext()->createBuffer(256 * sizeof(cl_uint), CL_MEM_WRITE_ONLY, NULL);
    nonce_b = device->getContext()->createBuffer(256 * sizeof(cl_uint), CL_MEM_WRITE_ONLY, NULL);
    nonce_qty = device->getContext()->createBuffer(sizeof(cl_uint), CL_MEM_READ_WRITE, NULL);

    q = device->getContext()->createCommandQueue(device);
}