Ejemplo n.º 1
0
OpenCLMomentumV3::OpenCLMomentumV3(int _HASH_BITS, int _device_num) {
	max_threads = 1<<30; // very big
	HASH_BITS = _HASH_BITS;
	device_num = _device_num;

	OpenCLMain& main = OpenCLMain::getInstance();

	// checks if device exists
	if (main.getInstance().getNumDevices() <= device_num) {
		printf("ERROR: DEVICE %d does not exist. Please limit your threads to one per device.\n", device_num);
		assert(false);
	}

	// compiles
	fprintf(stdout, "Starting OpenCLMomentum V3\n");
	fprintf(stdout, "Device %02d: %s\n", device_num, main.getDevice(device_num)->getName().c_str());
	cl_ulong maxWorkGroupSize = main.getDevice(device_num)->getMaxWorkGroupSize();
	fprintf(stdout, "Max work group size: %llu\n", maxWorkGroupSize);

	if (maxWorkGroupSize < max_threads) max_threads = maxWorkGroupSize;

	OpenCLContext *context = main.getDevice(device_num)->getContext();
	std::vector<std::string> program_filenames;
	program_filenames.push_back("opencl/opencl_cryptsha512.h");
	program_filenames.push_back("opencl/cryptsha512_kernel.cl");
	program_filenames.push_back("opencl/OpenCLMomentumV3.cl");
	OpenCLProgram *program = context->loadProgramFromFiles(program_filenames);

	// prealoc kernels
	OpenCLKernel *kernel = program->getKernel("kernel_sha512");
	OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table");

	// only one queue, helps with memory leaking
	queue = context->createCommandQueue(main.getDevice(device_num));

	size_t BLOCKSIZE = max_threads;
	// allocate internal structure
	cl_message = context->createBuffer(sizeof(uint8_t)*32, CL_MEM_READ_ONLY, NULL);
	internal_hash_table = context->createBuffer(sizeof(uint32_t)*(1<<HASH_BITS), CL_MEM_READ_WRITE, NULL);
	temp_collisions = context->createBuffer(sizeof(collision_struct)*getCollisionCeiling(), CL_MEM_WRITE_ONLY, NULL);
	temp_collisions_count = context->createBuffer(sizeof(size_t), CL_MEM_READ_WRITE, NULL);

	// sets args
	kernel_cleanup->resetArgs();
	kernel_cleanup->addGlobalArg(internal_hash_table);

	kernel->resetArgs();
	kernel->addGlobalArg(cl_message);
	kernel->addGlobalArg(internal_hash_table);
	uint32_t ht_size = 1<<HASH_BITS;
	kernel->addScalarUInt(ht_size);
	kernel->addGlobalArg(temp_collisions);
	kernel->addGlobalArg(temp_collisions_count);

}
Ejemplo n.º 2
0
void OpenCLMomentumV9::find_collisions(uint8_t* message, collision_struct* collisions, size_t* collision_count) {

    // temp storage
    *collision_count = 0;
    uint32_t ht_size = 1<<HASH_BITS;
    SHA512_Context c512_avxsse;

    SHA512_Init(&c512_avxsse);
    uint8_t midhash[32+4];
    memcpy(midhash+4, message, 32);
    *((uint32_t*)midhash) = 0;
    SHA512_Update_Simple(&c512_avxsse, midhash, 32+4);
    SHA512_PreFinal(&c512_avxsse);

    *(uint32_t *)(&c512_avxsse.buffer.bytes[0]) = 0;
    uint64_t * swap_helper = (uint64_t*)(&c512_avxsse.buffer.bytes[0]);
    for (int i = 1; i < 5; i++) {
        swap_helper[i] = SWAP64(swap_helper[i]);
    }

    OpenCLContext *context = OpenCLMain::getInstance().getDevice(device_num)->getContext();
    OpenCLProgram *program = context->getProgram(0);

    OpenCLKernel *kernel = program->getKernel("kernel_sha512");
    OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table");

    assert(kernel != NULL);

    //size_t BLOCKSIZE = main.getPlatform(0)->getDevice(0)->getMaxWorkGroupSize();
    size_t BLOCKSIZE = kernel->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num));
    //has to be a power of 2
    BLOCKSIZE = 1<<log2(BLOCKSIZE);
    size_t BLOCKSIZE_CLEAN = kernel_cleanup->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num));
    BLOCKSIZE_CLEAN = 1<<log2(BLOCKSIZE_CLEAN);

//	printf("BLOCKSIZE = %ld\n", BLOCKSIZE);
//	printf("BLOCKSIZE_CLEAN = %ld\n", BLOCKSIZE_CLEAN);

    // cleans up the hash table
    queue->enqueueKernel1D(kernel_cleanup, 1<<HASH_BITS, BLOCKSIZE_CLEAN);

    queue->enqueueWriteBuffer(cl_message, c512_avxsse.buffer.bytes, sizeof(uint8_t)*SHA512_BLOCK_SIZE);
    queue->enqueueWriteBuffer(temp_collisions_count, collision_count, sizeof(size_t));

    queue->enqueueKernel1D(kernel, MAX_MOMENTUM_NONCE/8, BLOCKSIZE);
    queue->enqueueReadBuffer(temp_collisions_count, collision_count, sizeof(size_t));
    queue->enqueueReadBuffer(temp_collisions, collisions, sizeof(collision_struct)*getCollisionCeiling());
    queue->finish();


}
Ejemplo n.º 3
0
	virtual bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/,
	                          vector<OpenCLProgram*> &programs)
	{
		path_trace_program.add_kernel(ustring("path_trace"));
		programs.push_back(&path_trace_program);
		return true;
	}
Ejemplo n.º 4
0
	~OpenCLDeviceSplitKernel()
	{
		task_pool.stop();

		/* Release kernels */
		program_data_init.release();

		delete split_kernel;
	}
Ejemplo n.º 5
0
	virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
	                          vector<OpenCLDeviceBase::OpenCLProgram*> &programs)
	{
		bool single_program = OpenCLInfo::use_single_program();
		program_data_init = OpenCLDeviceBase::OpenCLProgram(this,
		                                  single_program ? "split" : "split_data_init",
		                                  single_program ? "kernel_split.cl" : "kernel_data_init.cl",
		                                  get_build_options(this, requested_features));

		program_data_init.add_kernel(ustring("path_trace_data_init"));
		programs.push_back(&program_data_init);

		program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this,
		                                  single_program ? "split" : "split_state_buffer_size",
		                                  single_program ? "kernel_split.cl" : "kernel_state_buffer_size.cl",
		                                  get_build_options(this, requested_features));
		program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size"));
		programs.push_back(&program_state_buffer_size);

		return split_kernel->load_kernels(requested_features);
	}
Ejemplo n.º 6
0
void OpenCLMomentumV3::find_collisions(uint8_t* message, collision_struct* collisions, size_t* collision_count) {

	// temp storage
	*collision_count = 0;

	OpenCLContext *context = OpenCLMain::getInstance().getDevice(device_num)->getContext();
	OpenCLProgram *program = context->getProgram(0);

	OpenCLKernel *kernel = program->getKernel("kernel_sha512");
	OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table");

	assert(kernel != NULL);

	//size_t BLOCKSIZE = main.getPlatform(0)->getDevice(0)->getMaxWorkGroupSize();
	size_t BLOCKSIZE = kernel->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num));
	//has to be a power of 2
	BLOCKSIZE = 1<<log2(BLOCKSIZE);
	size_t BLOCKSIZE_CLEAN = kernel_cleanup->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num));
	BLOCKSIZE_CLEAN = 1<<log2(BLOCKSIZE_CLEAN);

//	printf("BLOCKSIZE = %ld\n", BLOCKSIZE);
//	printf("BLOCKSIZE_CLEAN = %ld\n", BLOCKSIZE_CLEAN);

	// cleans up the hash table
	queue->enqueueKernel1D(kernel_cleanup, 1<<HASH_BITS, BLOCKSIZE_CLEAN);

	queue->enqueueWriteBuffer(cl_message, message, sizeof(uint8_t)*32);
	queue->enqueueWriteBuffer(temp_collisions_count, collision_count, sizeof(size_t));

	queue->enqueueKernel1D(kernel, MAX_MOMENTUM_NONCE/8, BLOCKSIZE);
	queue->enqueueReadBuffer(temp_collisions_count, collision_count, sizeof(size_t));
	queue->enqueueReadBuffer(temp_collisions, collisions, sizeof(collision_struct)*getCollisionCeiling());
	queue->finish();


}
Ejemplo n.º 7
0
void OpenCLMomentumV8::find_collisions(uint8_t* message, collision_struct* out_buff, size_t* out_count) {

	// temp storage
	*out_count = 0;
	uint32_t ht_size = 1<<HASH_BITS;
	SHA512_Context c512_avxsse;

	SHA512_Init(&c512_avxsse);
	uint8_t midhash[32+4];
	memcpy(midhash+4, message, 32);
	*((uint32_t*)midhash) = 0;
	SHA512_Update_Simple(&c512_avxsse, midhash, 32+4);
	SHA512_PreFinal(&c512_avxsse);

	*(uint32_t *)(&c512_avxsse.buffer.bytes[0]) = 0;
	uint64_t * swap_helper = (uint64_t*)(&c512_avxsse.buffer.bytes[0]);
	for (int i = 1; i < 5; i++) {
		swap_helper[i] = SWAP64(swap_helper[i]);
	}

	OpenCLContext *context = OpenCLMain::getInstance().getDevice(device_num)->getContext();
	OpenCLProgram *program = context->getProgram(0);

	OpenCLKernel *kernel_calculate_all_hashes = program->getKernel("calculate_all_hashes");
	OpenCLKernel *kernel_fill_table = program->getKernel("fill_table");
	OpenCLKernel *kernel_find_collisions = program->getKernel("find_collisions");
	OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table");

	OpenCLDevice * device = OpenCLMain::getInstance().getDevice(device_num);

	// cleans up the hash table
	size_t kc_wgsize = kernel_cleanup->getWorkGroupSize(device);
	kc_wgsize = 1<<log2(kc_wgsize);
	queue->enqueueKernel1D(kernel_cleanup, 1<<HASH_BITS, kc_wgsize);

//	printf("Cleaning the HT\n");
//	queue->finish();

	queue->enqueueWriteBuffer(cl_message, c512_avxsse.buffer.bytes, sizeof(uint8_t)*SHA512_BLOCK_SIZE);
	// step 1, calculate hashes
	size_t kcah_wgsize = kernel_calculate_all_hashes->getWorkGroupSize(device);
	kcah_wgsize = 1<<log2(kcah_wgsize);
	queue->enqueueKernel1D(kernel_calculate_all_hashes, MAX_MOMENTUM_NONCE/8,
			kcah_wgsize);

//	uint64_t * apa = new uint64_t[MAX_MOMENTUM_NONCE];
//	queue->enqueueReadBuffer(hashes, apa, sizeof(uint64_t)*MAX_MOMENTUM_NONCE);
//	queue->finish();
//
//	printf("testing hashes\n");
//	uint64_t count = 0;
//	for (int i = 0; i < MAX_MOMENTUM_NONCE; i++) {
//		if (apa[i] == 0) {
//			count++;
//			printf("BAD HASH AT: %d %X\n", i, apa[i]);
//		}
//	}
//	printf("counted %X bad hashes\n", count);
//	printf("NOW REALLY TEST THEM hashes\n");
//	count = 0;
//	for (uint32_t i = 0; i < MAX_MOMENTUM_NONCE/8; i+=8) {
//		sph_sha512_context c512_sph; //SPH
//		sph_sha512_init(&c512_sph);
//		sph_sha512(&c512_sph, &i, 4);
//		sph_sha512(&c512_sph, message, 32);
//		uint64_t out[8];
//		sph_sha512_close(&c512_sph, out);
//		for (int j =0; j < 8; j++) {
//			if (apa[i+j] != out[j]) {
//				count++;
//				uint64_t xxx = apa[i+j];
//				printf("BAD HASH AT: %d => %X != %X\n", i, apa[i+j], out[j]);
//			}
//		}
//	}
//	printf("counted %X bad hashes\n", count);

	// step 2, populate hashtable
	size_t kft_wgsize = kernel_fill_table->getWorkGroupSize(device);
	kft_wgsize = 1<<log2(kft_wgsize);
	queue->enqueueKernel1D(kernel_fill_table, MAX_MOMENTUM_NONCE,
							kft_wgsize);

//	printf("step 2, populate hashtable\n");
//	queue->finish();

	queue->enqueueWriteBuffer(collisions_count, out_count, sizeof(size_t));
	// step 3, find collisions
	size_t kfc_wgsize = kernel_find_collisions->getWorkGroupSize(device);
	kfc_wgsize = 1<<log2(kfc_wgsize);
	queue->enqueueKernel1D(kernel_find_collisions, MAX_MOMENTUM_NONCE,
							kfc_wgsize);

//	printf("step 3, find collisions\n");
//	queue->finish();

	queue->enqueueReadBuffer(collisions_count, out_count, sizeof(size_t));
	queue->enqueueReadBuffer(collisions, out_buff, sizeof(collision_struct)*getCollisionCeiling());

//	printf("step 4, copy output\n");
	queue->finish();


#ifdef DEBUG
	printf("Collision Count = %d\n", (*out_count));
#endif

}
Ejemplo n.º 8
0
int main()
{
    if(OpenCLRuntime::Initialize() != SICKL_SUCCESS)
    {
        printf("Could not OpenCL Context\n");
        return -1;
    }

    Mandelbrot mbrot;
    
    mbrot.Parse();
  
    mbrot.GetRoot().Print();
    
    OpenCLProgram program;
    OpenCLCompiler::Build(mbrot, program);

#if 0

	// init GLEW/GLUT and other gl setup
    if(!OpenGLRuntime::Initialize())
    {
        printf("Could not create OpenGL Context\n");
        return -1;
    }

	OpenGLCompiler comp;

	Mandelbrot mbrot;
	mbrot.Parse();
	/// Prints the AST generated from the Mandelbrot source

	mbrot.GetRoot().Print();

	/// Compile our OpenGL program
	OpenGLProgram* program = comp.Build(mbrot);

	/// Print the generated GLSL source
	printf("%s\n", program->GetSource().c_str());

	
	const uint32_t width = 350 * 5;
	const uint32_t height = 200 * 5;
	const uint32_t colors = mbrot.max_iterations;

	/// Generate the color table (a nice gold)
	float* color_map_data = new float[3 * colors];
	for(uint32_t i = 0; i < colors; i++)
	{
		float x = i/(float)colors;
		color_map_data[3 * i + 0] = 191.0f / 255.0f * (1.0f - x);
		color_map_data[3 * i + 1] = 125.0f / 255.0f * (1.0f - x);
		color_map_data[3 * i + 2] = 37.0f / 255.0f * (1.0f - x);
	}

	/// put it int a 1d buffer
	OpenGLBuffer1D color_map(colors, ReturnType::Float3, color_map_data);

	/// our output buffer
	OpenGLBuffer2D result(width, height, ReturnType::Float3, nullptr);
	OpenGLBuffer2D copy(width, height, ReturnType::Float3, nullptr);

	/// initialize our program
	program->Initialize(width, height);

	/// get our binding locations for each of the program input and outputs
	input_t min_loc = program->GetInputHandle("min");
	input_t max_loc = program->GetInputHandle("max");
	input_t color_map_loc = program->GetInputHandle("color_map");

	output_t output_loc = program->GetOutputHandle("output");

	/// sets min values
	program->SetInput(min_loc, -2.5f, -1.0f);
	/// sets max values
	program->SetInput(max_loc, 1.0f, 1.0f);
	/// set the scaler
	program->SetInput(color_map_loc, color_map);

	/// sets the render location
	program->BindOutput(output_loc, result);

	/// Runs the program
	program->Run();

    /// We can copy our data to the second buffer
    copy.SetData(result);

    float* result_buffer = nullptr;
    /// We can either read result back from the texture
    copy.GetData(result_buffer);

	/// Or from the framebuffer (which is faster on nvidia hardware at least)
    program->GetOutput(output_loc, result_buffer);

	/// Finally, dump the image to a Bitmap to view
	BMP image;
	image.SetSize(width, height);

	for(uint32_t i = 0; i < height; i++)
	{
		for(uint32_t j = 0; j < width; j++)
		{
			float red = result_buffer[i * width * 3 + j * 3 + 0];
			float green = result_buffer[i * width * 3 + j * 3 + 1];
			float blue = result_buffer[i * width * 3 + j * 3 + 2];

			auto pixel = image(j,i);
			pixel->Red = (uint8_t)(red * 255);
			pixel->Green = (uint8_t)(green * 255);
			pixel->Blue = (uint8_t)(blue * 255);
		}
	}

	image.WriteToFile("result.bmp");

	/// Cleanup

	free(result_buffer);
	delete program;

    OpenGLRuntime::Finalize();
#endif
}
Ejemplo n.º 9
0
ProtoshareOpenCL::ProtoshareOpenCL(int _device_num)
{
    this->device_num = _device_num;

    printf("Initializing GPU %d\n", device_num);
    OpenCLMain &main = OpenCLMain::getInstance();
    this->device = main.getDevice(device_num);


    printf("======================================================================\n");
    printf("Device information for: %s\n", device->getName().c_str());
    device->dumpDeviceInfo(); // Makes troubleshooting easier
    printf("======================================================================\n");
    printf("\n");

    // Sanitize input parameters
    if (commandlineInput.wgs == 0) {
        this->wgs = device->getMaxWorkGroupSize();
    } else {
        this->wgs = commandlineInput.wgs;
    }

    this->buckets_log2 = commandlineInput.buckets_log2;
    this->vect_type = commandlineInput.vect_type;
    this->bucket_size = commandlineInput.bucket_size;
    this->target_mem = commandlineInput.target_mem;

// If bucket size unset and target memory unset, use maximum usable memory
    if (target_mem == 0 && bucket_size == 0) {
        target_mem = device->getGlobalMemSize() / 1024 / 1024;
    }

    // If set, convert target memory into a usable value for bucket_size
    if (target_mem > 0) {
        // Convert target to bytes, subtract 1 to guarantee results LESS THAN target
        uint32 target_mem_temp = (target_mem * 1024 * 1024);

        // Lazy calculation, assume large bucket_size, scale back from there
        bucket_size = 1024;

        while (bucket_size > 0 && calc_total_mem_usage(buckets_log2, bucket_size) > target_mem_temp) { bucket_size--; }

        // Make sure the parameter configuration is sane:
        if (bucket_size < 1) {
            printf("ERROR: Memory target of %d MB cannot be attained with 2^%d buckets!\n", target_mem, buckets_log2);
            printf("       Please lower the value of \"-b\" or increase the value of \"-m\".\n");
            exit(0);
        }
    }


    // Make sure we can allocate hash_list (cannot violate CL_DEVICE_MAX_MEM_ALLOC_SIZE)
    cl_ulong required_mem = calc_hash_mem_usage(buckets_log2, bucket_size);
    cl_ulong available_mem = device->getMaxMemAllocSize();
    if (required_mem > available_mem) {
        printf("ERROR: Device %d cannot allocate 2^%d buckets of %d elements!\n", device_num, buckets_log2, bucket_size);
        printf("       CL_DEVICE_MAX_MEM_ALLOC_SIZE is %d MB, this configuration requires %d MB\n", available_mem / 1024 / 1024, required_mem / 1024 / 1024);
        printf("       Please lower the value of \"-b\" or \"-s\" or increase the value of \"-m\".\n");
        exit(0);
    }


    // Make sure we can allocate nonce_map (cannot violate CL_DEVICE_MAX_MEM_ALLOC_SIZE)
    required_mem = calc_index_mem_usage(buckets_log2, bucket_size);
    available_mem = device->getMaxMemAllocSize();
    if (required_mem > available_mem) {
        printf("ERROR: Device %d cannot allocate index of 2^%d elements!\n", device_num, buckets_log2);
        printf("       CL_DEVICE_MAX_MEM_ALLOC_SIZE is %d MB, this configuration requires %d MB\n", available_mem / 1024 / 1024, required_mem / 1024 / 1024);
        printf("       Please lower the value of \"-b\" or increase the value of \"-m\".\n");
        exit(0);
    }
    
    
    // Make sure the whole thing fits in memory
    required_mem = calc_total_mem_usage(buckets_log2, bucket_size);
    available_mem = device->getGlobalMemSize();
    if (required_mem > available_mem) {
        printf("ERROR: Device %d cannot store 2^%d buckets of %d elements!\n", device_num, buckets_log2, bucket_size);
        printf("       CL_DEVICE_GLOBAL_MEM_SIZE is %d MB, this configuration requires %d MB\n", available_mem / 1024 / 1024, required_mem / 1024 / 1024);
        printf("       Please lower the value of \"-b\" or \"-s\" or increase the value of \"-m\".\n");
        exit(0);
    }
    

    // All clear, show the running parameters!
    printf("Using %d work group size\n", wgs);
    printf("Using vector size %d\n", vect_type);
    printf("Using 2^%d buckets\n", buckets_log2);
    printf("Using %d elements per bucket\n", bucket_size);
    printf("Using %d MB of memory\n", required_mem / 1024 / 1024);
    printf("Estimated drop percentage: %5.2f%%\n", 100 * poisson_estimate((1 << buckets_log2), MAX_MOMENTUM_NONCE, bucket_size));
    printf("\n");


    // Compile the OpenCL code
    printf("Compiling OpenCL code... this may take 3-5 minutes\n");


    bool isGPU = device->isGPU();
    if (!isGPU) { gpu_watchdog_max_wait *= 6; } // Effectively disable the watchdog

    std::stringstream params;
    params << " -I ./opencl/";
    params << " -D DEVICE_GPU=" << (isGPU ? 1 : 0);
    params << " -D VECT_TYPE=" << vect_type;
    params << " -D LOCAL_WGS=" << wgs;
    params << " -D NUM_BUCKETS_LOG2=" << buckets_log2;
    params << " -D BUCKET_SIZE=" << bucket_size;

#ifdef USE_SOURCE
    std::vector<std::string> file_list;
    file_list.push_back("opencl/momentum.cl");
    OpenCLProgram* program = device->getContext()->loadProgramFromFiles(file_list, params.str());
#else
    std::vector<std::string> input_src;
    input_src.push_back(getMomentumOpenCL());
    OpenCLProgram* program = device->getContext()->loadProgramFromStrings(input_src, params.str());
#endif

    kernel_hash   = program->getKernel("hash_step");
    kernel_reset  = program->getKernel("reset_and_seek");

    mid_hash = device->getContext()->createBuffer(32 * sizeof(cl_uint), CL_MEM_READ_ONLY, NULL);

    hash_list  = device->getContext()->createBuffer(calc_hash_mem_usage(buckets_log2, bucket_size), CL_MEM_READ_WRITE, NULL);
    index_list = device->getContext()->createBuffer(calc_index_mem_usage(buckets_log2, bucket_size), CL_MEM_READ_WRITE, NULL);

    nonce_a = device->getContext()->createBuffer(256 * sizeof(cl_uint), CL_MEM_WRITE_ONLY, NULL);
    nonce_b = device->getContext()->createBuffer(256 * sizeof(cl_uint), CL_MEM_WRITE_ONLY, NULL);
    nonce_qty = device->getContext()->createBuffer(sizeof(cl_uint), CL_MEM_READ_WRITE, NULL);

    q = device->getContext()->createCommandQueue(device);
}
Ejemplo n.º 10
0
	~OpenCLDeviceMegaKernel()
	{
		task_pool.stop();
		path_trace_program.release();
	}
Ejemplo n.º 11
0
	~OpenCLDeviceSplitKernel()
	{
		task_pool.stop();

		/* Release kernels */
		program_data_init.release();
		program_scene_intersect.release();
		program_lamp_emission.release();
		program_queue_enqueue.release();
		program_background_buffer_update.release();
		program_shader_eval.release();
		program_holdout_emission_blurring_pathtermination_ao.release();
		program_direct_lighting.release();
		program_shadow_blocked.release();
		program_next_iteration_setup.release();
		program_sum_all_radiance.release();

		/* Release global memory */
		release_mem_object_safe(rng_coop);
		release_mem_object_safe(throughput_coop);
		release_mem_object_safe(L_transparent_coop);
		release_mem_object_safe(PathRadiance_coop);
		release_mem_object_safe(Ray_coop);
		release_mem_object_safe(PathState_coop);
		release_mem_object_safe(Intersection_coop);
		release_mem_object_safe(kgbuffer);
		release_mem_object_safe(sd);
		release_mem_object_safe(sd_DL_shadow);
		release_mem_object_safe(ray_state);
		release_mem_object_safe(AOAlpha_coop);
		release_mem_object_safe(AOBSDF_coop);
		release_mem_object_safe(AOLightRay_coop);
		release_mem_object_safe(BSDFEval_coop);
		release_mem_object_safe(ISLamp_coop);
		release_mem_object_safe(LightRay_coop);
		release_mem_object_safe(Intersection_coop_shadow);
#ifdef WITH_CYCLES_DEBUG
		release_mem_object_safe(debugdata_coop);
#endif
		release_mem_object_safe(use_queues_flag);
		release_mem_object_safe(Queue_data);
		release_mem_object_safe(Queue_index);
		release_mem_object_safe(work_array);
#ifdef __WORK_STEALING__
		release_mem_object_safe(work_pool_wgs);
#endif
		release_mem_object_safe(per_sample_output_buffers);

		if(hostRayStateArray != NULL) {
			free(hostRayStateArray);
		}
	}