コード例 #1
0
ファイル: al_OpenCLImage2D.cpp プロジェクト: LuaAV/LuaAV
void OpenCLImage2D :: create(
	OpenCLContext &ctx, 
	cl_mem_flags usage, 
	AlloArray *array
) {
	destroy();
	detach();

	usage = OpenCLMemoryBuffer::check_memory_flags(usage, array->data.ptr);
	
	bool at_least_2d = array->header.dimcount >= 2;
	size_t width = array->header.dim[0];
	size_t height = at_least_2d ? array->header.dim[1] : 1;
	size_t rowstride = at_least_2d ? array->header.stride[1] : allo_array_size(array);
	cl_image_format format = OpenCLImageFormat::format_from_array(array);
	
	cl_int res = CL_SUCCESS;
	cl_mem mem = clCreateImage2D(
		ctx.get_context(),
		usage,
		&format,
		width,
		height,
		rowstride,
		array->data.ptr,
		&res
	);
	
	if(opencl_error(res, "clCreateImage2D error creating buffer")) {
		return;
	}
	
	mMem = mem;
	ctx.attach_resource(this);
}
コード例 #2
0
ファイル: al_OpenCLImage2D.cpp プロジェクト: LuaAV/LuaAV
void OpenCLImage2D :: create(
	OpenCLContext &ctx, 
	cl_mem_flags usage, 
	const cl_image_format *format, 
	size_t width, 
	size_t height, 
	size_t rowstride, 
	void *ptr
) {
	destroy();
	detach();

	usage = OpenCLMemoryBuffer::check_memory_flags(usage, ptr);
	
	cl_int res = CL_SUCCESS;
	cl_mem mem = clCreateImage2D(
		ctx.get_context(),
		usage,
		format,
		width,
		height,
		rowstride,
		ptr,
		&res
	);
	
	if(opencl_error(res, "clCreateImage2D error creating buffer")) {
		return;
	}
	
	mMem = mem;
	ctx.attach_resource(this);
}
コード例 #3
0
ファイル: fftclfft.cpp プロジェクト: aveminus/freq
void FftClFft::
        compute( Tfr::ChunkData::Ptr input, Tfr::ChunkData::Ptr output, FftDirection direction )
{
    TIME_STFT TaskTimer tt("Fft ClFft");

    unsigned n = input->getNumberOfElements().width;
    unsigned N = output->getNumberOfElements().width;

    if (-1 != direction)
        EXCEPTION_ASSERT( n == N );

    {
        TIME_STFT TaskTimer tt("Computing fft(N=%u, n=%u, direction=%d)", N, n, direction);
        OpenCLContext *opencl = &OpenCLContext::Singleton();
        cl_int fft_error;

        clFFT_Plan plan = CLFFTKernelBuffer::Singleton().getPlan(opencl->getContext(), n, fft_error);
        if (fft_error != CL_SUCCESS)
            throw std::runtime_error("Could not create clFFT compute plan.");

        // Run the fft in OpenCL :)
        // fft kernel needs to have read/write access to output data
        fft_error |= clFFT_ExecuteInterleaved(
                opencl->getCommandQueue(),
                plan, 1, (clFFT_Direction)direction,
                OpenClMemoryStorage::ReadOnly<1>( input ).ptr(),
                OpenClMemoryStorage::ReadWrite<1>( output ).ptr(),
                0, NULL, NULL );

        if (fft_error != CL_SUCCESS)
            throw std::runtime_error("Bad stuff happened during FFT computation.");
    }
}
コード例 #4
0
ファイル: fftclfft.cpp プロジェクト: aveminus/freq
void FftClFft::
        compute( Tfr::ChunkData::Ptr input, Tfr::ChunkData::Ptr output, DataStorageSize n, FftDirection direction )
{
    TaskTimer tt("Stft::computeWithClFft( matrix[%d, %d], %s )",
                 input->size().width,
                 input->size().height,
                 direction==FftDirection_Forward?"forward":"backward");

    EXCEPTION_ASSERT( output->numberOfBytes() == input->numberOfBytes() );

    const int batchSize = n.height;

    OpenCLContext *opencl = &OpenCLContext::Singleton();
    cl_int fft_error;

    clFFT_Plan plan = CLFFTKernelBuffer::Singleton().getPlan(opencl->getContext(), n.width, fft_error);
    if(fft_error != CL_SUCCESS)
        throw std::runtime_error("Could not create clFFT compute plan.");

    {
        TaskTimer tt("Calculating batches");

        // Run the fft in OpenCL :)
        fft_error |= clFFT_ExecuteInterleaved(
                opencl->getCommandQueue(),
                plan, batchSize, direction==FftDirection_Forward?clFFT_Forward:clFFT_Inverse,
                OpenClMemoryStorage::ReadOnly<1>( input ).ptr(),
                OpenClMemoryStorage::ReadWrite<1>( output ).ptr(),
                0, NULL, NULL );
        if(fft_error != CL_SUCCESS)
            throw std::runtime_error("Bad stuff happened during FFT computation.");
    }
}
コード例 #5
0
ファイル: al_OpenCLCommandQueue.cpp プロジェクト: LuaAV/LuaAV
void OpenCLCommandQueue :: create(OpenCLContext &ctx, const OpenCLDevice &dev, bool ordered, bool profiling) {
	destroy();
	detach();
	
	cl_command_queue_properties properties = 0;
	if(profiling) {
		properties |= CL_QUEUE_PROFILING_ENABLE;
	}
	if(! ordered) {
		if(GET_FLAG(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, dev.get_queue_properties())) {
			properties |= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
		}
		else {
			opencl_error(USER_OPENCL_ERROR, "Device doesn't support out of order execution ... disabling");
		}
	}
	
	cl_int res = CL_SUCCESS;
	cl_command_queue command_queue = clCreateCommandQueue(
		ctx.get_context(),
		dev.get_device(),
		properties,
		&res
	);
	
	if(opencl_error(res, "clCreateCommandQueue error creating command queue")) {
		return;
	}
	
	mCommandQueue = command_queue;
	ctx.attach_resource(this);
}
コード例 #6
0
ファイル: OpenCLSort.cpp プロジェクト: hainm/openmm
OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait),
            dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL), dataLength(length) {
    // Create kernels.

    std::map<std::string, std::string> replacements;
    replacements["DATA_TYPE"] = trait->getDataType();
    replacements["KEY_TYPE"] =  trait->getKeyType();
    replacements["SORT_KEY"] = trait->getSortKey();
    replacements["MIN_KEY"] = trait->getMinKey();
    replacements["MAX_KEY"] = trait->getMaxKey();
    replacements["MAX_VALUE"] = trait->getMaxValue();
    replacements["VALUE_IS_INT2"] = (trait->getDataType() == std::string("int2") ? "1" : "0");
    cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
    shortListKernel = cl::Kernel(program, "sortShortList");
    computeRangeKernel = cl::Kernel(program, "computeRange");
    assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets");
    computeBucketPositionsKernel = cl::Kernel(program, "computeBucketPositions");
    copyToBucketsKernel = cl::Kernel(program, "copyDataToBuckets");
    sortBucketsKernel = cl::Kernel(program, "sortBuckets");

    // Work out the work group sizes for various kernels.

    unsigned int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
    int maxSharedMem = context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
    unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
    unsigned int maxRangeSize = std::min(maxGroupSize, (unsigned int) computeRangeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
    unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
    unsigned int maxShortListSize = shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
    // On Qualcomm's OpenCL, it's essential to check against maxShortListSize.  Otherwise you get a crash.
    // But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
    // maximum, so including the check hurts performance.  For the moment I'm going to just comment it out.
    // If we officially support Qualcomm in the future, we'll need to do something better.
    isShortList = (length <= maxLocalBuffer/* && length < maxShortListSize*/);
    for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2)
        ;
    positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize);
    sortKernelSize = (isShortList ? rangeKernelSize : rangeKernelSize/2);
    if (rangeKernelSize > length)
        rangeKernelSize = length;
    if (sortKernelSize > maxLocalBuffer)
        sortKernelSize = maxLocalBuffer;
    unsigned int targetBucketSize = sortKernelSize/2;
    unsigned int numBuckets = length/targetBucketSize;
    if (numBuckets < 1)
        numBuckets = 1;
    if (positionsKernelSize > numBuckets)
        positionsKernelSize = numBuckets;

    // Create workspace arrays.

    if (!isShortList) {
        dataRange = new OpenCLArray(context, 2, trait->getKeySize(), "sortDataRange");
        bucketOffset = OpenCLArray::create<cl_uint>(context, numBuckets, "bucketOffset");
        bucketOfElement = OpenCLArray::create<cl_uint>(context, length, "bucketOfElement");
        offsetInBucket = OpenCLArray::create<cl_uint>(context, length, "offsetInBucket");
        buckets = new OpenCLArray(context, length, trait->getDataSize(), "buckets");
    }
}
コード例 #7
0
void OpenCLMomentumV9::find_collisions(uint8_t* message, collision_struct* collisions, size_t* collision_count) {

    // temp storage
    *collision_count = 0;
    uint32_t ht_size = 1<<HASH_BITS;
    SHA512_Context c512_avxsse;

    SHA512_Init(&c512_avxsse);
    uint8_t midhash[32+4];
    memcpy(midhash+4, message, 32);
    *((uint32_t*)midhash) = 0;
    SHA512_Update_Simple(&c512_avxsse, midhash, 32+4);
    SHA512_PreFinal(&c512_avxsse);

    *(uint32_t *)(&c512_avxsse.buffer.bytes[0]) = 0;
    uint64_t * swap_helper = (uint64_t*)(&c512_avxsse.buffer.bytes[0]);
    for (int i = 1; i < 5; i++) {
        swap_helper[i] = SWAP64(swap_helper[i]);
    }

    OpenCLContext *context = OpenCLMain::getInstance().getDevice(device_num)->getContext();
    OpenCLProgram *program = context->getProgram(0);

    OpenCLKernel *kernel = program->getKernel("kernel_sha512");
    OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table");

    assert(kernel != NULL);

    //size_t BLOCKSIZE = main.getPlatform(0)->getDevice(0)->getMaxWorkGroupSize();
    size_t BLOCKSIZE = kernel->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num));
    //has to be a power of 2
    BLOCKSIZE = 1<<log2(BLOCKSIZE);
    size_t BLOCKSIZE_CLEAN = kernel_cleanup->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num));
    BLOCKSIZE_CLEAN = 1<<log2(BLOCKSIZE_CLEAN);

//	printf("BLOCKSIZE = %ld\n", BLOCKSIZE);
//	printf("BLOCKSIZE_CLEAN = %ld\n", BLOCKSIZE_CLEAN);

    // cleans up the hash table
    queue->enqueueKernel1D(kernel_cleanup, 1<<HASH_BITS, BLOCKSIZE_CLEAN);

    queue->enqueueWriteBuffer(cl_message, c512_avxsse.buffer.bytes, sizeof(uint8_t)*SHA512_BLOCK_SIZE);
    queue->enqueueWriteBuffer(temp_collisions_count, collision_count, sizeof(size_t));

    queue->enqueueKernel1D(kernel, MAX_MOMENTUM_NONCE/8, BLOCKSIZE);
    queue->enqueueReadBuffer(temp_collisions_count, collision_count, sizeof(size_t));
    queue->enqueueReadBuffer(temp_collisions, collisions, sizeof(collision_struct)*getCollisionCeiling());
    queue->finish();


}
コード例 #8
0
ファイル: OpenCLSort.cpp プロジェクト: MrBitKoin/openmm
OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait),
            dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL), dataLength(length) {
    // Create kernels.

    std::map<std::string, std::string> replacements;
    replacements["DATA_TYPE"] = trait->getDataType();
    replacements["KEY_TYPE"] =  trait->getKeyType();
    replacements["SORT_KEY"] = trait->getSortKey();
    replacements["MIN_KEY"] = trait->getMinKey();
    replacements["MAX_KEY"] = trait->getMaxKey();
    replacements["MAX_VALUE"] = trait->getMaxValue();
    replacements["VALUE_IS_INT2"] = (trait->getDataType() == std::string("int2") ? "1" : "0");
    cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
    shortListKernel = cl::Kernel(program, "sortShortList");
    computeRangeKernel = cl::Kernel(program, "computeRange");
    assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets");
    computeBucketPositionsKernel = cl::Kernel(program, "computeBucketPositions");
    copyToBucketsKernel = cl::Kernel(program, "copyDataToBuckets");
    sortBucketsKernel = cl::Kernel(program, "sortBuckets");

    // Work out the work group sizes for various kernels.

    unsigned int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
    int maxSharedMem = context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
    unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
    isShortList = (length <= maxLocalBuffer);
    for (rangeKernelSize = 1; rangeKernelSize*2 <= maxGroupSize; rangeKernelSize *= 2)
        ;
    positionsKernelSize = rangeKernelSize;
    sortKernelSize = (isShortList ? rangeKernelSize : rangeKernelSize/2);
    if (rangeKernelSize > length)
        rangeKernelSize = length;
    if (sortKernelSize > maxLocalBuffer)
        sortKernelSize = maxLocalBuffer;
    unsigned int targetBucketSize = sortKernelSize/2;
    unsigned int numBuckets = length/targetBucketSize;
    if (numBuckets < 1)
        numBuckets = 1;
    if (positionsKernelSize > numBuckets)
        positionsKernelSize = numBuckets;

    // Create workspace arrays.

    if (!isShortList) {
        dataRange = new OpenCLArray(context, 2, trait->getKeySize(), "sortDataRange");
        bucketOffset = OpenCLArray::create<cl_uint>(context, numBuckets, "bucketOffset");
        bucketOfElement = OpenCLArray::create<cl_uint>(context, length, "bucketOfElement");
        offsetInBucket = OpenCLArray::create<cl_uint>(context, length, "offsetInBucket");
        buckets = new OpenCLArray(context, length, trait->getDataSize(), "buckets");
    }
}
コード例 #9
0
OpenCLArray::OpenCLArray(OpenCLContext& context, int size, int elementSize, const std::string& name, cl_int flags) :
        context(context), size(size), elementSize(elementSize), name(name), ownsBuffer(true) {
    try {
        buffer = new cl::Buffer(context.getContext(), flags, size*elementSize);
    }
    catch (cl::Error err) {
        std::stringstream str;
        str<<"Error creating array "<<name<<": "<<err.what()<<" ("<<err.err()<<")";
        throw OpenMMException(str.str());
    }
}
コード例 #10
0
OpenCLMomentumV3::OpenCLMomentumV3(int _HASH_BITS, int _device_num) {
	max_threads = 1<<30; // very big
	HASH_BITS = _HASH_BITS;
	device_num = _device_num;

	OpenCLMain& main = OpenCLMain::getInstance();

	// checks if device exists
	if (main.getInstance().getNumDevices() <= device_num) {
		printf("ERROR: DEVICE %d does not exist. Please limit your threads to one per device.\n", device_num);
		assert(false);
	}

	// compiles
	fprintf(stdout, "Starting OpenCLMomentum V3\n");
	fprintf(stdout, "Device %02d: %s\n", device_num, main.getDevice(device_num)->getName().c_str());
	cl_ulong maxWorkGroupSize = main.getDevice(device_num)->getMaxWorkGroupSize();
	fprintf(stdout, "Max work group size: %llu\n", maxWorkGroupSize);

	if (maxWorkGroupSize < max_threads) max_threads = maxWorkGroupSize;

	OpenCLContext *context = main.getDevice(device_num)->getContext();
	std::vector<std::string> program_filenames;
	program_filenames.push_back("opencl/opencl_cryptsha512.h");
	program_filenames.push_back("opencl/cryptsha512_kernel.cl");
	program_filenames.push_back("opencl/OpenCLMomentumV3.cl");
	OpenCLProgram *program = context->loadProgramFromFiles(program_filenames);

	// prealoc kernels
	OpenCLKernel *kernel = program->getKernel("kernel_sha512");
	OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table");

	// only one queue, helps with memory leaking
	queue = context->createCommandQueue(main.getDevice(device_num));

	size_t BLOCKSIZE = max_threads;
	// allocate internal structure
	cl_message = context->createBuffer(sizeof(uint8_t)*32, CL_MEM_READ_ONLY, NULL);
	internal_hash_table = context->createBuffer(sizeof(uint32_t)*(1<<HASH_BITS), CL_MEM_READ_WRITE, NULL);
	temp_collisions = context->createBuffer(sizeof(collision_struct)*getCollisionCeiling(), CL_MEM_WRITE_ONLY, NULL);
	temp_collisions_count = context->createBuffer(sizeof(size_t), CL_MEM_READ_WRITE, NULL);

	// sets args
	kernel_cleanup->resetArgs();
	kernel_cleanup->addGlobalArg(internal_hash_table);

	kernel->resetArgs();
	kernel->addGlobalArg(cl_message);
	kernel->addGlobalArg(internal_hash_table);
	uint32_t ht_size = 1<<HASH_BITS;
	kernel->addScalarUInt(ht_size);
	kernel->addGlobalArg(temp_collisions);
	kernel->addGlobalArg(temp_collisions_count);

}
コード例 #11
0
void OpenCLMomentumV3::find_collisions(uint8_t* message, collision_struct* collisions, size_t* collision_count) {

	// temp storage
	*collision_count = 0;

	OpenCLContext *context = OpenCLMain::getInstance().getDevice(device_num)->getContext();
	OpenCLProgram *program = context->getProgram(0);

	OpenCLKernel *kernel = program->getKernel("kernel_sha512");
	OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table");

	assert(kernel != NULL);

	//size_t BLOCKSIZE = main.getPlatform(0)->getDevice(0)->getMaxWorkGroupSize();
	size_t BLOCKSIZE = kernel->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num));
	//has to be a power of 2
	BLOCKSIZE = 1<<log2(BLOCKSIZE);
	size_t BLOCKSIZE_CLEAN = kernel_cleanup->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num));
	BLOCKSIZE_CLEAN = 1<<log2(BLOCKSIZE_CLEAN);

//	printf("BLOCKSIZE = %ld\n", BLOCKSIZE);
//	printf("BLOCKSIZE_CLEAN = %ld\n", BLOCKSIZE_CLEAN);

	// cleans up the hash table
	queue->enqueueKernel1D(kernel_cleanup, 1<<HASH_BITS, BLOCKSIZE_CLEAN);

	queue->enqueueWriteBuffer(cl_message, message, sizeof(uint8_t)*32);
	queue->enqueueWriteBuffer(temp_collisions_count, collision_count, sizeof(size_t));

	queue->enqueueKernel1D(kernel, MAX_MOMENTUM_NONCE/8, BLOCKSIZE);
	queue->enqueueReadBuffer(temp_collisions_count, collision_count, sizeof(size_t));
	queue->enqueueReadBuffer(temp_collisions, collisions, sizeof(collision_struct)*getCollisionCeiling());
	queue->finish();


}
コード例 #12
0
OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false), usePadding(true),
        numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), nonbondedForceGroup(0) {
    // Decide how many thread blocks and force buffers to use.

    deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
    if (deviceIsCpu) {
        numForceThreadBlocks = context.getNumThreadBlocks();
        forceThreadBlockSize = 1;
        numForceBuffers = numForceThreadBlocks;
    }
    else if (context.getSIMDWidth() == 32) {
        if (context.getSupports64BitGlobalAtomics()) {
            numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
            forceThreadBlockSize = 256;
            // Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels.
            numForceBuffers = 1;
        }
        else {
            numForceThreadBlocks = 3*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
            forceThreadBlockSize = 256;
            numForceBuffers = numForceThreadBlocks*forceThreadBlockSize/OpenCLContext::TileSize;
        }
    }
    else {
        numForceThreadBlocks = context.getNumThreadBlocks();
        forceThreadBlockSize = (context.getSIMDWidth() >= 32 ? OpenCLContext::ThreadBlockSize : 32);
        if (context.getSupports64BitGlobalAtomics()) {
            // Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels.
            numForceBuffers = 1;
        }
        else {
            numForceBuffers = numForceThreadBlocks*forceThreadBlockSize/OpenCLContext::TileSize;
        }
    }
}
コード例 #13
0
void OpenCLMomentumV8::find_collisions(uint8_t* message, collision_struct* out_buff, size_t* out_count) {

	// temp storage
	*out_count = 0;
	uint32_t ht_size = 1<<HASH_BITS;
	SHA512_Context c512_avxsse;

	SHA512_Init(&c512_avxsse);
	uint8_t midhash[32+4];
	memcpy(midhash+4, message, 32);
	*((uint32_t*)midhash) = 0;
	SHA512_Update_Simple(&c512_avxsse, midhash, 32+4);
	SHA512_PreFinal(&c512_avxsse);

	*(uint32_t *)(&c512_avxsse.buffer.bytes[0]) = 0;
	uint64_t * swap_helper = (uint64_t*)(&c512_avxsse.buffer.bytes[0]);
	for (int i = 1; i < 5; i++) {
		swap_helper[i] = SWAP64(swap_helper[i]);
	}

	OpenCLContext *context = OpenCLMain::getInstance().getDevice(device_num)->getContext();
	OpenCLProgram *program = context->getProgram(0);

	OpenCLKernel *kernel_calculate_all_hashes = program->getKernel("calculate_all_hashes");
	OpenCLKernel *kernel_fill_table = program->getKernel("fill_table");
	OpenCLKernel *kernel_find_collisions = program->getKernel("find_collisions");
	OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table");

	OpenCLDevice * device = OpenCLMain::getInstance().getDevice(device_num);

	// cleans up the hash table
	size_t kc_wgsize = kernel_cleanup->getWorkGroupSize(device);
	kc_wgsize = 1<<log2(kc_wgsize);
	queue->enqueueKernel1D(kernel_cleanup, 1<<HASH_BITS, kc_wgsize);

//	printf("Cleaning the HT\n");
//	queue->finish();

	queue->enqueueWriteBuffer(cl_message, c512_avxsse.buffer.bytes, sizeof(uint8_t)*SHA512_BLOCK_SIZE);
	// step 1, calculate hashes
	size_t kcah_wgsize = kernel_calculate_all_hashes->getWorkGroupSize(device);
	kcah_wgsize = 1<<log2(kcah_wgsize);
	queue->enqueueKernel1D(kernel_calculate_all_hashes, MAX_MOMENTUM_NONCE/8,
			kcah_wgsize);

//	uint64_t * apa = new uint64_t[MAX_MOMENTUM_NONCE];
//	queue->enqueueReadBuffer(hashes, apa, sizeof(uint64_t)*MAX_MOMENTUM_NONCE);
//	queue->finish();
//
//	printf("testing hashes\n");
//	uint64_t count = 0;
//	for (int i = 0; i < MAX_MOMENTUM_NONCE; i++) {
//		if (apa[i] == 0) {
//			count++;
//			printf("BAD HASH AT: %d %X\n", i, apa[i]);
//		}
//	}
//	printf("counted %X bad hashes\n", count);
//	printf("NOW REALLY TEST THEM hashes\n");
//	count = 0;
//	for (uint32_t i = 0; i < MAX_MOMENTUM_NONCE/8; i+=8) {
//		sph_sha512_context c512_sph; //SPH
//		sph_sha512_init(&c512_sph);
//		sph_sha512(&c512_sph, &i, 4);
//		sph_sha512(&c512_sph, message, 32);
//		uint64_t out[8];
//		sph_sha512_close(&c512_sph, out);
//		for (int j =0; j < 8; j++) {
//			if (apa[i+j] != out[j]) {
//				count++;
//				uint64_t xxx = apa[i+j];
//				printf("BAD HASH AT: %d => %X != %X\n", i, apa[i+j], out[j]);
//			}
//		}
//	}
//	printf("counted %X bad hashes\n", count);

	// step 2, populate hashtable
	size_t kft_wgsize = kernel_fill_table->getWorkGroupSize(device);
	kft_wgsize = 1<<log2(kft_wgsize);
	queue->enqueueKernel1D(kernel_fill_table, MAX_MOMENTUM_NONCE,
							kft_wgsize);

//	printf("step 2, populate hashtable\n");
//	queue->finish();

	queue->enqueueWriteBuffer(collisions_count, out_count, sizeof(size_t));
	// step 3, find collisions
	size_t kfc_wgsize = kernel_find_collisions->getWorkGroupSize(device);
	kfc_wgsize = 1<<log2(kfc_wgsize);
	queue->enqueueKernel1D(kernel_find_collisions, MAX_MOMENTUM_NONCE,
							kfc_wgsize);

//	printf("step 3, find collisions\n");
//	queue->finish();

	queue->enqueueReadBuffer(collisions_count, out_count, sizeof(size_t));
	queue->enqueueReadBuffer(collisions, out_buff, sizeof(collision_struct)*getCollisionCeiling());

//	printf("step 4, copy output\n");
	queue->finish();


#ifdef DEBUG
	printf("Collision Count = %d\n", (*out_count));
#endif

}
コード例 #14
0
static void setInvPeriodicBoxSizeArg(OpenCLContext& cl, cl::Kernel& kernel, int index) {
    if (cl.getUseDoublePrecision())
        kernel.setArg<mm_double4>(index, cl.getInvPeriodicBoxSizeDouble());
    else
        kernel.setArg<mm_float4>(index, cl.getInvPeriodicBoxSize());
}
コード例 #15
0
OpenCLCompact::OpenCLCompact(OpenCLContext& context) : context(context), dgBlockCounts(NULL) {
    dgBlockCounts = OpenCLArray::create<cl_uint>(context, context.getNumThreadBlocks(), "dgBlockCounts");
    cl::Program program = context.createProgram(OpenCLKernelSources::compact);
    countKernel = cl::Kernel(program, "countElts");
    moveValidKernel = cl::Kernel(program, "moveValidElementsStaged");
}
コード例 #16
0
static void setPeriodicBoxArgs(OpenCLContext& cl, cl::Kernel& kernel, int index) {
    if (cl.getUseDoublePrecision()) {
        kernel.setArg<mm_double4>(index++, cl.getPeriodicBoxSizeDouble());
        kernel.setArg<mm_double4>(index++, cl.getInvPeriodicBoxSizeDouble());
        kernel.setArg<mm_double4>(index++, cl.getPeriodicBoxVecXDouble());
        kernel.setArg<mm_double4>(index++, cl.getPeriodicBoxVecYDouble());
        kernel.setArg<mm_double4>(index, cl.getPeriodicBoxVecZDouble());
    }
    else {
        kernel.setArg<mm_float4>(index++, cl.getPeriodicBoxSize());
        kernel.setArg<mm_float4>(index++, cl.getInvPeriodicBoxSize());
        kernel.setArg<mm_float4>(index++, cl.getPeriodicBoxVecX());
        kernel.setArg<mm_float4>(index++, cl.getPeriodicBoxVecY());
        kernel.setArg<mm_float4>(index, cl.getPeriodicBoxVecZ());
    }
}