Esempio n. 1
0
/**
 * \brief ocl::Image::write Transfers data from host memory to this Image.
 *
 * You can be sure that the data is write.
 * \param ptr_to_host_data must point to a memory location whith region bytes available.
 * \param region is the 3D region of the data. It is given with {image_width, image_height, image_depth}.
 */
void ocl::Image::write(const void *ptr_to_host_data, const size_t *region, const EventList &list) const
{
    TRUE_ASSERT(ptr_to_host_data != NULL, "data == 0");
    std::vector<size_t> origin = {0, 0, 0};
    OPENCL_SAFE_CALL( clEnqueueWriteImage(this->activeQueue().id(), this->id(), CL_TRUE, origin.data(), region, 0, 0, ptr_to_host_data, list.size(), list.events().data(), NULL) );
    OPENCL_SAFE_CALL( clFinish(this->activeQueue().id()) );
}
Esempio n. 2
0
/**
 * \brief ocl::Image::writeAsync Transfers data from host memory to this Image.
 *
 * Waits until the event list is completed.
 * \param origin is the 3D offset in bytes from which the Image is read.
 * \param ptr_to_host_data must point to a memory location whith region bytes available.
 * \param region is the 3D region of the data. It is given with {image_width, image_height, image_depth}.
 * \param list contains all events for which this command has to wait.
 * \return an event which can be further put into an event list for synchronization.
 */
ocl::Event ocl::Image::writeAsync(size_t *origin, const void *ptr_to_host_data, const size_t *region, const EventList &list) const
{
    TRUE_ASSERT(ptr_to_host_data != NULL, "data == 0");
    cl_event event_id;
    OPENCL_SAFE_CALL( clEnqueueWriteImage(this->activeQueue().id(), this->id(), CL_FALSE, origin, region, 0, 0, ptr_to_host_data, list.size(), list.events().data(), &event_id) );
    return ocl::Event(event_id, this->context());
}
	void OpenCLImage::write(void *dataPtr, bool blockingWrite, size_t *pOrigin, size_t *pRegion, size_t rowPitch, size_t slicePitch) {
		if(pOrigin == NULL) pOrigin = origin;
		if(pRegion == NULL) pRegion = region;
		
		cl_int err = clEnqueueWriteImage(pOpenCL->getQueue(), clMemObject, blockingWrite, pOrigin, pRegion, rowPitch, slicePitch, dataPtr, 0, NULL, NULL);
		assert(err == CL_SUCCESS);
	}
Esempio n. 4
0
    /// Enqueues a command to write data from host memory to \p image.
    ///
    /// \see_opencl_ref{clEnqueueWriteImage}
    void enqueue_write_image(const image3d &image,
                             const size_t origin[3],
                             const size_t region[3],
                             size_t input_row_pitch,
                             size_t input_slice_pitch,
                             const void *host_ptr,
                             const wait_list &events = wait_list())
    {
        BOOST_ASSERT(m_queue != 0);
        BOOST_ASSERT(image.get_context() == this->get_context());

        cl_int ret = clEnqueueWriteImage(
            m_queue,
            image.get(),
            CL_TRUE,
            origin,
            region,
            input_row_pitch,
            input_slice_pitch,
            host_ptr,
            events.size(),
            events.get_event_ptr(),
            0
        );

        if(ret != CL_SUCCESS){
            BOOST_THROW_EXCEPTION(opencl_error(ret));
        }
    }
void timedImageCLWrite( cl_command_queue queue,
                      cl_mem image,
                      void *ptr )
{
    CPerfCounter t1;
    cl_int ret;
    cl_event ev;

    t1.Start();

    ret = clEnqueueWriteImage( queue,
                               image,
                               CL_FALSE,
                               imageOrigin, imageRegion,
                               0,0,
                               ptr,
                               0, NULL,
                               &ev );
    ASSERT_CL_RETURN( ret );

    clFlush( queue );
    spinForEventsComplete( 1, &ev );

    t1.Stop();
    
    tlog->Timer( "%32s  %lf s   %8.2lf GB/s\n", "clEnqueueWriteImage():", t1.GetElapsedTime(), nBytesRegion, 1 );
}
Esempio n. 6
0
result plane::CopyToAsynch(
	const	unsigned char	&host_buffer, 			
	const	int				&host_cols,	 			
	const	int				&host_rows,	 			
	const	int				&host_pitch,
			cl_event		*event) {
	
	if (!valid_) return FILTER_INVALID_PLANE_BUFFER_STATE;

	valid_ = false;

	cl_int cl_status = CL_SUCCESS;

	size_t zero_offset[] = {0, 0, 0}; 
	size_t copy_region[] = {ByPowerOf2(host_cols, 2) >> 2, host_rows, 1};
	cl_status = clEnqueueWriteImage(cq_,
									mem_,
									CL_FALSE,
									zero_offset,
									copy_region,
									host_pitch,
									0,
									&host_buffer,
									0,
									NULL,
									event);
	if (cl_status != CL_SUCCESS) {  
		g_last_cl_error = cl_status;
		return FILTER_COPYING_TO_PLANE_FAILED;
	}

	valid_ = true;
	return FILTER_OK;
}
Esempio n. 7
0
void transfer(Image * I, int dest)
{
	if(I->locality != dest)

	{
		size_t origin[3];
		origin[0] = 0;
		origin[1] = 0;
		origin[2] = 0;
		size_t region[3];
		region[0] = I->width;
		region[1] = I->height;
		region[2] = 1;		
		if(dest == 1)
		{
			//if(DEBUG) printf("moving to gpu\n");
			check(clEnqueueWriteImage(queue,I->GPU_Image,CL_FALSE,origin, region,0,0,I->CPU_Image,0,NULL,NULL));	
			I->locality = 1;
			//if(DEBUG) printf("done\n");
		}
		else
		{
			//if(DEBUG) printf("moving to cpu\n");
			check(clEnqueueReadImage(queue,I->GPU_Image,CL_FALSE,origin, region, 0,0,I->CPU_Image,0,NULL,NULL));		
			I->locality = 0;
		}
	}
}
Esempio n. 8
0
/**
 * \brief ocl::Image::write Transfers data from host memory to this Image.
 *
 * You can be sure that the data is read. Be sure that the queue
 * and this Image are in the same context.
 * \param queue is a command queue on which the command is executed.
 * \param origin is the 3D offset in bytes from which the Image is read.
 * \param ptr_to_host_data must point to a memory location whith region bytes available.
 * \param region is the 3D region of the data. It is given with {image_width, image_height, image_depth}.
 */
void ocl::Image::write(const Queue& queue, size_t *origin, const void *ptr_to_host_data, const size_t *region, const EventList &list) const
{
    TRUE_ASSERT(ptr_to_host_data != NULL, "data == 0");
    TRUE_ASSERT(queue.context() == *this->context(), "Context of queue and this must be equal");
    OPENCL_SAFE_CALL( clEnqueueWriteImage(queue.id(), this->id(), CL_TRUE, origin, region, 0, 0, ptr_to_host_data, list.size(), list.events().data(), NULL) );
    OPENCL_SAFE_CALL( clFinish(queue.id()) );
}
Esempio n. 9
0
bool Sobel::runOpenCL(Image input, Image output, const Params& params)
{
    if (!initCL(params, sobel_kernel, "-cl-fast-relaxed-math"))
    {
        return false;
    }

    cl_int err;
    cl_kernel kernel;
    cl_mem d_input, d_output;
    cl_image_format format = {CL_RGBA, CL_UNORM_INT8};

    kernel = clCreateKernel(m_program, "sobel", &err);
    CHECK_ERROR_OCL(err, "creating kernel", return false);

    d_input = clCreateImage2D(
                  m_context, CL_MEM_READ_ONLY, &format,
                  input.width, input.height, 0, NULL, &err);
    CHECK_ERROR_OCL(err, "creating input image", return false);

    d_output = clCreateImage2D(
                   m_context, CL_MEM_WRITE_ONLY, &format,
                   input.width, input.height, 0, NULL, &err);
    CHECK_ERROR_OCL(err, "creating output image", return false);

    size_t origin[3] = {0, 0, 0};
    size_t region[3] = {input.width, input.height, 1};
    err = clEnqueueWriteImage(
              m_queue, d_input, CL_TRUE,
              origin, region, 0, 0, input.data, 0, NULL, NULL);
    CHECK_ERROR_OCL(err, "writing image data", return false);

    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_input);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_output);
    CHECK_ERROR_OCL(err, "setting kernel arguments", return false);

    reportStatus("Running OpenCL kernel");

    const size_t global[2] = {output.width, output.height};
    const size_t *local = NULL;
    if (params.wgsize[0] && params.wgsize[1])
    {
        local = params.wgsize;
    }

    // Timed runs
    for (int i = 0; i < params.iterations + 1; i++)
    {
        err = clEnqueueNDRangeKernel(
                  m_queue, kernel, 2, NULL, global, local, 0, NULL, NULL);
        CHECK_ERROR_OCL(err, "enqueuing kernel", return false);

        // Start timing after warm-up run
        if (i == 0)
        {
            err = clFinish(m_queue);
            CHECK_ERROR_OCL(err, "running kernel", return false);
            startTiming();
        }
    }
Esempio n. 10
0
/**
 * \brief ocl::Image::writeAsync Transfers data from host memory to this Image.
 *
 * Waits until the event list is completed. Be sure that the queue
 * and this Image are in the same context.
 * \param queue is a command queue on which the command is executed.
 * \param origin is the 3D offset in bytes from which the Image is read.
 * \param ptr_to_host_data must point to a memory location whith region bytes available.
 * \param region is the 3D region of the data. It is given with {image_width, image_height, image_depth}.
 * \param list contains all events for which this command has to wait.
 * \return an event which can be further put into an event list for synchronization.
 */
ocl::Event ocl::Image::writeAsync(const Queue &queue, size_t *origin, const void *ptr_to_host_data, const size_t *region, const EventList &list) const
{
    TRUE_ASSERT(ptr_to_host_data != NULL, "data == 0");
    TRUE_ASSERT(queue.context() == *this->context(), "Context of queue and this must be equal");
    cl_event event_id;
    OPENCL_SAFE_CALL( clEnqueueWriteImage(queue.id(), this->id(), CL_FALSE, origin, region, 0, 0, ptr_to_host_data, list.size(), list.events().data(), &event_id) );
    return ocl::Event(event_id, this->context());
}
Esempio n. 11
0
cl_int WINAPI wine_clEnqueueWriteImage(cl_command_queue command_queue, cl_mem image, cl_bool blocking_write,
                                       const size_t * origin, const size_t * region,
                                       size_t input_row_pitch, size_t input_slice_pitch, const void * ptr,
                                       cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event)
{
    cl_int ret;
    TRACE("\n");
    ret = clEnqueueWriteImage(command_queue, image, blocking_write, origin, region, input_row_pitch, input_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event);
    return ret;
}
Esempio n. 12
0
void cl_copyTextureToDevice(cl_mem dst, void* src, int width, int height)
{
	cl_int status;
	const size_t szTexOrigin[3] = {0, 0, 0};
	const size_t szTexRegion[3] = {height, width, 1};
	status = clEnqueueWriteImage(clCommandQueue, dst, CL_TRUE, szTexOrigin,
		szTexRegion, 0, 0, src, 0, NULL, NULL);
	if(cl_errChk(status, "write buffer texture")) {
		exit(1);
	}
}
/**
 * \related cl_Mem_Object_t
 *
 * This function read data from Host-accessible memory region, defined by argument
 * 'source' & write that data into OpenCL Image memory object, definded by
 * argument'self'
 * @param[in,out] self  pointer to structure, in which 'Write' function pointer
 * is defined to point on this function.
 * @param[in] blocking_flag flag, that denotes, should operation be blocking or not.
 * @param[in] source pointer to Host-accessible memory region, that
 * contain data to be write in OpenCL memory object.
 * @param[in] time_mode enumeration, that denotes how time measurement should be
 * performed.
 * @param[out] evt_to_generate pointer to OpenCL event that will be generated
 * at the end of operation.
 *
 * @return CL_SUCCESS in case of success, error code of type 'ret_code' otherwise.
 *
 * @see cl_err_codes.h for detailed error description.
 * @see 'cl_Error_t' structure for error handling.
 */
static ret_code Image_Send_To_Device(
    scow_Mem_Object     *self,
    cl_bool             blocking_flag, 
    void                *source, 
    TIME_STUDY_MODE     time_mode,
    cl_event            *evt_to_generate, 
    cl_command_queue    explicit_queue)
{
    cl_int ret = CL_SUCCESS;

    cl_event write_ready, *p_write_ready = (cl_event*) 0x0;

    OCL_CHECK_EXISTENCE(self, INVALID_BUFFER_GIVEN);
    OCL_CHECK_EXISTENCE(source, INVALID_BUFFER_GIVEN);

    const size_t origin[3] =
    { 0, 0, 0 }, region[3] =
    { self->width, self->height, 1 };

    (evt_to_generate != NULL) ?
            (p_write_ready = evt_to_generate) : (p_write_ready = &write_ready);

    cl_command_queue q =
            (explicit_queue == NULL) ?
                    (self->parent_thread->q_data_htod) : (explicit_queue);

    ret = clEnqueueWriteImage(q, self->cl_mem_object, blocking_flag, origin,
            region, self->row_pitch, 0, source, 0, NULL, p_write_ready);

    OCL_DIE_ON_ERROR(ret, CL_SUCCESS, NULL, ret);

    switch (time_mode)
    {
    case MEASURE:
        self->timer->current_time_device = Gather_Time_uS(p_write_ready);
        self->timer->total_time_device += self->timer->current_time_device;
        break;

    case DONT_MEASURE:
        break;

    default:
        break;
    }

    if (p_write_ready != evt_to_generate){
        clReleaseEvent(*p_write_ready);
    }

    return ret;
}
Esempio n. 14
0
bool
piglit_cl_write_image(cl_command_queue command_queue, cl_mem image,
                      const size_t *origin, const size_t *region,
                      const void *ptr)
{
	cl_int errNo;

	errNo = clEnqueueWriteImage(command_queue, image, CL_TRUE, origin, region,
	                            0, 0, ptr, 0, NULL, NULL);
	if(!piglit_cl_check_error(errNo, CL_SUCCESS)) {
		fprintf(stderr,
		        "Could not enqueue image write: %s\n",
		        piglit_cl_get_error_name(errNo));
		return false;
	}

	return true;
}
Esempio n. 15
0
/** Thread that receives image from client.
 * @param data struct dataTransfer casted variable.
 * @return NULL
 */
void *asyncDataRecvImage_thread(void *data)
{
    struct dataSend* _data = (struct dataSend*)data;
    // Provide a server for the data transfer
    int fd = accept(_data->fd, (struct sockaddr*)NULL, NULL);
    if(fd < 0){
        // we can't work, disconnect the client
        printf("ERROR: Can't listen on binded port.\n"); fflush(stdout);
        shutdown(_data->fd, 2);
        return CL_SUCCESS;
    }
    // We may wait manually for the events generated by ocland,
    // and then we can wait for the OpenCL generated ones.
    if(_data->num_events_in_wait_list){
        oclandWaitForEvents(_data->num_events_in_wait_list, _data->event_wait_list);
    }
    // Receive the data
    Recv(&fd, _data->ptr, _data->cb, MSG_WAITALL);
    // Writre it into the buffer
    clEnqueueWriteImage(_data->command_queue,_data->mem,CL_FALSE,
                        _data->buffer_origin,_data->region,
                        _data->buffer_row_pitch,_data->buffer_slice_pitch,
                        _data->ptr,0,NULL,&(_data->event->event));
    // Wait until the data is copied before start cleaning up
    clWaitForEvents(1,&(_data->event->event));
    // Clean up
    free(_data->ptr); _data->ptr = NULL;
    if(_data->event){
        _data->event->status = CL_COMPLETE;
    }
    if(_data->want_event != CL_TRUE){
        free(_data->event); _data->event = NULL;
    }
    if(_data->event_wait_list) free(_data->event_wait_list); _data->event_wait_list=NULL;
    // shutdown(fd, 2);
    // shutdown(_data->fd, 2); // Destroy the server to free the port
    close(fd);
    close(_data->fd);
    free(_data); _data=NULL;
    pthread_exit(NULL);
    return NULL;
}
Esempio n. 16
0
bool CL_Image3D::WriteData(const CL_CommandQueue* pCommandQueue, size_t uOriginX, size_t uOriginY, size_t uOriginZ, size_t uWidth, size_t uHeight, size_t uDepth, const void* pImgInput, bool bIsBlocking, CL_Event* pNewEvent, const CL_EventPool* pWaitList)
{
	CL_CPP_CONDITIONAL_RETURN_FALSE(!m_Image);
	CL_CPP_CONDITIONAL_RETURN_FALSE(!pCommandQueue);

	const size_t uOrigin[3] =
	{
		uOriginX,
		uOriginY,
		uOriginZ
	};

	const size_t uRegion[3] =
	{
		uWidth,
		uHeight,
		uDepth
	};

	cl_uint uNumWaitEvents = pWaitList ? pWaitList->GetNumEvents() : 0;
	const cl_event* pWaitEvents = pWaitList ? pWaitList->GetEventPool() : NULL;
	cl_event NewEvent = NULL;

	//	Write some data to the buffer object.
	const cl_command_queue CommandQueue = pCommandQueue->GetCommandQueue();
	cl_int iErrorCode = clEnqueueWriteImage(CommandQueue, m_Image, (bIsBlocking) ? CL_TRUE : CL_FALSE, uOrigin, uRegion, m_uRowPitch, m_uSlicePitch, pImgInput, uNumWaitEvents, pWaitEvents, &NewEvent);

	CL_CPP_CATCH_ERROR(iErrorCode);
	CL_CPP_ON_ERROR_RETURN_FALSE(iErrorCode);

	if(NewEvent)
	{
		if(pNewEvent)
			pNewEvent->SetEvent(NewEvent);

		clReleaseEvent(NewEvent);
	}

	return true;
}
Esempio n. 17
0
cl_mem * push_image(clinfo_t *clinfo, image_t * image, cl_event * event)
{
        cl_mem * image_buffer = malloc(sizeof(cl_mem));
        cl_int err;
        *image_buffer = clCreateImage2D(clinfo->context, CL_MEM_READ_ONLY
                                      , image->image_fmt, image->size[0]
                                      , image->size[1], 0, 0, &err);
        if(err != CL_SUCCESS) {
                fprintf(stderr, "Failed to create image buffer, %i\n", err);
                return NULL;
        }
        size_t origin[] = {0,0,0};
        size_t region[] = {image->size[0], image->size[1], 1};
        err = clEnqueueWriteImage(clinfo->command_queue, *image_buffer
                                  , CL_FALSE, origin, region
                                  , 0, 0, *image->pixels, 0, NULL, event);
        if(err != CL_SUCCESS) {
                fprintf(stderr, "Failed to write image to memory %i\n", err);
                return NULL;
        }
        return image_buffer;
}
Esempio n. 18
0
/*!
    \param dst Valid device pointer
    \param src Host pointer that contains the data
    \param height Height of the image
    \param width Width of the image
*/
void cl_copyImageToDevice(cl_mem dst, void* src, size_t height, size_t width)
{
    static int eventCnt = 0;

    cl_event* eventPtr = NULL, event;

    if(eventsEnabled) {
        eventPtr = &event;
    }

    cl_int status;
    size_t origin[3] = {0, 0, 0};
    size_t region[3] = {width, height, 1};

    status = clEnqueueWriteImage(commandQueue, dst, CL_TRUE, origin,
        region, 0, 0, src, 0, NULL, eventPtr);
    cl_errChk(status, "Writing image", true);

    if(eventsEnabled) {
        char* eventStr = catStringWithInt("copyImageToDevice", eventCnt++);
        events->newIOEvent(*eventPtr, eventStr);
    }
}
int main()
{
	int i,j,k;
	// nb of operations:
	const int dsize = 512;
	int nthreads = 1;
	int nbOfAverages = 1e2;
	int opsMAC = 2; // operations per MAC
	cl_short4 *in, *out;
	cl_half *ck;
	double tops; //total ops

#define NQUEUES 1
	cl_int err;
	cl_platform_id platform = 0;
	cl_device_id device = 0;
	cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
	cl_context ctx = 0;
	cl_command_queue queues[NQUEUES];
	cl_mem bufin, bufck, bufout;
	cl_event event = NULL;
	cl_program program;
	cl_kernel kernel;
	size_t global[2], local[2];
	size_t param[5];
	char version[300];
  
	// allocate matrices
	
	in = (cl_short4 *) calloc(dsize*dsize, sizeof(*in));
	out = (cl_short4 *) calloc(dsize*dsize, sizeof(*out));
	ck = (cl_half *) calloc(9*9, sizeof(*ck));
	in[0].x = 0x3c00;
	in[1].x = 0x4000;
	in[dsize].x = 0x4100;
	ck[0] = 0x3c00;
	ck[1] = 0x4000;
	ck[9] = 0x3000;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs( 1, &platform, NULL );
    err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
    for(i = 0; i < NQUEUES; i++)
    	queues[i] = clCreateCommandQueue( ctx, device, 0, &err );

	// Print some info about the system
	clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(version), version, NULL);
	printf("CL_DEVICE_VERSION=%s\n", version);
	clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(version), version, NULL);
	printf("CL_DRIVER_VERSION=%s\n", version);
	program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err);
	clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(param[0]), param, NULL);
	printf("CL_DEVICE_LOCAL_MEM_SIZE=%d\n", (int)param[0]);
	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(param[0]), param, NULL);
	printf("CL_DEVICE_MAX_WORK_GROUP_SIZE=%d\n", (int)param[0]);
	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(param[0]), param, NULL);
	printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS=%d\n", (int)param[0]);
	j = param[0];
	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(param[0])*j, param, NULL);
	printf("CL_DEVICE_MAX_WORK_ITEM_SIZES=");
	for(i = 0; i < j; i++)
		printf("%d ", (int)param[i]);
	printf("\n");
        clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(param[0]), param, NULL);
        printf("CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE=%d\n", (int)param[0]);
		
		
	program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err);
	if(!program)
	{
		printf("Error creating program\n");
		return -1;
	}
	err = clBuildProgram(program, 0, 0, 0, 0, 0);
	if(err != CL_SUCCESS)
	{
		char buffer[20000];
		size_t len;
		
		clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		puts(buffer);
		return -1;
	}
	kernel = clCreateKernel(program, "conv9x9", &err);
	if(!kernel || err != CL_SUCCESS)
	{
		printf("Error creating kernel\n");
		return -1;
	}

    /* Prepare OpenCL memory objects and place matrices inside them. */
	cl_image_format fmt = {CL_RGBA, CL_HALF_FLOAT};
	cl_int rc;
	bufin = clCreateImage2D(ctx, CL_MEM_READ_ONLY, &fmt, dsize, dsize, 0, 0, &rc);
	bufout = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &fmt, dsize, dsize, 0, 0, &rc);
    bufck = clCreateBuffer( ctx, CL_MEM_READ_ONLY, 9 * 9 * sizeof(*ck),
                          NULL, &err );

	size_t origin[3] = {0,0,0};
	size_t region[3] = {dsize, dsize, 1};
    err = clEnqueueWriteImage(queues[0], bufin, CL_TRUE, origin, region, dsize * sizeof(*in), 0, in, 0, NULL, NULL );
    err = clEnqueueWriteBuffer( queues[0], bufck, CL_TRUE, 0, 9 * 9 * sizeof( *ck ), ck, 0, NULL, NULL );
	clSetKernelArg(kernel, 0, sizeof(int), &dsize);
	clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufin);
	clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufck);
	clSetKernelArg(kernel, 3, sizeof(cl_mem), &bufout);
	local[0] = 8;
	local[1] = 8;
	global[0] = global[1] = dsize-32;
    usleep(100000);

	struct timeval start,end;
	gettimeofday(&start, NULL);

	for (k=0; k<nthreads; k++) {
		//printf("Hello from thread %d, nthreads %d\n", omp_get_thread_num(), omp_get_num_threads());
		for(i=0;i<nbOfAverages;i++) {
		// do the 2D convolution
			err = clEnqueueNDRangeKernel(queues[0], kernel, 2, NULL, global, local, 0, NULL, NULL);
			if(err != CL_SUCCESS)
			{
				printf("clEnqueueNDRangeKernel error %d\n", err);
				return -1;
			}
		}
	}

	clFinish(queues[0]);
	gettimeofday(&end, NULL);
	double t = ((double) (end.tv_sec - start.tv_sec))
	+ ((double) (end.tv_usec - start.tv_usec)) / 1e6; //reports time in [s] - verified!

    /* Wait for calculations to be finished. */

    /* Fetch results of calculations from GPU memory. */
    err = clEnqueueReadImage(queues[0], bufout, CL_TRUE, origin, region, dsize * sizeof(*out), 0, out, 0, NULL, NULL );
	clFinish(queues[0]);
	
	printf("%x %x %x %x\n", out[0].x, out[1].x, out[dsize].x, out[dsize+1].x);

    /* Release OpenCL memory objects. */
    clReleaseMemObject( bufin );
    clReleaseMemObject( bufck );
    clReleaseMemObject( bufout );

    /* Release OpenCL working objects. */
    for(i = 0; i < NQUEUES; i++)
    	clReleaseCommandQueue( queues[i] );
    clReleaseContext( ctx );
	
	// report performance:
	tops = 4 * nthreads * opsMAC * (dsize-32)*(dsize-32)*9*9; // total ops
	printf("Total M ops = %.0lf, # of threads = %d", nbOfAverages*tops*1e-6, nthreads);
	printf("\nTime in s: %lf:", t);
	printf("\nTest performance [G OP/s] %lf:", tops*nbOfAverages/t*1e-9);
	printf("\n");
	return(0);
}
Esempio n. 20
0
int main(int argc, char *argv[])
{

#ifdef DEBUG
printf("Argument count = [%d]\n", argc);
#endif

if(argc!=2)
{
printf("Expecting one argument!\n");
exit(1);
}
if(argv[1]==NULL)
{
printf("Expecting one non-null argument!\n");
exit(1);
}

char *progName = argv[1];
char fileName[100];
sprintf(fileName, "./target/%s.cl",progName);
printf("Using kernel file [%s], with kernel name [%s]\n", fileName, progName);

cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;

float *result;
int i;
cl_mem image, out;
cl_bool support;
cl_image_format fmt;
int num_out = 9;

FILE *fp;
char *source_str;
size_t source_size, r_size;
int mem_size = sizeof(cl_float4) * num_out;

/*load the source code containing the kernel*/
fp = fopen (fileName, "r");
if (!fp) {
fprintf(stderr, "failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);

/*Get platform and device info*/
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
printf("ret_num_platforms = %d\n", ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1 ,&device_id, &ret_num_devices);
printf("ret_num_platforms = %d\n", ret_num_platforms);

context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

result = (float*) malloc(mem_size);
//check image support
clGetDeviceInfo(device_id, CL_DEVICE_IMAGE_SUPPORT, sizeof(support), &support, &r_size);
if (support != CL_TRUE) {
	puts("image not supported");
	return 1;
}

command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
printf("queue ret = %d\n", ret);

out = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size, NULL, &ret);
printf("create buffer ret = %d\n", ret);

fmt.image_channel_order = CL_R;
fmt.image_channel_data_type = CL_FLOAT;

image = clCreateImage2D(context, CL_MEM_READ_ONLY, &fmt, 4, 4, 0, 0, NULL);

size_t origin[] = {0,0,0};
size_t region[] = {4,4,1};
float data[] = {
	10,20,30,40,
	10,20,30,40,
	10,20,30,40,
	10,20,30,40
};

clEnqueueWriteImage(command_queue, image, CL_TRUE, origin, region, 4*sizeof(float), 0, data, 0, NULL, NULL);

program = clCreateProgramWithSource(context, 1, (const char**) &source_str, (const size_t*) &source_size, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
printf("build program ret = %d\n", ret);
kernel = clCreateKernel(program, progName, &ret);
printf("create kernel ret = %d\n", ret);

//How to set int arguments?
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*) &image);
printf("arg 0 ret = %d\n", ret);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &out);
printf("arg 1 ret = %d\n", ret);

cl_event ev;
ret = clEnqueueTask(command_queue, kernel, 0, NULL, &ev);

//How to read a int?
ret = clEnqueueReadBuffer(command_queue, out, CL_TRUE, 0, mem_size, result, 0, NULL, NULL);
for(int i=0; i < num_out; i++) {
	printf("%f,%f,%f,%f\n", result[i*4+0], result[i*4+1], result[i*4+2], result[i*4+3]);
}

ret=clFlush(command_queue);
ret=clFinish(command_queue);
ret=clReleaseKernel(kernel);
ret=clReleaseProgram(program);

ret=clReleaseMemObject(out);
ret=clReleaseMemObject(image);

ret=clReleaseCommandQueue(command_queue);
ret=clReleaseContext(context);

free(source_str);

printf("\n");

return 0;
}
Esempio n. 21
0
void spmv_csr_ocl(csr_matrix<int, float>* mat, float* vec, float* result, int dim2Size, double& opttime, double& optflop, int& optmethod, char* oclfilename, cl_device_type deviceType, int ntimes, double* floptable, int groupnum)
{
    cl_device_id* devices = NULL;
    cl_context context = NULL;
    cl_command_queue cmdQueue = NULL;
    cl_program program = NULL;

    assert(initialization(deviceType, devices, &context, &cmdQueue, &program, oclfilename) == 1);

    cl_int errorCode = CL_SUCCESS;

    //Create device memory objects
    cl_mem devRowPtr;
    cl_mem devColId;
    cl_mem devData;
    cl_mem devVec;
    cl_mem devTexVec;
    cl_mem devRes;

    //Initialize values
    int nnz = mat->matinfo.nnz;
    int vecsize = mat->matinfo.width;
    int rownum = mat->matinfo.height;
    int rowptrsize = rownum + 1;
    ALLOCATE_GPU_READ(devRowPtr, mat->csr_row_ptr, sizeof(int)*rowptrsize);
    ALLOCATE_GPU_READ(devColId, mat->csr_col_id, sizeof(int)*nnz);
    ALLOCATE_GPU_READ(devData, mat->csr_data, sizeof(float)*nnz);
    ALLOCATE_GPU_READ(devVec, vec, sizeof(float)*vecsize);
    int paddedres = findPaddedSize(rownum, 16);
    devRes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*paddedres, NULL, &errorCode); CHECKERROR;
    //errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR;

	const cl_image_format floatFormat =
	{
	    CL_R,
	    CL_FLOAT,
	};

	int width = VEC2DWIDTH;
	int height = (vecsize + VEC2DWIDTH - 1)/VEC2DWIDTH;
	float* image2dVec = (float*)malloc(sizeof(float)*width*height);
	memset(image2dVec, 0, sizeof(float)*width*height);
	for (int i = 0; i < vecsize; i++)
	{
	    image2dVec[i] = vec[i];
	}
	size_t origin[] = {0, 0, 0};
	size_t vectorSize[] = {width, height, 1};
	devTexVec = clCreateImage2D(context, CL_MEM_READ_ONLY, &floatFormat, width, height, 0, NULL, &errorCode); CHECKERROR;
	errorCode = clEnqueueWriteImage(cmdQueue, devTexVec, CL_TRUE, origin, vectorSize, 0, 0, image2dVec, 0, NULL, NULL); CHECKERROR;
	clFinish(cmdQueue);


    opttime = 10000.0f;
    optmethod = 0;
    int dim2 = dim2Size;



    {
	int methodid = 0;
	cl_mem devRowPtrPad;
	int padrowsize = findPaddedSize(rownum, CSR_VEC_GROUP_SIZE/WARPSIZE);
	int* rowptrpad = (int*)malloc(sizeof(int)*(padrowsize+1));
	memset(rowptrpad, 0, sizeof(int)*(padrowsize+1));
	for (int i = 0; i <= mat->matinfo.height; i++)
	    rowptrpad[i] = mat->csr_row_ptr[i];
	ALLOCATE_GPU_READ(devRowPtrPad, rowptrpad, sizeof(int)*(padrowsize+1));
	clFinish(cmdQueue);

	printf("\nRow Num %d padded size %d\n", rownum, padrowsize);
	cl_uint work_dim = 2;
	//int dim2 = 16;
	size_t blocksize[] = {CSR_VEC_GROUP_SIZE, 1};

	cl_kernel csrKernel = NULL;
	csrKernel = clCreateKernel(program, "gpu_csr_ve_slm_pm_fs", &errorCode); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devRowPtrPad); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColId); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 3, sizeof(cl_mem), &devVec); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 4, sizeof(cl_mem), &devRes); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 5, sizeof(int), &rownum); CHECKERROR;


	{
	    size_t globalsize[] = {groupnum * CSR_VEC_GROUP_SIZE, dim2};

	    for (int k = 0; k < 3; k++)
	    {
		errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	    }
	    clFinish(cmdQueue);

	    double teststart = timestamp();
	    for (int i = 0; i < ntimes; i++)
	    {
		errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	    }
	    clFinish(cmdQueue);
	    double testend = timestamp();
	    double time_in_sec = (testend - teststart)/(double)dim2;
	    double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9;
	    printf("\nCSR vector SLM row ptr padded mat strided rows fixed size:%d cpu time %lf ms GFLOPS %lf code %d \n\n", groupnum * CSR_VEC_GROUP_SIZE,   time_in_sec / (double) ntimes * 1000, gflops, methodid);

	    double onetime = time_in_sec / (double) ntimes;
	    floptable[methodid] = gflops;
	    if (onetime < opttime)
	    {
		opttime = onetime;
		optmethod = methodid;
		optflop = gflops;
	    }
	}

	if (devRowPtrPad)
	    clReleaseMemObject(devRowPtrPad);
	if (csrKernel)
	    clReleaseKernel(csrKernel);
	free(rowptrpad);


    }





    {
	int methodid = 1;
	cl_mem devRowPtrPad;
	int padrowsize = findPaddedSize(rownum, CSR_VEC_GROUP_SIZE/WARPSIZE);
	int* rowptrpad = (int*)malloc(sizeof(int)*(padrowsize+1));
	memset(rowptrpad, 0, sizeof(int)*(padrowsize+1));
	for (int i = 0; i <= mat->matinfo.height; i++)
	    rowptrpad[i] = mat->csr_row_ptr[i];
	ALLOCATE_GPU_READ(devRowPtrPad, rowptrpad, sizeof(int)*(padrowsize+1));
	clFinish(cmdQueue);

	printf("\nRow Num %d padded size %d\n", rownum, padrowsize);
	cl_uint work_dim = 2;
	//int dim2 = 16;
	size_t blocksize[] = {CSR_VEC_GROUP_SIZE, 1};

	cl_kernel csrKernel = NULL;
	csrKernel = clCreateKernel(program, "gpu_csr_ve_reduction_fs", &errorCode); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devRowPtrPad); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColId); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 3, sizeof(cl_mem), &devVec); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 4, sizeof(cl_mem), &devRes); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 5, sizeof(int), &rownum); CHECKERROR;


	{
	    size_t globalsize[] = {groupnum * CSR_VEC_GROUP_SIZE, dim2};

	    for (int k = 0; k < 3; k++)
	    {
		errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	    }
	    clFinish(cmdQueue);

	    double teststart = timestamp();
	    for (int i = 0; i < ntimes; i++)
	    {
		errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	    }
	    clFinish(cmdQueue);
	    double testend = timestamp();
	    double time_in_sec = (testend - teststart)/(double)dim2;
	    double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9;
	    printf("\nCSR vector SLM row ptr padded mat strided rows fixed size:%d cpu time %lf ms GFLOPS %lf code %d \n\n", groupnum * CSR_VEC_GROUP_SIZE,   time_in_sec / (double) ntimes * 1000, gflops, methodid);

	    double onetime = time_in_sec / (double) ntimes;
	    floptable[methodid] = gflops;
	    if (onetime < opttime)
	    {
		opttime = onetime;
		optmethod = methodid;
		optflop = gflops;
	    }
	}

	if (devRowPtrPad)
	    clReleaseMemObject(devRowPtrPad);
	if (csrKernel)
	    clReleaseKernel(csrKernel);
	free(rowptrpad);


    }


    //Clean up
    if (image2dVec)
	free(image2dVec);

    if (devRowPtr)
	clReleaseMemObject(devRowPtr);
    if (devColId)
	clReleaseMemObject(devColId);
    if (devData)
	clReleaseMemObject(devData);
    if (devVec)
	clReleaseMemObject(devVec);
    if (devTexVec)
	clReleaseMemObject(devTexVec);
    if (devRes)
	clReleaseMemObject(devRes);

    freeObjects(devices, &context, &cmdQueue, &program);

}
Esempio n. 22
0
int main(int argc, const char** argv)
{
	size_t x = 512, y = 250000; //y has to be a multiple of ciDeviceCount!
	struct svm_node* px = (struct svm_node*)malloc((x+1)*sizeof(struct svm_node));
	gen_data(px, x, 1, 3);
	struct svm_node* py = (struct svm_node*)malloc((x+1)*y*sizeof(struct svm_node));
	for(size_t i = 0; i < y; ++i) {
		struct svm_node* tmp = py+i*(x+1);
		gen_data(tmp, x, 3,2);
	}
	dtype* result = (dtype*)malloc(y*sizeof(dtype));
	int* pyLength = (int*)malloc(y*sizeof(int));
	
	for(size_t i = 0; i < y; ++i)
	{
		for(size_t j = 0; py[i*(x+1)+j].index >= 0; ++j)
			pyLength[i] = py[i*(x+1)+j].index;
		++pyLength[i];
	}
	
	cl_int err = CL_SUCCESS;
//	cl_platform_id platform = NULL;
//	cl_uint ciDeviceCount = 0;
//	cl_device_id *device = NULL;

	// retrieve devices
	cl_platform_id platform;
	err = clGetPlatformIDs(1, &platform, NULL);
	cl_device_id device;
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);

	size_t localDim  = 256l;
	size_t globalDim = localDim*y;
/*	
	device = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) );
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, ciDeviceCount, device, NULL);
	if (err != CL_SUCCESS)
	{
		printf("Failed to get devices:\n%s\n", oclErrorString(err));
		return -1;
	}
	*/
	//Create the context
	cl_context context1 = clCreateContext(0, 1, &device, NULL, NULL, &err);
	if(err != CL_SUCCESS)
	{
		printf("Context creation failed:\n%d\n", err);
		return -1;
	}										 

	// create a command queue for first device the context reported
	cl_command_queue queue = clCreateCommandQueue(context1, device, 0, 0);
	
	// load program from disk
	char *tmp = strdup(argv[0]);
	char* my_dir = dirname(tmp);

//	size_t program_length;
	char path[256];
  	snprintf(path, PATH_MAX - 1, "%s/vecops.cl", my_dir);
 
	cl_program vecops = load_kernel(path, context1);

	if(err != CL_SUCCESS)
	{
		printf("Program creation failed:\n%d\n", (err));
		return -1;
	}
 
	err = clBuildProgram(vecops, 0, NULL, "-I.", NULL, NULL);
	if(err != CL_SUCCESS)
	{
			err = clGetProgramBuildInfo(vecops, device, CL_PROGRAM_BUILD_LOG, 8192, buffer, NULL);
			if(err != CL_SUCCESS)
				printf("Cannot get build info: %d\n", (err));

			printf("Build log:\n%s\n", buffer);
	}
	
	// create kernel
	cl_kernel sparsedot_kernel;
	
#if version == 1
	sparsedot_kernel = clCreateKernel(vecops, "sparsedot1_kernel", &err);
#endif
#if version == 2
	sparsedot_kernel = clCreateKernel(vecops, "sparsedot4_kernel", &err);
#endif
#if version == 3
	sparsedot_kernel = clCreateKernel(vecops, "sparsedot3_kernel", &err);
#endif
	if (err != CL_SUCCESS)
	{
		printf("Kernel creation failed:\n%d\n", (err));
		return -1;
	}
	
	 
	// allocate memory on the devices
	cl_mem px_d, py_d, result_d, pyLength_d;
	
#if version == 1
	px_d = clCreateBuffer(context1,
							 CL_MEM_READ_ONLY,
							 (x+1) * sizeof(struct svm_node),
							 0, &err);
#endif
#if version == 2 || version == 3
	//unpack px
	int size = px[x-1].index+1;

	for(size_t i = 0; i < y; ++i)
		size = size > pyLength[i] ? size : pyLength[i];

	dtype* px_u = (dtype*)calloc(size, sizeof(dtype));

	unpack(px, px_u);
	printf("px size: %d\n", size);
#endif
#if version == 3
	size_t height, width;
	clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &height, 0);
	clGetDeviceInfo(Device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &width, 0);

	size_t region[3];
	region[2] = 1;

	region[0] = min(4, size);
	region[1] = (size+2-1) / 4;
		

	cl_image_format px_format;
	px_format.image_channel_order = CL_R;
	px_format.image_channel_data_type = CL_FLOAT;
#endif
#if version == 2
	px_d = clCreateBuffer(context1,
				 CL_MEM_READ_ONLY,
				 size * sizeof(dtype),
				 0, &err);
#endif
#if version == 3
	 px_d = clCreateImage2D(context1, CL_MEM_READ_ONLY, &px_format,
				  region[0], region[1], 0, 0, &err);

#endif
	if(err != CL_SUCCESS)
	{
		printf("Failed to allocate px:\n%d\n", (err));
		return -1;
	}
	py_d = clCreateBuffer(context1,
		 CL_MEM_READ_ONLY,
		 (x+1) * y * sizeof(struct svm_node),
		 0, &err);
	if(err != CL_SUCCESS)
	{
		printf("Failed to allocate px:\n%d\n", (err));
		return -1;
	}
	result_d = clCreateBuffer(context1,
		CL_MEM_WRITE_ONLY,
		y * sizeof(dtype),
		0, 0);
	pyLength_d = clCreateBuffer(context1,
		CL_MEM_READ_ONLY,
		y * sizeof(int),
		0, 0);
	

#if bench
	//start time measurement
	start_timer(0);
#endif

	// copy host vectors to device
	err = CL_SUCCESS;
   
    err |= clEnqueueWriteBuffer(queue, py_d, CL_FALSE, 0, 
								(x+1) * y * sizeof(struct svm_node), py, 0, NULL, NULL);
									

	err |= clEnqueueWriteBuffer(queue, pyLength_d, CL_FALSE, 0, 
								y * sizeof(int), pyLength, 0, NULL, NULL);

#if version == 1
	err |= clEnqueueWriteBuffer(queue, px_d, CL_FALSE, 0, 
								(x+1) * sizeof(struct svm_node), px, 0, NULL, NULL);
#endif
#if version == 2
	err |= clEnqueueWriteBuffer(queue, px_d, CL_FALSE, 0, 
								size * sizeof(dtype), px_u, 0, NULL, NULL);
#endif
#if version == 3
	size_t offset[] = {0,0,0};
	err |= clEnqueueWriteImage(queue, px_d, CL_TRUE, offset, region, sizeof(dtype), 0, 
							   px_u, 0, 0, NULL);
#endif
	clFinish(queue);

	 
	if(err != CL_SUCCESS)
	{
		printf("Data transfer to GPU failed:\n%d\n", (err));
		return -1;
	}

#if bench
	stop_timer(0);
	start_timer(1);
#endif
	// set kernel arguments

	clSetKernelArg(sparsedot_kernel, 0, sizeof(cl_mem), (void *) &px_d);
	clSetKernelArg(sparsedot_kernel, 1, sizeof(cl_mem), (void *) &py_d);
	clSetKernelArg(sparsedot_kernel, 2, sizeof(cl_mem), (void *) &result_d);
	clSetKernelArg(sparsedot_kernel, 3, sizeof(cl_mem), (void *) &pyLength_d);
	clSetKernelArg(sparsedot_kernel, 4, sizeof(cl_ulong), (void *) &x);
	clSetKernelArg(sparsedot_kernel, 5, sizeof(cl_ulong), (void *) &y);
//	clSetKernelArg(sparsedot_kernel, 6, sizeof(cl_float8)*localDim, 0);
#if version == 3
		clSetKernelArg(sparsedot_kernel, 7, sizeof(cl_long), (void *) &region[1]) ;		
		clSetKernelArg(sparsedot_kernel, 8, sizeof(cl_long), (void *) &region[0]) ;		
#endif
	clFlush(queue);

	// start kernel
	err = clEnqueueNDRangeKernel(queue, sparsedot_kernel, 1, 0, &globalDim, &localDim,
					   0, NULL, 0);

	if(err != CL_SUCCESS)
	{
		printf("Kernel launch failed:\n%d\n", (err));
		return -1;
	}

	clFinish(queue);
	
#if bench	
	stop_timer(1);
	start_timer(2);
#endif

	cl_event result_gather;
	 
	// Non-blocking copy of result from device to host
	err = clEnqueueReadBuffer(queue, result_d, CL_FALSE, 0, y * sizeof(dtype), 
						result, 0, NULL, &result_gather);
	
	if(err != CL_SUCCESS)
	{
		printf("Reading result failed:\n%d\n", (err));
		return -1;
	}

	// CPU sync with GPU
	clWaitForEvents(1, &result_gather);

#if bench	
	// stop GPU time measurement
	stop_timer(2);
#endif
	//check result
/*	for(size_t i = 0; i < y; ++i)
	{
		printf("%f ", result[i]);
	}
	printf("\n");
  */  

#if bench
	start_timer(3);
#endif
	bool correct = validate(px, py, result, x, y);
#if bench
	stop_timer(3);
	printf("v%i; x: %lu, y: %lu\n", version, x, y);
	printf("CPU: %f, upcpy: %f DeviceCalc: %f, downcpy: %f\n", 
		   get_secs(3), get_secs(0), get_secs(1), get_secs(2));
#endif
	
	if(correct)
		printf("SUCCESS!\n");
		
	//cleenup

	clReleaseKernel(sparsedot_kernel);
	clReleaseCommandQueue(queue);
	clReleaseEvent(result_gather);
	clReleaseMemObject(px_d);
	clReleaseMemObject(py_d);
	clReleaseMemObject(result_d);
	clReleaseMemObject(pyLength_d);
//	clReleaseDevice(device);

	free(px);
#if version == 2 || version == 3
	free(px_u);
#endif
	free(py);
	free(result);

	return 0;
}
Esempio n. 23
0
void spmv_coo_ocl(coo_matrix<int, float>* mat, float* vec, float* result, int dim2Size, double& opttime, double& optflop, int& optmethod, char* oclfilename, cl_device_type deviceType, int ntimes, double* floptable, int maxgroupnum)
{

    for (int i = 0; i < mat->matinfo.height; i++)
	result[i] = 0.0f;
    cl_device_id* devices = NULL;
    cl_context context = NULL;
    cl_command_queue cmdQueue = NULL;
    cl_program program = NULL;

    assert(initialization(deviceType, devices, &context, &cmdQueue, &program, oclfilename) == 1);

    cl_int errorCode = CL_SUCCESS;

    //Create device memory objects
    cl_mem devRowid;
    cl_mem devColid;
    cl_mem devData;
    cl_mem devVec;
    cl_mem devRes;
    cl_mem devTexVec;
    cl_mem devTmpRow;
    cl_mem devTmpData;

    //Initialize values
    int nnz = mat->matinfo.nnz;
    int rownum = mat->matinfo.height;
    int vecsize = mat->matinfo.width;
    int num_units = nnz / COO_GROUP_SIZE;
    if (nnz % COO_GROUP_SIZE != 0)
	num_units++;
    int group_num = (num_units < maxgroupnum) ? num_units : maxgroupnum;
    int work_size = group_num * COO_GROUP_SIZE;
    int num_iters = nnz / work_size;
    if (nnz % work_size != 0)
	num_iters++;
    int process_size = num_iters * COO_GROUP_SIZE;
    int active_warp = num_units / num_iters;
    if (num_units % num_iters != 0)
	active_warp++;
    int paddedNNZ = findPaddedSize(nnz, COO_ALIGNMENT);
    int* paddedRow = (int*)malloc(sizeof(int)*paddedNNZ);
    int* paddedCol = (int*)malloc(sizeof(int)*paddedNNZ);
    float* paddedData = (float*)malloc(sizeof(float)*paddedNNZ);
    memcpy(paddedRow, mat->coo_row_id, sizeof(int)*nnz);
    memcpy(paddedCol, mat->coo_col_id, sizeof(int)*nnz);
    memcpy(paddedData, mat->coo_data, sizeof(float)*nnz);
    for (int i = nnz; i < paddedNNZ; i++)
    {
	paddedRow[i] = mat->coo_row_id[nnz - 1];
	paddedCol[i] = mat->coo_col_id[nnz - 1];
	paddedData[i] = 0.0f;
    }


    ALLOCATE_GPU_READ(devRowid, paddedRow, sizeof(int)*paddedNNZ);
    ALLOCATE_GPU_READ(devColid, paddedCol, sizeof(int)*paddedNNZ);
    ALLOCATE_GPU_READ(devData, paddedData, sizeof(float)*paddedNNZ);
    ALLOCATE_GPU_READ(devVec, vec, sizeof(float)*vecsize);
    int paddedres = findPaddedSize(rownum, 512);
    devRes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*paddedres, NULL, &errorCode); CHECKERROR;
    errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR;
    devTmpRow = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*maxgroupnum, NULL, &errorCode); CHECKERROR;
    devTmpData = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*maxgroupnum, NULL, &errorCode); CHECKERROR;

    const cl_image_format floatFormat =
    {
	CL_R,
	CL_FLOAT,
    };


    int width = VEC2DWIDTH;
    int height = (vecsize + VEC2DWIDTH - 1)/VEC2DWIDTH;
    float* image2dVec = (float*)malloc(sizeof(float)*width*height);
    memset(image2dVec, 0, sizeof(float)*width*height);
    for (int i = 0; i < vecsize; i++)
    {
	image2dVec[i] = vec[i];
    }
    size_t origin[] = {0, 0, 0};
    size_t vectorSize[] = {width, height, 1};
    devTexVec = clCreateImage2D(context, CL_MEM_READ_ONLY, &floatFormat, width, height, 0, NULL, &errorCode); CHECKERROR;
    errorCode = clEnqueueWriteImage(cmdQueue, devTexVec, CL_TRUE, origin, vectorSize, 0, 0, image2dVec, 0, NULL, NULL); CHECKERROR;
    clFinish(cmdQueue);


    opttime = 10000.0f;
    optmethod = 0;
    int dim2 = dim2Size;

    {
	int methodid = 0;
	cl_uint work_dim = 2;
	size_t blocksize[] = {COO_GROUP_SIZE, 1};
	int gsize = group_num * COO_GROUP_SIZE;
	size_t globalsize[] = {gsize, dim2};

	cl_kernel csrKernel = NULL;
	csrKernel = clCreateKernel(program, "gpu_coo_s1", &errorCode); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devRowid); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColid); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 3, sizeof(int),    &process_size); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 4, sizeof(int),    &paddedNNZ); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devVec); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 7, sizeof(cl_mem), &devTmpRow); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 8, sizeof(cl_mem), &devTmpData); CHECKERROR;

	printf("process size %d nnz %d gsize %d active_warp %d\n", process_size, paddedNNZ, gsize, active_warp);

	size_t blocksize2[] = {COO_GROUP_SIZE * 2, 1};
	size_t globalsize2[] = {COO_GROUP_SIZE * 2, dim2};


	cl_kernel csrKernel2 = NULL;
	csrKernel2 = clCreateKernel(program, "gpu_coo_s2", &errorCode); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel2, 0, sizeof(cl_mem), &devTmpRow); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel2, 1, sizeof(cl_mem), &devTmpData); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel2, 2, sizeof(int), &active_warp); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel2, 3, sizeof(cl_mem), &devRes); CHECKERROR;

	for (int k = 0; k < 3; k++)
	{
	    errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	}
	clFinish(cmdQueue);


	//int* tmpRow = (int*)malloc(sizeof(int)*maxgroupnum);
	//float* tmpData = (float*)malloc(sizeof(float)*maxgroupnum);


	double teststart = timestamp();
	for (int i = 0; i < ntimes; i++)
	{
	    errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	    errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel2, work_dim, NULL, globalsize2, blocksize2, 0, NULL, NULL); CHECKERROR;
	}
	clFinish(cmdQueue);

	double testend = timestamp();
	double time_in_sec = (testend - teststart)/(double)dim2;
	double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9;
	printf("\nCOO cpu time %lf ms GFLOPS %lf code %d \n\n",   time_in_sec / (double) ntimes * 1000, gflops, methodid);

	if (csrKernel)
	    clReleaseKernel(csrKernel);
	if (csrKernel2)
	    clReleaseKernel(csrKernel2);

	double onetime = time_in_sec / (double) ntimes;
	floptable[methodid] = gflops;
	if (onetime < opttime)
	{
	    opttime = onetime;
	    optmethod = methodid;
	    optflop = gflops;
	}
	//for (int i = 0; i < active_warp; i++)
	//printf("Row %d Data %f\n", tmpRow[i], tmpData[i]);
    }


    //Clean up
    if (paddedRow)
	free(paddedRow);
    if (paddedCol)
	free(paddedCol);
    if (paddedData)
	free(paddedData);
    if (image2dVec)
	free(image2dVec);

    if (devRowid)
	clReleaseMemObject(devRowid);
    if (devColid)
	clReleaseMemObject(devColid);
    if (devData)
	clReleaseMemObject(devData);
    if (devVec)
	clReleaseMemObject(devVec);
    if (devTexVec)
	clReleaseMemObject(devTexVec);
    if (devRes)
	clReleaseMemObject(devRes);
    if (devTmpRow)
	clReleaseMemObject(devTmpRow);
    if (devTmpData)
	clReleaseMemObject(devTmpData);

    freeObjects(devices, &context, &cmdQueue, &program);
}
Esempio n. 24
0
/*! \brief Calls an OpenCL kernel from OpenVX Graph.
 * Steps:
 * \arg Find the target
 * \arg Get the vxcl context
 * \arg Find the kernel (to get cl kernel information)
 * \arg for each input parameter that is an object, enqueue write
 * \arg wait for finish
 * \arg for each parameter, SetKernelArg
 * \arg call kernel
 * \arg wait for finish
 * \arg for each output parameter that is an object, enqueue read
 * \arg wait for finish
 * \note This implementation will attempt to use the External API as much as possible,
 * but will cast to internal representation when needed (due to lack of API or
 * need for secret information). This is not an optimal OpenCL invocation.
 */
vx_status vxclCallOpenCLKernel(vx_node node, const vx_reference *parameters, vx_uint32 num)
{
    vx_status status = VX_FAILURE;
    vx_context context = node->base.context;
    vx_target target = (vx_target_t *)&node->base.context->targets[node->affinity];
    vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration);
    vx_uint32 pidx, pln, didx, plidx, argidx;
    cl_int err = 0;
    size_t off_dim[3] = {0,0,0};
    size_t work_dim[3];
    //size_t local_dim[3];
    cl_event writeEvents[VX_INT_MAX_PARAMS];
    cl_event readEvents[VX_INT_MAX_PARAMS];
    cl_int we = 0, re = 0;

    vxSemWait(&target->base.lock);

    // determine which platform to use
    plidx = 0;

    // determine which device to use
    didx = 0;

    /* for each input/bi data object, enqueue it and set the kernel parameters */
    for (argidx = 0, pidx = 0; pidx < num; pidx++)
    {
        vx_reference ref = node->parameters[pidx];
        vx_enum dir = node->kernel->signature.directions[pidx];
        vx_enum type = node->kernel->signature.types[pidx];
        vx_memory_t *memory = NULL;

        switch (type)
        {
            case VX_TYPE_ARRAY:
                memory = &((vx_array)ref)->memory;
                break;
            case VX_TYPE_CONVOLUTION:
                memory = &((vx_convolution)ref)->base.memory;
                break;
            case VX_TYPE_DISTRIBUTION:
                memory = &((vx_distribution)ref)->memory;
                break;
            case VX_TYPE_IMAGE:
                memory = &((vx_image)ref)->memory;
                break;
            case VX_TYPE_LUT:
                memory = &((vx_lut_t*)ref)->memory;
                break;
            case VX_TYPE_MATRIX:
                memory = &((vx_matrix)ref)->memory;
                break;
            //case VX_TYPE_PYRAMID:
            //    break;
            case VX_TYPE_REMAP:
                memory = &((vx_remap)ref)->memory;
                break;
            //case VX_TYPE_SCALAR:
            //case VX_TYPE_THRESHOLD:
            //    break;
        }
        if (memory) {
            for (pln = 0; pln < memory->nptrs; pln++) {
                if (memory->cl_type == CL_MEM_OBJECT_BUFFER) {
                    if (type == VX_TYPE_IMAGE) {

                        /* set the work dimensions */
                        work_dim[0] = memory->dims[pln][VX_DIM_X];
                        work_dim[1] = memory->dims[pln][VX_DIM_Y];

                        // width, height, stride_x, stride_y
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->dims[pln][VX_DIM_X]);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->dims[pln][VX_DIM_Y]);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]);
                        VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 4 parameters\n");
                    } else if (type == VX_TYPE_ARRAY || type == VX_TYPE_LUT) {
                        vx_array arr = (vx_array)ref;
                        // sizeof item, active count, capacity
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->item_size);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->num_items); // this is output?
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->capacity);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &arr->memory.strides[VX_DIM_X]);
                        VX_PRINT(VX_ZONE_INFO, "Setting vx_buffer as Buffer with 4 parameters\n");
                    } else if (type == VX_TYPE_MATRIX) {
                        vx_matrix mat = (vx_matrix)ref;
                        // columns, rows
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&mat->columns);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&mat->rows);
                        VX_PRINT(VX_ZONE_INFO, "Setting vx_matrix as Buffer with 2 parameters\n");
                    } else if (type == VX_TYPE_DISTRIBUTION) {
                        vx_distribution dist = (vx_distribution)ref;
                        // num, range, offset, winsize
                        vx_uint32 range = dist->memory.dims[0][VX_DIM_X] * dist->window_x;
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->memory.dims[VX_DIM_X]);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&range);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->offset_x);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->window_x);
                    } else if (type == VX_TYPE_CONVOLUTION) {
                        vx_convolution conv = (vx_convolution)ref;
                        // columns, rows, scale
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->base.columns);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->base.rows);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->scale);
                    }
                    err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(cl_mem), &memory->hdls[pln]);
                    CL_ERROR_MSG(err, "clSetKernelArg");
                    if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL)
                    {
                        err = clEnqueueWriteBuffer(context->queues[plidx][didx],
                                                   memory->hdls[pln],
                                                   CL_TRUE,
                                                   0,
                                                   vxComputeMemorySize(memory, pln),
                                                   memory->ptrs[pln],
                                                   0,
                                                   NULL,
                                                   &ref->event);
                    }
                } else if (memory->cl_type == CL_MEM_OBJECT_IMAGE2D) {
                    vx_rectangle_t rect = {0};
                    vx_image image = (vx_image)ref;
                    vxGetValidRegionImage(image, &rect);
                    size_t origin[3] = {rect.start_x, rect.start_y, 0};
                    size_t region[3] = {rect.end_x-rect.start_x, rect.end_y-rect.start_y, 1};
                    /* set the work dimensions */
                    work_dim[0] = rect.end_x-rect.start_x;
                    work_dim[1] = rect.end_y-rect.start_y;
                    VX_PRINT(VX_ZONE_INFO, "Setting vx_image as image2d_t wd={%zu,%zu} arg:%u\n",work_dim[0], work_dim[1], argidx);
                    err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(cl_mem), &memory->hdls[pln]);
                    CL_ERROR_MSG(err, "clSetKernelArg");
                    if (err != CL_SUCCESS) {
                        VX_PRINT(VX_ZONE_ERROR, "Error Calling Kernel %s, parameter %u\n", node->kernel->name, pidx);
                    }
                    if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL)
                    {
                        err = clEnqueueWriteImage(context->queues[plidx][didx],
                                                  memory->hdls[pln],
                                                  CL_TRUE,
                                                  origin, region,
                                                  memory->strides[pln][VX_DIM_Y],
                                                  0,
                                                  memory->ptrs[pln],
                                                  0, NULL,
                                                  &ref->event);
                        CL_ERROR_MSG(err, "clEnqueueWriteImage");
                    }
                }
            }
        } else {
            if (type == VX_TYPE_SCALAR) {
                vx_value_t value; // largest platform atomic
                vx_size size = 0ul;
                vx_scalar sc = (vx_scalar)ref;
                vx_enum stype = VX_TYPE_INVALID;
                vxReadScalarValue(sc, &value);
                vxQueryScalar(sc, VX_SCALAR_ATTRIBUTE_TYPE, &stype, sizeof(stype));
                size = vxSizeOfType(stype);
                err = clSetKernelArg(vxclk->kernels[plidx], argidx++, size, &value);
            }
            else if (type == VX_TYPE_THRESHOLD) {
                vx_enum ttype = 0;
                vx_threshold th = (vx_threshold)ref;
                vxQueryThreshold(th, VX_THRESHOLD_ATTRIBUTE_TYPE, &ttype, sizeof(ttype));
                if (ttype == VX_THRESHOLD_TYPE_BINARY) {
                    err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->value);
                } else if (ttype == VX_THRESHOLD_TYPE_RANGE) {
                    err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->lower);
                    err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->upper);
                }
            }
        }
    }
    we = 0;
    for (pidx = 0; pidx < num; pidx++) {
        vx_reference ref = node->parameters[pidx];
        vx_enum dir = node->kernel->signature.directions[pidx];
        if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) {
            memcpy(&writeEvents[we++],&ref->event, sizeof(cl_event));
        }
    }
    //local_dim[0] = 1;
    //local_dim[1] = 1;
    err = clEnqueueNDRangeKernel(context->queues[plidx][didx],
                                 vxclk->kernels[plidx],
                                 2,
                                 off_dim,
                                 work_dim,
                                 NULL,
                                 we, writeEvents, &node->base.event);

    CL_ERROR_MSG(err, "clEnqueueNDRangeKernel");
    /* enqueue a read on all output data */
    for (pidx = 0; pidx < num; pidx++)
    {
        vx_reference ref = node->parameters[pidx];
        vx_enum dir = node->kernel->signature.directions[pidx];
        vx_enum type = node->kernel->signature.types[pidx];

        if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL)
        {
            vx_memory_t *memory = NULL;

            switch (type)
            {
                case VX_TYPE_ARRAY:
                    memory = &((vx_array)ref)->memory;
                    break;
                case VX_TYPE_CONVOLUTION:
                    memory = &((vx_convolution)ref)->base.memory;
                    break;
                case VX_TYPE_DISTRIBUTION:
                    memory = &((vx_distribution)ref)->memory;
                    break;
                case VX_TYPE_IMAGE:
                    memory = &((vx_image)ref)->memory;
                    break;
                case VX_TYPE_LUT:
                    memory = &((vx_lut_t*)ref)->memory;
                    break;
                case VX_TYPE_MATRIX:
                    memory = &((vx_matrix)ref)->memory;
                    break;
                //case VX_TYPE_PYRAMID:
                //    break;
                case VX_TYPE_REMAP:
                    memory = &((vx_remap)ref)->memory;
                    break;
                //case VX_TYPE_SCALAR:
                //case VX_TYPE_THRESHOLD:
                //    break;
            }
            if (memory) {
                for (pln = 0; pln < memory->nptrs; pln++) {
                    if (memory->cl_type == CL_MEM_OBJECT_BUFFER) {
                        err = clEnqueueReadBuffer(context->queues[plidx][didx],
                            memory->hdls[pln],
                            CL_TRUE, 0, vxComputeMemorySize(memory, pln),
                            memory->ptrs[pln],
                            1, &node->base.event, &ref->event);
                        CL_ERROR_MSG(err, "clEnqueueReadBuffer");
                    } else if (memory->cl_type == CL_MEM_OBJECT_IMAGE2D) {
                        vx_rectangle_t rect = {0};
                        vx_image image = (vx_image)ref;
                        vxGetValidRegionImage(image, &rect);
                        size_t origin[3] = {rect.start_x, rect.start_y, 0};
                        size_t region[3] = {rect.end_x-rect.start_x, rect.end_y-rect.start_y, 1};
                        /* set the work dimensions */
                        work_dim[0] = rect.end_x-rect.start_x;
                        work_dim[1] = rect.end_y-rect.start_y;
                        err = clEnqueueReadImage(context->queues[plidx][didx],
                                                  memory->hdls[pln],
                                                  CL_TRUE,
                                                  origin, region,
                                                  memory->strides[pln][VX_DIM_Y],
                                                  0,
                                                  memory->ptrs[pln],
                                                  1, &node->base.event,
                                                  &ref->event);
                        CL_ERROR_MSG(err, "clEnqueueReadImage");
                        VX_PRINT(VX_ZONE_INFO, "Reading Image wd={%zu,%zu}\n", work_dim[0], work_dim[1]);
                    }
                }
            }
        }
    }
    re = 0;
    for (pidx = 0; pidx < num; pidx++) {
        vx_reference ref = node->parameters[pidx];
        vx_enum dir = node->kernel->signature.directions[pidx];
        if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL) {
            memcpy(&readEvents[re++],&ref->event, sizeof(cl_event));
        }
    }
    err = clFlush(context->queues[plidx][didx]);
    CL_ERROR_MSG(err, "Flush");
    VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n");
    clWaitForEvents(re, readEvents);
    if (err == CL_SUCCESS)
        status = VX_SUCCESS;
//exit:
    VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status);
    vxSemPost(&target->base.lock);
    return status;
}
Esempio n. 25
0
void spmv_b4ell_ocl(b4ell_matrix<int, float>* mat, float* vec, float* result, int dim2Size, double& opttime, int& optmethod, char* oclfilename, cl_device_type deviceType, float* coores, int ntimes, int bw, int bh)
{
    cl_device_id* devices = NULL;
    cl_context context = NULL;
    cl_command_queue cmdQueue = NULL;
    cl_program program = NULL;

    assert(initialization(deviceType, devices, &context, &cmdQueue, &program, oclfilename) == 1);

    cl_int errorCode = CL_SUCCESS;

    //Create device memory objects
    cl_mem devColid;
    cl_mem devData;
    cl_mem devVec;
    cl_mem devRes;
    cl_mem devTexVec;

    //Initialize values
    int col_align = mat->b4ell_height_aligned;
    int data_align = mat->b4ell_float4_aligned;
    int nnz = mat->matinfo.nnz;
    int rownum = mat->matinfo.height;
    int blockrownum = mat->b4ell_row_num;
    int vecsize = mat->matinfo.width;
    int b4ellnum = mat->b4ell_block_num;
    int bwidth = mat->b4ell_bwidth;
    int bheight = mat->b4ell_bheight;
    int width4num = bwidth / 4;
    int padveclen = findPaddedSize(vecsize, 8);
    float* paddedvec = (float*)malloc(sizeof(float)*padveclen);
    memset(paddedvec, 0, sizeof(float)*padveclen);
    memcpy(paddedvec, vec, sizeof(float)*vecsize);
    ALLOCATE_GPU_READ(devColid, mat->b4ell_col_id, sizeof(int)*col_align*b4ellnum);
    ALLOCATE_GPU_READ(devData, mat->b4ell_data, sizeof(float)*data_align*bheight*width4num*b4ellnum);
    ALLOCATE_GPU_READ(devVec, paddedvec, sizeof(float)*padveclen);
    int paddedres = findPaddedSize(rownum, 512);
    devRes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*paddedres, NULL, &errorCode); CHECKERROR;
    errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR;
    const cl_image_format floatFormat =
    {
	CL_RGBA,
	CL_FLOAT,
    };


    int width = VEC2DWIDTH;
    int height = (vecsize + VEC2DWIDTH - 1)/VEC2DWIDTH;
    if (height % 4 != 0)
	height += (4 - (height % 4));
    float* image2dVec = (float*)malloc(sizeof(float)*width*height);
    memset(image2dVec, 0, sizeof(float)*width*height);
    for (int i = 0; i < vecsize; i++)
    {
	image2dVec[i] = vec[i];
    }
    size_t origin[] = {0, 0, 0};
    size_t vectorSize[] = {width, height/4, 1};
    devTexVec = clCreateImage2D(context, CL_MEM_READ_ONLY, &floatFormat, width, height/4, 0, NULL, &errorCode); CHECKERROR;
    errorCode = clEnqueueWriteImage(cmdQueue, devTexVec, CL_TRUE, origin, vectorSize, 0, 0, image2dVec, 0, NULL, NULL); CHECKERROR;
    clFinish(cmdQueue);

    //printf("\nvec length %d padded length %d", mat->matinfo.width, padveclength);

    opttime = 10000.0f;
    optmethod = 0;
    int dim2 = dim2Size;
    {
	int methodid = 0;
	cl_uint work_dim = 2;
	size_t blocksize[] = {BELL_GROUP_SIZE, 1};
	int gsize = ((blockrownum + BELL_GROUP_SIZE - 1)/BELL_GROUP_SIZE)*BELL_GROUP_SIZE;
	size_t globalsize[] = {gsize, dim2};
	int data_align4 = data_align / 4;
	char kernelname[100] = "gpu_bell00";
	kernelname[8] += bh;
	kernelname[9] += bw;

	cl_kernel csrKernel = NULL;
	csrKernel = clCreateKernel(program, kernelname, &errorCode); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devColid); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devData); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 2, sizeof(int),    &data_align4); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 3, sizeof(int),    &col_align); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 4, sizeof(int),    &b4ellnum); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devVec); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 7, sizeof(int),    &blockrownum); CHECKERROR;

	errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR;
	clFinish(cmdQueue);
	errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	clFinish(cmdQueue);
	float* tmpresult = (float*)malloc(sizeof(float)*rownum);
	errorCode = clEnqueueReadBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, tmpresult, 0, NULL, NULL); CHECKERROR;
	clFinish(cmdQueue);
	two_vec_compare(coores, tmpresult, rownum);
	free(tmpresult);

	for (int k = 0; k < 3; k++)
	{
	    errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	}
	clFinish(cmdQueue);

	double teststart = timestamp();
	for (int i = 0; i < ntimes; i++)
	{
	    errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	}
	clFinish(cmdQueue);
	double testend = timestamp();
	double time_in_sec = (testend - teststart)/(double)dim2;
	double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9;
	printf("\nBELL %dx%d block cpu time %lf ms GFLOPS %lf code %d \n\n", bh, bw,  time_in_sec / (double) ntimes * 1000, gflops, methodid);

	if (csrKernel)
	    clReleaseKernel(csrKernel);

	double onetime = time_in_sec / (double) ntimes;
	if (onetime < opttime)
	{
	    opttime = onetime;
	    optmethod = methodid;
	}

    }
    {
	int methodid = 1;
	cl_uint work_dim = 2;
	size_t blocksize[] = {BELL_GROUP_SIZE, 1};
	int gsize = ((blockrownum + BELL_GROUP_SIZE - 1)/BELL_GROUP_SIZE)*BELL_GROUP_SIZE;
	size_t globalsize[] = {gsize, dim2};
	int data_align4 = data_align / 4;
	char kernelname[100] = "gpu_bell00_mad";
	kernelname[8] += bh;
	kernelname[9] += bw;

	cl_kernel csrKernel = NULL;
	csrKernel = clCreateKernel(program, kernelname, &errorCode); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devColid); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devData); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 2, sizeof(int),    &data_align4); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 3, sizeof(int),    &col_align); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 4, sizeof(int),    &b4ellnum); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devVec); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 7, sizeof(int),    &blockrownum); CHECKERROR;

	errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR;
	clFinish(cmdQueue);
	errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	clFinish(cmdQueue);
	float* tmpresult = (float*)malloc(sizeof(float)*rownum);
	errorCode = clEnqueueReadBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, tmpresult, 0, NULL, NULL); CHECKERROR;
	clFinish(cmdQueue);
	two_vec_compare(coores, tmpresult, rownum);
	free(tmpresult);

	for (int k = 0; k < 3; k++)
	{
	    errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	}
	clFinish(cmdQueue);

	double teststart = timestamp();
	for (int i = 0; i < ntimes; i++)
	{
	    errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	}
	clFinish(cmdQueue);
	double testend = timestamp();
	double time_in_sec = (testend - teststart)/(double)dim2;
	double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9;
	printf("\nBELL %dx%d block mad cpu time %lf ms GFLOPS %lf code %d \n\n", bh, bw,  time_in_sec / (double) ntimes * 1000, gflops, methodid);

	if (csrKernel)
	    clReleaseKernel(csrKernel);

	double onetime = time_in_sec / (double) ntimes;
	if (onetime < opttime)
	{
	    opttime = onetime;
	    optmethod = methodid;
	}

    }
    {
	int methodid = 100;
	cl_uint work_dim = 2;
	size_t blocksize[] = {BELL_GROUP_SIZE, 1};
	int gsize = ((blockrownum + BELL_GROUP_SIZE - 1)/BELL_GROUP_SIZE)*BELL_GROUP_SIZE;
	size_t globalsize[] = {gsize, dim2};
	int data_align4 = data_align / 4;
	char kernelname[100] = "gpu_bell00_tx";
	kernelname[8] += bh;
	kernelname[9] += bw;

	cl_kernel csrKernel = NULL;
	csrKernel = clCreateKernel(program, kernelname, &errorCode); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devColid); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devData); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 2, sizeof(int),    &data_align4); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 3, sizeof(int),    &col_align); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 4, sizeof(int),    &b4ellnum); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devTexVec); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 7, sizeof(int),    &blockrownum); CHECKERROR;

	errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR;
	clFinish(cmdQueue);
	errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	clFinish(cmdQueue);
	float* tmpresult = (float*)malloc(sizeof(float)*rownum);
	errorCode = clEnqueueReadBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, tmpresult, 0, NULL, NULL); CHECKERROR;
	clFinish(cmdQueue);
	two_vec_compare(coores, tmpresult, rownum);
	free(tmpresult);

	for (int k = 0; k < 3; k++)
	{
	    errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	}
	clFinish(cmdQueue);

	double teststart = timestamp();
	for (int i = 0; i < ntimes; i++)
	{
	    errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	}
	clFinish(cmdQueue);
	double testend = timestamp();
	double time_in_sec = (testend - teststart)/(double)dim2;
	double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9;
	printf("\nBELL %dx%d block tx cpu time %lf ms GFLOPS %lf code %d \n\n", bh, bw,  time_in_sec / (double) ntimes * 1000, gflops, methodid);

	if (csrKernel)
	    clReleaseKernel(csrKernel);

	double onetime = time_in_sec / (double) ntimes;
	if (onetime < opttime)
	{
	    opttime = onetime;
	    optmethod = methodid;
	}

    }
    {
	int methodid = 101;
	cl_uint work_dim = 2;
	size_t blocksize[] = {BELL_GROUP_SIZE, 1};
	int gsize = ((blockrownum + BELL_GROUP_SIZE - 1)/BELL_GROUP_SIZE)*BELL_GROUP_SIZE;
	size_t globalsize[] = {gsize, dim2};
	int data_align4 = data_align / 4;
	char kernelname[100] = "gpu_bell00_mad_tx";
	kernelname[8] += bh;
	kernelname[9] += bw;

	cl_kernel csrKernel = NULL;
	csrKernel = clCreateKernel(program, kernelname, &errorCode); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devColid); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devData); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 2, sizeof(int),    &data_align4); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 3, sizeof(int),    &col_align); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 4, sizeof(int),    &b4ellnum); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devTexVec); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 7, sizeof(int),    &blockrownum); CHECKERROR;

	errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR;
	clFinish(cmdQueue);
	errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	clFinish(cmdQueue);
	float* tmpresult = (float*)malloc(sizeof(float)*rownum);
	errorCode = clEnqueueReadBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, tmpresult, 0, NULL, NULL); CHECKERROR;
	clFinish(cmdQueue);
	two_vec_compare(coores, tmpresult, rownum);
	free(tmpresult);

	for (int k = 0; k < 3; k++)
	{
	    errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	}
	clFinish(cmdQueue);

	double teststart = timestamp();
	for (int i = 0; i < ntimes; i++)
	{
	    errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	}
	clFinish(cmdQueue);
	double testend = timestamp();
	double time_in_sec = (testend - teststart)/(double)dim2;
	double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9;
	printf("\nBELL %dx%d block mad tx cpu time %lf ms GFLOPS %lf code %d \n\n", bh, bw,  time_in_sec / (double) ntimes * 1000, gflops, methodid);

	if (csrKernel)
	    clReleaseKernel(csrKernel);

	double onetime = time_in_sec / (double) ntimes;
	if (onetime < opttime)
	{
	    opttime = onetime;
	    optmethod = methodid;
	}

    }

    //Clean up
    if (image2dVec)
	free(image2dVec);

    if (devColid)
	clReleaseMemObject(devColid);
    if (devData)
	clReleaseMemObject(devData);
    if (devVec)
	clReleaseMemObject(devVec);
    if (devTexVec)
	clReleaseMemObject(devTexVec);
    if (devRes)
	clReleaseMemObject(devRes);


    freeObjects(devices, &context, &cmdQueue, &program);
}
Esempio n. 26
0
int main(int argc, char **argv) 
{
   /* Host data */
   float *hInputImage = NULL; 
   float *hOutputImage = NULL;

   /* Angle for rotation (degrees) */
   const float theta = 45.0f;

   /* Allocate space for the input image and read the
    * data from disk */
   int imageRows;
   int imageCols;
   hInputImage = readBmpFloat("../../Images/cat-face.bmp", &imageRows, &imageCols);
   const int imageElements = imageRows*imageCols;
   const size_t imageSize = imageElements*sizeof(float);

   /* Allocate space for the output image */
   hOutputImage = (float*)malloc(imageSize);
   if (!hOutputImage) { exit(-1); }

   /* Use this to check the output of each API call */
   cl_int status;

   /* Get the first platform */
   cl_platform_id platform;
   status = clGetPlatformIDs(1, &platform, NULL);
   check(status);

   /* Get the first device */
   cl_device_id device;
   status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
   check(status);

   /* Create a context and associate it with the device */
   cl_context context;
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
   check(status);

   /* Create a command queue and associate it with the device */
   cl_command_queue cmdQueue;
   cmdQueue = clCreateCommandQueue(context, device, 0, &status);
   check(status);

   /* The image descriptor describes how the data will be stored 
    * in memory. This descriptor initializes a 2D image with no pitch */
   cl_image_desc desc;
   desc.image_type = CL_MEM_OBJECT_IMAGE2D;
   desc.image_width = imageCols;
   desc.image_height = imageRows;
   desc.image_depth = 0;
   desc.image_array_size = 0;
   desc.image_row_pitch = 0;
   desc.image_slice_pitch = 0;
   desc.num_mip_levels = 0;
   desc.num_samples = 0;
   desc.buffer = NULL;

   /* The image format describes the properties of each pixel */
   cl_image_format format;
   format.image_channel_order = CL_R; // single channel
   format.image_channel_data_type = CL_FLOAT;

   /* Create the input image and initialize it using a 
    * pointer to the image data on the host. */
   cl_mem inputImage = clCreateImage(context, CL_MEM_READ_ONLY,
      &format, &desc, NULL, NULL);

   /* Create the output image */
   cl_mem outputImage = clCreateImage(context, CL_MEM_WRITE_ONLY,
      &format, &desc, NULL, NULL);

   /* Copy the host image data to the device */
   size_t origin[3] = {0, 0, 0}; // Offset within the image to copy from
   size_t region[3] = {imageCols, imageRows, 1}; // Elements to per dimension
   clEnqueueWriteImage(cmdQueue, inputImage, CL_TRUE, 
      origin, region, 0 /* row-pitch */, 0 /* slice-pitch */, 
      hInputImage, 0, NULL, NULL);

   /* Create a program with source code */
   char *programSource = readFile("image-rotation.cl");
   size_t programSourceLen = strlen(programSource);
   cl_program program = clCreateProgramWithSource(context, 1,
      (const char**)&programSource, &programSourceLen, &status);
   check(status);

   /* Build (compile) the program for the device */
   status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
   if (status != CL_SUCCESS) {
      printCompilerError(program, device);
      exit(-1);
   }

   /* Create the kernel */
   cl_kernel kernel;
   kernel = clCreateKernel(program, "rotation", &status);
   check(status);

   /* Set the kernel arguments */
   status  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
   status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
   status |= clSetKernelArg(kernel, 2, sizeof(int), &imageCols);
   status |= clSetKernelArg(kernel, 3, sizeof(int), &imageRows);
   status |= clSetKernelArg(kernel, 4, sizeof(float), &theta);
   check(status);

   /* Define the index space and work-group size */
   size_t globalWorkSize[2];
   globalWorkSize[0] = imageCols;
   globalWorkSize[1] = imageRows;

   size_t localWorkSize[2];
   localWorkSize[0] = 8;
   localWorkSize[1] = 8;

   /* Enqueue the kernel for execution */
   status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL,
      globalWorkSize, localWorkSize, 0, NULL, NULL);
   check(status);

   /* Read the output image buffer to the host */
   status = clEnqueueReadImage(cmdQueue, outputImage, CL_TRUE, 
      origin, region, 0 /* row-pitch */, 0 /* slice-pitch */, 
      hOutputImage, 0, NULL, NULL);
   check(status);

   /* Write the output image to file */
   writeBmpFloat(hOutputImage, "rotated-cat.bmp", imageRows, imageCols, 
      "../../Images/cat-face.bmp"); 

   /* Free OpenCL resources */
   clReleaseKernel(kernel);
   clReleaseProgram(program);
   clReleaseCommandQueue(cmdQueue);
   clReleaseMemObject(inputImage);
   clReleaseMemObject(outputImage);
   clReleaseContext(context);

   /* Free host resources */
   free(hInputImage);
   free(hOutputImage);
   free(programSource);

   return 0;
}
Esempio n. 27
0
void spmv_sell_ocl(sell_matrix<int, float>* mat, float* vec, float* result, int dim2Size, double& opttime, double& optflop, int& optmethod, char* oclfilename, cl_device_type deviceType, int ntimes, double* floptable)
{
    cl_device_id* devices = NULL;
    cl_context context = NULL;
    cl_command_queue cmdQueue = NULL;
    cl_program program = NULL;

    assert(initialization(deviceType, devices, &context, &cmdQueue, &program, oclfilename) == 1);

    cl_int errorCode = CL_SUCCESS;

    //Create device memory objects
    cl_mem devSlicePtr;
    cl_mem devColid;
    cl_mem devData;
    cl_mem devVec;
    cl_mem devRes;
    cl_mem devTexVec;

    //Initialize values
    int nnz = mat->matinfo.nnz;
    int rownum = mat->matinfo.height;
    int vecsize = mat->matinfo.width;
    int sliceheight = mat->sell_slice_height;
    int slicenum = mat->sell_slice_num;
    int datasize = mat->sell_slice_ptr[slicenum];
    ALLOCATE_GPU_READ(devSlicePtr, mat->sell_slice_ptr, sizeof(int)*(slicenum + 1));
    ALLOCATE_GPU_READ(devColid, mat->sell_col_id, sizeof(int)*datasize);
    ALLOCATE_GPU_READ(devData, mat->sell_data, sizeof(float)*datasize);
    ALLOCATE_GPU_READ(devVec, vec, sizeof(float)*vecsize);
    int paddedres = findPaddedSize(rownum, 512);
    devRes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*paddedres, NULL, &errorCode); CHECKERROR;
    errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR;
    const cl_image_format floatFormat =
    {
	CL_R,
	CL_FLOAT,
    };


    int width = VEC2DWIDTH;
    int height = (vecsize + VEC2DWIDTH - 1)/VEC2DWIDTH;
    float* image2dVec = (float*)malloc(sizeof(float)*width*height);
    memset(image2dVec, 0, sizeof(float)*width*height);
    for (int i = 0; i < vecsize; i++)
    {
	image2dVec[i] = vec[i];
    }
    size_t origin[] = {0, 0, 0};
    size_t vectorSize[] = {width, height, 1};
    devTexVec = clCreateImage2D(context, CL_MEM_READ_ONLY, &floatFormat, width, height, 0, NULL, &errorCode); CHECKERROR;
    errorCode = clEnqueueWriteImage(cmdQueue, devTexVec, CL_TRUE, origin, vectorSize, 0, 0, image2dVec, 0, NULL, NULL); CHECKERROR;
    clFinish(cmdQueue);

    //printf("\nvec length %d padded length %d", mat->matinfo.width, padveclength);

    int dim2 = dim2Size;
    if (sliceheight == WARPSIZE)
    {
	int methodid = 0;
	cl_uint work_dim = 2;
	size_t blocksize[] = {SELL_GROUP_SIZE, 1};
	int gsize = ((rownum + SELL_GROUP_SIZE - 1)/SELL_GROUP_SIZE)*SELL_GROUP_SIZE;
	size_t globalsize[] = {gsize, dim2};
	//printf("gsize %d rownum %d slicenum %d sliceheight %d datasize %d nnz %d vecsize %d \n", gsize, rownum, slicenum, sliceheight, datasize, nnz, vecsize);
	//int warpnum = SELL_GROUP_SIZE / WARPSIZE;

	cl_kernel csrKernel = NULL;
	csrKernel = clCreateKernel(program, "gpu_sell_warp", &errorCode); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devSlicePtr); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColid); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 3, sizeof(cl_mem), &devVec); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 4, sizeof(cl_mem), &devRes); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 5, sizeof(int),    &slicenum); CHECKERROR;

	for (int k = 0; k < 3; k++)
	{
	    errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	}
	clFinish(cmdQueue);

	double teststart = timestamp();
	for (int i = 0; i < ntimes; i++)
	{
	    errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	}
	clFinish(cmdQueue);
	double testend = timestamp();
	double time_in_sec = (testend - teststart)/(double)dim2;
	double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9;
	printf("\nSELL cpu warp time %lf ms GFLOPS %lf code %d \n\n",   time_in_sec / (double) ntimes * 1000, gflops, methodid);

	if (csrKernel)
	    clReleaseKernel(csrKernel);

	double onetime = time_in_sec / (double) ntimes;
	floptable[methodid] = gflops;
	if (onetime < opttime)
	{
	    opttime = onetime;
	    optmethod = methodid;
	    optflop = gflops;
	}

    }

    if (sliceheight == SELL_GROUP_SIZE)
    {
	int methodid = 1;
	cl_uint work_dim = 2;
	size_t blocksize[] = {SELL_GROUP_SIZE, 1};
	int gsize = slicenum * SELL_GROUP_SIZE;
	size_t globalsize[] = {gsize, dim2};

	cl_kernel csrKernel = NULL;
	csrKernel = clCreateKernel(program, "gpu_sell_group", &errorCode); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devSlicePtr); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColid); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 3, sizeof(cl_mem), &devVec); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 4, sizeof(cl_mem), &devRes); CHECKERROR;
	errorCode = clSetKernelArg(csrKernel, 5, sizeof(int),    &slicenum); CHECKERROR;

	for (int k = 0; k < 3; k++)
	{
	    errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	}
	clFinish(cmdQueue);

	double teststart = timestamp();
	for (int i = 0; i < ntimes; i++)
	{
	    errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR;
	}
	clFinish(cmdQueue);
	double testend = timestamp();
	double time_in_sec = (testend - teststart)/(double)dim2;
	double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9;
	printf("\nSELL cpu group time %lf ms GFLOPS %lf code %d \n\n",   time_in_sec / (double) ntimes * 1000, gflops, methodid);

	if (csrKernel)
	    clReleaseKernel(csrKernel);

	double onetime = time_in_sec / (double) ntimes;
	floptable[methodid] = gflops;
	if (onetime < opttime)
	{
	    opttime = onetime;
	    optmethod = methodid;
	    optflop = gflops;
	}

    }

    //Clean up
    if (image2dVec)
	free(image2dVec);

    if (devSlicePtr)
	clReleaseMemObject(devSlicePtr);
    if (devColid)
	clReleaseMemObject(devColid);
    if (devData)
	clReleaseMemObject(devData);
    if (devVec)
	clReleaseMemObject(devVec);
    if (devTexVec)
	clReleaseMemObject(devTexVec);
    if (devRes)
	clReleaseMemObject(devRes);


    freeObjects(devices, &context, &cmdQueue, &program);
}
Esempio n. 28
0
/** vglClUpload branch3d

 */
void vglClUpload(VglImage* img)
{
    if (Interop && img->nChannels > 1)
    {
        vglClUploadInterop(img);
    }
    else
    {
        if (img->nChannels == 3)
        {
            fprintf(stderr, "%s: %s: Error: image with 3 channels not supported. Please convert to 4 channels.\n", __FILE__, __FUNCTION__);
            exit(1);
        }
        cl_int err;

        if (    !vglIsInContext(img, VGL_RAM_CONTEXT)   && 
                !vglIsInContext(img, VGL_BLANK_CONTEXT)    )
        {
            fprintf(stderr, "vglClUpload: Error: image context = %d not in VGL_RAM_CONTEXT or VGL_BLANK_CONTEXT\n", img->inContext);
            return;
        }

        if (img->oclPtr == NULL)
        {
            /*if (img->fbo != -1)
            {
                img->oclPtr = clCreateFromGLTexture2D(cl.context,CL_MEM_READ_WRITE,GL_TEXTURE_2D,0,img->fbo,&err);
                vglClCheckError( err, (char*) "clCreateFromGlTexture2D interop" );
                clEnqueueAcquireGLObjects(cl.commandQueue, 1, &img->oclPtr, 0,0,0);
            }
            else
            {*/

            cl_image_format format;
            if (img->nChannels == 1)
            {
                format.image_channel_order = CL_R;
            }
            else if (img->nChannels == 4)
            {
                format.image_channel_order = CL_RGBA;
            }

            if (img->depth == IPL_DEPTH_8U)
	    {
                format.image_channel_data_type = CL_UNORM_INT8;
	    }
            else if (img->depth == IPL_DEPTH_16U)
	    {
                format.image_channel_data_type = CL_UNORM_INT16;
	    }
            else if (img->depth == IPL_DEPTH_32S)
	    {
                format.image_channel_data_type = CL_SIGNED_INT32;
	    }
            else
            {
                fprintf(stderr, "%s: %s: Error: Unsupported image depth = %d.\n", __FILE__, __FUNCTION__, img->depth);
                format.image_channel_data_type = CL_UNORM_INT8;
	    }

            if (img->ndim == 2)
            {
                img->oclPtr = clCreateImage2D(cl.context, CL_MEM_READ_WRITE, &format, img->shape[VGL_WIDTH], img->shape[VGL_HEIGHT], 0, NULL, &err);
                vglClCheckError( err, (char*) "clCreateImage2D" );
            }
            else if(img->ndim == 3)
            {
                img->oclPtr = clCreateImage3D(cl.context, CL_MEM_READ_WRITE, &format, img->shape[VGL_WIDTH], img->shape[VGL_HEIGHT], img->shape[VGL_LENGTH], 0, 0, NULL, &err);
                vglClCheckError( err, (char*) "clCreateImage3D" );
            }
            else
            {
                img->oclPtr = clCreateBuffer(cl.context, CL_MEM_READ_WRITE, img->getTotalSizeInBytes(), NULL, &err);
                vglClCheckError( err, (char*) "clCreateNDImage" );
            }
            /*
            cl_image_desc desc;
            if (img->ndim == 2)
            {
                desc.image_type = CL_MEM_OBJECT_IMAGE2D;
                desc.image_width = img->shape[VGL_WIDTH];
                desc.image_height = img->shape[VGL_HEIGHT];
                desc.image_depth = 0;
                desc.image_array_size = 1;
                desc.image_row_pitch = 0;
                desc.image_slice_pitch = 0;
                desc.num_mip_levels = 0;
                desc.num_samples = 0;
                desc.buffer = NULL;
            }
            else
            {
                desc.image_type = CL_MEM_OBJECT_IMAGE3D;
                desc.image_width = img->shape[VGL_WIDTH];
                desc.image_height = img->shape[VGL_HEIGHT];
                desc.image_depth = img->shape[VGL_LENGTH];
                desc.image_array_size = 0;
                desc.image_row_pitch = 0;
                desc.image_slice_pitch = 0;
                desc.num_mip_levels = 0;
                desc.num_samples = 0;
                desc.buffer = NULL;
            }            
            img->oclPtr = clCreateImage(cl.context,CL_MEM_READ_WRITE, &format, &desc,NULL,&err);
            vglClCheckError(err, (char*) "clCreateImage");
*/
        }

        if (vglIsInContext(img, VGL_RAM_CONTEXT))
        {
            size_t Origin[3] = { 0, 0, 0};
            int nFrames = 1;
            if(img->ndim == 3)
            {
                nFrames = img->shape[VGL_LENGTH];
            }

            void* imageData = img->getImageData();
            if (!imageData)
            {
                fprintf(stderr, "%s: %s: Error: both ipl and ndarray are NULL.\n", __FILE__, __FUNCTION__);
                exit(1);
            }
   
            if ( (img->ndim == 2) || (img->ndim == 3) )
            {
                size_t Size3d[3] = {img->shape[VGL_WIDTH], img->shape[VGL_HEIGHT], nFrames};
                err = clEnqueueWriteImage( cl.commandQueue, img->oclPtr, CL_TRUE, Origin, Size3d, 0, 0, (char*)imageData, 0, NULL, NULL );
                vglClCheckError( err, (char*) "clEnqueueWriteImage" );
                clFinish(cl.commandQueue);
            }
            else
            {
                err = clEnqueueWriteBuffer(cl.commandQueue, img->oclPtr, CL_TRUE, 0, img->getTotalSizeInBytes(), imageData, 0, NULL, NULL);
                vglClCheckError( err, (char*) "clEnqueueWriteBuffer" );
                clFinish(cl.commandQueue);
            }
        }
        vglAddContext(img, VGL_CL_CONTEXT);
    }
}
Esempio n. 29
0
 int main()		
 {		
 	cl_platform_id platform_id = NULL;	
 	cl_uint ret_num_platforms;	
 	cl_device_id device_id = NULL;	
 	cl_uint ret_num_devices;	
 	cl_context context = NULL;	
 	cl_command_queue command_queue = NULL;	
 	cl_program program = NULL;	
 	cl_kernel kernel = NULL;	
 	size_t kernel_code_size;	
 	char *kernel_src_str;	
 	float *result;	
 	cl_int ret;	
 	int i;	
 	FILE *fp;	
 	size_t r_size;	
 		
 	cl_mem image, out;	
 	cl_bool support;	
 	cl_image_format fmt;	
 		
 	int num_out = 9;	
 		
 	clGetPlatformIDs(1, &platform_id, &ret_num_platforms);	
 	clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id,	
 		&ret_num_devices);
 	context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);	
 	result = (float*)malloc(sizeof(cl_float4)*num_out);	
 		
 	/* Check if the device support images */	
 	clGetDeviceInfo(device_id, CL_DEVICE_IMAGE_SUPPORT, sizeof(support), &support, &r_size);	
 	if (support != CL_TRUE) {	
 		puts("image not supported");
 		return 1;
 	}	
 		
 	command_queue = clCreateCommandQueue(context, device_id, 0, &ret);	
 	fp = fopen("image.cl", "r");	
 	kernel_src_str = (char*)malloc(MAX_SOURCE_SIZE);	
 	kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp);	
 	fclose(fp);	
 		
 	/* Create output buffer */	
 	out = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4)*num_out, NULL, &ret);	
 		
 	/* Create data format for image creation */	
 	fmt.image_channel_order = CL_R;	
 	fmt.image_channel_data_type = CL_FLOAT;	
 		
 	/* Create Image Object */	
 	image = clCreateImage2D(context, CL_MEM_READ_ONLY, &fmt, 4, 4, 0, 0, NULL);	
 		
 	/* Set parameter to be used to transfer image object */	
 	size_t origin[] = {0, 0, 0}; /* Transfer target coordinate*/	
 	size_t region[] = {4, 4, 1}; /* Size of object to be transferred */	
 		
 	float data[] = { /* Transfer Data */	
 		10, 20, 30, 40,
 		10, 20, 30, 40,
 		10, 20, 30, 40,
 		10, 20, 30, 40,
 	};	
 		
 	/* Transfer to device */	
 	clEnqueueWriteImage(command_queue, image, CL_TRUE, origin, region, 4*sizeof(float), 0, data, 0, NULL, NULL);	
 		
 	/* Build program */	
 	program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str,	
 	(const size_t *)&kernel_code_size, &ret);	
 	clBuildProgram(program, 1, &device_id, "", NULL, NULL);	
 	kernel = clCreateKernel(program, "image_test", &ret);	
 		
 	/* Set Kernel Arguments */	
 	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&image);	
 	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&out);	
 		
 	cl_event ev;	
 	clEnqueueTask(command_queue, kernel, 0, NULL, &ev);	
 		
 	/* Retrieve result */	
 	clEnqueueReadBuffer(command_queue, out, CL_TRUE, 0, sizeof(cl_float4)*num_out, result, 0, NULL, NULL);	
 		
 	for (i=0; i < num_out; i++) {	
 	printf("%f,%f,%f,%f\n",result[i*4+0],result[i*4+1],result[i*4+2],result[i*4+3]);	
 	}	
 		
 	clReleaseMemObject(out);	
 	clReleaseMemObject(image);	
 		
 	clReleaseKernel(kernel);	
 	clReleaseProgram(program);	
 	clReleaseCommandQueue(command_queue);	
 	clReleaseContext(context);	
 		
 	free(kernel_src_str);	
 	free(result);	
 		
 	return 0;	
 }		
Esempio n. 30
0
END_TEST

START_TEST (test_read_write_image)
{
    cl_platform_id platform = 0;
    cl_device_id device;
    cl_context ctx;
    cl_command_queue queue;
    cl_mem image2d, part2d;
    cl_int result;

    unsigned char image2d_data_24bpp[3*3*4] = {
        255, 0, 0, 0,       0, 255, 0, 0,       128, 128, 128, 0,
        0, 0, 255, 0,       255, 255, 0, 0,     0, 128, 0, 0,
        255, 128, 0, 0,     128, 0, 255, 0,     0, 0, 0, 0
    };

    unsigned char image2d_part_24bpp[2*2*4] = {
        255, 0, 0, 0,       0, 255, 0, 0,
        0, 0, 255, 0,       255, 255, 0, 0
    };

    unsigned char image2d_buffer[3*3*4];
    unsigned char image2d_part[2*2*4];

    cl_image_format fmt;

    fmt.image_channel_data_type = CL_UNORM_INT8;
    fmt.image_channel_order = CL_RGBA;

    size_t origin[3] = {0, 0, 0};
    size_t region[3] = {3, 3, 1};

    result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
    fail_if(
        result != CL_SUCCESS,
        "unable to get the default device"
    );

    ctx = clCreateContext(0, 1, &device, 0, 0, &result);
    fail_if(
        result != CL_SUCCESS || ctx == 0,
        "unable to create a valid context"
    );

    queue = clCreateCommandQueue(ctx, device, 0, &result);
    fail_if(
        result != CL_SUCCESS || queue == 0,
        "cannot create a command queue"
    );

    image2d = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &fmt,
                              3, 3, 0, image2d_buffer, &result);
    fail_if(
        result != CL_SUCCESS || image2d == 0,
        "cannot create a valid 3x3 image2D"
    );

    part2d = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &fmt,
                             2, 2, 0, image2d_part, &result);
    fail_if(
        result != CL_SUCCESS || image2d == 0,
        "cannot create a valid 2x2 image2D"
    );

    // Write data in buffer
    result = clEnqueueWriteImage(queue, image2d, 1, origin, region, 0, 0,
                                 image2d_data_24bpp, 0, 0, 0);
    fail_if(
        result != CL_SUCCESS,
        "cannot enqueue a blocking write image event"
    );

    // Read it back
    region[0] = 2;
    region[1] = 2;

    result = clEnqueueReadImage(queue, image2d, 1, origin, region, 0, 0,
                                image2d_part, 0, 0, 0);
    fail_if(
        result != CL_SUCCESS,
        "cannot enqueue a blocking read image event"
    );

    // Compare
    fail_if(
        std::memcmp(image2d_part, image2d_part_24bpp, sizeof(image2d_part)) != 0,
        "reading and writing images doesn't produce the correct result"
    );

    // Read it back using a buffer
    cl_event event;
    std::memset(image2d_part, 0, sizeof(image2d_part));

    result = clEnqueueCopyImage(queue, image2d, part2d, origin, origin,
                                region, 0, 0, &event);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue a copy image event"
    );

    result = clWaitForEvents(1, &event);
    fail_if(
        result != CL_SUCCESS,
        "unable to wait for events"
    );

    // Compare
    fail_if(
        std::memcmp(image2d_part, image2d_part_24bpp, sizeof(image2d_part)) != 0,
        "copying images doesn't produce the correct result"
    );

    clReleaseEvent(event);
    clReleaseMemObject(part2d);
    clReleaseMemObject(image2d);
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);
}