Ejemplo n.º 1
0
Workers::Workers() {
  int size = matrixSize * matrixSize; 
  A = new ElementType[size];
  B = new ElementType[size];
  C = new ElementType[size]; 
  
  randomInit(A, size); 
  randomInit(B, size); 
}
Ejemplo n.º 2
0
int main(void)
{
    randomInit();
    DDRB |= (1<<PB2) | (1<<PB3) | (1<<PB4) | (1<<PB5);
    startDemo(3, 0b00000011);
    _delay_ms(500);

    unsigned char length = 1;
    while(1)
    {
        /*unsigned char buttons = getButton();
        ledSet(buttons, 1);
        ledSet(~buttons, 0);*/
        generate(length);
        if(check(length))
        {
            gameOver(length - 1);
            length = 0;
        }
        _delay_ms(500);

        length++;
    }

    return 0;
}
Ejemplo n.º 3
0
  void Initialize(arma::Mat<eT>& W, const size_t rows, const size_t cols)
  {
    RandomInitialization randomInit(-gamma, gamma);
    randomInit.Initialize(W, rows, cols);

    W = (b / (k  * rows)) * arma::sqrt(W + 1);
  }
 void Initialize(arma::Mat<eT>& W, const size_t rows, const size_t cols)
 {
   arma::Row<eT> b = s * arma::sqrt(3 / (rows * dataSum));
   const double theta = b.min();
   RandomInitialization randomInit(-theta, theta);
   randomInit.Initialize(W, rows, cols);
 }
Ejemplo n.º 5
0
// -------------------------------------
// Main function
// -------------------------------------
void appMain(void)
{
    uint16_t i;

    // timers (used to get an extra interrupt context)
    alarmInit(&timer, onTimer, NULL);
    alarmSchedule(&timer, 1000);

    // radio
    radioSetReceiveHandle(radioRecvCb);
    radioOn();

    for (i = 0; i < BUFFER_SIZE; i++) {
        buffer[i] = i;
    }

    randomInit();

    // SELECT_FLASH;
    // extFlashBulkErase();
    // UNSELECT_FLASH;

    for (i = 0; ; i++) {
        uint32_t address = i * 64ul;

        SELECT_FLASH;

        if (IS_ALIGNED(address, EXT_FLASH_SECTOR_SIZE)) {
            PRINTF("erase address %lu\n", address);
            flashErase(address);
        }

        PRINTF("write address %lu\n", address);
        flashWrite(address);

        if (address > 0) {
            PRINTF("verify...\n");
            flashRead(address - 64);
        }

        UNSELECT_FLASH;

        msleep(randomInRange(400, 1000));

        PRINTF("send smth to radio...\n");
        radioSend("hello world", sizeof("hello world"));

        greenLedToggle();
    }
}
Ejemplo n.º 6
0
int main (int argc, char* argv[]){

  int nsteps = 2;
  int g1[WDEFAULT][HDEFAULT], g2[WDEFAULT][HDEFAULT];

  for(int i = 0; i < WDEFAULT; i++)
    for(int j = 0; j < HDEFAULT; j++){
      g1[i][j] = 0;
      g2[i][j] = 0;
    }
  
  const gsl_rng_type *T;
  gsl_rng *rand;

  gsl_rng_env_setup();
  T = gsl_rng_default;
  rand = gsl_rng_alloc(T);

  gsl_rng_set(rand, get_seed_noblock());
  randomInit(g1, 0.55, rand);

  
  printGrid(g1);

  updateGrid(g1,g2);

  printf("# updated \n");
  printGrid(g2);

  for(int i = 0; i < nsteps; i++){
    // blit g2 back into g1
    memcpy(g1, g2, sizeof(int)*WDEFAULT*HDEFAULT);
    // now update g1
    updateGrid(g1,g2);
    printGrid(g2);
  }
  

  gsl_rng_free(rand);
  return EXIT_SUCCESS;
}
Ejemplo n.º 7
0
/**
 * \brief Initializes all the subsystems for this Droplet. This function MUST be called
 * by the user before using any other functions in the API.
 */ 
static void initAllSystems(void){
	cli();
	Config32MHzClock();
	
	calculateIdNumber();
	
	schedulerInit();			INIT_DEBUG_PRINT("SCHEDULER INIT\r\n");
	pcCommInit();				INIT_DEBUG_PRINT("PC COM INIT\r\n");
	rgbLEDinit();				INIT_DEBUG_PRINT("LED INIT\r\n");
	powerInit();				INIT_DEBUG_PRINT("POWER INIT\r\n");
	i2cInit();					INIT_DEBUG_PRINT("I2C INIT\r\n");
	
	enableInterrupts();	
	
	rangeAlgsInit();			INIT_DEBUG_PRINT("RANGE ALGORITHMS INIT\r\n");
	rgbSensorInit();			INIT_DEBUG_PRINT("RGB SENSE INIT\r\n");
	irLedInit();				INIT_DEBUG_PRINT("IR LED INIT\r\n");
	irSensorInit();			INIT_DEBUG_PRINT("IR SENSE INIT\r\n");
	
	#ifdef AUDIO_DROPLET
		speakerInit();			INIT_DEBUG_PRINT("SPEAKER INIT\r\n");
		micInit();				INIT_DEBUG_PRINT("MIC INIT\r\n"); //Must occur after ir_sensor_init.
	#endif
		
	motorInit();				INIT_DEBUG_PRINT("MOTOR INIT\r\n");
	randomInit();				INIT_DEBUG_PRINT("RAND INIT\r\n"); //This uses adc readings for a random seed, and so requires that the adcs have been initialized.
	localizationInit();		INIT_DEBUG_PRINT("LOCALIZATION INIT\r\n"); 
	
	#ifdef SYNCHRONIZED
		fireflySyncInit();
	#endif

	setAllirPowers(256);
	startupLightSequence();
	
	irCommInit();				INIT_DEBUG_PRINT("IR COM INIT\r\n");
	#ifdef AUDIO_DROPLET
		enableMicInterrupt();
	#endif
}
Ejemplo n.º 8
0
int main(void) {
char buf[80];
system("clear");
printf("\n\n\n ### GPU ENABLED CODE\n");
printf("\n ### Host creating two input vectors and one output vector\n");
    unsigned i,  LIST_SIZE;
    FILE *fp0;

    fp0 = fopen("data.txt", "r");
    if (!fp0) {
        fprintf(stderr, "Failed to load data.txt.\n");
        exit(1);
    }
    fgets(buf, sizeof(buf),  fp0);
    sscanf(buf,"%d", &LIST_SIZE);
    printf(" ### Vector_size:%d elements, Elem_size:%lu byte\n", LIST_SIZE, sizeof(int));
    fclose(fp0);

    int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
    if (A == NULL) {
    fprintf(stderr, "failed to allocate memory.\n");
    return -1;
    }

    int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
    if (B == NULL) {
    fprintf(stderr, "failed to allocate memory.\n");
    return -1;
    }

    int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
    if (C == NULL) {
    fprintf(stderr, "failed to allocate memory.\n");
    return -1;
    }

printf("\n Host initialising input vector values\n");
    for(i = 0; i < LIST_SIZE; i++) {
        A[i] = /*randomInit()*/ i; 
        B[i] = /*randomInit()*/ LIST_SIZE-i;
        loadBar(i, LIST_SIZE, 100, 2);
    }

printf("\n Setting up GPU\n");    
    // Load the kernel source code into the array source_str
    FILE *fp;
    char *source_str;
    size_t source_size;

    fp = fopen("vec.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose( fp );

    // Get platform and device information
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;   
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    printf("Check platform id %s\n", getErrorString(ret));
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_CPU, 1, 
            &device_id, &ret_num_devices);
printf("device_id:%u, device_count:%u\n", (int) device_id, ret_num_devices);
     printf("Check device id %s\n", getErrorString(ret));

    // Create an OpenCL context
    cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
     printf("Check context %s\n", getErrorString(ret));

    // Create a command queue
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
     printf("Check command queue %s\n", getErrorString(ret));

    // Create memory buffers on the device for each vector 
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, 
            LIST_SIZE * sizeof(int), NULL, &ret);
     printf("Check a_mem_obj %s\n", getErrorString(ret));
    cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
            LIST_SIZE * sizeof(int), NULL, &ret);
     printf("Check b_mem_obj %s\n", getErrorString(ret));
    cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
            LIST_SIZE * sizeof(int), NULL, &ret);
     printf("Check c_mem_obj %s\n", getErrorString(ret));

// <-
    // Create a program from the kernel source
    cl_program program = clCreateProgramWithSource(context, 1, 
            (const char **)&source_str, (const size_t *)&source_size, &ret);
     printf("Check program %s\n", getErrorString(ret));

    // Build the program
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
//    ret = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
     printf("Check build program %s, device_id:%d \n", getErrorString(ret),(int)device_id);

    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
     printf("Check kernel create %s\n", getErrorString(ret));

    // Set the arguments of the kernel
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
     printf("Check kernel arg 0 %s\n", getErrorString(ret));
    ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
     printf("Check kernel arg 1 %s\n", getErrorString(ret));
    ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
    ret |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &LIST_SIZE);
     printf("Check kernel args 0-3 %s\n", getErrorString(ret));

    // COPY EXECUTE BLOCK //////////////////////////////////////////////
    // ->Copy the lists A and B to their respective memory buffers
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
            LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
     printf("Check Read B %s\n", getErrorString(ret));
    ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, 
            LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
     printf("Check Read B %s\n", getErrorString(ret));

    // Execute the OpenCL kernel on the list
    printf("\n GPU adding the vectors \n");
    size_t global_item_size = LIST_SIZE; // Process the entire lists
    size_t local_item_size = 1; // Process one item at a time
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
            &global_item_size, &local_item_size, 0, NULL, NULL);
    printf("Check kernel EnqueueNDRange %s\n", getErrorString(ret));

    // Read the memory buffer c_mem_obj on the device to the local C
    ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, 
            LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
     printf("Check Read Buffer C %s\n", getErrorString(ret));
    ////////////////////////////////////////////////////////////////////

    // Display the first 10 result to the screen
    for(i = 0; i < 10; i++)
        printf("%d + %d = %d\n", A[i], B[i], C[i]);
    printf("\n");
    // Display the last 10 result to the screen
    for(i = LIST_SIZE-10; i < LIST_SIZE; i++)
        printf("%d + %d = %d\n", A[i], B[i], C[i]);

//// SECOND TIME ////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////
printf("\n Host initialising input vectors randomly \n");
    for(i = 0; i < LIST_SIZE; i++) {
        A[i] = randomInit(); 
        B[i] = randomInit();
        loadBar(i, LIST_SIZE, 100, 2);
    }

    // COPY EXECUTE BLOCK RE-RUN ////////////////////////////////////////
    // ->Copy the lists A and B to their respective memory buffers
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
            LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
     printf("Check Read B %s\n", getErrorString(ret));
    ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, 
            LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
     printf("Check Read B %s\n", getErrorString(ret));

    // Execute the OpenCL kernel on the list
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
            &global_item_size, &local_item_size, 0, NULL, NULL);
    printf("Check kernel EnqueueNDRange %s\n", getErrorString(ret));

    // Read the memory buffer c_mem_obj on the device to the local C
    ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, 
            LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
     printf("Check Read Buffer C %s\n", getErrorString(ret));
    ////////////////////////////////////////////////////////////////////

    // Display the first 10 result to the screen
    for(i = 0; i < 10; i++)
        printf("%d + %d = %d\n", A[i], B[i], C[i]);
    printf("\n");
    // Display the last 10 result to the screen
    for(i = LIST_SIZE-10; i < LIST_SIZE; i++)
        printf("%d + %d = %d\n", A[i], B[i], C[i]);


////// SECOND TIME ENDS/////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
printf("\n Cleaning up GPU queues\n");
    // Clean up
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(a_mem_obj);
    ret = clReleaseMemObject(b_mem_obj);
    ret = clReleaseMemObject(c_mem_obj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);

printf("\n Host freeing up the vectors\n");
    free(A);
    free(B);
    free(C);
    return 0;
}
Ejemplo n.º 9
0
int main(void) {
	int N = 100;
	int M = 20;
	// The number of classes determines the number of problems
	Problem* probs = (Problem*) malloc(M * sizeof(Problem));
	// Initialize the parameters for the SVM one class
	Parameter param;
//	Node *x_space;

	// Type of SVM
	param.svm_type = 0;
	param.kernel_type = RBF;
	param.degree = 3;
	param.gamma = 0;
	param.coef0 = 0;
//	param.nu = 0.05;
//	param.cache_size = 1000;
//	param.C = 1;
//	param.eps = 1e-3;
//	param.p = 0.1;
//	param.shrinking = 1;
//	param.probability = 0;
//	param.nr_weight = 0;
//	param.weight_label = NULL;
//	param.weight = NULL;

	// Generate random Matrix and vector
	unsigned long vector_start_time = start_timer();
	unsigned long int mem_size_X = sizeof(float) * M * N;
	unsigned long int mem_size_K = sizeof(float) * M * M;
	unsigned long int mem_size_Y = sizeof(float) * M;
	float *Y = (float *) malloc(mem_size_Y);
	float *X = (float *) malloc(mem_size_X);
	float *K = (float *) malloc(mem_size_K);
	// initialize host memory
	srand(time(NULL ));
	randomInit(X, M * N);
	srand(time(NULL ));
	randomInit(Y, M);

	Node* x_space = malloc(N * sizeof(Node*));
	for (unsigned int i = 0; i < M; i++) {
		probs[i].idx = i;
		probs[i].y = Y[i];

		for (unsigned int j = 0; j < N; j++) {
			x_space[j].idx = j;
			x_space[j].value = X[i * N + j];
		}
		probs[i].x = x_space;

	}
	stop_timer(vector_start_time, "Vector generation");

	printf("\nUsing LINEAR\n");
	unsigned long ker_start_time = start_timer();
	//formK(K, probs, M);

	stop_timer(ker_start_time, "Time LINEAR\t");

	printf("\nUsing POLY\n");
	ker_start_time = start_timer();
	stop_timer(ker_start_time, "Time POLY\t");

	printf("\nUsing RBF\n");
	ker_start_time = start_timer();
	stop_timer(ker_start_time, "Time RBF\t");

	printf("\nUsing SIGMOID\n");
	ker_start_time = start_timer();
	stop_timer(ker_start_time, "Time SIGMOID\t");

	return EXIT_SUCCESS;
}
int
main(int argc, char** argv)
{

	// set seed for rand()
	srand(2006);

	// 1. allocate host memory for matrices A and B
	unsigned int size_A = WA * HA;
	unsigned int mem_size_A = sizeof(float)* size_A;
	float* h_A = (float*)malloc(mem_size_A);

	unsigned int size_B = WB * HB;
	unsigned int mem_size_B = sizeof(float)* size_B;
	float* h_B = (float*)malloc(mem_size_B);

	// 2. initialize host memory
	randomInit(h_A, size_A);
	randomInit(h_B, size_B);
	int i,j,k;
	// 3. print out A and B
	//printf("\n\nMatrix A\n");
	/*for (i = 0; i < size_A; i++)
	{
		printf("%f ", h_A[i]);
		if (((i + 1) % WA) == 0)
			printf("\n");
	}

	printf("\n\nMatrix B\n");
	for (i = 0; i < size_B; i++)
	{
		printf("%f ", h_B[i]);
		if (((i + 1) % WB) == 0)
			printf("\n");
	}*/

	// 4. allocate host memory for the result C
	unsigned int size_C = WC * HC;
	unsigned int mem_size_C = sizeof(float)* size_C;
	float* h_C = (float*)malloc(mem_size_C);

	// 5. Initialize OpenCL
	// OpenCL specific variables
	cl_context clGPUContext;
	cl_command_queue clCommandQue;
	cl_program clProgram;
	cl_kernel clKernel;
	
	cl_platform_id platform_id = NULL;   //
	cl_uint ret_num_devices;  //
	cl_uint ret_num_platforms;  //

	size_t dataBytes;
	size_t kernelLength;
	cl_int errcode;
	cl_device_id device_id = NULL;  //
	 
	cl_int ret; //

	// OpenCL device memory for matrices
	cl_mem d_A;
	cl_mem d_B;
	cl_mem d_C;

	/*****************************************/
	/* Initialize OpenCL */
	/*****************************************/
	//clGPUContext = clCreateContextFromType(0,
	//	CL_DEVICE_TYPE_GPU,
	//	NULL, NULL, &errcode);
	//shrCheckError(errcode, CL_SUCCESS);

	// get the list of GPU devices associated 
	// with context
	
	ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
	ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);

	/* Create OpenCL context */
	clGPUContext = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

	/* Create Command Queue */
	clCommandQue = clCreateCommandQueue(clGPUContext, device_id, 0, &ret);




/*	errcode = clGetContextInfo(clGPUContext,
		CL_CONTEXT_DEVICES, 0, NULL,
		&dataBytes);
	cl_device_id *clDevices = (cl_device_id *)
		malloc(dataBytes);

	errcode |= clGetContextInfo(clGPUContext,
		CL_CONTEXT_DEVICES, dataBytes,
		clDevices, NULL);

*/

//	shrCheckError(errcode, CL_SUCCESS);


	//Create a command-queue
//	clCommandQue = clCreateCommandQueue(clGPUContext,
//		clDevices[0], 0, &errcode);
	

	// shrCheckError(errcode, CL_SUCCESS);

	// Setup device memory

	d_C = clCreateBuffer(clGPUContext,
		CL_MEM_READ_WRITE,
		mem_size_A, NULL, &ret);
	d_A = clCreateBuffer(clGPUContext,
		CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
		mem_size_A, h_A, &ret);
	d_B = clCreateBuffer(clGPUContext,
		CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
		mem_size_B, h_B, &ret);

	//printf("6. Load and build OpenCL kernel\n");
	// 6. Load and build OpenCL kernel

	FILE *fp;
	char fileName[] = "./kernel.cl";
	char *source_str;
	size_t source_size;

	/* Load the source code containing the kernel*/
	fp = fopen(fileName, "r");
	if (!fp) {
		printf("Failed to load kernel.\n");
		exit(1);
	}
	source_str = (char*)malloc(MAX_SOURCE_SIZE);
	source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);

	clProgram = clCreateProgramWithSource(clGPUContext, 1, (const char **)&source_str,
		(const size_t *)&source_size, &ret);



	

	ret = clBuildProgram(clProgram, 0,
		NULL, NULL, NULL, NULL);
//	shrCheckError(errcode, CL_SUCCESS);

	clKernel = clCreateKernel(clProgram,
		"matrixMul", &ret);
//	shrCheckError(errcode, CL_SUCCESS);


	// 7. Launch OpenCL kernel
	size_t localWorkSize[2], globalWorkSize[2];

	int wA = WA;
	int wC = WC;
	ret = clSetKernelArg(clKernel, 0,
		sizeof(cl_mem), (void *)&d_C);
	ret |= clSetKernelArg(clKernel, 1,
		sizeof(cl_mem), (void *)&d_A);
	ret |= clSetKernelArg(clKernel, 2,
		sizeof(cl_mem), (void *)&d_B);
	ret |= clSetKernelArg(clKernel, 3,
		sizeof(int), (void *)&wA);
	ret |= clSetKernelArg(clKernel, 4,
		sizeof(int), (void *)&wC);
//	shrCheckError(errcode, CL_SUCCESS);

	localWorkSize[0] = 3;
	localWorkSize[1] = 3;
	globalWorkSize[0] = 3;
	globalWorkSize[1] = 3;

	ret = clEnqueueNDRangeKernel(clCommandQue,
		clKernel, 2, NULL, globalWorkSize,
		localWorkSize, 0, NULL, NULL);
//	shrCheckError(errcode, CL_SUCCESS);

	// 8. Retrieve result from device
	ret = clEnqueueReadBuffer(clCommandQue,
		d_C, CL_TRUE, 0, mem_size_C,
		h_C, 0, NULL, NULL);
	//shrCheckError(errcode, CL_SUCCESS);

	// 9. print out the results
	/*printf("\n\nMatrix C (Results)\n");
	for (i = 0; i < size_C; i++)
	{
		printf("%f ", h_C[i]);
		if (((i + 1) % WC) == 0)
			printf("\n");
	}
	printf("\n");*/

	// 10. clean up memory
	free(h_A);
	free(h_B);
	free(h_C);

	clReleaseMemObject(d_A);
	clReleaseMemObject(d_C);
	clReleaseMemObject(d_B);

	//free(clDevices);
	//free(clMatrixMul);
	clReleaseContext(clGPUContext);
	clReleaseKernel(clKernel);
	clReleaseProgram(clProgram);
	clReleaseCommandQueue(clCommandQue);

}
Ejemplo n.º 11
0
int main(int argc, char *argv[]) {
  int i,j,k;
  machineInformation currentMachine;
  counterSessionInfo session;

  initializeCUDA();

  // Set machine information from CounterHomeBrew.h
  currentMachine.cpu_model = CPU_MODEL;
  currentMachine.num_sockets = NUM_SOCKETS;
  currentMachine.num_phys_cores_per_socket = NUM_PHYS_CORES_PER_SOCKET;
  currentMachine.num_cores_per_socket = NUM_CORES_PER_SOCKET;
  currentMachine.num_cores = NUM_CORES;
  currentMachine.num_cbos = NUM_PHYS_CORES_PER_SOCKET; // should multiply by NUM_SOCKETS???
  currentMachine.core_gen_counter_num_max = CORE_GEN_COUNTER_MAX;
  currentMachine.cbo_counter_num_max = CBO_COUNTER_NUM_MAX;

  // Set session events, umasks and counters used
  //  int32 core_event_numbers[] = {FP_COMP_OPS_EXE_EVTNR,SIMD_FP_256_EVTNR,0x51,0xF1,0x80};
  // int32 core_umasks[] = {FP_COMP_OPS_EXE_SCALAR_DOUBLE_UMASK,SIMD_FP_256_PACKED_DOUBLE_UMASK,0x01, 0x07,0x01};

  session.core_gen_counter_num_used = 5;
  int32 core_event_numbers[] = {0x10,0x10,0x11,0x51,0xF1};
  int32 core_umasks[] = {0x20,0x40,0x01,0x01, 0x07};

  session.cbo_counter_num_used = 1;
  int32 cbo_event_numbers[] = {0x37};
  int32 cbo_umasks[] = {0xf};
  session.cbo_filter = 0x1f;

  for (i = 0; i < session.core_gen_counter_num_used; i++) {
    session.core_event_numbers[i] = core_event_numbers[i];
    session.core_umasks[i] = core_umasks[i];
  }
  for (i = 0; i < session.cbo_counter_num_used; i++) {
    session.cbo_event_numbers[i] = cbo_event_numbers[i];
    session.cbo_umasks[i] = cbo_umasks[i];
  }

  int fd[NUM_CORES];

  // Arrays to hold counter data...
  counterData before;
  counterData after;

  // some data for doing a naive matmul to test flop counting...
  // initloop(N);
  

  // M,N,K are multiples of the block size....
  int gpuOuter = atoi(argv[1]);
  int gpuInner = atoi(argv[2]);
  int cpuInner = atoi(argv[3]);
  double minRuntime = atoi(argv[4]);
  int Md = atoi(argv[5])*block_size;
  int Nd = atoi(argv[6])*block_size;
  int Kd = atoi(argv[7])*block_size;
  int Mh = atoi(argv[8]);
  int Nh = atoi(argv[9]);
  int Kh = atoi(argv[10]);

  char *ts1,*ts2,*ts3,*ts4;
  char *ts5,*ts6,*ts7,*ts8;
  double fineTimeStamps[8];
  double gTime = 0.0;
  double cTime = 0.0;
  double seconds = 0.0;
  int num_iters;

  uint64 *coreSums;
  coreSums = (uint64*)calloc(currentMachine.num_sockets*session.core_gen_counter_num_used,sizeof(uint64));

  uint64 *sums;
  sums = (uint64*)calloc(currentMachine.num_sockets*session.cbo_counter_num_used,sizeof(uint64));

  float *Atmp = NULL;
  float *Btmp = NULL;
  float *Ctmp = NULL;
  Atmp = (float*) malloc( Mh * Nh * sizeof(float) );
  Btmp = (float*) malloc( Nh * sizeof(float) );
  Ctmp = (float*) malloc( Mh * sizeof(float) );
  randomInit(Atmp,Mh*Nh);
  randomInit(Btmp,Nh);

  for (num_iters = cpuInner; seconds < minRuntime; num_iters *=2) {
    seconds = 0.0;	
    for (i =0; i < num_iters; i++)
      BLASFUNC( CblasColMajor,CblasNoTrans,Mh,Nh, 1, Atmp,Mh, Btmp,1, 1, Ctmp,1 );
    seconds = read_timer()-seconds;
  }
  //  num_iters /= 2;

  free(Atmp);
  free(Btmp);
  free(Ctmp);

  int readyThreads = 0;
  #pragma omp parallel
  {
    int threadNum = omp_get_thread_num();
    int numThreads = omp_get_num_threads();
    assert(numThreads==2);
    if (threadNum == 0) {
      cudaError_t error;
      int memSizeA = sizeof(float)*Md*Nd;
      int memSizeB = sizeof(float)*Nd;
      int memSizeC = sizeof(float)*Md;
      
      float *Ahost,*Bhost,*Chost;
      // use pinned memory on the host for BW and asynch memory transfers..
      int flags = cudaHostAllocDefault;
      ts5 = getTimeStamp();
      fineTimeStamps[0] = read_timer();
      error = cudaHostAlloc((void**)&Ahost,memSizeA,flags);if (error != cudaSuccess){printf("cudaHostMalloc Ahost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      error = cudaHostAlloc((void**)&Bhost,memSizeB,flags);if (error != cudaSuccess){printf("cudaHostMalloc Bhost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      error = cudaHostAlloc((void**)&Chost,memSizeC,flags);if (error != cudaSuccess){printf("cudaHostMalloc Chost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      // set local arrays
      randomInit(Ahost,Md*Nd);
      randomInit(Bhost,Nd);

      // allocate device memory
      float *Adevice,*Bdevice,*Cdevice;
      error = cudaMalloc((void**)&Adevice,memSizeA); if (error != cudaSuccess){printf("cudaMalloc Adevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      error = cudaMalloc((void**)&Bdevice,memSizeB); if (error != cudaSuccess){printf("cudaMalloc Bdevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      error = cudaMalloc((void**)&Cdevice,memSizeC); if (error != cudaSuccess){printf("cudaMalloc Cdevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      fineTimeStamps[1] = read_timer();
      ts6 = getTimeStamp();
#pragma omp critical
      {
	readyThreads += 1;
      }
      //     fprintf(stderr,"Incremented ready GPU\n");
      while (readyThreads < 2){sleep(1);fprintf(stderr,"Thread 0: %d\n",readyThreads);};

      //#pragma omp single 
      //{
      cudaStream_t stream1;
      cudaStreamCreate ( &stream1) ;
      ts3 = getTimeStamp();
      fineTimeStamps[2] = read_timer();
      gTime = read_timer();
      for (int i = 0; i < gpuOuter; i++) 
	GPUsgemv(gpuInner,Md,Nd,Kd,Adevice,Bdevice,Cdevice,Ahost,Bhost,Chost,&stream1);
      cudaStreamSynchronize(stream1);
      gTime = read_timer() - gTime;
      fineTimeStamps[3] = read_timer();
      ts4 = getTimeStamp();
      cudaFreeHost(Ahost);
      cudaFreeHost(Bhost);
      cudaFreeHost(Chost);

    } else {
      //  uint64 min_iters = strtoull(argv[4],NULL,0);
      float *A = NULL;
      float *B = NULL;
      float *C = NULL;
      ts7 = getTimeStamp();
      fineTimeStamps[4] = read_timer();
      A = (float*) malloc( Mh * Nh * sizeof(float) );
      B = (float*) malloc( Nh * sizeof(float) );
      C = (float*) malloc( Mh * sizeof(float) );
      randomInit(A,Mh*Nh);
      randomInit(B,Nh);
      fineTimeStamps[5] = read_timer();
      ts8 = getTimeStamp();
#pragma omp critical
      {
	readyThreads += 1;
      }
      //   fprintf(stderr,"Incremented ready CPU\n");
      while (readyThreads < 2){sleep(1);fprintf(stderr,"Thread 1: %d\n",readyThreads);};
                  
      // open the msr files for each core on the machine
      for (i = 0; i < currentMachine.num_cores; i++)
	open_msr_file(i,&fd[i]);
      
      
      int socketsProgrammed = 0;
      for (i = 0; i < currentMachine.num_cores; i++) {
	int currentCoreFD = fd[i];
	
	stopCounters(i, currentCoreFD, &currentMachine, &session);
	programCoreFixedCounters(currentCoreFD);    
	programGeneralPurposeRegisters(currentCoreFD, &currentMachine, &session);
	
	/* Program the Uncore as desired...*/
	// Only program the first physical core on each socket. 
	// NOTE: Some assumptions about topology here...check /proc/cpuinfo to confirm.
	if (i % currentMachine.num_phys_cores_per_socket == 0 && socketsProgrammed < currentMachine.num_sockets) {
	  programUncoreCounters( currentCoreFD, &currentMachine, &session);
	  socketsProgrammed++;
	}
      }
      
      seconds = 0.0;
      
      // start the programmed counters...
      for (i = 0; i < currentMachine.num_cores; i++)
	startCounters( i, fd[i], &currentMachine, &session);
      
      /* READ COUNTERS BEFORE STUFF */
      readCounters(fd,&currentMachine,&session, &before);
      ts1 = getTimeStamp();
      fineTimeStamps[6] = read_timer();
      seconds = read_timer();
      
      /* DO STUFF */    
      for (i =0; i < num_iters; i++)
	BLASFUNC( CblasColMajor,CblasNoTrans,Mh,Nh, 1, A,Mh, B,1, 1, C,1 );
      
      /* END DOING STUFF */
      
      seconds = read_timer()-seconds;
      fineTimeStamps[7] = read_timer();
      ts2 = getTimeStamp();
      
      /* READ COUNTERS AFTER STUFF */    
      for (i = 0; i < currentMachine.num_cores; i++)
	stopCounters(i,fd[i],&currentMachine, &session);
      
      //  printf("num_iters = %"PRIu64", runtime is %g\n",num_iters,seconds);
      
      readCounters(fd,&currentMachine,&session,&after);
      diffCounterData(&currentMachine, &session, &after, &before, &after);
      
      for (i = 0; i < currentMachine.num_sockets; i++) {
	//    printf("Socket %d\n",i);
	for (j = 0; j < currentMachine.num_cores_per_socket; j++) {
	  //   printf("%d,",j);
	  for (k = 0; k < session.core_gen_counter_num_used; k++){
	    //	printf("%"PRIu64",",after.generalCore[i*currentMachine.num_cores_per_socket + j][k]);
	    // bug in the indexing of the core sums???
	    //        coreSums[i*session.core_gen_counter_num_used + k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k];
	    coreSums[k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k];
	  }
	  //	printf("\n");
	}
      }
      
      for (i = 0; i < currentMachine.num_sockets; i++) {
	//	printf("%d,",i);
	for (j = 0; j < currentMachine.num_cbos; j++) {
	  //	  printf("%d,",j);
	  for (k = 0; k < session.cbo_counter_num_used; k++) {
	    //	    printf("%llu,",after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]);
	    // bug in the indexing of the core sums???
	    //        sums[i*session.cbo_counter_num_used + k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k];
	    sums[k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k];
	  }
	}
      }
      //      printf("\n");
            
      // Stop counters, reset PMU, close msr files
      cleanup(fd,&currentMachine,&session);
      
      
      free(A);
      free(B);
      free(C);
    }
  } // end parallel region

  printf("%s,%s,%s,%s,%s,%s,%s,%s,%d,%d,%d,%d,%d,%d,%d,%d,%d,%f,%f,%f,",ts7,ts8,ts1,ts2,ts5,ts6,ts3,ts4,Mh,Nh,Kh,Md/block_size,Nd/block_size,Kd/block_size,num_iters,gpuOuter,gpuInner,seconds,gTime,(float)(gpuOuter*(Md*Kd+Nd+Md))/16.0);
  for (int i = 0; i < 8; i++)
    printf("%f,",fineTimeStamps[i]);

  for (j = 0; j < session.core_gen_counter_num_used; j++)
    printf("%llu,",coreSums[j]);
  for (j = 0; j < session.cbo_counter_num_used; j++)
    if (j == session.cbo_counter_num_used-1)
      printf("%llu",sums[j]);
    else
      printf("%llu,",sums[j]);
  printf("\n");
  
  free(sums);
  free(coreSums);
  
  return 0;
}
Ejemplo n.º 12
0
Random::Random()
{
   randomInit();
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
{
    cudaDeviceProp deviceProp;

    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));

    // use a larger block size for Fermi and above
    int block_size = (deviceProp.major < 2) ? 16 : 32;

    // set seed for rand()
    srand(2006);

    // allocate host memory for matrices A and B
    unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
    unsigned int mem_size_A = sizeof(float) * size_A;
    float *h_A = (float *)malloc(mem_size_A);
    unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
    unsigned int mem_size_B = sizeof(float) * size_B;
    float *h_B = (float *)malloc(mem_size_B);

    // set seed for rand()
    srand(2006);

    // initialize host memory
    randomInit(h_A, size_A);
    randomInit(h_B, size_B);

    // allocate device memory
    float *d_A, *d_B, *d_C;
    unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
    unsigned int mem_size_C = sizeof(float) * size_C;

    // allocate host memory for the result
    float *h_C      = (float *) malloc(mem_size_C);
    float *h_CUBLAS = (float *) malloc(mem_size_C);

    checkCudaErrors(cudaMalloc((void **) &d_A, mem_size_A));
    checkCudaErrors(cudaMalloc((void **) &d_B, mem_size_B));
    checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMalloc((void **) &d_C, mem_size_C));

    // setup execution parameters
    dim3 threads(block_size, block_size);
    dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);

    // create and start timer
    printf("Computing result using CUBLAS...");

    // execute the kernel
    int nIter = 30;

    // CUBLAS version 2.0
    {
        const float alpha = 1.0f;
        const float beta  = 0.0f;
        cublasHandle_t handle;
        cudaEvent_t start, stop;

        checkCudaErrors(cublasCreate(&handle));

        //Perform warmup operation with cublas
        checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA));

        // Allocate CUDA events that we'll use for timing
        checkCudaErrors(cudaEventCreate(&start));
        checkCudaErrors(cudaEventCreate(&stop));

        // Record the start event
        checkCudaErrors(cudaEventRecord(start, NULL));

        for (int j = 0; j < nIter; j++)
        {
            //note cublas is column primary!
            //need to transpose the order
            checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA));

        }

        printf("done.\n");

        // Record the stop event
        checkCudaErrors(cudaEventRecord(stop, NULL));

        // Wait for the stop event to complete
        checkCudaErrors(cudaEventSynchronize(stop));

        float msecTotal = 0.0f;
        checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));

        // Compute and print the performance
        float msecPerMatrixMul = msecTotal / nIter;
        double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiWA * (double)matrix_size.uiHA * (double)matrix_size.uiWB;
        double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
        printf(
            "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
            gigaFlops,
            msecPerMatrixMul,
            flopsPerMatrixMul);

        // copy result from device to host
        checkCudaErrors(cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost));

        // Destroy the handle
        checkCudaErrors(cublasDestroy(handle));
    }

    // compute reference solution
    printf("Computing result using host CPU...");
    float *reference = (float *)malloc(mem_size_C);
    matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
    printf("done.\n");

    // check result (CUBLAS)
    bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);

    if (resCUBLAS != true)
    {
        printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f);
    }

    printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL");

    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");

    // clean up memory
    free(h_A);
    free(h_B);
    free(h_C);
    free(reference);
    checkCudaErrors(cudaFree(d_A));
    checkCudaErrors(cudaFree(d_B));
    checkCudaErrors(cudaFree(d_C));

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();

    if (resCUBLAS == true)
    {
        return EXIT_SUCCESS;    // return value = 1
    }
    else
    {
        return EXIT_FAILURE;     // return value = 0
    }
}
Ejemplo n.º 14
0
Archivo: kqmat.c Proyecto: b8875/gemtc
int main(int argc, char* argv[])
{
int num_ker=0,num_queue;
num_ker=atoi(argv[2]);
num_queue=atoi(argv[3]);

	//variables
/*#define WA 1024
#define HA 1024
#define WB 1024
#define HB WA
#define WC WB
#define HC HA
*/
struct timeval tim,ftim;
  double t1,t2,tim1,tim2;

//    gettimeofday(&tim, NULL);
  //  t1=tim.tv_sec+(tim.tv_usec/1000000.0);
    gettimeofday(&ftim, NULL);
    tim1=ftim.tv_sec+(ftim.tv_usec/1000000.0);

int m,WA,HA,WB,HB,WC,HC;
m = atoi(argv[5]);
WA=(256*m);
HA = WA;
WB = WA;
HB = WB;
WC = WA;
HC = WA;
   // set seed for rand()
   srand(2006);
 
   // 1. allocate host memory for matrices A and B
	//automate the size of the matrix
   unsigned int size_A = WA * HA;
   unsigned int mem_size_A = sizeof(int) * size_A;
   int* h_A = (int*) malloc(mem_size_A);
 
   unsigned int size_B = WB * HB;
   unsigned int mem_size_B = sizeof(int) * size_B;
   int* h_B = (int*) malloc(mem_size_B);
 
   // 2. initialize host memory
   randomInit(h_A, size_A);
   randomInit(h_B, size_B);
 
/*   // 3. print out A and B
   printf("\n\nMatrix A\n");
   for(i = 0; i < size_A; i++)
   {
      printf("%f ", h_A[i]);
      if(((i + 1) % WA) == 0)
      printf("\n");
   }
 
   printf("\n\nMatrix B\n");
   for(i = 0; i < size_B; i++)
   {
      printf("%f ", h_B[i]);
      if(((i + 1) % WB) == 0)
      printf("\n");
   }
 */
   
// 4. allocate host memory for the result C
   unsigned int size_C = WC * HC;
   unsigned int  mem_size_C = sizeof(int) * size_C;
   int* h_C = (int*) malloc(mem_size_C);
 
   // 5. Initialize OpenCL
   // OpenCL specific variables
   cl_context clGPUContext;
//   cl_command_queue* clCommandQue;
   //cl_program clProgram;
   //cl_kernel clKernel;
cl_platform_id* cpPlatform;        // OpenCL platform
cl_uint platformCount; //keeps the divice count
  
   size_t dataBytes;
   size_t kernelLength;
   cl_int errcode;
 
   // OpenCL device memory for matrices
   cl_mem d_A;
   cl_mem d_B;
   cl_mem d_C;
 
   /*****************************************/
   /* Initialize OpenCL */
   /*****************************************/
//cl_platform_id* cpPlatform;        // OpenCL platform
    //cl_device_id device_id;// = (cl_device_id)malloc(sizeof(cl_device_id)); 
    // Bind to platform
// errcode = clGetPlatformIDs(1, &cpPlatform, NULL);
clGetPlatformIDs(0, NULL, &platformCount);
    cpPlatform = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
clGetPlatformIDs(platformCount, cpPlatform, NULL);//what ever is returned from last step will be used here

cl_device_id device_id;
int choice =atoi(argv[1]);
if(choice ==1)
{
 // Length of vectors
    // n = 64;

    // Connect to a compute device 
// we can have CL_DEVICE_GPU or ACCELERATOR or ALL as an option here
//depending what device are we working on
// we can these multiple times depending on requirements
    errcode = clGetDeviceIDs(cpPlatform[0],CL_DEVICE_TYPE_CPU , 1, &device_id, NULL);
    if (errcode != CL_SUCCESS)

        printf("Error: Failed to create a device group!\n");
}
else
{
 //   errcode = clGetPlatformIDs(1, &cpPlatform, NULL);
    // Get ID for the device
    errcode = clGetDeviceIDs(cpPlatform[1], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
    if (errcode != CL_SUCCESS)

    {

        printf("Error: Failed to create a device group!\n");
}
}
//printf("here");
    // Create a context 
   clGPUContext = clCreateContext(0, 1, &device_id, NULL, NULL, &errcode);
    // Create a command queue
//printf("here");
   /*clGPUContext = clCreateContextFromType(NULL, 
                   CL_DEVICE_TYPE_GPU, 
                   NULL, NULL, &errcode);
   //shrCheckError(errcode, CL_SUCCESS);
 
   // get the list of GPU devices associated 
   // with context
   errcode = clGetContextInfo(clGPUContext, 
              CL_CONTEXT_DEVICES, 0, NULL, 
              &dataBytes);
   cl_device_id *clDevices = (cl_device_id *)
              malloc(dataBytes);
   errcode = clGetContextInfo(clGPUContext, 
              CL_CONTEXT_DEVICES, dataBytes, 
              clDevices, NULL);
   //shrCheckError(errcode, CL_SUCCESS);
 */
//malloc for command queue, kernel and program
cl_kernel *clKernel=(cl_kernel *)malloc(num_ker * sizeof(cl_kernel));
cl_program *clProgram=(cl_program *)malloc(num_ker * sizeof(cl_kernel));

cl_command_queue * clCommandQue = (cl_command_queue *)malloc(num_ker * sizeof(cl_command_queue));
   //Create a command-queue
for(i=0;i<num_queue;i++)
{
   clCommandQue[i] = clCreateCommandQueue(clGPUContext, 
                  device_id, 0, &errcode);
 }  //shrCheckError(errcode, CL_SUCCESS);
  
  /* // Setup device memory
   d_C = clCreateBuffer(clGPUContext, 
          CL_MEM_READ_WRITE, 
          mem_size_A, NULL, &errcode);
   d_A = clCreateBuffer(clGPUContext, 
printf("\nhere"); 
          CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
          mem_size_A, h_A, &errcode);
   d_B = clCreateBuffer(clGPUContext, 
          CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
          mem_size_B, h_B, &errcode);
 */
 	char *file="matxm.cl";
	char *KernelSource =  load_program_source(file);
 for(i=0;i<num_ker;i++)
{
   clProgram[i] = clCreateProgramWithSource(clGPUContext, 
                1, (const char **) & KernelSource, 
                NULL, &errcode);
   //shrCheckError(errcode, CL_SUCCESS);
 
   errcode = clBuildProgram(clProgram[i], 0, 
              NULL, NULL, NULL, NULL);
   //shrCheckError(errcode, CL_SUCCESS);
 
   clKernel[i] = clCreateKernel(clProgram[i], 
               "matrixMul", &errcode);
} 
  //shrCheckError(errcode, CL_SUCCESS);
  // Setup device memory
   d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE,
          mem_size_A, NULL, &errcode);
   d_A = clCreateBuffer(clGPUContext,
          CL_MEM_READ_WRITE,
          mem_size_A, h_A, &errcode);
   d_B = clCreateBuffer(clGPUContext,
          CL_MEM_READ_WRITE,
          mem_size_B, h_B, &errcode);

     // Write our data set into the input array in device memory

for(i=0;i<num_queue;i++){
    errcode = clEnqueueWriteBuffer(clCommandQue[i], d_A, CL_TRUE, 0,mem_size_A, h_A, 0, NULL, NULL);
errcode = clEnqueueWriteBuffer(clCommandQue[i], d_B, CL_TRUE, 0,mem_size_B, h_B, 0, NULL, NULL);
}
    
   // 7. Launch OpenCL kernel
   size_t localWorkSize[2], globalWorkSize[2];
 
   int wA = WA;
   int wC = WC;
for(i=0;i<num_ker;i++)
{
   errcode = clSetKernelArg(clKernel[i], 0, 
              sizeof(cl_mem), (void *)&d_C);
   errcode = clSetKernelArg(clKernel[i], 1, 
              sizeof(cl_mem), (void *)&d_A);
   errcode = clSetKernelArg(clKernel[i], 2, 
              sizeof(cl_mem), (void *)&d_B);
   errcode = clSetKernelArg(clKernel[i], 3, 
              sizeof(int), (void *)&wA);
   errcode = clSetKernelArg(clKernel[i], 4, 
              sizeof(int), (void *)&wC);
}
//   shrCheckError(errcode, CL_SUCCESS);
//struct timespec start, finish;
 
 int value;
value =atoi(argv[4]);
   localWorkSize[0] = value ;
   localWorkSize[1] = value ;
   globalWorkSize[0] = HA;
   globalWorkSize[1] = HA;
//clFinish(clCommandQue);

//timer starting
// clock_gettime(CLOCK_MONOTONIC, &start);
//struct timeval tim;
  //double t1,t2;

//    gettimeofday(&tim, NULL);
  //  t1=tim.tv_sec+(tim.tv_usec/1000000.0);
    gettimeofday(&tim, NULL);
    t1=tim.tv_sec+(tim.tv_usec/1000000.0);
//multikernels inside queues
int j=0;
for(j=0;j<num_queue;j++)
{
for(i=0;i<num_ker;i++){
   errcode = clEnqueueNDRangeKernel(clCommandQue[j], 
              clKernel[i], 2, NULL, globalWorkSize, 
              localWorkSize, 0, NULL, NULL);
}
}
for(i=0;i<num_queue;i++)
{
 clFinish(clCommandQue[i]);
}

gettimeofday(&tim, NULL);
    t2=tim.tv_sec+(tim.tv_usec/1000000.0);
printf("%.6lf\t",(t2-t1));
 
 // shrCheckError(errcode, CL_SUCCESS);
/*  clock_gettime(CLOCK_MONOTONIC, &finish);
        elapsed = (finish.tv_sec - start.tv_sec);
        elapsed += (finish.tv_nsec - start.tv_nsec)/ 1000000000.0;

printf("Work Item/threads = %d \n",value);
printf("time taken by GPU = %le\n ",elapsed);
*/
   // 8. Retrieve result from device

for(i=0;i<num_queue;i++)
{
   errcode = clEnqueueReadBuffer(clCommandQue[i], 
              d_C, CL_TRUE, 0, mem_size_C, 
              h_C, 0, NULL, NULL);
   //shrCheckError(errcode, CL_SUCCESS);
}
for(i=0;i<num_queue;i++)
{
 clFinish(clCommandQue[i]);
}
 // shrCheckError(errcode, CL_SUCCESS);
  //clock_gettime(CLOCK_MONOTONIC, &finish);
    //    elapsed = (finish.tv_sec - start.tv_sec);
      //  elapsed += (finish.tv_nsec - start.tv_nsec)/ 1000000000.0;

//printf("Work Item/threads = %d \n",value);
//printf("time taken by GPU = %le\n ",elapsed);

   // 9. print out the results
   /*printf("\n\nMatrix C (Results)\n");
   for(i = 0; i < size_C; i++)
   {
      printf("%f ", h_C[i]);
      if(((i + 1) % WC) == 0)
      printf("\n");
   }
   printf("\n");*/
 
   // 10. clean up memory
   free(h_A);
   free(h_B);
   free(h_C);
 
   clReleaseMemObject(d_A);
   clReleaseMemObject(d_C);
   clReleaseMemObject(d_B);
 
//   free(device_id);
 free(KernelSource);
   clReleaseContext(clGPUContext);
for(i=0;i<num_ker;i++)
{
   clReleaseKernel(clKernel[i]);
   clReleaseProgram(clProgram[i]);
}
for(i=0;i<num_queue;i++){
   clReleaseCommandQueue(clCommandQue[i]);
}	
gettimeofday(&ftim, NULL);
    tim2=ftim.tv_sec+(ftim.tv_usec/1000000.0);
printf("%.6lf\t",(tim2-tim1));
printf("\n");
exit(0);
}
Ejemplo n.º 15
0
int main(int argc, char** argv)
{
	srand(2006);

	unsigned int size_A = WA*HA;
	unsigned int mem_size_A = sizeof(float)*size_A;
	float* h_A = (float*) malloc(mem_size_A);

	unsigned int size_B = WB*HB;
	unsigned int mem_size_B = sizeof(float)*size_B;
	float* h_B = (float*) malloc(mem_size_B);

	randomInit(h_A, size_A);
	randomInit(h_B, size_B);

	printf("\n\nMathrix A\n");
	for (int i = 0; i < size_A; i++)
	{
		printf("%f ", h_A[i]);
		if (((i+1)%WA)==0)
		{
			printf("\n");
		}
	}

	printf("\n\nMatrix B\n");
	for (int i = 0; i < size_B; i++)
	{
		printf("%f ", h_B[i]);
		if (((i+1)%WB)==0)
		{
			printf("\n");
		}
	}

	unsigned int size_C = WC*HC;
	unsigned int mem_size_C = sizeof(float)*size_C;
	float* h_C = (float*) malloc(mem_size_C);

	for (int i = 0; i < n; ++i)
	{
		for (int j = 0; j < m; ++j)
		{
			for (int k = 0; k < p; ++k)
			{
				a[i+n*j] += b[i+n*k]*c[k+p*j];
			}
		}
	}

	printf("\n\nMatric C (Results)\n");
	for (int i = 0; i < size_C; i++)
	{
		printf("%f ", h_C[i]);
		if (((i+1)%WC)==0)
		{
			printf("\n");
		}
	}
	printf("\n");
	free(h_A);
	free(h_B);
	free(h_C);
	return 0;
}
Ejemplo n.º 16
0
Map::Map(int width, int height, int mode) : width(width),height(height),mode(mode){
	randomInit();
}
Ejemplo n.º 17
0
Archivo: hw2.c Proyecto: hemantjp/HW2
int
main(int argc, char** argv)
{


   srand(1000);
   int i;

   unsigned int size_A = WA * HA;
   unsigned int mem_size_A = sizeof(float) * size_A;
   float* h_A = (float*) malloc(mem_size_A);

   unsigned int size_B = WB * HB;
   unsigned int mem_size_B = sizeof(float) * size_B;
   float* h_B = (float*) malloc(mem_size_B);


   randomInit(h_A, size_A);
   randomInit(h_B, size_B);


   unsigned int size_C = WC * HC;
   unsigned int mem_size_C = sizeof(float) * size_C;
   float* h_C = (float*) malloc(mem_size_C);

   cl_context clGPUContext;
   cl_command_queue clCommandQue;
   cl_program clProgram;
   cl_kernel clKernel;
   cl_event mm;

   size_t dataBytes;
   size_t kernelLength;
   cl_int errcode;


   cl_mem d_A;
   cl_mem d_B;
   cl_mem d_C;


   clGPUContext = clCreateContextFromType(0,
                   CL_DEVICE_TYPE_GPU,
                   NULL, NULL, &errcode);



   errcode = clGetContextInfo(clGPUContext,
              CL_CONTEXT_DEVICES, 0, NULL,
              &dataBytes);
   cl_device_id *clDevices = (cl_device_id *)
              malloc(dataBytes);
   errcode |= clGetContextInfo(clGPUContext,
              CL_CONTEXT_DEVICES, dataBytes,
              clDevices, NULL);



   clCommandQue = clCreateCommandQueue(clGPUContext,
                  clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode);



   d_C = clCreateBuffer(clGPUContext,
          CL_MEM_READ_WRITE,
          mem_size_A, NULL, &errcode);
   d_A = clCreateBuffer(clGPUContext,
          CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
          mem_size_A, h_A, &errcode);
   d_B = clCreateBuffer(clGPUContext,
          CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
          mem_size_B, h_B, &errcode);


   FILE* fp = fopen("hw2.cl", "r");
   fseek (fp , 0 , SEEK_END);
   const size_t lSize = ftell(fp);
   rewind(fp);
   unsigned char* buffer;
   buffer = (unsigned char*) malloc (lSize);
   fread(buffer, 1, lSize, fp);
   fclose(fp);

   cl_int status;
   clProgram = clCreateProgramWithBinary(clGPUContext,
                1, (const cl_device_id *)clDevices,
                &lSize, (const unsigned char**)&buffer,
                &status, &errcode);
   errcode = clBuildProgram(clProgram, 0, NULL, NULL,
                NULL, NULL);


   errcode = clBuildProgram(clProgram, 0,
              NULL, NULL, NULL, NULL);


   clKernel = clCreateKernel(clProgram,
               "MM", &errcode);




   size_t globalWorkSize[2];

   int wA = WA;
   int wC = WC;
   errcode = clSetKernelArg(clKernel, 0,
              sizeof(cl_mem), (void *)&d_C);
   errcode |= clSetKernelArg(clKernel, 1,
              sizeof(cl_mem), (void *)&d_A);
   errcode |= clSetKernelArg(clKernel, 2,
              sizeof(cl_mem), (void *)&d_B);
   errcode |= clSetKernelArg(clKernel, 3,
              sizeof(int), (void *)&wA);
   errcode |= clSetKernelArg(clKernel, 4,
              sizeof(int), (void *)&wC);



   globalWorkSize[0] = 16;
   globalWorkSize[1] = 16;

   cl_ulong time_start, time_end, total_time = 0;

   errcode = clEnqueueNDRangeKernel(clCommandQue,
              clKernel, 2, NULL, globalWorkSize,
              NULL, 0, NULL, &mm);
   printf("Average time = %lu\n");
   clFinish(clCommandQue);

         clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_START,
              sizeof(time_start), &time_start, NULL);
        clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_END,
               sizeof(time_end), &time_end, NULL);
         total_time += time_end - time_start;


         printf("Average time = %lu\n", total_time);
   errcode = clEnqueueReadBuffer(clCommandQue,
              d_C, CL_TRUE, 0, mem_size_C,
              h_C, 0, NULL, NULL);



   free(h_A);
   free(h_B);
   free(h_C);

   clReleaseMemObject(d_A);
   clReleaseMemObject(d_C);
   clReleaseMemObject(d_B);

   free(clDevices);

   clReleaseContext(clGPUContext);
   clReleaseKernel(clKernel);
   clReleaseProgram(clProgram);
   clReleaseCommandQueue(clCommandQue);

}
Ejemplo n.º 18
0
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
{
    cudaDeviceProp deviceProp;
    cudaError_t error;

    error = cudaGetDeviceProperties(&deviceProp, devID);

    if (error != cudaSuccess)
    {
        printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    // use a larger block size for Fermi and above
    int block_size = (deviceProp.major < 2) ? 16 : 32;

    // set seed for rand()
    srand(2006);

    // allocate host memory for matrices A and B
    unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
    unsigned int mem_size_A = sizeof(float) * size_A;
    float *h_A = (float *)malloc(mem_size_A);
    unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
    unsigned int mem_size_B = sizeof(float) * size_B;
    float *h_B = (float *)malloc(mem_size_B);

    // set seed for rand()
    srand(2006);

    // initialize host memory
    randomInit(h_A, size_A);
    randomInit(h_B, size_B);

    // allocate device memory
    float *d_A, *d_B, *d_C;
    unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
    unsigned int mem_size_C = sizeof(float) * size_C;

    // allocate host memory for the result
    float *h_C      = (float *) malloc(mem_size_C);
    float *h_CUBLAS = (float *) malloc(mem_size_C);

    error = cudaMalloc((void **) &d_A, mem_size_A);

    if (error != cudaSuccess)
    {
        printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    error = cudaMalloc((void **) &d_B, mem_size_B);

    if (error != cudaSuccess)
    {
        printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

	error = cudaMalloc((void **) &d_C, mem_size_C);

	if (error != cudaSuccess)
	{
		printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__);
		exit(EXIT_FAILURE);
	}

	// create and start timer
	StopWatchInterface *timerMemIn = NULL;
	sdkCreateTimer(&timerMemIn);
	// start the timer
	sdkStartTimer(&timerMemIn);

    // copy host memory to device
    error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);

    if (error != cudaSuccess)
    {
        printf("cudaMemcpy d_A h_A returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);

    if (error != cudaSuccess)
    {
        printf("cudaMemcpy d_B h_B returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

	sdkStopTimer(&timerMemIn);
	printf("\nMemory H2D Transferring time: %f (ms)\n", sdkGetTimerValue(&timerMemIn));
	sdkDeleteTimer(&timerMemIn);

    // setup execution parameters
    dim3 threads(block_size, block_size);
    dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);

    // create and start timer
    printf("Computing result using CUBLAS...");

    // execute the kernel
    int nIter = 30;

    // CUBLAS version 2.0
    {
        cublasHandle_t handle;

        cublasStatus_t ret;

        ret = cublasCreate(&handle);

        if (ret != CUBLAS_STATUS_SUCCESS)
        {
            printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
            exit(EXIT_FAILURE);
        }

        const float alpha = 1.0f;
        const float beta  = 0.0f;
        //Perform warmup operation with cublas
        ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA);

        if (ret != CUBLAS_STATUS_SUCCESS)
        {
            printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__);
            exit(EXIT_FAILURE);
        }

        // Allocate CUDA events that we'll use for timing
        cudaEvent_t start;
        error = cudaEventCreate(&start);

        if (error != cudaSuccess)
        {
            fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
            exit(EXIT_FAILURE);
        }

        cudaEvent_t stop;
        error = cudaEventCreate(&stop);

        if (error != cudaSuccess)
        {
            fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
            exit(EXIT_FAILURE);
        }

        // Record the start event
        error = cudaEventRecord(start, NULL);

        if (error != cudaSuccess)
        {
            fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
            exit(EXIT_FAILURE);
        }

        for (int j = 0; j < nIter; j++)
        {
            //note cublas is column primary!
            //need to transpose the order
            ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA);

            if (ret != CUBLAS_STATUS_SUCCESS)
            {
                printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__);
                exit(EXIT_FAILURE);
            }
        }

        printf("done.\n");

        // Record the stop event
        error = cudaEventRecord(stop, NULL);

        if (error != cudaSuccess)
        {
            fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
            exit(EXIT_FAILURE);
        }

        // Wait for the stop event to complete
        error = cudaEventSynchronize(stop);

        if (error != cudaSuccess)
        {
            fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
            exit(EXIT_FAILURE);
        }

        float msecTotal = 0.0f;
        error = cudaEventElapsedTime(&msecTotal, start, stop);

        if (error != cudaSuccess)
        {
            fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
            exit(EXIT_FAILURE);
        }

        // Compute and print the performance
        float msecPerMatrixMul = msecTotal / nIter;
        double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiWA * (double)matrix_size.uiHA * (double)matrix_size.uiWB;
        double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
        printf(
            "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
            gigaFlops,
            msecPerMatrixMul,
            flopsPerMatrixMul);

		// create and start timer
		StopWatchInterface *timerMemOut = NULL;
		sdkCreateTimer(&timerMemOut);
		// start the timer
		sdkStartTimer(&timerMemOut);

        // copy result from device to host
        error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);

		sdkStopTimer(&timerMemOut);
		printf("\Memory D2H Transferring time: %f (ms)\n", sdkGetTimerValue(&timerMemOut));
		sdkDeleteTimer(&timerMemOut);

        if (error != cudaSuccess)
        {
            printf("cudaMemcpy h_CUBLAS d_C returned error code %d, line(%d)\n", error, __LINE__);
            exit(EXIT_FAILURE);
        }

        checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
    }

    // compute reference solution
    printf("Computing result using host CPU...");
    float *reference = (float *)malloc(mem_size_C);

	// create and start timer
	StopWatchInterface *timer = NULL;
	sdkCreateTimer(&timer);
	// start the timer
	sdkStartTimer(&timer);

    matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);

	sdkStopTimer(&timer);
	printf("\nCPU Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
	sdkDeleteTimer(&timer);

    printf("done.\n");

    // check result (CUBLAS)
    bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);

    if (resCUBLAS != true)
    {
        printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f);
    }

    printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL");

    // clean up memory
    free(h_A);
    free(h_B);
    free(h_C);
    free(reference);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    cudaDeviceReset();

    if (resCUBLAS == true)
    {
        return EXIT_SUCCESS;    // return value = 1
    }
    else
    {
        return EXIT_FAILURE;     // return value = 0
    }
}
Ejemplo n.º 19
0
int main( int argc, char* argv[] )
{
    // Length of vectors
  int m = atoi(argv[4]);
	unsigned int n=(256*m);
//matrix variable
 // OpenCL device memory for matrices
   cl_mem d_A;
   cl_mem d_B;
   cl_mem d_C;

//########################Vector Add Variables
// Host input vectors
    int *h_a;
    int *h_b;
    // Host output vector
    int *h_c;
    // Device input buffers
    cl_mem d_a;
    cl_mem d_b;
    // Device output buffer
    cl_mem d_c;
//	cl_kernel *kernel; 
    cl_platform_id* cpPlatform;        // OpenCL platform
    cl_device_id device_id;           // device ID
    cl_context context;               // context
    //cl_command_queue* queue;           // command queue
    //cl_command_queue queue;           // command queue
//    cl_program *program;               // program
cl_platform_id* platforms;		// platform id,
// differnt for all the device we have in the system
cl_uint platformCount; //keeps the divice count

    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(int);
 
    // Allocate memory for each vector on host
    h_a = (int*)malloc(bytes);
    h_b = (int*)malloc(bytes);
    h_c = (int*)malloc(bytes);
    // Initialize vectors on host
    int i;
    for( i = 0; i < n; i++ )
    {
        h_a[i] = i;
        h_b[i] = i;
//	printf("%d ",h_a[i]);
    }
 
    size_t globalSize, localSize; //similar to cuda
    cl_int err;//for errors
    int workgrp;
    int wrkitm;
    int num_ker;
    num_ker=atoi(argv[2]);
    wrkitm=atoi(argv[3]);// i have tried automating lots of data,
    // Number of work items in each local work group
    localSize = wrkitm ;
    // Number of total work items - localSize must be devisor
    globalSize = n;
//################################# Done vector ###################
//#############Matrix Multiplication Variables ###############
int WA,HA,WB,HB,WC,HC;
WA = n;
HA = WA;
WB = WA;
HB = WB;
WC = WA;
HC = WA;
   // set seed for rand()
   srand(2006);

   // 1. allocate host memory for matrices A and B
        //automate the size of the matrix
   unsigned int size_A = WA * HA;
   unsigned int mem_size_A = sizeof(float) * size_A;
   float* h_A = (float*) malloc(mem_size_A);

   unsigned int size_B = WB * HB;
   unsigned int mem_size_B = sizeof(float) * size_B;
   float* h_B = (float*) malloc(mem_size_B);
// 4. allocate host memory for the result C
   unsigned int size_C = WC * HC;
   unsigned int  mem_size_C = sizeof(float) * size_C;
   float* h_C = (float*) malloc(mem_size_C);
 // 2. initialize host memory
   randomInit(h_A, size_A);
   randomInit(h_B, size_B);
//######################## matrix done #######################
//mallocing for array of queues (break through)
cl_command_queue * queue = (cl_command_queue *)malloc(num_ker * sizeof(cl_command_queue));
cl_kernel *kernel=(cl_kernel *)malloc(num_ker * sizeof(cl_kernel));
cl_program *program=(cl_program *)malloc(num_ker * sizeof(cl_kernel));
//defining platform
 clGetPlatformIDs(0, NULL, &platformCount);
    cpPlatform = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
clGetPlatformIDs(platformCount, cpPlatform, NULL);//what ever is returned from last step will be used here

int choice = atoi(argv[1]);
if(choice ==1)
{
// we can have CL_DEVICE_GPU or ACCELERATOR or ALL as an option here
// we can these multiple times depending on requirements
    err = clGetDeviceIDs(cpPlatform[0],CL_DEVICE_TYPE_CPU , 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    
        printf("Error: Failed to create a device group!\n");
}

else
{
    // Get ID for the device
    err = clGetDeviceIDs(cpPlatform[1], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);

    if (err != CL_SUCCESS)

    {

        printf("Error: Failed to create a device group!\n");
}
}
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
//malloc file and kernel variable
char **file=(char **)malloc(num_ker * sizeof(char *));
char **KernelSource=(char **)malloc(num_ker * sizeof(char *));

	for(i=0;i<num_ker;i++)
	{
    queue[i] = clCreateCommandQueue(context, device_id, 0, &err);
	}
	file[0]="vectadd.cl";
        KernelSource[0] =  load_program_source(file[0]);
        file[1]="matxm.cl";
        KernelSource[1] =  load_program_source(file[1]);
for(i=0;i<num_ker;i++)
{
	// Create the compute program from the source buffer
    program[i] = clCreateProgramWithSource(context, 1,
                            (const char **) & KernelSource[i], NULL, &err);
    // Build the program executable
    clBuildProgram(program[i], 0, NULL, NULL, NULL, NULL);
    // Create the compute kernel in the program we wish to run
    kernel[i] = clCreateKernel(program[i], file[i], &err);
 }
//Vector Start
    // Create the input and output arrays in device memory for our calculation
    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
//vector finsih
//matrix start 
d_C = clCreateBuffer(context, CL_MEM_READ_WRITE,
          mem_size_A, NULL, &err);
   d_A = clCreateBuffer(context,
          CL_MEM_READ_WRITE,
          mem_size_A, h_A, &err);
   d_B = clCreateBuffer(context,
          CL_MEM_READ_WRITE,
          mem_size_B, h_B, &err);
//matrix finish
	// Write our data set into the input array in device memory
	for(i=0;i<num_ker;i++)
{
if(i=0)//for vectorADD
{
    err = clEnqueueWriteBuffer(queue[i], d_a, CL_TRUE, 0,bytes, h_a, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue[i], d_b, CL_TRUE, 0,bytes, h_b, 0, NULL, NULL);
  // Set the arguments to our compute kernel
    err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), &d_a);
    err = clSetKernelArg(kernel[i], 1, sizeof(cl_mem), &d_b);
    err = clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &d_c);
    err = clSetKernelArg(kernel[i], 3, sizeof(unsigned int), &n);
  // Get the maximum work group size for executing the kernel on the device
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }

}
else if(i=1)
{ err = clEnqueueWriteBuffer(queue[i], d_A, CL_TRUE, 0,mem_size_A, h_A, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue[i], d_B, CL_TRUE, 0,mem_size_B, h_B, 0, NULL, NULL);
 //size_t localWorkSize[2], globalWorkSize[2];

   int wA = WA;
   int wC = WC;
   err = clSetKernelArg(kernel[i], 0,
              sizeof(cl_mem), (void *)&d_C);
   err = clSetKernelArg(kernel[i], 1,
              sizeof(cl_mem), (void *)&d_A);
   err = clSetKernelArg(kernel[i], 2,
              sizeof(cl_mem), (void *)&d_B);
   err = clSetKernelArg(kernel[i], 3,
              sizeof(int), (void *)&wA);
   err = clSetKernelArg(kernel[i], 4,
              sizeof(int), (void *)&wC);

}
}
  /*  // Set the arguments to our compute kernel
    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
    err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
    err = clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
  // Get the maximum work group size for executing the kernel on the device
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }
    */
//struct timeval tim;
  //double t1,t2;

    //gettimeofday(&tim, NULL);
    //t1=tim.tv_sec+(tim.tv_usec/1000000.0);

//need to work on work size#############################
for(i=0;i<num_ker;i++)
{
err = clEnqueueNDRangeKernel(queue[i], kernel[i], 1, NULL, &globalSize, &localSize,
                                                              0, NULL, NULL);


}

//for(i=0;i<num_ker;i++)
//clFinish(queue[i]);

//gettimeofday(&tim, NULL);
  //  t2=tim.tv_sec+(tim.tv_usec/1000000.0);
//printf("GPU time %.4lf\t",(t2-t1));

for(i=0;i<num_ker;++i)
{
if(i=0)
{
clEnqueueReadBuffer(queue[i], d_c, CL_TRUE, 0,
                                bytes, h_c, 0, NULL, NULL ); 
}
else if(i=1)
{
err = clEnqueueReadBuffer(queue[i],
              d_C, CL_TRUE, 0, mem_size_C,
              h_C, 0, NULL, NULL);
   }
}  
for(i=0;i<num_ker;++i)
{
clFinish(queue[i]);
}
    // release OpenCL resources
    free(h_A);
   free(h_B);
   free(h_C);

   clReleaseMemObject(d_A);
   clReleaseMemObject(d_C);
   clReleaseMemObject(d_B);
   clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
 //   clReleaseProgram(program);
   // clReleaseKernel(kernel);
for(i=0;i<num_ker;++i)
{
    clReleaseCommandQueue(queue[i]);
    clReleaseKernel(kernel[i]);
    clReleaseProgram(program[i]);
}
    clReleaseContext(context);
 
    //release host memory
    free(h_a);
    free(h_b);
    free(h_c);
 
    return 0;
}
Ejemplo n.º 20
0
//----------------------------------------------------------
//      System initialization
//----------------------------------------------------------
static inline void initSystem(void)
{
    bool success;
    (void)success;

    // disable interrupts: disabled on msp430 by default, but other systems might need this
    DISABLE_INTS();

    // stop the watchdog: GCC disables it by default, but other compilers might not be so helpful
    watchdogStop();

    // TODO: init dynamic memory
    // platformMemInit();

    // basic, platform-specific initialization: timers, platform-specific drivers (?)
    initPlatform();

    // start energy accounting (as soon as timers are initialized)
    energyConsumerOn(ENERGY_CONSUMER_MCU);

#ifdef USE_PRINT
    // init printing to serial (makes sense only after clock has been calibrated)
    if (printInit != NULL) printInit();
#endif

    INIT_PRINTF("starting MansOS...\n");

#ifdef USE_LEDS
    INIT_PRINTF("init LED(s)...\n");
    ledsInit();
#endif
#ifdef USE_BEEPER
    beeperInit();
#endif
#ifdef RAMTEXT_START
    if ((MemoryAddress_t)&_end > RAMTEXT_START) {
        // Panic right aways on RAM overflow.
        // In case this happens, you might want to increase the address
        // specified by CONST_RAMTEXT_START in config file
        assertionFailed("Overflow between .data and .ramtext sections", __FILE__, __LINE__);
    }
#endif
#ifdef USE_ADC
    if (adcInit != NULL) {
        INIT_PRINTF("init ADC...\n");
        adcInit();
    }
#endif
#ifdef USE_RANDOM
    INIT_PRINTF("init RNG...\n");
    randomInit();
#endif
#if USE_ALARMS
    INIT_PRINTF("init alarms...\n");
    initAlarms();
#endif
#ifdef USE_RADIO
    INIT_PRINTF("init radio...\n");
    radioInit();
#endif
#ifdef USE_ADDRESSING
    INIT_PRINTF("init communication stack...\n");
    networkingInit();
#endif
#ifdef USE_EXT_FLASH
    INIT_PRINTF("init external flash...\n");
    extFlashInit();
#endif
#ifdef USE_SDCARD
    INIT_PRINTF("init SD card...\n");
    sdcardInit();
#endif
#ifdef USE_EEPROM
    INIT_PRINTF("init EEPROM...\n");
    eepromInit();
#endif
#ifdef USE_ISL29003
    INIT_PRINTF("init ISL light sensor...\n");
    success = islInit();
    if (!success) {
        INIT_PRINTF("ISL init failed!\n");
    }
#endif
#ifdef USE_ADS1115
    INIT_PRINTF("init ADS111x ADC converter chip...\n");
    adsInit();
#endif
#if USE_ADS8638
    INIT_PRINTF("init ADS8638 ADC converter chip...\n");
    ads8638Init();
#endif
#if USE_ADS8328
    INIT_PRINTF("init ADS8328 ADC converter chip...\n");
    ads8328Init();
#endif
#if USE_AD5258
    INIT_PRINTF("init AD5258 digital potentiometer...\n");
    ad5258Init();
#endif
#if USE_DAC7718
    INIT_PRINTF("init DAC7718 DAC converter chip...\n");
    dac7718Init();
#endif
#if USE_ISL1219
    INIT_PRINTF("init ISL1219 real-time clock chip...\n");
    isl1219Init();
#endif
#ifdef USE_HUMIDITY
    INIT_PRINTF("init humidity sensor...\n");
    humidityInit();
#endif
#ifdef USE_ACCEL
    INIT_PRINTF("init accelerometer...\n");
    accelInit();
#endif
#ifdef USE_TIMESYNC
    INIT_PRINTF("init base station time sync...\n");
    timesyncInit();
#endif
#ifdef USE_SMP
    INIT_PRINTF("init SSMP...\n");
    smpInit();
#endif
#ifdef USE_REPROGRAMMING
    INIT_PRINTF("init reprogramming...\n");
    bootParamsInit();
#endif
#ifdef USE_DCO_RECALIBRATION
    extern void dcoRecalibrationInit(void);
    INIT_PRINTF("init DCO recalibration...\n");
    dcoRecalibrationInit();
#endif
#ifdef USE_FS
    INIT_PRINTF("init file system...\n");
    fsInit();
#endif
#ifdef USE_FATFS
    INIT_PRINTF("init FAT file system...\n");
    fatFsInit();
    INIT_PRINTF("init POSIX-like file routines...\n");
    posixStdioInit();
#endif
#ifdef USE_WMP
    INIT_PRINTF("init WMP...\n");
    wmpInit();
#endif
#ifdef USE_SEAL_NET
    INIT_PRINTF("init SEAL networking...\n");
    sealNetInit();
#endif

    INIT_PRINTF("starting the application...\n");
}