예제 #1
0
double LCPPolyDist<Dimension,FVector,DVector>::ProcessLoop (
    bool halfspaceConstructor, int& statusCode, FVector closest[2])
{
    int i;
    double* Q = new1<double>(mNumEquations);
    double** M = new2<double>(mNumEquations, mNumEquations);

    statusCode = 0;

    bool processing = true;
    if (!BuildMatrices(M, Q))
    {
        WM5_LCPPOLYDIST_FUNCTION(CloseLog());
        closest[0] = FVector::ZERO;
        closest[1] = FVector::ZERO;
        processing = false;
    }

    WM5_LCPPOLYDIST_FUNCTION(PrintMatrices(M, Q));

    int tryNumber = 0;
    double distance = -Mathd::MAX_REAL;
    while (processing)
    {
        double* zResult = new1<double>(mNumEquations);
        double* wResult = new1<double>(mNumEquations);
        int lcpStatusCode;

        LCPSolver solver(mNumEquations, M, Q, zResult, wResult, lcpStatusCode);
        WM5_UNUSED(solver);

        switch (lcpStatusCode)
        {
        case SC_FOUND_TRIVIAL_SOLUTION:
        {
            statusCode = SC_FOUND_TRIVIAL_SOLUTION;
            break;
        }
        case SC_FOUND_SOLUTION:
        {
            DVector lcpClosest[2];
            for (i = 0; i < mDimension; ++i)
            {
                lcpClosest[0][i] = (float)zResult[i];
                lcpClosest[1][i] = (float)zResult[i + mDimension];
            }

            DVector diff = lcpClosest[0] - lcpClosest[1];
            distance = diff.Length();

            statusCode = VerifySolution(lcpClosest);

            if (!halfspaceConstructor)
            {
                WM5_LCPPOLYDIST_FUNCTION(VerifyWithTestPoints(lcpClosest,
                    statusCode));
            }

            for (i = 0; i < mDimension; ++i)
            {
                closest[0][i] = (float)lcpClosest[0][i];
                closest[1][i] = (float)lcpClosest[1][i];
            }
            processing = false;
            break;
        }
        case SC_CANNOT_REMOVE_COMPLEMENTARY:
        {
            WM5_LCPPOLYDIST_FUNCTION(LogRetries(tryNumber));
            if (tryNumber == 3)
            {
                statusCode = SC_CANNOT_REMOVE_COMPLEMENTARY;
                processing = false;
                break;
            }
            MoveHalfspaces(mNumFaces1, mA1, mB1);
            WM5_LCPPOLYDIST_FUNCTION(LogHalfspaces(mNumFaces1, mA1, mB1));
            MoveHalfspaces(mNumFaces2, mA2, mB2);
            WM5_LCPPOLYDIST_FUNCTION(LogHalfspaces(mNumFaces2, mA2, mB2));
            BuildMatrices(M, Q);
            ++tryNumber;
            break;
        }
        case SC_EXCEEDED_MAX_RETRIES:
        {
            WM5_LCPPOLYDIST_FUNCTION(LCPSolverLoopLimit());
            statusCode = SC_EXCEEDED_MAX_RETRIES;
            processing = false;
            break;
        }
        }
        delete1(wResult);
        delete1(zResult);
    }

    delete2(M);
    delete1(Q);

    return distance;
}
예제 #2
0
int main(int argc, char *argv[]){

	if (MODE == 5){

		printf("---OpenCL Test Code---\n\n");


		cl_int errNum;
		cl_uint numPlatforms;
		cl_platform_id *platforms = NULL;
		cl_uint numDevices;
		cl_device_id *devices = NULL;

		//platform info fields
		char vendor[1024], name[1024], version[1024];

		//device info fields
		size_t MAX_WORK_GROUP_SIZE;
		cl_ulong GLOBAL_MEM_CACHE_SIZE, GLOBAL_MEM_SIZE, LOCAL_MEM_SIZE, GLOBAL_MEM_CACHELINE_SIZE;
		cl_uint MAX_COMPUTE_UNITS, MAX_WORK_ITEM_DIMENSIONS;
		size_t MAX_WORK_ITEM_SIZES[3];
		char DEVICE_NAME[1024], DEVICE_VENDOR[1024], DEVICE_VERSION[1024], DRIVER_VERSION[1024], EXTENSIONS[2048];
		cl_device_mem_cache_type GLOBAL_MEM_CACHE_TYPE;


		//printf("Getting number of OpenCL Platforms...\n");
		errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
		if (errNum != CL_SUCCESS)
		{
			printf("Failed to get number of OpenCL platforms.\n");
			return 0;
		}
		else
		{

			//printf("found %d.\n", numPlatforms);
		}

		//printf("Allocating space for the platform info...\n");
		platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id));

		printf("---Platform Info---\n");
		errNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
		if (errNum != CL_SUCCESS)
		{
			printf("Failed to get platform info.\n");
			return 0;
		}
		else
		{
			clGetPlatformInfo (platforms[0], CL_PLATFORM_VENDOR, sizeof(vendor), vendor, NULL);
			clGetPlatformInfo (platforms[0], CL_PLATFORM_NAME, sizeof(name), name, NULL);
			clGetPlatformInfo (platforms[0], CL_PLATFORM_VERSION, sizeof(version), version, NULL);

			//printf("Got platform info.\n");
			printf("Vendor: \t%s\n", vendor);
			printf("Name:   \t%s\n", name);
			printf("Version:\t%s\n", version);
		}

		//printf("Getting number of devices...\n");
		errNum = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
		if (errNum != CL_SUCCESS)
		{
			printf("Failed to get number of devices.\n");
			return 0;
		}
		else
		{
	    	//printf("Found %d.\n", numDevices);
	    }

		//printf("Allocating space for device info...\n");
		devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));

		printf("\n---Device Info---");
		errNum = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
		if (errNum != CL_SUCCESS)
		{
			printf("Failed to get device info.\n");
			return 0;
		}
		else
		{

			int i, j = 0;
			for (i = 0; i < numDevices; i++ )
			{
				printf("\nDevice ID: %d\n", i+1);
				clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(DEVICE_NAME), DEVICE_NAME, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(DEVICE_VENDOR), DEVICE_VENDOR, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(DEVICE_VERSION), DEVICE_VERSION, NULL);
				clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(DRIVER_VERSION), DRIVER_VERSION, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_EXTENSIONS, sizeof(EXTENSIONS), EXTENSIONS, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(MAX_COMPUTE_UNITS), &MAX_COMPUTE_UNITS, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(GLOBAL_MEM_SIZE), &GLOBAL_MEM_SIZE, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(LOCAL_MEM_SIZE), &LOCAL_MEM_SIZE, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(MAX_WORK_ITEM_DIMENSIONS), &MAX_WORK_ITEM_DIMENSIONS, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(MAX_WORK_ITEM_SIZES), MAX_WORK_ITEM_SIZES, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(MAX_WORK_GROUP_SIZE), &MAX_WORK_GROUP_SIZE, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(GLOBAL_MEM_CACHE_SIZE), &GLOBAL_MEM_CACHE_SIZE, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(GLOBAL_MEM_CACHELINE_SIZE), &GLOBAL_MEM_CACHELINE_SIZE, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(GLOBAL_MEM_CACHE_TYPE), &GLOBAL_MEM_CACHE_TYPE, NULL);


				printf("Device Name:\t%s\n", DEVICE_NAME);
				printf("Device Vendor:\t%s\n", DEVICE_VENDOR);
				printf("Device Version:\t%s\n", DEVICE_VERSION);
				printf("Driver Version:\t%s\n", DRIVER_VERSION);
				printf("EXTENSIONS:\t%s\n", EXTENSIONS);
				printf("Number of CUs:\t%d\n", MAX_COMPUTE_UNITS);
				printf("GMem:\t\t%lld (Bytes)\n", (long long) GLOBAL_MEM_SIZE);
				printf("GMem $ Size:\t%lld (Bytes)\n", (long long) GLOBAL_MEM_CACHE_SIZE);
				printf("GMem $ Line:\t%lld (Bytes)\n", (long long) GLOBAL_MEM_CACHELINE_SIZE);
				if(GLOBAL_MEM_CACHE_TYPE == CL_NONE)
				{
					printf("GMem $ Type:\tCL_NONE\n");
				}
				else if(GLOBAL_MEM_CACHE_TYPE == CL_READ_ONLY_CACHE)
				{
					printf("GMem $ Type:\tCL_READ_ONLY_CACHE\n");
				}

				else if(GLOBAL_MEM_CACHE_TYPE == CL_READ_WRITE_CACHE)
				{
					printf("GMem $ Type:\tCL_READ_WRITE_CACHE\n");
				}
				printf("LMem:\t\t%lld (Bytes)\n", (long long) LOCAL_MEM_SIZE);
				printf("Work Group Size:%d (Max)\n", (int) MAX_WORK_GROUP_SIZE);
				printf("Work Item Dim:\t%d (Max)\n", MAX_WORK_ITEM_DIMENSIONS);
				printf("Work Item Size:\t");
				for(j = 0; j < MAX_WORK_ITEM_DIMENSIONS; j ++)
				{
						if (j != (MAX_WORK_ITEM_DIMENSIONS -1))
						printf("%d, ", (int) MAX_WORK_ITEM_SIZES[j]);

						if (j == (MAX_WORK_ITEM_DIMENSIONS -1))
						printf("%d ", (int) MAX_WORK_ITEM_SIZES[j]);
				}
				printf("(Max)\n");

			}

				//printf("Got device info.\n");
		}


	}

	else if (MODE == 4){
		cl_context context = 0;
	    cl_command_queue commandQueue = 0;
	    cl_program program = 0;
	    cl_device_id device = 0;

	    //Create an OpenCL context on first available platform
	    context = CreateContext();
	    if (context == NULL)
	    {
	        printf("Failed to create OpenCL context.\n");
	        return 1;
	    }

	    //Create a command-queue on the first device available on the created context
	    commandQueue = CreateCommandQueue(context, &device);
	    if (commandQueue == NULL)
	    {
	    	printf("Failed to create commandQueue.\n");
	    	Cleanup(context, commandQueue, program, NULL);
	    	return 1;
	    }

	    // Create OpenCL program and store the binary for future use.
	    printf("Attempting to create kernel binary from source.\n");
	    program = CreateProgram(context, device, KERNELPATHIN);
	    if (program == NULL)
	    {
	    	printf("Failed to create Program");
	    	Cleanup(context, commandQueue, program, NULL);
	    	return 1;
	    }

	    printf("Kernel is saved.\n");
	    if (SaveProgramBinary(program, device, KERNELPATHOUT) == false)
	    {
	        printf("Failed to write program binary.\n");
	        Cleanup(context, commandQueue, program, NULL);
	        return 1;
	     }

	    //printf("---Done---");

	    //return 1;

	}
	else if (MODE == 3){

		//todo free remaining objects not passed to cleanup

		//profiling
		int write_bytes = 0;
		int read_bytes = 0;
		/*unsigned long long start_cycles, stop_cycles;
		unsigned long long start_setup, stop_setup;
		unsigned long long start_write, stop_write;
		unsigned long long start_read, stop_read;
		unsigned long long start_finalize, stop_finalize;
		struct timespec start_time_t, stop_time_t;*/


		printf("Stream Mode\n\n");
		//clock_gettime(CLOCK_MONOTONIC, &start_time_t);
		//start_cycles = rdtsc();


		int i;
		time_t t;
		srand((unsigned) time(&t));

	    // Create the two input vectors
	    printf("\nHostside malloc(s)\n");
	    fflush(stdout);
	    int *A = (int*)malloc(sizeof(int)*(SIZE*SIZE));
	    int *B = (int*)malloc(sizeof(int)*(SIZE*SIZE));
	    int *C = (int*)malloc(sizeof(int)*(SIZE*SIZE));

	    //profile
	    //bytes += 3 * sizeof(int)*(SIZE*SIZE);

	    printf("\nHostside mat init\n");
	    fflush(stdout);
	    for(i = 0; i < (SIZE*SIZE); i++) {
	        A[i] = B[i] = rand() % 10 + 1;;
	    }


	    //print matrix
    	printf("Matrix A[%d][%d]:\n", SIZE, SIZE);
	   	for(i = 0; i < (SIZE*SIZE); i++)
	    {
	   		printf("%3d ", A[i]);
	   		if(((i + 1) % SIZE) == 0)
	   			printf("\n");
	    }

	    //print matrix
	   	printf("\nMatrix B[%d][%d]:\n", SIZE, SIZE);
	    for(i = 0; i < (SIZE*SIZE); i++)
	    {
	    	printf("%3d ", B[i]);
	        if(((i + 1) % SIZE) == 0)
	        	printf("\n");
	    }


	    //syscall(STATS_RESET);


	    //Get platform and device information
	    cl_context context = 0;
	    cl_command_queue commandQueue = 0;
	    cl_program program = 0;
	    cl_device_id device = 0;
	    cl_kernel kernel = 0;
	    cl_uint err = 0;
	    //char *filepath = NULL;

	    //Create the context
	    printf("\nCreateContext\n");
	    fflush(stdout);
	    context = CreateContext();
	    if (context == NULL)
	    {
	    	printf("Failed to create OpenCL context.\n");
	    	return 1;
	    }

	   /* printf("\nEnd CreateContext\n");
	    fflush(stdout);*/

	    //Create a command-queue on the first device available on the created context
	    printf("\nCreateCommandQueue\n");
	    fflush(stdout);
	    commandQueue = CreateCommandQueue(context, &device);
	    if (commandQueue == NULL)
	    {
	    	printf("Failed to create command queue.\n");
	    	Cleanup(context, commandQueue, program, NULL);
	    	return 1;
	    }

	    //create the program from the binary
	    //program = CreateProgramFromBinary(context, device, "/home/stardica/Desktop/Kernels/vector.cl.bin.GPU");
	    //strcat(KERNELPATHOUT, ".GPU")
	    printf("\nCreateProgramFromBinary\n");
	    fflush(stdout);
	    program = CreateProgramFromBinary(context, device, KERNEL);
	    if (program == NULL)
	    {
	    	printf("Failed to load kernel binary,\n");
	    	Cleanup(context, commandQueue, program, NULL);
	    	return 1;
	    }



	    // Create OpenCL kernel
	    printf("\nclCreateKernel\n");
	    fflush(stdout);
	    kernel = clCreateKernel(program, "Matrix", NULL);
	    if (kernel == NULL)
	    {
	    	printf("Failed to create kernel.\n");
	    	Cleanup(context, commandQueue, program, NULL);
	    	return 1;
	    }

	    cl_mem a_mem_obj = 0;
	    cl_mem b_mem_obj = 0;
	    cl_mem c_mem_obj = 0;

  	    //Create memory buffers on the device for each vector

	    printf("\nclCreateBuffer(s)\n");
	    fflush(stdout);
	    if(LOCALMEM == 1 && CACHEDMEM == 0)
		{
			//this creates uncached buffers in the GPU's local memory
			#if M2S_CGM_OCL_SIM
			{
				a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
				b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
				c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);

			}
			#else
			{
				a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
				b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
				c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
			}
			#endif
		}

		if(SYSMEM == 1 && CACHEDMEM == 0)
		{
			//this creates uncached buffers in the system memory
			#if M2S_CGM_OCL_SIM
			{
				a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
				b_mem_obj = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
				c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
			}
			#else
			{
				a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
				b_mem_obj = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
				c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
			}
			#endif
		}

		if(SYSMEM == 1 && CACHEDMEM == 1)
		{
			//this creates cached buffers in the system memory.
			#if M2S_CGM_OCL_SIM
			{
				a_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
				b_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
				c_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
			}
			#else
			{
				a_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
				b_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
				c_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
			}
			#endif
		}

	    if (a_mem_obj == NULL || b_mem_obj == NULL  || c_mem_obj == NULL)
	    {
	    	printf("Failed to create memory objects.\n");
	    	Cleanup(context, commandQueue, program, kernel);
	    	return 1;
	    }

	    //Copy the lists A and B to their respective memory buffers
	    printf("\nclEnqueueWriteBuffer(s)\n");
	    fflush(stdout);
	    write_bytes += 2 * sizeof(int)*(SIZE*SIZE);
	   // start_write = rdtsc();
	    clEnqueueWriteBuffer(commandQueue, a_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), A, 0, NULL, NULL);
	    clEnqueueWriteBuffer(commandQueue, b_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), B, 0, NULL, NULL);
	   // stop_write = rdtsc();


	    // Set the arguments of the kernel
	    int *size = (int *)SIZE;
	    printf("\nclSetKernelArg(s)\n");
	    fflush(stdout);
	    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&c_mem_obj);
	    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&a_mem_obj);
	    err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&b_mem_obj);
	    err = clSetKernelArg(kernel, 3, sizeof(int), (void *)&size);
	    if (err != CL_SUCCESS)
	    {
	    	printf("Kernel args not set.\n");
	    	return 1;
	    }

	    // Execute the OpenCL kernel on the list
	    size_t GlobalWorkSize[2], LocalWorkSize[2];

	    //Rember that in OpenCL we need to express the globalWorkSize in
	    //terms of the total number of threads. The underlying OpenCL API
	    //will look at the globalWorkSize and divide by the localWorkSize
	    //to arrive at a 64 by 64 NDRange of 16 by 16 work groups.

	    GlobalWorkSize[0] = GWS_0;//SIZE*SIZE*SIZE; // Process the entire lists
	    GlobalWorkSize[1] = GWS_1;//SIZE*SIZE*SIZE; // Process the entire lists
	    LocalWorkSize[0] = LWS_0; //SIZE Divide work items into groups of 64
	    LocalWorkSize[1] = LWS_1; //SIZE Divide work items into groups of 64


	    //used null for local, lets OpenCL determine the best local size.
	    //err = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL);
	    printf("\nclEnqueueNDRangeKernel\n");
	    fflush(stdout);
	    err = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL);
	    if (err != CL_SUCCESS)
	    {
	    	printf("ND range not enqueued. Code: %d\n", err);
	    	return 1;
	    }


	    //Read the memory buffer C on the device to the local variable C
	    printf("\nclEnqueueReadBuffer\n");
	    fflush(stdout);
	    read_bytes += sizeof(int)*(SIZE*SIZE);
	    //start_read = rdtsc();
	    err = clEnqueueReadBuffer(commandQueue, c_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), C, 0, NULL, NULL);
	   // stop_read = rdtsc();
	    if (err != CL_SUCCESS)
	    {
	    	printf("Buffer not returned.\n");
	    	return 1;
	    }

	    //syscall(STATS_STOP);

	    //print matrix
	    printf("\nMatrix C[%d][%d] = A[%d][%d]*B[%d][%d]:\n", SIZE, SIZE, SIZE, SIZE, SIZE, SIZE);
	    for(i = 0; i < (SIZE*SIZE); i++)
	    {
	    	printf("%3d ", C[i]);
	        if(((i + 1) % SIZE) == 0)
	        printf("\n");
	    }

	    printf("\nHostside clean up\n");
	    fflush(stdout);
	    err = clFlush(commandQueue);
	    err = clFinish(commandQueue);
	    Cleanup(context, commandQueue, program, kernel);
	    err = clReleaseMemObject(a_mem_obj);
	    err = clReleaseMemObject(b_mem_obj);
	    err = clReleaseMemObject(c_mem_obj);
	    free(A);
	    free(B);
	    free(C);

	    //printf("---Done---");



	    /*stop_cycles = rdtsc();
	    clock_gettime(CLOCK_MONOTONIC, &stop_time_t);
	    printf("Total cycles = %llu\n", (stop_cycles - start_cycles));

	    long int time_s = stop_time_t.tv_nsec - start_time_t.tv_nsec;
	    printf("Approximate runtime (check) = %ld ms\n", (time_s/1000000));

	    printf("Bytes written %d\n", write_bytes);
	    printf("transfer cycles = %llu\n", (stop_write - start_write));
	    printf("start at = %llu\n", (start_write - start_cycles));

	    printf("Bytes read %d\n", read_bytes);
	    printf("transfer cycles = %llu\n", (stop_read - start_read));
	    printf("start at = %llu\n", (start_read - start_cycles));*/

	}
	else if (MODE == 2){

		printf("Multi Thread Mode\n");
		//cal this:
		//assignToThisCore(0);//assign to core 0,1,2,...

		unsigned long long a, b;
	    int i = 0;
	    int j = 0;
	    int k = 0;

		LoadMatrices();

		pthread_t tid[SIZE*SIZE];


		//printf("waiting\n");
		//start our threads
		a = rdtsc();
		syscall(BEGIN_PARALLEL_SECTION);

		for(i=0;i<SIZE;i++){
			for(j=0;j<SIZE;j++){
				struct RowColumnData *RCData = (struct RowColumnData *) malloc(sizeof(struct RowColumnData));
				RCData->RowNum = i;
				RCData->ColumnNum = j;
				//printf("Thread create %d Row %d Col %d\n", k, RCData->RowNum, RCData->ColumnNum);
				pthread_create(&tid[k], NULL, RowColumnMultiply, RCData);
				k++;
			}
		}

		//Join threads////////////////////////////
		for (i=0;i<NUM_THREADS;i++)
		{
			pthread_join(tid[i], NULL);
		}
		syscall(END_PARALLEL_SECTION);
		b = rdtsc();

		PrintMatrices();



		//printf("\nend clock Cycles: %llu\n", b);
		printf("\nDone. Number of clock Cycles: %llu\n", b-a);

	}
	else if (MODE == 1)
	{

		printf("Single Thread Mode\n\n");
		//unsigned long long a, b;
		//a = rdtsc();
		//time_t t;
		int i,j,k;

		//srand((unsigned) time(&t));

		LoadMatrices();

		//multiply mats/////////////////////////
		for (i=0;i<SIZE;i++){
			for(j=0;j<SIZE;j++){
				for(k=0;k<SIZE;k++){
					matC[i][j] = matC[i][j] + (matA[i][k] * matB[k][j]);
					}
			}
		}

		PrintMatrices();

		//b = rdtsc();
		//printf("\nDone. Number of clock Cycles: %llu\n", b-a);
	}
	else if (MODE == 0)
	{
		printf("---Misc Tests---\n\n");

		printf("size of long long is %d\n", (int) sizeof(long long));
		printf("size of long is %d\n", (int) sizeof(long));
		printf("size of int is %d\n", (int) sizeof(int));
		printf("size of short is %d\n", (int) sizeof(short));
		printf("size of char * %d\n", (int) sizeof(char *));
		printf("size of unsigned int (word) %d\n", (int) sizeof(unsigned int));

		char *string = "test string";
		printf("Here is the string 1: \"%s\"\n", string);

		//Using the struct
		//set string variable and point to print_me.
		object.string = strdup(string);
		object.print_me = (void (*)(void *)) print_me;

		//use of print_me
		object.print_me(object.string);

		//pointer fun
		struct Object *ptr = &object;
		printf("this is the value of the pointer to struct object: %p\n", ptr);
		object.next=&object;
		printf("this is the value of the pointer to struct object: %p\n", object.next);
		object_ptr = &object;
		object_ptr->next = &object;
		printf("this is the value of the pointer to struct object: %p\n", object_ptr->next);

		//Macro fun
		PRINT(ptr, ptr);
		PRINT(object.next, object.next);
		PRINT(object_ptr->next, object_ptr->next);

		int mmu_page_size = 1 << 12;

		printf("mmu_papge_size = %d\n", mmu_page_size);


		//setjmp and longjmp fun
		/*jmp_buf environment;
		int i;

		i = setjmp(environment);
		printf("\n\nsetjmp returned = %d\n", i);

		printf("Env 1:\n");

		int x = 0;
		for(x = 0; x < 6; x++)
		{
			printf("  %x\n", environment[x]);
		}


		if (i < 3)
		{
			longjmp(environment, 3);
		}

		printf("longjmp finished with i = %d\n", i);*/


	}
	else
	{

		printf("---Invalid Mode Set---\n\n");

	}

	printf("\n---Done---\n");
	return 1;
}