Пример #1
0
RuntimeScheduler::RuntimeScheduler() :
		npart(0), SyncBarrier(TOTAL_DEVICES + 1) {

	_gmem = new MemorySystem();
	LowLevelMemAllocator::_memSys = (MemorySystem*) _gmem;

#if defined(STATIC)
	ds = new StaticScheduler();
#endif
#if defined(DEMAND_DRIVEN)
	ds = new DemandScheduler();
#endif
	unsigned int GPU_ID = 0;

	for (unsigned dev = 0; dev < TOTAL_DEVICES; dev++) {

		npart += TOTAL_CORES_TYPES[dev];

		switch (DEVICE_TYPES[dev]) {
		case CPU_X86:
			SysDevices[dev] = new DeviceX86(dev, (MemorySystem*) _gmem, ds,
					&SyncBarrier);
			ds->setQueueDevice(dev, SysDevices[dev]->INBOX);
			break;
		case GPU_CUDA:
			SysDevices[dev] = new DeviceCuda(dev, GPU_ID++,
					(MemorySystem*) _gmem, ds, &SyncBarrier);
			ds->setQueueDevice(dev, SysDevices[dev]->INBOX);
			break;
		default:
			SysDevices[dev] = NULL;
			break;
		}
	}

	npart = roundUpToNextPowerOfTwo(npart);

	SyncBarrier.wait();
	synchronize();
}
Пример #2
0
// Main function 
// *********************************************************************
int main(int argc, char **argv)
{
	
    // time stuff:
    time_t dtimer;
    time_t htimer;
	
    // set and log Global and Local work size dimensions
    
    szLocalWorkSize = roundUpToNextPowerOfTwo(CHARACTERS);
	//  szGlobalWorkSize = NODES * SITES * CHARACTERS;
    szGlobalWorkSize = roundUpToNextPowerOfTwo(SITES) * roundUpToNextPowerOfTwo(CHARACTERS);
    localMemorySize = roundUpToNextPowerOfTwo(CHARACTERS);
    printf("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", 
           szGlobalWorkSize, szLocalWorkSize, 
           (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); 
	
    // Allocate and initialize host arrays 
    //*************************************************
    printf( "Allocate and Init Host Mem...\n"); 
	
    node_cache      = (void*)malloc (sizeof(clfp)*CHARACTERS*SITES);
    parent_cache    = (void*)malloc (sizeof(clfp)*CHARACTERS*SITES);
    scalings        = (void*)malloc (sizeof(int)*CHARACTERS*SITES);
    model           = (void*)malloc (sizeof(clfp)*CHARACTERS*CHARACTERS);
    Golden          = (void*)malloc (sizeof(clfp)*CHARACTERS*SITES);
	
    long tempindex = 0;
    // initialize the vectors
    for (tempindex = 0; tempindex < (CHARACTERS*SITES); tempindex++)
    {
        ((fpoint*)node_cache)[tempindex] = 1./CHARACTERS; // this is just dummy filler
        ((fpoint*)Golden)[tempindex] = 1.;
        ((fpoint*)parent_cache)[tempindex] = 1.;
        ((int*)scalings)[tempindex] = 0;
    }
	
    // initialize the model
    for (tempindex = 0; tempindex < (CHARACTERS*CHARACTERS); tempindex++)
    {
        ((fpoint*)model)[tempindex] = 1./CHARACTERS; // this is just dummy filler
    }
	
    //**************************************************
    dtimer = time(NULL); 
	
    //Get an OpenCL platform
    ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL);
	
    printf("clGetPlatformID...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(EXIT_FAILURE);
    }
	
    //Get the devices
    ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
    printf("clGetDeviceIDs...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(EXIT_FAILURE);
    }
    
    size_t maxWorkGroupSize;
    ciErr1 = clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE, 
							 sizeof(size_t), &maxWorkGroupSize, NULL);
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Getting max work group size failed!\n");
    }
    printf("Max work group size: %lu\n", (unsigned long)maxWorkGroupSize);
    
    cl_uint extcheck;
    ciErr1 = clGetDeviceInfo(cdDevice, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, 
							 sizeof(cl_uint), &extcheck, NULL);
    if (extcheck ==0 ) 
    {
        printf("Device does not support double precision.\n");
    }
    
    size_t returned_size = 0;
    cl_char vendor_name[1024] = {0};
    cl_char device_name[1024] = {0};
    ciErr1 = clGetDeviceInfo(cdDevice, CL_DEVICE_VENDOR, sizeof(vendor_name), 
							 vendor_name, &returned_size);
    ciErr1 |= clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(device_name), 
							  device_name, &returned_size);
    assert(ciErr1 == CL_SUCCESS);
    printf("Connecting to %s %s...\n", vendor_name, device_name);
	
    //Create the context
    cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1);
    printf("clCreateContext...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(EXIT_FAILURE);
    }
	
    // Create a command-queue
    cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErr1);
    printf("clCreateCommandQueue...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(EXIT_FAILURE);
    }
	
    // Allocate the OpenCL buffer memory objects for source and result on the device GMEM
    cmNode_cache = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, 
								  sizeof(clfp) * CHARACTERS * SITES, NULL, &ciErr1);
    cmModel = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, 
							 sizeof(clfp) * CHARACTERS * CHARACTERS, NULL, &ciErr2);
    ciErr1 |= ciErr2;
    cmParent_cache = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, 
									sizeof(clfp) * CHARACTERS * SITES, NULL, &ciErr2);
    ciErr1 |= ciErr2;
    cmScalings = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE,
								sizeof(cl_int) * CHARACTERS * SITES, NULL, &ciErr2);
    ciErr1 |= ciErr2;
    printf("clCreateBuffer...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(EXIT_FAILURE);
    }
	
	// Create the program
    
	// Read the OpenCL kernel in from source file

	const char *program_source = "\n" \
	"#pragma OPENCL EXTENSION cl_khr_fp64: enable																				\n" \
	"" FLOATPREC                                                                                                                    \
	"__kernel void FirstLoop(__global const fpoint* node_cache, __global const fpoint* model, __global fpoint* parent_cache, 	\n" \
    "    __local fpoint* nodeScratch, __local fpoint * modelScratch, int nodes, int sites, int characters,						\n" \
    "   __global int* scalings, fpoint uflowthresh, fpoint scalar)																\n" \
	"{																															\n" \
	"   int parentCharGlobal = get_global_id(0); // a unique global ID for each parentcharacter									\n" \
    "   int parentCharLocal = get_local_id(0); // a local ID unique within the site.											\n" \
	"	if ((parentCharGlobal/characters) >= sites) return;																		\n" \
	"	if (parentCharLocal >= characters) return;																				\n" \
	"   nodeScratch[parentCharLocal] = node_cache[parentCharGlobal];															\n" \
    "   modelScratch[parentCharLocal] = model[parentCharLocal * characters + parentCharLocal];									\n" \
	"	barrier(CLK_LOCAL_MEM_FENCE);																							\n" \
	"   fpoint sum = 0.;																										\n" \
    "   long myChar;																											\n" \
    "   for (myChar = 0; myChar < characters; myChar++)																			\n" \
    "   {																														\n" \
    "       sum += nodeScratch[myChar] * modelScratch[myChar];																	\n" \
    "   }																														\n" \
    "   barrier(CLK_LOCAL_MEM_FENCE);																							\n" \
    "   while (parent_cache[parentCharGlobal] < uflowthresh)																	\n" \
    "   {																														\n" \
    "       parent_cache[parentCharGlobal] *= scalar;																			\n" \
    "    	scalings[parentCharGlobal] += 1;																					\n" \
    "	}																														\n" \
	"   parent_cache[parentCharGlobal] *= sum;																					\n" \
	"}																															\n" \
	"\n";

      
//	printf("LoadProgSource (%s)...\n", cSourceFile); 
//	char *program_source = load_program_source(cSourceFile, argv[0],  &szKernelLength);
	cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char**)&program_source,
	                                         NULL, &ciErr1);
	
    printf("clCreateProgramWithSource...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(EXIT_FAILURE);
    }
	
    ciErr1 = clBuildProgram(cpProgram, 1, &cdDevice, NULL, NULL, NULL);
    printf("clBuildProgram...\n"); 
	
    // Shows the log
    char* build_log;
    size_t log_size;
    // First call to know the proper size
    clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
    build_log = new char[log_size+1];   
    // Second call to get the log
    clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
    build_log[log_size] = '\0';
    printf(build_log);
    delete[] build_log;
	
    if (ciErr1 != CL_SUCCESS)
    {
		printf("%i\n", ciErr1); //prints "1"
		switch(ciErr1)
		{
			case   CL_INVALID_PROGRAM: printf("CL_INVALID_PROGRAM\n"); break;
			case   CL_INVALID_VALUE: printf("CL_INVALID_VALUE\n"); break;
			case   CL_INVALID_DEVICE: printf("CL_INVALID_DEVICE\n"); break;
			case   CL_INVALID_BINARY: printf("CL_INVALID_BINARY\n"); break; 
			case   CL_INVALID_BUILD_OPTIONS: printf("CL_INVALID_BUILD_OPTIONS\n"); break;
			case   CL_COMPILER_NOT_AVAILABLE: printf("CL_COMPILER_NOT_AVAILABLE\n"); break;
			case   CL_BUILD_PROGRAM_FAILURE: printf("CL_BUILD_PROGRAM_FAILURE\n"); break;
			case   CL_INVALID_OPERATION: printf("CL_INVALID_OPERATION\n"); break;
			case   CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
			default: printf("Strange error\n"); //This is printed
		}
		printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
    }
	
    // Create the kernel
    ckKernel = clCreateKernel(cpProgram, "FirstLoop", &ciErr1);
    printf("clCreateKernel (FirstLoop)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(EXIT_FAILURE);
    }
	
	
    int tempNodeCount = NODES;
    int tempSiteCount = SITES;
    int tempCharCount = CHARACTERS;
	
    // Set the Argument values
    ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmNode_cache);
    ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmModel);
    ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmParent_cache);
    ciErr1 |= clSetKernelArg(ckKernel, 3, localMemorySize * sizeof(fpoint), NULL);
    ciErr1 |= clSetKernelArg(ckKernel, 4, localMemorySize * sizeof(fpoint), NULL);
    ciErr1 |= clSetKernelArg(ckKernel, 5, sizeof(cl_int), (void*)&tempNodeCount);
    ciErr1 |= clSetKernelArg(ckKernel, 6, sizeof(cl_int), (void*)&tempSiteCount);
    ciErr1 |= clSetKernelArg(ckKernel, 7, sizeof(cl_int), (void*)&tempCharCount);
    ciErr1 |= clSetKernelArg(ckKernel, 8, sizeof(cl_mem), (void*)&cmScalings);
    ciErr1 |= clSetKernelArg(ckKernel, 9, sizeof(clfp), (void*)&uflowThresh);
    ciErr1 |= clSetKernelArg(ckKernel, 10, sizeof(clfp), (void*)&scalar);
    printf("clSetKernelArg 0 - 10...\n\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(EXIT_FAILURE);
    }
	
    // --------------------------------------------------------
    // Start Core sequence... copy input data to GPU, compute, copy results back
	
    // Asynchronous write of data to GPU device
    ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmNode_cache, CL_FALSE, 0, 
								  sizeof(clfp) * CHARACTERS * SITES, node_cache, 0, NULL, NULL);
    ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmModel, CL_FALSE, 0, 
								   sizeof(clfp) * CHARACTERS * CHARACTERS, model, 0, NULL, NULL);
    ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmParent_cache, CL_FALSE, 0, 
								   sizeof(clfp) * CHARACTERS * SITES, parent_cache, 0, NULL, NULL);
    ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmScalings, CL_FALSE, 0,
								   sizeof(cl_int) * CHARACTERS * SITES, scalings, 0, NULL, NULL);
    printf("clEnqueueWriteBuffer (node_cache, parent_cache and model)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(EXIT_FAILURE);
    }
	
    // Launch kernel
    
    int nodeIndex;
	
	//    clock_gettime(CLOCK_REALTIME, &begin);
    
    printf("clEnqueueNDRangeKernel (FirstLoop)...\n"); 
    for (nodeIndex = 0; nodeIndex < NODES; nodeIndex++)
    {
		
        ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, 
										&szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
        
        if (ciErr1 != CL_SUCCESS)
        {
            printf("%i\n", ciErr1); //prints "1"
            switch(ciErr1)
            {
                case   CL_INVALID_PROGRAM_EXECUTABLE: printf("CL_INVALID_PROGRAM_EXECUTABLE\n"); break;
                case   CL_INVALID_COMMAND_QUEUE: printf("CL_INVALID_COMMAND_QUEUE\n"); break;
                case   CL_INVALID_KERNEL: printf("CL_INVALID_KERNEL\n"); break;
                case   CL_INVALID_CONTEXT: printf("CL_INVALID_CONTEXT\n"); break;   
                case   CL_INVALID_KERNEL_ARGS: printf("CL_INVALID_KERNEL_ARGS\n"); break;
                case   CL_INVALID_WORK_DIMENSION: printf("CL_INVALID_WORK_DIMENSION\n"); break;
                case   CL_INVALID_GLOBAL_WORK_SIZE: printf("CL_INVALID_GLOBAL_WORK_SIZE\n"); break;
                case   CL_INVALID_GLOBAL_OFFSET: printf("CL_INVALID_GLOBAL_OFFSET\n"); break;
                case   CL_INVALID_WORK_GROUP_SIZE: printf("CL_INVALID_WORK_GROUP_SIZE\n"); break;
                case   CL_INVALID_WORK_ITEM_SIZE: printf("CL_INVALID_WORK_ITEM_SIZE\n"); break;
					//          case   CL_MISALIGNED_SUB_BUFFER_OFFSET: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
                case   CL_INVALID_IMAGE_SIZE: printf("CL_INVALID_IMAGE_SIZE\n"); break;
                case   CL_OUT_OF_RESOURCES: printf("CL_OUT_OF_RESOURCES\n"); break;
                case   CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n"); break;
                case   CL_INVALID_EVENT_WAIT_LIST: printf("CL_INVALID_EVENT_WAIT_LIST\n"); break;
                case   CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
                default: printf("Strange error\n"); //This is printed
			}
			printf("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
			Cleanup(EXIT_FAILURE);
        }
		
		//      ciErr1 = clEnqueueBarrier(cqCommandQueue);
		
    }
    
	//    clock_gettime(CLOCK_REALTIME, &end);
	
    // Synchronous/blocking read of results, and check accumulated errors
    ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmParent_cache, CL_TRUE, 0, sizeof(clfp) * CHARACTERS * SITES, parent_cache, 0, NULL, NULL);
    ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmScalings, CL_TRUE, 0, sizeof(cl_int) * CHARACTERS * SITES, scalings, 0, NULL, NULL);
    printf("clEnqueueReadBuffer...\n\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("%i\n", ciErr1); //prints "1"
        switch(ciErr1)
        {
            case   CL_INVALID_COMMAND_QUEUE: printf("CL_INVALID_COMMAND_QUEUE\n"); break;
            case   CL_INVALID_CONTEXT: printf("CL_INVALID_CONTEXT\n"); break;
            case   CL_INVALID_MEM_OBJECT: printf("CL_INVALID_MEM_OBJECT\n"); break;
            case   CL_INVALID_VALUE: printf("CL_INVALID_VALUE\n"); break;   
            case   CL_INVALID_EVENT_WAIT_LIST: printf("CL_INVALID_EVENT_WAIT_LIST\n"); break;
				//          case   CL_MISALIGNED_SUB_BUFFER_OFFSET: printf("CL_MISALIGNED_SUB_BUFFER_OFFSET\n"); break;
				//          case   CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST\n"); break;
            case   CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n"); break;
            case   CL_OUT_OF_RESOURCES: printf("CL_OUT_OF_RESOURCES\n"); break;
            case   CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
            default: printf("Strange error\n"); //This is printed
        }
        printf("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(EXIT_FAILURE);
    }
    //--------------------------------------------------------
	
    
    clFinish(cqCommandQueue);
    printf("%f seconds on device\n", difftime(time(NULL), dtimer));
	//    double timeDifference = ((double)end.tv_sec + ((double)end.tv_nsec/1000.0))-((double)begin.tv_sec + ((double)begin.tv_nsec/1000.0));
	//    printf("%f seconds on device\n", timeDifference);
    
    htimer = time(NULL);
	
	
    // Compute and compare results for golden-host and report errors and pass/fail
    printf("Comparing against Host/C++ computation...\n\n"); 
    
    FirstLoopHost ((const fpoint*)node_cache, (const fpoint*)model, (fpoint*)Golden);
	
    printf("%f seconds on host\n", difftime(time(NULL), htimer));
	
	/*
	 int goldenLoop = 0;
	 for (goldenLoop = 0; goldenLoop < SITES; goldenLoop++)
	 {
	 printf("Golden: %e\n", ((fpoint*)Golden)[goldenLoop*CHARACTERS]);
	 }
	 */
	
	
	// Unscaling
	//***************************************************************************
    int scIndex;
    for (scIndex = 0; scIndex < CHARACTERS * SITES; scIndex++)
    {
        while (((int*)scalings)[scIndex] > 0)
        {
            ((fpoint*)parent_cache)[scIndex] /= scalar;
            ((int*)scalings)[scIndex]--;
        }
    }
	
	
    bool match = true;
	//        int unmatching = 0;
	//        long firstUnmatch = -1;
	//        long lastUnmatch = -1;
    int verI;
    for (verI  = 0; verI < CHARACTERS*SITES; verI++)
    {
		if (verI%(SITES)==0)
			printf("Device: %e, Host: %e, Scalings: %i\n", ((fpoint*)parent_cache)[verI], ((fpoint*)Golden)[verI], ((int*)scalings)[verI]); 
		//                if (((fpoint*)parent_cache)[i] != ((fpoint*)Golden)[i]) match = false;
        if (((fpoint*)parent_cache)[verI] != ((fpoint*)Golden)[verI])
        {
            match = false;
			//                        unmatching++;
			//                        if (firstUnmatch == -1) firstUnmatch = i;
			//                        if (lastUnmatch < i) lastUnmatch = i;
        }
    }
	//        printf("Unmatching: %i, First: %d, Last: %d\n", unmatching, firstUnmatch, lastUnmatch);
    printf("%s\n\n", (match) ? "PASSED" : "FAILED");
	
	
    // Cleanup and leave
    Cleanup (EXIT_SUCCESS);
}