Ejemplo n.º 1
0
CUresult
TestSAXPY( chCUDADevice *chDevice, size_t N, float alpha )
{
    CUresult status;
    CUdeviceptr dptrOut = 0;
    CUdeviceptr dptrIn = 0;
    float *hostOut = 0;
    float *hostIn = 0;

    CUDA_CHECK( cuCtxPushCurrent( chDevice->context() ) );

    CUDA_CHECK( cuMemAlloc( &dptrOut, N*sizeof(float) ) );
    CUDA_CHECK( cuMemsetD32( dptrOut, 0, N ) );
    CUDA_CHECK( cuMemAlloc( &dptrIn, N*sizeof(float) ) );
    CUDA_CHECK( cuMemHostAlloc( (void **) &hostOut, N*sizeof(float), 0 ) );
    CUDA_CHECK( cuMemHostAlloc( (void **) &hostIn, N*sizeof(float), 0 ) );
    for ( size_t i = 0; i < N; i++ ) {
        hostIn[i] = (float) rand() / (float) RAND_MAX;
    }
    CUDA_CHECK( cuMemcpyHtoDAsync( dptrIn, hostIn, N*sizeof(float ), NULL ) );

    {
        CUmodule moduleSAXPY;
        CUfunction kernelSAXPY;
        void *params[] = { &dptrOut, &dptrIn, &N, &alpha };
        
        moduleSAXPY = chDevice->module( "saxpy.ptx" );
        if ( ! moduleSAXPY ) {
            status = CUDA_ERROR_NOT_FOUND;
            goto Error;
        }
        CUDA_CHECK( cuModuleGetFunction( &kernelSAXPY, moduleSAXPY, "saxpy" ) );

        CUDA_CHECK( cuLaunchKernel( kernelSAXPY, 1500, 1, 1, 512, 1, 1, 0, NULL, params, NULL ) );

    }

    CUDA_CHECK( cuMemcpyDtoHAsync( hostOut, dptrOut, N*sizeof(float), NULL ) );
    CUDA_CHECK( cuCtxSynchronize() );
    for ( size_t i = 0; i < N; i++ ) {
        if ( fabsf( hostOut[i] - alpha*hostIn[i] ) > 1e-5f ) {
            status = CUDA_ERROR_UNKNOWN;
            goto Error;
        }
    }
    status = CUDA_SUCCESS;
    printf( "Well it worked!\n" );

Error:
    cuCtxPopCurrent( NULL );
    cuMemFreeHost( hostOut );
    cuMemFreeHost( hostIn );
    cuMemFree( dptrOut );
    cuMemFree( dptrIn );
    return status;
}
Ejemplo n.º 2
0
void swanMemset( void *ptr, unsigned char b, size_t len ) {
	CUresult  err;
	if( len == 0 ) { return; }

	if( (0 == len % 16) && b==0 ) {
			block_config_t grid, block;
			int threads;
//			if( __CUDA_ARCH__ == 110 ) {
				threads = 128;
//			} else {
//				threads = CUDA_THREAD_MAX;
//			}

			swanDecompose( &grid, &block, len/(sizeof(uint4)), threads );

			if( grid.x < 65535 ) {
				k_swan_fast_fill( grid, block, 0, (uint4*) ptr, len/(sizeof(uint4) )  );
			}
			else {
				// the region to be zeroed is too large for the simple-minded fill kernel 
				// fall-back to something dumb
				err = cuMemsetD32( PTR_TO_CUDEVPTR(ptr), 0, len/4 );
			}
			return;
	}
	else if( 0 == len % 4 ) {
//			printf("SWAN: Warning: swanMemset using D32\n");
			unsigned word = 0;
			word = b;
			word <<=8;
			word |= b;
			word = word | (word<<16);
			err = cuMemsetD32( PTR_TO_CUDEVPTR(ptr), word, len/4 );
	}
	else {
		printf("SWAN: Warning: swanMemset using D8\n");
		err = cuMemsetD8( PTR_TO_CUDEVPTR(ptr), b, len );
	}
	if ( err != CUDA_SUCCESS ) {
		error("swanMemset failed\n" );
	}
}
Ejemplo n.º 3
0
sortStatus_t sortPass(sortEngine_t engine, sortData_t data, int numSortThreads, 
	int valuesPerThread, bool useTransList, int firstBit, int endBit, 
	int endKeyFlags, int valueCode, int* earlyExitCode, int& parity) {

	if(data->numElements > data->maxElements) return SORT_STATUS_INVALID_VALUE;

	if((firstBit < 0) || (endBit > 32) || (endBit <= firstBit) || 
		((endBit - firstBit) > 6))
		return SORT_STATUS_INVALID_VALUE;
	
	int numBits = endBit - firstBit;

	SortTerms terms = ComputeSortTerms(numSortThreads, valuesPerThread,
		useTransList, numBits, data->numElements, engine->numSMs);

	sortEngine_d::HistKernel* hist;
	sortEngine_d::SortKernel* sort;
	CUresult result;
	sortStatus_t status = LoadKernels(engine, numSortThreads, valuesPerThread,
		useTransList, valueCode, &hist, &sort);
	if(SORT_STATUS_SUCCESS != status) return status;

	status = AllocSortResources(terms, engine);
	if(SORT_STATUS_SUCCESS != status) return status;
	
	// Set numHistRows into rangePairs if it hasn't already been set to this 
	// size.
	if(terms.numHistRows != engine->lastNumHistRowsProcessed) {
		int2* pairs = &engine->rangePairsHost[0];
		int numPairs = terms.numHistBlocks * NumHistWarps;
		int pairCount = terms.numHistRows / numPairs;
		int pairSplit = terms.numHistRows % numPairs;
		pairs[0].x = 0;
		for(int i = 0; i < numPairs; ++i) {
			if(i) pairs[i].x = pairs[i - 1].y;
			pairs[i].y = pairs[i].x + pairCount + (i < pairSplit);
		}

		// Copy rangePairsHost to device memory.
		CUresult result = engine->rangePairs->FromHost(
			&engine->rangePairsHost[0], numPairs);
		if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR;

		engine->lastNumHistRowsProcessed = terms.numHistRows;
	}

	// Save the trailing keys
	if((SORT_END_KEY_SAVE & endKeyFlags) && terms.numEndKeys) {
		engine->restoreSourceSize = terms.numEndKeys;
		CUdeviceptr source = AdjustPointer<uint>(data->keys[0],
			data->numElements);
		CUresult result = cuMemcpy(engine->keyRestoreBuffer->Handle(), source, 
			4 * engine->restoreSourceSize);
		if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR;
	}

	// Set the trailing keys to all set bits here.
	if((SORT_END_KEY_SET & endKeyFlags) && terms.numEndKeys) {
		// Back up the overwritten keys in the engine
		CUdeviceptr target = AdjustPointer<uint>(data->keys[0],
			data->numElements);
		CUresult result = cuMemsetD32(target, 0xffffffff, terms.numEndKeys);
		if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR;
	}

	// Run the count kernel
	if(data->earlyExit) engine->sortDetectCounters->Fill(0);

	CuCallStack callStack;
	callStack.Push(data->keys[0], firstBit, data->numElements, 
		terms.countValuesPerThread, engine->countBuffer);
	CuFunction* count = data->earlyExit ? 
		engine->count->eeFunctions[numBits - 1].get() :
		engine->count->functions[numBits - 1].get();
	result = count->Launch(terms.numCountBlocks, 1, callStack);
	if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR;

	*earlyExitCode = 0;
	if(data->earlyExit) {
		uint4 detect;
		result = engine->sortDetectCounters->ToHost(&detect, 1);
		if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR;

		uint radixSort = detect.x;
		uint fullCount = detect.y;
		uint radixCount = detect.z;

		if(terms.numCountBlocks == (int)fullCount) *earlyExitCode = 3;
		else if(terms.numCountBlocks == (int)radixCount) *earlyExitCode = 2;

		// If 5% of the sort blocks are sorted, use the slightly slower early 
		// exit sort kernel.
		else if((double)radixSort / terms.numSortBlocks > 0.05)
			*earlyExitCode = 1;
		else *earlyExitCode = 0;
	}

	if(*earlyExitCode <= 1) {

		// Run the three histogram kernels
		callStack.Reset();
		callStack.Push(engine->countBuffer, engine->rangePairs, 
			engine->countScan, engine->columnScan);
		result = hist->pass1[numBits - 1]->Launch(terms.numHistBlocks, 1,
			callStack);
		if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR;

		callStack.Reset();
		callStack.Push(terms.numHistBlocks, engine->countScan);
		result = hist->pass2[numBits - 1]->Launch(1, 1, callStack);
		if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR;

		callStack.Reset();
		callStack.Push(engine->countBuffer, engine->rangePairs, 
			engine->countScan, engine->columnScan, engine->bucketCodes,
			*earlyExitCode);
		result = hist->pass3[numBits - 1]->Launch(terms.numHistBlocks, 1,
			callStack);
		if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR;


		// Run the sort kernel
		// Because the max grid size is only 65535 in any dimension, large
		// sorts require multiple kernel launche.
		int MaxGridSize = 65535;
		int numSortLaunches = DivUp(terms.numSortBlocks, MaxGridSize);

		for(int launch(0); launch < numSortLaunches; ++launch) {
			int block = MaxGridSize * launch;
			int numBlocks = std::min(MaxGridSize, terms.numSortBlocks - block);

			callStack.Reset();
			callStack.Push(data->keys[0], block, engine->bucketCodes, firstBit, 
				data->keys[1]);
			
			switch(valueCode) {
				case 1:		// VALUE_TYPE_INDEX
					callStack.Push(data->values1[1]); 
					break;
				case 2:		// VALUE_TYPE_SINGLE
					callStack.Push(data->values1[0], data->values1[1]);
					break;
				case 3:		// VALUE_TYPE_MULTI
					callStack.Push(data->valueCount,
						// Six values_global_in
						data->values1[0], data->values2[0], data->values3[0],
						data->values4[0], data->values5[0], data->values6[0],
						
						// Six values_global_out
						data->values1[1], data->values2[1], data->values3[1],
						data->values4[1], data->values5[1], data->values6[1]);
					break;
			}

			CuFunction* sortFunc = *earlyExitCode ? 
				sort->eeFunctions[numBits - 1].get() :
				sort->functions[numBits - 1].get();

			result = sortFunc->Launch(numBlocks, 1, callStack);
			if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR;
		}

		// Swap the source and target buffers in the data structure.
		std::swap(data->keys[0], data->keys[1]);
		std::swap(data->values1[0], data->values1[1]);
		std::swap(data->values2[0], data->values2[1]);
		std::swap(data->values3[0], data->values3[1]);
		std::swap(data->values4[0], data->values4[1]);
		std::swap(data->values5[0], data->values5[1]);
		std::swap(data->values6[0], data->values6[1]);
		parity ^= 1;
	}

	return SORT_STATUS_SUCCESS;
}
Ejemplo n.º 4
0
CUresult CuDeviceMem::Fill4(const void* fill) {
	CUresult result = cuMemsetD32(_deviceptr, *(uint*)fill, _size / 4);
	return result;
}
Ejemplo n.º 5
0
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq)
{
	VirtQueueElement elem;
	
	while(virtqueue_pop(vq, &elem)) {
		struct param *p = elem.out_sg[0].iov_base;
	
		//for all library routines: get required arguments from buffer, execute, and push results back in virtqueue
		switch (p->syscall_type) {
		case CUINIT: {
			p->result = cuInit(p->flags);
			break;
		}
		case CUDRIVERGETVERSION: {
			p->result = cuDriverGetVersion(&p->val1);
			break;
		}
		case CUDEVICEGETCOUNT: {
			p->result = cuDeviceGetCount(&p->val1);
			break;
		}
		case CUDEVICEGET: {
			p->result = cuDeviceGet(&p->device, p->val1);
			break;
		}
		case CUDEVICECOMPUTECAPABILITY: {
			p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device);
			break;
		}
		case CUDEVICEGETNAME: {
			p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device);
			break;
		}
		case CUDEVICEGETATTRIBUTE: {
			p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device);
			break;
		}
		case CUCTXCREATE: {
                        p->result = cuCtxCreate(&p->ctx, p->flags, p->device);				
			break;
		}
		case CUCTXDESTROY: {
			p->result = cuCtxDestroy(p->ctx);
			break;
		}
		case CUCTXGETCURRENT: {
			p->result = cuCtxGetCurrent(&p->ctx);
			break;
		}
		case CUCTXGETDEVICE: {
			p->result = cuCtxGetDevice(&p->device);
			break;
		}
		case CUCTXPOPCURRENT: {
			p->result = cuCtxPopCurrent(&p->ctx);
			break;
		}
		case CUCTXSETCURRENT: {
			p->result = cuCtxSetCurrent(p->ctx);
	                break;
		}
	        case CUCTXSYNCHRONIZE: {
		        p->result = cuCtxSynchronize();
	                break;
	        }
		case CUMODULELOAD: {
			//hardcoded path - needs improvement
			//all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS
			char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char));
			if (!binname) {
				p->result = 0;
		                virtqueue_push(vq, &elem, 0);
				break;
			}
		        strcpy(binname, getenv("QEMU_NFS_PATH"));
		        strcat(binname, (char *)elem.out_sg[1].iov_base);
			//change current CUDA context
			//each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes
                        if (cuCtxSetCurrent(p->ctx) != 0) {
				p->result = 999;
                                break;
			}
			p->result = cuModuleLoad(&p->module, binname);
			free(binname);
			break;
		}
                case CUMODULEGETGLOBAL: {
                        char *name = malloc(100*sizeof(char));
                        if (!name) {
                                p->result = 999;
                                break;
                        }
                        strcpy(name, (char *)elem.out_sg[1].iov_base);
                        p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name);
                        break;
                }
		case CUMODULEUNLOAD: {
			p->result = cuModuleUnload(p->module);
			break;			
		}
		case CUMEMALLOC: {
			if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuMemAlloc(&p->dptr, p->bytesize);
			break;
		}
                case CUMEMALLOCPITCH: {
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize);
			break;
		}
		//large buffers are alocated in smaller chuncks in guest kernel space
		//gets each chunck seperately and copies it to device memory
	        case CUMEMCPYHTOD: {
			int i;
			size_t offset;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			offset = 0;
			for (i=0; i<nr_pages; i++) {
				s = *(long *)elem.out_sg[1+2*i+1].iov_base;
				p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s);
				if (p->result != 0) break;
				offset += s;
			}
	                break;
		}
		case CUMEMCPYHTODASYNC: {
			int i;
                        size_t offset;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
                        offset = 0;
			for (i=0; i<nr_pages; i++) {
                                s = *(long *)elem.out_sg[1+2*i+1].iov_base;
                                p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream);
                                if (p->result != 0) break;
                                offset += s;
                        }
                        break;
		}
		case CUMEMCPYDTODASYNC: {
			p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream);
                        break;		
		}
	        case CUMEMCPYDTOH: {
			int i;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			size_t offset = 0;
			for (i=0; i<nr_pages; i++) {
				s = *(long *)elem.in_sg[0+2*i+1].iov_base;
				p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s);
				if (p->result != 0) break;
				offset += s;
			}
			break;
		}
		case CUMEMCPYDTOHASYNC: {
			int i;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
                        size_t offset = 0;
			for (i=0; i<nr_pages; i++) {
                                s = *(long *)elem.in_sg[0+2*i+1].iov_base;
                                p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream);
                                if (p->result != 0) break;
                                offset += s;
                        }
			break;
		}
		case CUMEMSETD32: {
			p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize);
			break;
		}
	        case CUMEMFREE: {
	                p->result = cuMemFree(p->dptr);
	                break;
	        }
		case CUMODULEGETFUNCTION: {
			char *name = (char *)elem.out_sg[1].iov_base;
			name[p->length] = '\0';
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuModuleGetFunction(&p->function, p->module, name);
			break;	
		}
		case CULAUNCHKERNEL: {
			void **args = malloc(p->val1*sizeof(void *));
	                if (!args) {
				p->result = 9999;
	                        break;
        	        }
			int i;
			for (i=0; i<p->val1; i++) {
				args[i] = elem.out_sg[1+i].iov_base;
			}
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuLaunchKernel(p->function,
					p->gridDimX, p->gridDimY, p->gridDimZ,
			                p->blockDimX, p->blockDimY, p->blockDimZ,
					p->bytecount, 0, args, 0);
			free(args);
			break;
		}
		case CUEVENTCREATE: {
			p->result = cuEventCreate(&p->event1, p->flags);
			break;
		}
		case CUEVENTDESTROY: {
			p->result = cuEventDestroy(p->event1);
			break;
		}
		case CUEVENTRECORD: {
			p->result = cuEventRecord(p->event1, p->stream);
			break;
		}
		case CUEVENTSYNCHRONIZE: {
			p->result = cuEventSynchronize(p->event1);
			break;
		}
		case CUEVENTELAPSEDTIME: {
			p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2);
			break;
		}
		case CUSTREAMCREATE: {
			p->result =  cuStreamCreate(&p->stream, 0);
			break;
		}		
                case CUSTREAMSYNCHRONIZE: {
                        p->result = cuStreamSynchronize(p->stream);
                        break;
                }
                case CUSTREAMQUERY: {
                        p->result = cuStreamQuery(p->stream);
                        break;
                }
		case CUSTREAMDESTROY: {
                        p->result = cuStreamDestroy(p->stream);
                        break;
                }

		default: 
			printf("Unknown syscall_type\n");
		}
		virtqueue_push(vq, &elem, 0);
	}
	//notify frontend - trigger virtual interrupt
	virtio_notify(vdev, vq);
	return;
}