CUresult TestSAXPY( chCUDADevice *chDevice, size_t N, float alpha ) { CUresult status; CUdeviceptr dptrOut = 0; CUdeviceptr dptrIn = 0; float *hostOut = 0; float *hostIn = 0; CUDA_CHECK( cuCtxPushCurrent( chDevice->context() ) ); CUDA_CHECK( cuMemAlloc( &dptrOut, N*sizeof(float) ) ); CUDA_CHECK( cuMemsetD32( dptrOut, 0, N ) ); CUDA_CHECK( cuMemAlloc( &dptrIn, N*sizeof(float) ) ); CUDA_CHECK( cuMemHostAlloc( (void **) &hostOut, N*sizeof(float), 0 ) ); CUDA_CHECK( cuMemHostAlloc( (void **) &hostIn, N*sizeof(float), 0 ) ); for ( size_t i = 0; i < N; i++ ) { hostIn[i] = (float) rand() / (float) RAND_MAX; } CUDA_CHECK( cuMemcpyHtoDAsync( dptrIn, hostIn, N*sizeof(float ), NULL ) ); { CUmodule moduleSAXPY; CUfunction kernelSAXPY; void *params[] = { &dptrOut, &dptrIn, &N, &alpha }; moduleSAXPY = chDevice->module( "saxpy.ptx" ); if ( ! moduleSAXPY ) { status = CUDA_ERROR_NOT_FOUND; goto Error; } CUDA_CHECK( cuModuleGetFunction( &kernelSAXPY, moduleSAXPY, "saxpy" ) ); CUDA_CHECK( cuLaunchKernel( kernelSAXPY, 1500, 1, 1, 512, 1, 1, 0, NULL, params, NULL ) ); } CUDA_CHECK( cuMemcpyDtoHAsync( hostOut, dptrOut, N*sizeof(float), NULL ) ); CUDA_CHECK( cuCtxSynchronize() ); for ( size_t i = 0; i < N; i++ ) { if ( fabsf( hostOut[i] - alpha*hostIn[i] ) > 1e-5f ) { status = CUDA_ERROR_UNKNOWN; goto Error; } } status = CUDA_SUCCESS; printf( "Well it worked!\n" ); Error: cuCtxPopCurrent( NULL ); cuMemFreeHost( hostOut ); cuMemFreeHost( hostIn ); cuMemFree( dptrOut ); cuMemFree( dptrIn ); return status; }
void swanMemset( void *ptr, unsigned char b, size_t len ) { CUresult err; if( len == 0 ) { return; } if( (0 == len % 16) && b==0 ) { block_config_t grid, block; int threads; // if( __CUDA_ARCH__ == 110 ) { threads = 128; // } else { // threads = CUDA_THREAD_MAX; // } swanDecompose( &grid, &block, len/(sizeof(uint4)), threads ); if( grid.x < 65535 ) { k_swan_fast_fill( grid, block, 0, (uint4*) ptr, len/(sizeof(uint4) ) ); } else { // the region to be zeroed is too large for the simple-minded fill kernel // fall-back to something dumb err = cuMemsetD32( PTR_TO_CUDEVPTR(ptr), 0, len/4 ); } return; } else if( 0 == len % 4 ) { // printf("SWAN: Warning: swanMemset using D32\n"); unsigned word = 0; word = b; word <<=8; word |= b; word = word | (word<<16); err = cuMemsetD32( PTR_TO_CUDEVPTR(ptr), word, len/4 ); } else { printf("SWAN: Warning: swanMemset using D8\n"); err = cuMemsetD8( PTR_TO_CUDEVPTR(ptr), b, len ); } if ( err != CUDA_SUCCESS ) { error("swanMemset failed\n" ); } }
sortStatus_t sortPass(sortEngine_t engine, sortData_t data, int numSortThreads, int valuesPerThread, bool useTransList, int firstBit, int endBit, int endKeyFlags, int valueCode, int* earlyExitCode, int& parity) { if(data->numElements > data->maxElements) return SORT_STATUS_INVALID_VALUE; if((firstBit < 0) || (endBit > 32) || (endBit <= firstBit) || ((endBit - firstBit) > 6)) return SORT_STATUS_INVALID_VALUE; int numBits = endBit - firstBit; SortTerms terms = ComputeSortTerms(numSortThreads, valuesPerThread, useTransList, numBits, data->numElements, engine->numSMs); sortEngine_d::HistKernel* hist; sortEngine_d::SortKernel* sort; CUresult result; sortStatus_t status = LoadKernels(engine, numSortThreads, valuesPerThread, useTransList, valueCode, &hist, &sort); if(SORT_STATUS_SUCCESS != status) return status; status = AllocSortResources(terms, engine); if(SORT_STATUS_SUCCESS != status) return status; // Set numHistRows into rangePairs if it hasn't already been set to this // size. if(terms.numHistRows != engine->lastNumHistRowsProcessed) { int2* pairs = &engine->rangePairsHost[0]; int numPairs = terms.numHistBlocks * NumHistWarps; int pairCount = terms.numHistRows / numPairs; int pairSplit = terms.numHistRows % numPairs; pairs[0].x = 0; for(int i = 0; i < numPairs; ++i) { if(i) pairs[i].x = pairs[i - 1].y; pairs[i].y = pairs[i].x + pairCount + (i < pairSplit); } // Copy rangePairsHost to device memory. CUresult result = engine->rangePairs->FromHost( &engine->rangePairsHost[0], numPairs); if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR; engine->lastNumHistRowsProcessed = terms.numHistRows; } // Save the trailing keys if((SORT_END_KEY_SAVE & endKeyFlags) && terms.numEndKeys) { engine->restoreSourceSize = terms.numEndKeys; CUdeviceptr source = AdjustPointer<uint>(data->keys[0], data->numElements); CUresult result = cuMemcpy(engine->keyRestoreBuffer->Handle(), source, 4 * engine->restoreSourceSize); if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR; } // Set the trailing keys to all set bits here. if((SORT_END_KEY_SET & endKeyFlags) && terms.numEndKeys) { // Back up the overwritten keys in the engine CUdeviceptr target = AdjustPointer<uint>(data->keys[0], data->numElements); CUresult result = cuMemsetD32(target, 0xffffffff, terms.numEndKeys); if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR; } // Run the count kernel if(data->earlyExit) engine->sortDetectCounters->Fill(0); CuCallStack callStack; callStack.Push(data->keys[0], firstBit, data->numElements, terms.countValuesPerThread, engine->countBuffer); CuFunction* count = data->earlyExit ? engine->count->eeFunctions[numBits - 1].get() : engine->count->functions[numBits - 1].get(); result = count->Launch(terms.numCountBlocks, 1, callStack); if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR; *earlyExitCode = 0; if(data->earlyExit) { uint4 detect; result = engine->sortDetectCounters->ToHost(&detect, 1); if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR; uint radixSort = detect.x; uint fullCount = detect.y; uint radixCount = detect.z; if(terms.numCountBlocks == (int)fullCount) *earlyExitCode = 3; else if(terms.numCountBlocks == (int)radixCount) *earlyExitCode = 2; // If 5% of the sort blocks are sorted, use the slightly slower early // exit sort kernel. else if((double)radixSort / terms.numSortBlocks > 0.05) *earlyExitCode = 1; else *earlyExitCode = 0; } if(*earlyExitCode <= 1) { // Run the three histogram kernels callStack.Reset(); callStack.Push(engine->countBuffer, engine->rangePairs, engine->countScan, engine->columnScan); result = hist->pass1[numBits - 1]->Launch(terms.numHistBlocks, 1, callStack); if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR; callStack.Reset(); callStack.Push(terms.numHistBlocks, engine->countScan); result = hist->pass2[numBits - 1]->Launch(1, 1, callStack); if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR; callStack.Reset(); callStack.Push(engine->countBuffer, engine->rangePairs, engine->countScan, engine->columnScan, engine->bucketCodes, *earlyExitCode); result = hist->pass3[numBits - 1]->Launch(terms.numHistBlocks, 1, callStack); if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR; // Run the sort kernel // Because the max grid size is only 65535 in any dimension, large // sorts require multiple kernel launche. int MaxGridSize = 65535; int numSortLaunches = DivUp(terms.numSortBlocks, MaxGridSize); for(int launch(0); launch < numSortLaunches; ++launch) { int block = MaxGridSize * launch; int numBlocks = std::min(MaxGridSize, terms.numSortBlocks - block); callStack.Reset(); callStack.Push(data->keys[0], block, engine->bucketCodes, firstBit, data->keys[1]); switch(valueCode) { case 1: // VALUE_TYPE_INDEX callStack.Push(data->values1[1]); break; case 2: // VALUE_TYPE_SINGLE callStack.Push(data->values1[0], data->values1[1]); break; case 3: // VALUE_TYPE_MULTI callStack.Push(data->valueCount, // Six values_global_in data->values1[0], data->values2[0], data->values3[0], data->values4[0], data->values5[0], data->values6[0], // Six values_global_out data->values1[1], data->values2[1], data->values3[1], data->values4[1], data->values5[1], data->values6[1]); break; } CuFunction* sortFunc = *earlyExitCode ? sort->eeFunctions[numBits - 1].get() : sort->functions[numBits - 1].get(); result = sortFunc->Launch(numBlocks, 1, callStack); if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR; } // Swap the source and target buffers in the data structure. std::swap(data->keys[0], data->keys[1]); std::swap(data->values1[0], data->values1[1]); std::swap(data->values2[0], data->values2[1]); std::swap(data->values3[0], data->values3[1]); std::swap(data->values4[0], data->values4[1]); std::swap(data->values5[0], data->values5[1]); std::swap(data->values6[0], data->values6[1]); parity ^= 1; } return SORT_STATUS_SUCCESS; }
CUresult CuDeviceMem::Fill4(const void* fill) { CUresult result = cuMemsetD32(_deviceptr, *(uint*)fill, _size / 4); return result; }
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtQueueElement elem; while(virtqueue_pop(vq, &elem)) { struct param *p = elem.out_sg[0].iov_base; //for all library routines: get required arguments from buffer, execute, and push results back in virtqueue switch (p->syscall_type) { case CUINIT: { p->result = cuInit(p->flags); break; } case CUDRIVERGETVERSION: { p->result = cuDriverGetVersion(&p->val1); break; } case CUDEVICEGETCOUNT: { p->result = cuDeviceGetCount(&p->val1); break; } case CUDEVICEGET: { p->result = cuDeviceGet(&p->device, p->val1); break; } case CUDEVICECOMPUTECAPABILITY: { p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device); break; } case CUDEVICEGETNAME: { p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device); break; } case CUDEVICEGETATTRIBUTE: { p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device); break; } case CUCTXCREATE: { p->result = cuCtxCreate(&p->ctx, p->flags, p->device); break; } case CUCTXDESTROY: { p->result = cuCtxDestroy(p->ctx); break; } case CUCTXGETCURRENT: { p->result = cuCtxGetCurrent(&p->ctx); break; } case CUCTXGETDEVICE: { p->result = cuCtxGetDevice(&p->device); break; } case CUCTXPOPCURRENT: { p->result = cuCtxPopCurrent(&p->ctx); break; } case CUCTXSETCURRENT: { p->result = cuCtxSetCurrent(p->ctx); break; } case CUCTXSYNCHRONIZE: { p->result = cuCtxSynchronize(); break; } case CUMODULELOAD: { //hardcoded path - needs improvement //all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char)); if (!binname) { p->result = 0; virtqueue_push(vq, &elem, 0); break; } strcpy(binname, getenv("QEMU_NFS_PATH")); strcat(binname, (char *)elem.out_sg[1].iov_base); //change current CUDA context //each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleLoad(&p->module, binname); free(binname); break; } case CUMODULEGETGLOBAL: { char *name = malloc(100*sizeof(char)); if (!name) { p->result = 999; break; } strcpy(name, (char *)elem.out_sg[1].iov_base); p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name); break; } case CUMODULEUNLOAD: { p->result = cuModuleUnload(p->module); break; } case CUMEMALLOC: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAlloc(&p->dptr, p->bytesize); break; } case CUMEMALLOCPITCH: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize); break; } //large buffers are alocated in smaller chuncks in guest kernel space //gets each chunck seperately and copies it to device memory case CUMEMCPYHTOD: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYHTODASYNC: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTODASYNC: { p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream); break; } case CUMEMCPYDTOH: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTOHASYNC: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMSETD32: { p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize); break; } case CUMEMFREE: { p->result = cuMemFree(p->dptr); break; } case CUMODULEGETFUNCTION: { char *name = (char *)elem.out_sg[1].iov_base; name[p->length] = '\0'; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleGetFunction(&p->function, p->module, name); break; } case CULAUNCHKERNEL: { void **args = malloc(p->val1*sizeof(void *)); if (!args) { p->result = 9999; break; } int i; for (i=0; i<p->val1; i++) { args[i] = elem.out_sg[1+i].iov_base; } if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuLaunchKernel(p->function, p->gridDimX, p->gridDimY, p->gridDimZ, p->blockDimX, p->blockDimY, p->blockDimZ, p->bytecount, 0, args, 0); free(args); break; } case CUEVENTCREATE: { p->result = cuEventCreate(&p->event1, p->flags); break; } case CUEVENTDESTROY: { p->result = cuEventDestroy(p->event1); break; } case CUEVENTRECORD: { p->result = cuEventRecord(p->event1, p->stream); break; } case CUEVENTSYNCHRONIZE: { p->result = cuEventSynchronize(p->event1); break; } case CUEVENTELAPSEDTIME: { p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2); break; } case CUSTREAMCREATE: { p->result = cuStreamCreate(&p->stream, 0); break; } case CUSTREAMSYNCHRONIZE: { p->result = cuStreamSynchronize(p->stream); break; } case CUSTREAMQUERY: { p->result = cuStreamQuery(p->stream); break; } case CUSTREAMDESTROY: { p->result = cuStreamDestroy(p->stream); break; } default: printf("Unknown syscall_type\n"); } virtqueue_push(vq, &elem, 0); } //notify frontend - trigger virtual interrupt virtio_notify(vdev, vq); return; }