void all_distances(Param<To> dist, CParam<T> query, CParam<T> train, const dim_t dist_dim, const unsigned n_dist) { const unsigned feat_len = query.dims[dist_dim]; const unsigned max_kern_feat_len = min(THREADS, feat_len); const To max_dist = maxval<To>(); const dim_t sample_dim = (dist_dim == 0) ? 1 : 0; const unsigned ntrain = train.dims[sample_dim]; dim3 threads(THREADS, 1); dim3 blocks(divup(ntrain, threads.x), 1); // Determine maximum feat_len capable of using shared memory (faster) int device = getActiveDeviceId(); cudaDeviceProp prop = getDeviceProp(device); size_t avail_smem = prop.sharedMemPerBlock; size_t smem_predef = 2 * THREADS * sizeof(unsigned) + max_kern_feat_len * sizeof(T); size_t strain_sz = threads.x * max_kern_feat_len * sizeof(T); bool use_shmem = (avail_smem >= (smem_predef + strain_sz)) ? true : false; unsigned smem_sz = (use_shmem) ? smem_predef + strain_sz : smem_predef; // For each query vector, find training vector with smallest Hamming // distance per CUDA block for(int feat_offset=0; feat_offset<feat_len; feat_offset+=THREADS) { if (use_shmem) { CUDA_LAUNCH_SMEM((all_distances<T,To,dist_type,true>), blocks, threads, smem_sz, dist.ptr, query, train, max_dist, feat_len, max_kern_feat_len, feat_offset); } else { CUDA_LAUNCH_SMEM((all_distances<T,To,dist_type,false>), blocks, threads, smem_sz, dist.ptr, query, train, max_dist, feat_len, max_kern_feat_len, feat_offset); } } POST_LAUNCH_CHECK(); }
void nearest_neighbour(Param<uint> idx, Param<To> dist, CParam<T> query, CParam<T> train, const dim_t dist_dim, const unsigned n_dist) { const unsigned feat_len = query.dims[dist_dim]; const To max_dist = maxval<To>(); if (feat_len > THREADS) { CUDA_NOT_SUPPORTED(); } const dim_t sample_dim = (dist_dim == 0) ? 1 : 0; const unsigned nquery = query.dims[sample_dim]; const unsigned ntrain = train.dims[sample_dim]; dim3 threads(THREADS, 1); dim3 blocks(divup(ntrain, threads.x), 1); // Determine maximum feat_len capable of using shared memory (faster) int device = getActiveDeviceId(); cudaDeviceProp prop = getDeviceProp(device); size_t avail_smem = prop.sharedMemPerBlock; size_t smem_predef = 2 * THREADS * sizeof(unsigned) + feat_len * sizeof(T); size_t strain_sz = threads.x * feat_len * sizeof(T); bool use_shmem = (avail_smem >= (smem_predef + strain_sz)) ? true : false; unsigned smem_sz = (use_shmem) ? smem_predef + strain_sz : smem_predef; unsigned nblk = blocks.x; auto d_blk_idx = memAlloc<unsigned>(nblk * nquery); auto d_blk_dist = memAlloc<To>(nblk * nquery); // For each query vector, find training vector with smallest Hamming // distance per CUDA block if (use_shmem) { switch(feat_len) { // Optimized lengths (faster due to loop unrolling) case 1: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,1,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 2: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,2,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 4: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,4,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 8: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,8,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 16: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,16,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 32: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,32,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 64: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,64,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; default: CUDA_LAUNCH_SMEM((nearest_neighbour<T,To,dist_type,true>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist, feat_len); } } else { switch(feat_len) { // Optimized lengths (faster due to loop unrolling) case 1: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,1,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 2: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,2,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 4: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,4,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 8: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,8,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 16: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,16,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 32: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,32,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; case 64: CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,64,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist); break; default: CUDA_LAUNCH_SMEM((nearest_neighbour<T,To,dist_type,false>), blocks, threads, smem_sz, d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist, feat_len); } } POST_LAUNCH_CHECK(); threads = dim3(32, 8); blocks = dim3(nquery, 1); // Reduce all smallest Hamming distances from each block and store final // best match CUDA_LAUNCH(select_matches, blocks, threads, idx, dist, d_blk_idx.get(), d_blk_dist.get(), nquery, nblk, max_dist); POST_LAUNCH_CHECK(); }
//----------------------------------------------------------------------------- int main( int argc, char* argv[] ) //----------------------------------------------------------------------------- { HDMR hDMR = INVALID_ID; HDRV hDrv = INVALID_ID; HDEV hDevice = INVALID_ID; TDMR_ERROR result = DMR_NO_ERROR; CaptureParameter captureParams; unsigned int i = 0; HOBJ hPropFamily = INVALID_ID; char* pStringBuffer = NULL; UserSuppliedHeapBuffer* pUserSuppliedBuffer = 0; int requestNr = INVALID_ID; int requestUsed = INVALID_ID; int bufferSize = 0; RequestResult ReqRes; ImageBuffer* pIB = 0; const int REQUEST_TO_USE = 2; // get rid of warnings argc = argc; argv = argv; // try to initialise the library. if( ( result = DMR_Init( &hDMR ) ) != DMR_NO_ERROR ) { printf( "DMR_Init failed (code: %d)\n", result ); END_APPLICATION; } getDeviceFromUserInput( &hDevice, 0, 1 ); if( ( hPropFamily = getDeviceProp( hDevice, "Family" ) ) == INVALID_ID ) { printf( "Failed to obtain device family property for device %d.\n", i ); END_APPLICATION; } getStringValue( hPropFamily, &pStringBuffer, 0 ); free( pStringBuffer ); pStringBuffer = 0; // try to initialise this device if( ( result = DMR_OpenDevice( hDevice, &hDrv ) ) != DMR_NO_ERROR ) { printf( "DMR_OpenDevice failed (code: %d)\n", result ); printf( "DMR_Close: %d\n", DMR_Close() ); END_APPLICATION; } #ifdef USE_MV_DISPLAY_LIB // create a window to display the captured images captureParams.hDisp = mvDispWindowCreate( "CaptureToUserMemory sample(plain 'C')" ); captureParams.pDisp = mvDispWindowGetDisplayHandle( captureParams.hDisp ); mvDispWindowShow( captureParams.hDisp ); #endif // #ifdef USE_MV_DISPLAY_LIB captureParams.hDrv = hDrv; captureParams.pFirstHeapBuffer = 0; captureParams.requestCount = 0; captureParams.ppRequests = 0; // try to locate the frames per second property if( ( captureParams.hFramesPerSecond = getStatisticProp( hDrv, "FramesPerSecond" ) ) == INVALID_ID ) { printf( "Couldn't locate 'FramesPerSecond' property! Unable to continue!\n" ); END_APPLICATION; } if( ( captureParams.hRequestCount = getSystemSettingProp( hDrv, "RequestCount" ) ) == INVALID_ID ) { printf( "Couldn't locate 'RequestCount' property! Unable to continue!\n" ); END_APPLICATION; } if( ( captureParams.hCaptureBufferAlignment = getInfoProp( hDrv, "CaptureBufferAlignment" ) ) == INVALID_ID ) { printf( "Couldn't locate 'CaptureBufferAlignment' property! Unable to continue!\n" ); END_APPLICATION; } if( ( captureParams.hRequestControl_Mode = getRequestCtrlProp( hDrv, "Base", "Mode" ) ) == INVALID_ID ) { printf( "Couldn't locate request controls 'Mode' property! Unable to continue!\n" ); END_APPLICATION; } if( ( captureParams.hRequestControl_RequestToUse = getRequestCtrlProp( hDrv, "Base", "RequestToUse" ) ) == INVALID_ID ) { printf( "Couldn't locate request controls 'RequestToUse' property! Unable to continue!\n" ); END_APPLICATION; } allocateRequests( &captureParams ); //============================================================================= //========= Capture loop into memory managed by the driver (default) ========== //============================================================================= printf( "The device will try to capture continuously into memory automatically allocated be the device driver..\n" ); printf( "This is the default behaviour.\n" ); captureParams.boUserSuppliedMemoryUsed = 0; captureLoop( &captureParams ); //============================================================================= //========= Capture loop into memory managed by the user (advanced) =========== //============================================================================= printf( "The device will now try to capture continuously into user supplied memory.\n" ); captureParams.boUserSuppliedMemoryUsed = 1; // find out the size of the resulting buffer by requesting a dummy request setPropI( captureParams.hRequestControl_Mode, ircmTrial, 0 ); DMR_ImageRequestSingle( captureParams.hDrv, 0, 0 ); // waitFor will return as fast as possible. No 'real' image will be taken // but a request object that contains a dummy image with the format, dimensions // and other information will be returned, that is (apart from the pixel data) // similar to any 'real' image that would be captured with the current settings result = DMR_ImageRequestWaitFor( captureParams.hDrv, -1, 0, &requestNr ); if( result == DMR_NO_ERROR ) { // check if the request contains a valid image result = DMR_GetImageRequestResultEx( hDrv, requestNr, &ReqRes, sizeof( ReqRes ), 0, 0 ); if( ( result == DMR_NO_ERROR ) && ( ReqRes.result == rrOK ) ) { // obtain the buffer size needed in the current configuration bufferSize = getPropI( captureParams.ppRequests[requestNr]->hImageSize_, 0 ) + getPropI( captureParams.ppRequests[requestNr]->hImageFooterSize_, 0 ); /// switch back to 'normal' capture mode setPropI( captureParams.hRequestControl_Mode, ircmManual, 0 ); // unlock this request to make it usable for the driver again DMR_ImageRequestUnlock( captureParams.hDrv, requestNr ); result = createCaptureBuffers( &captureParams, bufferSize, getPropI( captureParams.hCaptureBufferAlignment, 0 ) ); if( result != 0 ) { printf( "An error occurred while setting up the user supplied buffers(error code: %s).\n", DMR_ErrorCodeToString( result ) ); END_APPLICATION; } } else { printf( "Internal error(Request result: %08x! This should not happen an is a driver fault! Unable to continue.\n", ReqRes.result ); END_APPLICATION; } } else { printf( "Internal error! This should not happen an is a driver fault! Unable to continue.\n" ); END_APPLICATION; } captureLoop( &captureParams ); //============================================================================= //========= unregister user supplied buffers again ============================ //============================================================================= freeCaptureBuffers( &captureParams ); //============================================================================= //========= Capture loop into memory managed by the driver again (default) ==== //============================================================================= captureParams.boUserSuppliedMemoryUsed = 0; printf( "The device will try to capture continuously into memory automatically allocated be the device driver again.\n" ); printf( "This is the default behaviour.\n" ); captureLoop( &captureParams ); //============================================================================= //========= Capture into a specific buffer managed by the user (advanced) ===== //============================================================================= // by default the driver will decide which request will be used for an acquisition // requested by the user. However sometimes it can be necessary to make sure that a // certain request object will be used... printf( "Now the device will try to capture one frame into a specific user supplied buffer.\n" ); pUserSuppliedBuffer = UserSuppliedHeapBuffer_Alloc( bufferSize, getPropI( captureParams.hCaptureBufferAlignment, 0 ) ); // we want to use request number 'REQUEST_TO_USE' (zero based) for this acquisition thus we have to make sure // that there are at least 'REQUEST_TO_USE + 1' requests if( getPropI( captureParams.hRequestCount, 0 ) < REQUEST_TO_USE ) { setPropI( captureParams.hRequestCount, REQUEST_TO_USE + 1, 0 ); allocateRequests( &captureParams ); } // associate a user supplied buffer with this request result = DMR_ImageRequestConfigure( captureParams.hDrv, REQUEST_TO_USE, 0, 0 ); if( result != DMR_NO_ERROR ) { printf( "An error occurred while setting request number %d in configuration mode: %s.\n", REQUEST_TO_USE, DMR_ErrorCodeToString( result ) ); printf( "Press [ENTER] to end the continuous acquisition.\n" ); getchar(); } setPropI( captureParams.ppRequests[REQUEST_TO_USE]->hImageMemoryMode_, rimmUser, 0 ); setPropP( captureParams.ppRequests[REQUEST_TO_USE]->hImageData_, pUserSuppliedBuffer->pBufAligned_, 0 ); setPropI( captureParams.ppRequests[REQUEST_TO_USE]->hImageSize_, pUserSuppliedBuffer->bufSize_, 0 ); if( ( result = DMR_ImageRequestUnlock( captureParams.hDrv, captureParams.ppRequests[REQUEST_TO_USE]->nr_ ) ) != DMR_NO_ERROR ) { printf( "An error occurred while unlocking request number %d: %s.\n", captureParams.ppRequests[REQUEST_TO_USE]->nr_, DMR_ErrorCodeToString( result ) ); freeCaptureBuffers( &captureParams ); END_APPLICATION; } // define that 'REQUEST_TO_USE' is used for the next acquisition setPropI( captureParams.hRequestControl_RequestToUse, REQUEST_TO_USE, 0 ); // and capture the image requestUsed = INVALID_ID; result = DMR_ImageRequestSingle( captureParams.hDrv, 0, &requestUsed ); if( result != DMR_NO_ERROR ) { printf( "An error occurred while requesting an image for request number %d: (%s).\n", REQUEST_TO_USE, DMR_ErrorCodeToString( result ) ); printf( "Press [ENTER] to end the continuous acquisition.\n" ); END_APPLICATION; } if( requestUsed != REQUEST_TO_USE ) { printf( "ERROR! An acquisition into buffer %d was requested, but the driver did use %d for this acquisition.\n", REQUEST_TO_USE, requestUsed ); } manuallyStartAcquisitionIfNeeded( hDrv ); result = DMR_ImageRequestWaitFor( captureParams.hDrv, -1, 0, &requestNr ); manuallyStopAcquisitionIfNeeded( hDrv ); if( result == DMR_NO_ERROR ) { // check if the request contains a valid image result = DMR_GetImageRequestResultEx( hDrv, requestNr, &ReqRes, sizeof( ReqRes ), 0, 0 ); if( ( result == DMR_NO_ERROR ) && ( ReqRes.result == rrOK ) ) { if( ( result = DMR_GetImageRequestBuffer( hDrv, requestNr, &pIB ) ) == DMR_NO_ERROR ) { #ifdef USE_MV_DISPLAY_LIB // display the captured image mvDispSetImageFromImageBuffer( captureParams.pDisp, pIB ); mvDispUpdate( captureParams.pDisp ); #else printf( "Frame captured into request number %d(%dx%d).\n", requestNr, pIB->iWidth, pIB->iHeight ); #endif // #ifdef USE_MV_DISPLAY_LIB } else { printf( "DMR_GetImageRequestBuffer: ERROR! Code %d\n", result ); } } else { printf( "Acquisition into a specific buffer was not successful. Request result: 0x%08x.\n", ReqRes.result ); } } else { printf( "Waiting for a frame captured into a specific buffer failed: %s.\n", DMR_ErrorCodeToString( result ) ); } #ifdef USE_MV_DISPLAY_LIB mvDispWindowDestroy( captureParams.hDisp ); #endif // #ifdef USE_MV_DISPLAY_LIB freeRequests( &captureParams ); freeCaptureBuffers( &captureParams ); printf( "DMR_ReleaseImageRequestBufferDesc: %s.\n", DMR_ErrorCodeToString( DMR_ReleaseImageRequestBufferDesc( &pIB ) ) ); printf( "DMR_CloseDevice: %s\n", DMR_ErrorCodeToString( DMR_CloseDevice( hDrv, hDevice ) ) ); printf( "DMR_Close: %s\n", DMR_ErrorCodeToString( DMR_Close() ) ); END_APPLICATION; }