Exemplo n.º 1
0
void all_distances(Param<To> dist,
                       CParam<T> query,
                       CParam<T> train,
                       const dim_t dist_dim,
                       const unsigned n_dist)
{
    const unsigned feat_len = query.dims[dist_dim];
    const unsigned max_kern_feat_len = min(THREADS, feat_len);
    const To max_dist = maxval<To>();

    const dim_t sample_dim = (dist_dim == 0) ? 1 : 0;

    const unsigned ntrain = train.dims[sample_dim];

    dim3 threads(THREADS, 1);
    dim3 blocks(divup(ntrain, threads.x), 1);

    // Determine maximum feat_len capable of using shared memory (faster)
    int device          = getActiveDeviceId();
    cudaDeviceProp prop = getDeviceProp(device);
    size_t avail_smem   = prop.sharedMemPerBlock;
    size_t smem_predef  = 2 * THREADS * sizeof(unsigned) + max_kern_feat_len * sizeof(T);
    size_t strain_sz    = threads.x * max_kern_feat_len * sizeof(T);
    bool use_shmem      = (avail_smem >= (smem_predef + strain_sz)) ? true : false;
    unsigned smem_sz    = (use_shmem) ? smem_predef + strain_sz : smem_predef;

    // For each query vector, find training vector with smallest Hamming
    // distance per CUDA block
    for(int feat_offset=0; feat_offset<feat_len; feat_offset+=THREADS) {
        if (use_shmem) {
            CUDA_LAUNCH_SMEM((all_distances<T,To,dist_type,true>), blocks, threads, smem_sz,
                             dist.ptr, query, train, max_dist, feat_len, max_kern_feat_len, feat_offset);
        } else {
            CUDA_LAUNCH_SMEM((all_distances<T,To,dist_type,false>), blocks, threads, smem_sz,
                             dist.ptr, query, train, max_dist, feat_len, max_kern_feat_len, feat_offset);
        }
    }
    POST_LAUNCH_CHECK();
}
Exemplo n.º 2
0
void nearest_neighbour(Param<uint> idx,
                       Param<To> dist,
                       CParam<T> query,
                       CParam<T> train,
                       const dim_t dist_dim,
                       const unsigned n_dist)
{
    const unsigned feat_len = query.dims[dist_dim];
    const To max_dist = maxval<To>();

    if (feat_len > THREADS) {
        CUDA_NOT_SUPPORTED();
    }

    const dim_t sample_dim = (dist_dim == 0) ? 1 : 0;

    const unsigned nquery = query.dims[sample_dim];
    const unsigned ntrain = train.dims[sample_dim];

    dim3 threads(THREADS, 1);
    dim3 blocks(divup(ntrain, threads.x), 1);

    // Determine maximum feat_len capable of using shared memory (faster)
    int device = getActiveDeviceId();
    cudaDeviceProp prop = getDeviceProp(device);
    size_t avail_smem = prop.sharedMemPerBlock;
    size_t smem_predef = 2 * THREADS * sizeof(unsigned) + feat_len * sizeof(T);
    size_t strain_sz = threads.x * feat_len * sizeof(T);
    bool use_shmem = (avail_smem >= (smem_predef + strain_sz)) ? true : false;
    unsigned smem_sz = (use_shmem) ? smem_predef + strain_sz : smem_predef;

    unsigned nblk = blocks.x;

    auto d_blk_idx  = memAlloc<unsigned>(nblk * nquery);
    auto d_blk_dist = memAlloc<To>(nblk * nquery);

    // For each query vector, find training vector with smallest Hamming
    // distance per CUDA block
    if (use_shmem) {
        switch(feat_len) {
        // Optimized lengths (faster due to loop unrolling)
        case 1:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,1,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 2:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,2,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 4:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,4,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 8:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,8,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 16:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,16,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 32:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,32,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 64:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,64,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        default:
            CUDA_LAUNCH_SMEM((nearest_neighbour<T,To,dist_type,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist, feat_len);
        }
    }
    else {
        switch(feat_len) {
        // Optimized lengths (faster due to loop unrolling)
        case 1:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,1,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 2:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,2,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 4:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,4,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 8:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,8,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 16:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,16,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 32:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,32,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 64:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,64,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        default:
            CUDA_LAUNCH_SMEM((nearest_neighbour<T,To,dist_type,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist, feat_len);
        }
    }
    POST_LAUNCH_CHECK();

    threads = dim3(32, 8);
    blocks = dim3(nquery, 1);

    // Reduce all smallest Hamming distances from each block and store final
    // best match
    CUDA_LAUNCH(select_matches, blocks, threads,
                idx, dist, d_blk_idx.get(), d_blk_dist.get(), nquery, nblk, max_dist);
    POST_LAUNCH_CHECK();

}
Exemplo n.º 3
0
//-----------------------------------------------------------------------------
int main( int argc, char* argv[] )
//-----------------------------------------------------------------------------
{
    HDMR                       hDMR = INVALID_ID;
    HDRV                       hDrv = INVALID_ID;
    HDEV                       hDevice = INVALID_ID;
    TDMR_ERROR                 result = DMR_NO_ERROR;
    CaptureParameter           captureParams;
    unsigned int               i = 0;
    HOBJ                       hPropFamily = INVALID_ID;
    char*                      pStringBuffer = NULL;
    UserSuppliedHeapBuffer*    pUserSuppliedBuffer = 0;
    int                        requestNr = INVALID_ID;
    int                        requestUsed = INVALID_ID;
    int                        bufferSize = 0;
    RequestResult              ReqRes;
    ImageBuffer*               pIB = 0;
    const int                  REQUEST_TO_USE = 2;

    // get rid of warnings
    argc = argc;
    argv = argv;

    // try to initialise the library.
    if( ( result = DMR_Init( &hDMR ) ) != DMR_NO_ERROR )
    {
        printf( "DMR_Init failed (code: %d)\n", result );
        END_APPLICATION;
    }

    getDeviceFromUserInput( &hDevice, 0, 1 );

    if( ( hPropFamily = getDeviceProp( hDevice, "Family" ) ) == INVALID_ID )
    {
        printf( "Failed to obtain device family property for device %d.\n", i );
        END_APPLICATION;
    }
    getStringValue( hPropFamily, &pStringBuffer, 0 );
    free( pStringBuffer );
    pStringBuffer = 0;

    // try to initialise this device
    if( ( result = DMR_OpenDevice( hDevice, &hDrv ) ) != DMR_NO_ERROR )
    {
        printf( "DMR_OpenDevice failed (code: %d)\n", result );
        printf( "DMR_Close: %d\n", DMR_Close() );
        END_APPLICATION;
    }

#ifdef USE_MV_DISPLAY_LIB
    // create a window to display the captured images
    captureParams.hDisp = mvDispWindowCreate( "CaptureToUserMemory sample(plain 'C')" );
    captureParams.pDisp = mvDispWindowGetDisplayHandle( captureParams.hDisp );
    mvDispWindowShow( captureParams.hDisp );
#endif // #ifdef USE_MV_DISPLAY_LIB
    captureParams.hDrv = hDrv;
    captureParams.pFirstHeapBuffer = 0;
    captureParams.requestCount = 0;
    captureParams.ppRequests = 0;
    // try to locate the frames per second property
    if( ( captureParams.hFramesPerSecond = getStatisticProp( hDrv, "FramesPerSecond" ) ) == INVALID_ID )
    {
        printf( "Couldn't locate 'FramesPerSecond' property! Unable to continue!\n" );
        END_APPLICATION;
    }

    if( ( captureParams.hRequestCount = getSystemSettingProp( hDrv, "RequestCount" ) ) == INVALID_ID )
    {
        printf( "Couldn't locate 'RequestCount' property! Unable to continue!\n" );
        END_APPLICATION;
    }

    if( ( captureParams.hCaptureBufferAlignment = getInfoProp( hDrv, "CaptureBufferAlignment" ) ) == INVALID_ID )
    {
        printf( "Couldn't locate 'CaptureBufferAlignment' property! Unable to continue!\n" );
        END_APPLICATION;
    }

    if( ( captureParams.hRequestControl_Mode = getRequestCtrlProp( hDrv, "Base", "Mode" ) ) == INVALID_ID )
    {
        printf( "Couldn't locate request controls 'Mode' property! Unable to continue!\n" );
        END_APPLICATION;
    }

    if( ( captureParams.hRequestControl_RequestToUse = getRequestCtrlProp( hDrv, "Base", "RequestToUse" ) ) == INVALID_ID )
    {
        printf( "Couldn't locate request controls 'RequestToUse' property! Unable to continue!\n" );
        END_APPLICATION;
    }

    allocateRequests( &captureParams );
    //=============================================================================
    //========= Capture loop into memory managed by the driver (default) ==========
    //=============================================================================
    printf( "The device will try to capture continuously into memory automatically allocated be the device driver..\n" );
    printf( "This is the default behaviour.\n" );
    captureParams.boUserSuppliedMemoryUsed = 0;
    captureLoop( &captureParams );

    //=============================================================================
    //========= Capture loop into memory managed by the user (advanced) ===========
    //=============================================================================
    printf( "The device will now try to capture continuously into user supplied memory.\n" );
    captureParams.boUserSuppliedMemoryUsed = 1;
    // find out the size of the resulting buffer by requesting a dummy request
    setPropI( captureParams.hRequestControl_Mode, ircmTrial, 0 );
    DMR_ImageRequestSingle( captureParams.hDrv, 0, 0 );
    // waitFor will return as fast as possible. No 'real' image will be taken
    // but a request object that contains a dummy image with the format, dimensions
    // and other information will be returned, that is (apart from the pixel data)
    // similar to any 'real' image that would be captured with the current settings
    result = DMR_ImageRequestWaitFor( captureParams.hDrv, -1, 0, &requestNr );
    if( result == DMR_NO_ERROR )
    {
        // check if the request contains a valid image
        result = DMR_GetImageRequestResultEx( hDrv, requestNr, &ReqRes, sizeof( ReqRes ), 0, 0 );
        if( ( result == DMR_NO_ERROR ) && ( ReqRes.result == rrOK ) )
        {
            // obtain the buffer size needed in the current configuration
            bufferSize = getPropI( captureParams.ppRequests[requestNr]->hImageSize_, 0 ) + getPropI( captureParams.ppRequests[requestNr]->hImageFooterSize_, 0 );
            /// switch back to 'normal' capture mode
            setPropI( captureParams.hRequestControl_Mode, ircmManual, 0 );
            // unlock this request to make it usable for the driver again
            DMR_ImageRequestUnlock( captureParams.hDrv, requestNr );
            result = createCaptureBuffers( &captureParams, bufferSize, getPropI( captureParams.hCaptureBufferAlignment, 0 ) );
            if( result != 0 )
            {
                printf( "An error occurred while setting up the user supplied buffers(error code: %s).\n", DMR_ErrorCodeToString( result ) );
                END_APPLICATION;
            }
        }
        else
        {
            printf( "Internal error(Request result: %08x! This should not happen an is a driver fault! Unable to continue.\n", ReqRes.result );
            END_APPLICATION;
        }
    }
    else
    {
        printf( "Internal error! This should not happen an is a driver fault! Unable to continue.\n" );
        END_APPLICATION;
    }

    captureLoop( &captureParams );
    //=============================================================================
    //========= unregister user supplied buffers again ============================
    //=============================================================================
    freeCaptureBuffers( &captureParams );

    //=============================================================================
    //========= Capture loop into memory managed by the driver again (default) ====
    //=============================================================================
    captureParams.boUserSuppliedMemoryUsed = 0;
    printf( "The device will try to capture continuously into memory automatically allocated be the device driver again.\n" );
    printf( "This is the default behaviour.\n" );
    captureLoop( &captureParams );

    //=============================================================================
    //========= Capture into a specific buffer managed by the user (advanced) =====
    //=============================================================================
    // by default the driver will decide which request will be used for an acquisition
    // requested by the user. However sometimes it can be necessary to make sure that a
    // certain request object will be used...
    printf( "Now the device will try to capture one frame into a specific user supplied buffer.\n" );
    pUserSuppliedBuffer = UserSuppliedHeapBuffer_Alloc( bufferSize, getPropI( captureParams.hCaptureBufferAlignment, 0 ) );
    // we want to use request number 'REQUEST_TO_USE' (zero based) for this acquisition thus we have to make sure
    // that there are at least 'REQUEST_TO_USE + 1' requests
    if( getPropI( captureParams.hRequestCount, 0 ) < REQUEST_TO_USE )
    {
        setPropI( captureParams.hRequestCount, REQUEST_TO_USE + 1, 0 );
        allocateRequests( &captureParams );
    }
    // associate a user supplied buffer with this request
    result = DMR_ImageRequestConfigure( captureParams.hDrv, REQUEST_TO_USE, 0, 0 );
    if( result != DMR_NO_ERROR )
    {
        printf( "An error occurred while setting request number %d in configuration mode: %s.\n", REQUEST_TO_USE, DMR_ErrorCodeToString( result ) );
        printf( "Press [ENTER] to end the continuous acquisition.\n" );
        getchar();
    }

    setPropI( captureParams.ppRequests[REQUEST_TO_USE]->hImageMemoryMode_, rimmUser, 0 );
    setPropP( captureParams.ppRequests[REQUEST_TO_USE]->hImageData_, pUserSuppliedBuffer->pBufAligned_, 0 );
    setPropI( captureParams.ppRequests[REQUEST_TO_USE]->hImageSize_, pUserSuppliedBuffer->bufSize_, 0 );

    if( ( result = DMR_ImageRequestUnlock( captureParams.hDrv, captureParams.ppRequests[REQUEST_TO_USE]->nr_ ) ) != DMR_NO_ERROR )
    {
        printf( "An error occurred while unlocking request number %d: %s.\n", captureParams.ppRequests[REQUEST_TO_USE]->nr_, DMR_ErrorCodeToString( result ) );
        freeCaptureBuffers( &captureParams );
        END_APPLICATION;
    }

    // define that 'REQUEST_TO_USE' is used for the next acquisition
    setPropI( captureParams.hRequestControl_RequestToUse, REQUEST_TO_USE, 0 );
    // and capture the image
    requestUsed = INVALID_ID;
    result = DMR_ImageRequestSingle( captureParams.hDrv, 0, &requestUsed );
    if( result != DMR_NO_ERROR )
    {
        printf( "An error occurred while requesting an image for request number %d: (%s).\n", REQUEST_TO_USE, DMR_ErrorCodeToString( result ) );
        printf( "Press [ENTER] to end the continuous acquisition.\n" );
        END_APPLICATION;
    }
    if( requestUsed != REQUEST_TO_USE )
    {
        printf( "ERROR! An acquisition into buffer %d was requested, but the driver did use %d for this acquisition.\n", REQUEST_TO_USE, requestUsed );
    }
    manuallyStartAcquisitionIfNeeded( hDrv );
    result = DMR_ImageRequestWaitFor( captureParams.hDrv, -1, 0, &requestNr );
    manuallyStopAcquisitionIfNeeded( hDrv );
    if( result == DMR_NO_ERROR )
    {
        // check if the request contains a valid image
        result = DMR_GetImageRequestResultEx( hDrv, requestNr, &ReqRes, sizeof( ReqRes ), 0, 0 );
        if( ( result == DMR_NO_ERROR ) && ( ReqRes.result == rrOK ) )
        {
            if( ( result = DMR_GetImageRequestBuffer( hDrv, requestNr, &pIB ) ) == DMR_NO_ERROR )
            {
#ifdef USE_MV_DISPLAY_LIB
                // display the captured image
                mvDispSetImageFromImageBuffer( captureParams.pDisp, pIB );
                mvDispUpdate( captureParams.pDisp );
#else
                printf( "Frame captured into request number %d(%dx%d).\n", requestNr, pIB->iWidth, pIB->iHeight );
#endif // #ifdef USE_MV_DISPLAY_LIB
            }
            else
            {
                printf( "DMR_GetImageRequestBuffer: ERROR! Code %d\n", result );
            }
        }
        else
        {
            printf( "Acquisition into a specific buffer was not successful. Request result: 0x%08x.\n", ReqRes.result );
        }
    }
    else
    {
        printf( "Waiting for a frame captured into a specific buffer failed: %s.\n", DMR_ErrorCodeToString( result ) );
    }

#ifdef USE_MV_DISPLAY_LIB
    mvDispWindowDestroy( captureParams.hDisp );
#endif // #ifdef USE_MV_DISPLAY_LIB
    freeRequests( &captureParams );
    freeCaptureBuffers( &captureParams );
    printf( "DMR_ReleaseImageRequestBufferDesc: %s.\n", DMR_ErrorCodeToString( DMR_ReleaseImageRequestBufferDesc( &pIB ) ) );
    printf( "DMR_CloseDevice: %s\n", DMR_ErrorCodeToString( DMR_CloseDevice( hDrv, hDevice ) ) );
    printf( "DMR_Close: %s\n", DMR_ErrorCodeToString( DMR_Close() ) );
    END_APPLICATION;
}