Пример #1
0
/*!

*/
void util::KernelExtractorDriver::kernelLaunch(CUfunction f, int gridX, int gridY) {
	state.launch.gridDim = ir::Dim3(gridX, gridY, 1);
	FunctionNameMap::const_iterator f_it = functionNameMap.find(f);
	
	cuCtxSynchronize();	// wait for previous launches to conclude [somehow]	
	synchronizeFromDevice();
	
	if (f_it != functionNameMap.end()) {
		state.launch.moduleName = f_it->second.first;
		state.launch.kernelName = f_it->second.second;
	}
	else {
		state.launch.moduleName = "unknown_module";
		state.launch.kernelName = "unknown_kernel";
	}
	
	// serialize 'before' state
	std::string launchName = state.launch.moduleName + "-" + state.launch.kernelName;
	std::ofstream file(state.application.name + "-" + launchName + ".json");
	
	std::string app = state.application.name;
	state.application.name += "-before-" + launchName;
	state.serialize(file);
	state.application.name = app;
}
Пример #2
0
int madd_gpu(struct device_info *device_info, CUdeviceptr *a_dev, CUdeviceptr
 *b_dev, CUdeviceptr *c_dev, unsigned int rows, unsigned int cols)
{
	CUresult res;

	/* set kernel parameters */
	void *kernel_params[] = {a_dev, b_dev, c_dev, &rows, &cols};

	/* execute kernel */
	unsigned int gridWidth = cols >> X_THREADS_PER_BLOCK_SHIFT;
	unsigned int gridHeight = rows >> Y_THREADS_PER_BLOCK_SHIFT;

	unsigned int shmemBytes = 0x40; /* random value */

	if ((res = cuLaunchKernel(device_info->kernel, gridWidth, gridHeight, 1,
	 X_THREADS_PER_BLOCK, Y_THREADS_PER_BLOCK, 1, shmemBytes, 0,
	 kernel_params, 0)) != CUDA_SUCCESS) {
		printf("cuLaunchKernel failed: res = %lu\n",
		 (unsigned long)res);
		return -1;
	}

	cuCtxSynchronize();
	
	return 0;
}
Пример #3
0
WEAK void halide_release() {
    // CUcontext ignore;
    // TODO: this is for timing; bad for release-mode performance
    CHECK_CALL( cuCtxSynchronize(), "cuCtxSynchronize on exit" );

    // Only destroy the context if we own it
    if (weak_cuda_ctx) {
        CHECK_CALL( cuCtxDestroy(weak_cuda_ctx), "cuCtxDestroy on exit" );    
        weak_cuda_ctx = 0;
    }

    // Destroy the events
    if (__start) {
        cuEventDestroy(__start);
        cuEventDestroy(__end);
        __start = __end = 0;
    }

    // Unload the module
    if (__mod) {
        CHECK_CALL( cuModuleUnload(__mod), "cuModuleUnload" );
        __mod = 0;
    }

    //CHECK_CALL( cuCtxPopCurrent(&ignore), "cuCtxPopCurrent" );
}
Пример #4
0
// Used to generate correct timings when tracing
WEAK int halide_dev_sync(void *user_context) {
    DEBUG_PRINTF( user_context, "CUDA: halide_dev_sync (user_context: %p)\n", user_context );

    CudaContext ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    CUresult err = cuCtxSynchronize();
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuCtxSynchronize failed (%s)",
                             _get_error_name(err));
        return err;
    }

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return 0;
}
Пример #5
0
WEAK void halide_release() {
    // It's possible that this is being called from the destructor of
    // a static variable, in which case the driver may already be
    // shutting down. For this reason we allow the deinitialized
    // error.
    CHECK_CALL_DEINIT_OK( cuCtxSynchronize(), "cuCtxSynchronize on exit" );

    // Only destroy the context if we own it
    if (weak_cuda_ctx) {
        CHECK_CALL_DEINIT_OK( cuCtxDestroy(weak_cuda_ctx), "cuCtxDestroy on exit" );
        weak_cuda_ctx = 0;
    }

    // Destroy the events
    if (__start) {
        cuEventDestroy(__start);
        cuEventDestroy(__end);
        __start = __end = 0;
    }

    // Unload the module
    if (__mod) {
        CHECK_CALL_DEINIT_OK( cuModuleUnload(__mod), "cuModuleUnload" );
        __mod = 0;
    }

    //CHECK_CALL( cuCtxPopCurrent(&ignore), "cuCtxPopCurrent" );
}
Пример #6
0
WEAK int halide_dev_run(void *user_context,
                        void *state_ptr,
                        const char* entry_name,
                        int blocksX, int blocksY, int blocksZ,
                        int threadsX, int threadsY, int threadsZ,
                        int shared_mem_bytes,
                        size_t arg_sizes[],
                        void* args[]) {
    DEBUG_PRINTF( user_context, "CUDA: halide_dev_run (user_context: %p, entry: %s, blocks: %dx%dx%d, threads: %dx%dx%d, shmem: %d)\n",
                  user_context, entry_name,
                  blocksX, blocksY, blocksZ,
                  threadsX, threadsY, threadsZ,
                  shared_mem_bytes );

    CUresult err;
    CudaContext ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    halide_assert(user_context, state_ptr);
    CUmodule mod = ((module_state*)state_ptr)->module;
    halide_assert(user_context, mod);
    CUfunction f;
    err = cuModuleGetFunction(&f, mod, entry_name);
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuModuleGetFunction failed (%s)",
                             _get_error_name(err));
        return err;
    }

    err = cuLaunchKernel(f,
                         blocksX,  blocksY,  blocksZ,
                         threadsX, threadsY, threadsZ,
                         shared_mem_bytes,
                         NULL, // stream
                         args,
                         NULL);
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuLaunchKernel failed (%s)",
                             _get_error_name(err));
        return err;
    }

    #ifdef DEBUG
    err = cuCtxSynchronize();
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuCtxSynchronize failed (%s)\n",
                             _get_error_name(err));
        return err;
    }
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif
    return 0;
}
Пример #7
0
/**
 * Frees the memory space pointed to by dptr, which must have been returned 
 * by a previous call to cuMemAlloc() or cuMemAllocPitch().
 *
 * Parameters:
 * dptr 	- Pointer to memory to free
 *
 * Returns:
 * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, 
 * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE 
 */
CUresult cuMemFree_v2(CUdeviceptr dptr)
{
	CUresult res;
	struct CUctx_st *ctx;
	Ghandle handle;
	uint64_t addr = dptr;
	uint64_t size;

	if (!gdev_initialized)
		return CUDA_ERROR_NOT_INITIALIZED;

	res = cuCtxGetCurrent(&ctx);
	if (res != CUDA_SUCCESS)
		return res;

	/* wait for all kernels to complete - some may be using the memory. */
	cuCtxSynchronize();

	handle = ctx->gdev_handle;

	if (!(size = gfree(handle, addr)))
		return CUDA_ERROR_INVALID_VALUE;

	return CUDA_SUCCESS;
}
Пример #8
0
WEAK void halide_release(void *user_context) {
    // Do not do any of this if there is not context set. E.g.
    // if halide_release is called and no CUDA calls have been made.
    if (cuda_ctx_ptr != NULL) {
        // It's possible that this is being called from the destructor of
        // a static variable, in which case the driver may already be
        // shutting down. For this reason we allow the deinitialized
        // error.
        CHECK_CALL_DEINIT_OK( cuCtxSynchronize(), "cuCtxSynchronize on exit" );

        // Destroy the events
        if (__start) {
            cuEventDestroy(__start);
            cuEventDestroy(__end);
            __start = __end = 0;
        }

        // Unload the module
        if (__mod) {
            CHECK_CALL_DEINIT_OK( cuModuleUnload(__mod), "cuModuleUnload" );
            __mod = 0;
        }

        // Only destroy the context if we own it
        if (weak_cuda_ctx) {
            CHECK_CALL_DEINIT_OK( cuCtxDestroy(weak_cuda_ctx), "cuCtxDestroy on exit" );
            weak_cuda_ctx = 0;
        }

        cuda_ctx_ptr = NULL;
    }

    //CHECK_CALL( cuCtxPopCurrent(&ignore), "cuCtxPopCurrent" );
}
Пример #9
0
void GPUInterface::ResizeStreamCount(int newStreamCount) {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tEntering GPUInterface::ResizeStreamCount\n");
#endif
    SAFE_CUDA(cuCtxPushCurrent(cudaContext));

    SAFE_CUDA(cuCtxSynchronize());

    if (cudaStreams != NULL) {
        for(int i=0; i<numStreams; i++) {
            if (cudaStreams[i] != NULL)
                SAFE_CUDA(cuStreamDestroy(cudaStreams[i]));
        }
        free(cudaStreams);
    }

    if (cudaEvents != NULL) {
        for(int i=0; i<numStreams; i++) {
            if (cudaEvents[i] != NULL)
                SAFE_CUDA(cuEventDestroy(cudaEvents[i]));
        }
        free(cudaEvents);
    }

    if (newStreamCount == 1) {
        numStreams = 1;
        cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams);
        cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1));
        cudaStreams[0] = NULL;
        CUevent event;
        for(int i=0; i<2; i++) {
            SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
            cudaEvents[i] = event;
        }
    } else {
        numStreams = newStreamCount;
        if (numStreams > BEAGLE_STREAM_COUNT) {
            numStreams = BEAGLE_STREAM_COUNT;
        }
        cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams);
        CUstream stream;
        cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1));
        CUevent event;
        for(int i=0; i<numStreams; i++) {
            SAFE_CUDA(cuStreamCreate(&stream, CU_STREAM_DEFAULT));
            cudaStreams[i] = stream;
            SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
            cudaEvents[i] = event;
        }
        SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
        cudaEvents[numStreams] = event;
    }

    SAFE_CUDA(cuCtxPopCurrent(&cudaContext));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tLeaving  GPUInterface::ResizeStreamCount\n");
#endif
}
Пример #10
0
CAMLprim value spoc_cuda_flush_all(value gi, value dev){
	CAMLparam2(gi, dev);

	CUDA_GET_CONTEXT;
	cuCtxSynchronize();
	CUDA_RESTORE_CONTEXT;

	CAMLreturn(Val_unit);
}
Пример #11
0
/*
 * Class:     edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2
 * Method:    runBlocks
 * Signature: (I)V
 */
JNIEXPORT jint JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_runBlocks
  (JNIEnv *env, jobject this_obj, jint num_blocks, jint block_shape, jint grid_shape){

  CUresult status;
  jlong * infoSpace = (jlong *) malloc(gc_space_size);
  infoSpace[1] = heapEndPtr;
  cuMemcpyHtoD(gcInfoSpace, infoSpace, gc_space_size);
  cuMemcpyHtoD(gpuToSpace, toSpace, heapEndPtr);
  //cuMemcpyHtoD(gpuTexture, textureMemory, textureMemSize);
  cuMemcpyHtoD(gpuHandlesMemory, handlesMemory, num_blocks * sizeof(jlong));
  cuMemcpyHtoD(gpuHeapEndPtr, &heapEndPtr, sizeof(jlong));
  cuMemcpyHtoD(gpuBufferSize, &bufferSize, sizeof(jlong));
  
/*
  status = cuModuleGetTexRef(&cache, cuModule, "m_Cache");  
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuModuleGetTexRef %d\n", status);
  }

  status = cuTexRefSetAddress(0, cache, gpuTexture, textureMemSize);
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuTextRefSetAddress %d\n", status);
  }
*/

  status = cuFuncSetBlockShape(cuFunction, block_shape, 1, 1);
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuFuncSetBlockShape %d\n", status);
    return (jint) status;
  }

  status = cuLaunchGrid(cuFunction, grid_shape, 1);
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuLaunchGrid %d\n", status);
    fflush(stdout);
    return (jint) status;
  }

  status = cuCtxSynchronize();  
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuCtxSynchronize %d\n", status);
    return (jint) status;
  }

  cuMemcpyDtoH(infoSpace, gcInfoSpace, gc_space_size);
  heapEndPtr = infoSpace[1];
  cuMemcpyDtoH(toSpace, gpuToSpace, heapEndPtr);
  cuMemcpyDtoH(exceptionsMemory, gpuExceptionsMemory, num_blocks * sizeof(jlong));
  free(infoSpace);
  
  return 0;
}
Пример #12
0
CUresult
TestSAXPY( chCUDADevice *chDevice, size_t N, float alpha )
{
    CUresult status;
    CUdeviceptr dptrOut = 0;
    CUdeviceptr dptrIn = 0;
    float *hostOut = 0;
    float *hostIn = 0;

    CUDA_CHECK( cuCtxPushCurrent( chDevice->context() ) );

    CUDA_CHECK( cuMemAlloc( &dptrOut, N*sizeof(float) ) );
    CUDA_CHECK( cuMemsetD32( dptrOut, 0, N ) );
    CUDA_CHECK( cuMemAlloc( &dptrIn, N*sizeof(float) ) );
    CUDA_CHECK( cuMemHostAlloc( (void **) &hostOut, N*sizeof(float), 0 ) );
    CUDA_CHECK( cuMemHostAlloc( (void **) &hostIn, N*sizeof(float), 0 ) );
    for ( size_t i = 0; i < N; i++ ) {
        hostIn[i] = (float) rand() / (float) RAND_MAX;
    }
    CUDA_CHECK( cuMemcpyHtoDAsync( dptrIn, hostIn, N*sizeof(float ), NULL ) );

    {
        CUmodule moduleSAXPY;
        CUfunction kernelSAXPY;
        void *params[] = { &dptrOut, &dptrIn, &N, &alpha };
        
        moduleSAXPY = chDevice->module( "saxpy.ptx" );
        if ( ! moduleSAXPY ) {
            status = CUDA_ERROR_NOT_FOUND;
            goto Error;
        }
        CUDA_CHECK( cuModuleGetFunction( &kernelSAXPY, moduleSAXPY, "saxpy" ) );

        CUDA_CHECK( cuLaunchKernel( kernelSAXPY, 1500, 1, 1, 512, 1, 1, 0, NULL, params, NULL ) );

    }

    CUDA_CHECK( cuMemcpyDtoHAsync( hostOut, dptrOut, N*sizeof(float), NULL ) );
    CUDA_CHECK( cuCtxSynchronize() );
    for ( size_t i = 0; i < N; i++ ) {
        if ( fabsf( hostOut[i] - alpha*hostIn[i] ) > 1e-5f ) {
            status = CUDA_ERROR_UNKNOWN;
            goto Error;
        }
    }
    status = CUDA_SUCCESS;
    printf( "Well it worked!\n" );

Error:
    cuCtxPopCurrent( NULL );
    cuMemFreeHost( hostOut );
    cuMemFreeHost( hostIn );
    cuMemFree( dptrOut );
    cuMemFree( dptrIn );
    return status;
}
Пример #13
0
void swanSynchronize( void ) {
	CUresult err =cuCtxSynchronize();

	if ( err != CUDA_SUCCESS ) {
		error("swanSynchronize failed\n" );
	}
	if( state.debug ) {
		printf("# swanSynchronize()\n");
	}
}
Пример #14
0
void CudaModule::sync(bool yield)
{
  if (!s_inited) {
    return;
  }

  if (!yield || !s_endEvent) {
    checkError("cuCtxSynchronize", cuCtxSynchronize());
    return;
  }
}
Пример #15
0
void GPUInterface::SynchronizeHost() {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tEntering GPUInterface::SynchronizeHost\n");
#endif

    SAFE_CUPP(cuCtxSynchronize());

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tLeaving  GPUInterface::SynchronizeHost\n");
#endif
}
Пример #16
0
SEXP R_cuCtxSynchronize()
{
    SEXP r_ans = R_NilValue;
    
    CUresult ans;
    ans = cuCtxSynchronize();
    
    r_ans = Renum_convert_CUresult(ans) ;
    
    return(r_ans);
}
Пример #17
0
void util::KernelExtractorDriver::kernelReturn(CUresult result) {
	cuCtxSynchronize();
	
	synchronizeFromDevice();
	
	
	std::string launchName = state.launch.moduleName + "-" + state.launch.kernelName;
	std::ofstream file(state.application.name + "-" + launchName + ".json", std::ios_base::app);
	
	std::string app = state.application.name;
	state.application.name += "-after-" + state.launch.moduleName + "-" + state.launch.kernelName;
	state.serialize(file);
	state.application.name = app;
}
Пример #18
0
void swanRunKernel( const char *kernel,  block_config_t grid , block_config_t block, size_t shmem, int flags, void *ptrs[], int *types  ) {
	CUresult  err;
	swanRunKernelAsync( kernel, grid, block, shmem, flags, ptrs, types );

 if(state.debug) {
	err =cuCtxSynchronize();
 }

//	if( err != CUDA_SUCCESS ) {
//		fprintf( stderr , "SWAN : FATAL : Failure executing kernel sync [%s] [%d]\n", kernel, err );
//	assert(0);
//		exit(-99);
//	}

}
Пример #19
0
Файл: ov.c Проект: CPFL/gtraffic
/*
 * execute kernel function
 */
void
execute_cuda(void){

  
  res = cuLaunchGrid(function, 1, block_num);
  if(res != CUDA_SUCCESS){
    printf("cuLaunchGrid() failed: res = %s\n", conv(res));
    exit(1);
  }

  
  res = cuCtxSynchronize();
  if(res != CUDA_SUCCESS){
    printf("cuCtxSynchronize() failed: res = %s\n", conv(res));
    exit(1);
  }
  
}
Пример #20
0
WEAK void halide_release(void *user_context) {
    DEBUG_PRINTF( user_context, "CUDA: halide_release (user_context: %p)\n", user_context );

    int err;
    CUcontext ctx;
    err = halide_acquire_cuda_context(user_context, &ctx);
    if (err != CUDA_SUCCESS || !ctx) {
        return;
    }

    // It's possible that this is being called from the destructor of
    // a static variable, in which case the driver may already be
    // shutting down.
    err = cuCtxSynchronize();
    halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);

    // Unload the modules attached to this context. Note that the list
    // nodes themselves are not freed, only the module objects are
    // released. Subsequent calls to halide_init_kernels might re-create
    // the program object using the same list node to store the module
    // object.
    module_state *state = state_list;
    while (state) {
        if (state->module) {
            DEBUG_PRINTF(user_context, "    cuModuleUnload %p\n", state->module);
            err = cuModuleUnload(state->module);
            halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
            state->module = 0;
        }
        state = state->next;
    }

    // Only destroy the context if we own it
    if (ctx == weak_cuda_ctx) {
        DEBUG_PRINTF(user_context, "    cuCtxDestroy %p\n", weak_cuda_ctx);
        err = cuCtxDestroy(weak_cuda_ctx);
        halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
        weak_cuda_ctx = NULL;
    }

    halide_release_cuda_context(user_context);
}
Пример #21
0
bool VideoDecoderCUDAPrivate::processDecodedData(CUVIDPARSERDISPINFO *cuviddisp, VideoFrame* outFrame) {
    int num_fields = cuviddisp->progressive_frame ? 1 : 2+cuviddisp->repeat_first_field;

    for (int active_field = 0; active_field < num_fields; ++active_field) {
        CUVIDPROCPARAMS proc_params;
        memset(&proc_params, 0, sizeof(CUVIDPROCPARAMS));
        proc_params.progressive_frame = cuviddisp->progressive_frame; //check user config
        proc_params.second_field = active_field == 1; //check user config
        proc_params.top_field_first = cuviddisp->top_field_first;
        proc_params.unpaired_field = cuviddisp->progressive_frame == 1;

        CUdeviceptr devptr;
        unsigned int pitch;
        cuvidCtxLock(vid_ctx_lock, 0);
        CUresult cuStatus = cuvidMapVideoFrame(dec, cuviddisp->picture_index, &devptr, &pitch, &proc_params);
        if (cuStatus != CUDA_SUCCESS) {
            qWarning("cuvidMapVideoFrame failed on index %d (%#x, %s)", cuviddisp->picture_index, cuStatus, _cudaGetErrorEnum(cuStatus));
            cuvidUnmapVideoFrame(dec, devptr);
            cuvidCtxUnlock(vid_ctx_lock, 0);
            return false;
        }
#define PAD_ALIGN(x,mask) ( (x + mask) & ~mask )
        //uint w = dec_create_info.ulWidth;//PAD_ALIGN(dec_create_info.ulWidth, 0x3F);
        uint h = dec_create_info.ulHeight;//PAD_ALIGN(dec_create_info.ulHeight, 0x0F); //?
#undef PAD_ALIGN
        int size = pitch*h*3/2;
        if (size > host_data_size && host_data) {
            cuMemFreeHost(host_data);
            host_data = 0;
            host_data_size = 0;
        }
        if (!host_data) {
            cuStatus = cuMemAllocHost((void**)&host_data, size);
            if (cuStatus != CUDA_SUCCESS) {
                qWarning("cuMemAllocHost failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus));
                cuvidUnmapVideoFrame(dec, devptr);
                cuvidCtxUnlock(vid_ctx_lock, 0);
                return false;
            }
            host_data_size = size;
        }
        if (!host_data) {
            qWarning("No valid staging memory!");
            cuvidUnmapVideoFrame(dec, devptr);
            cuvidCtxUnlock(vid_ctx_lock, 0);
            return false;
        }
        cuStatus = cuMemcpyDtoHAsync(host_data, devptr, size, stream);
        if (cuStatus != CUDA_SUCCESS) {
            qWarning("cuMemcpyDtoHAsync failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus));
            cuvidUnmapVideoFrame(dec, devptr);
            cuvidCtxUnlock(vid_ctx_lock, 0);
            return false;
        }
        cuStatus = cuCtxSynchronize();
        if (cuStatus != CUDA_SUCCESS) {
            qWarning("cuCtxSynchronize failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus));
        }
        cuvidUnmapVideoFrame(dec, devptr);
        cuvidCtxUnlock(vid_ctx_lock, 0);
        //qDebug("mark not in use pic_index: %d", cuviddisp->picture_index);
        surface_in_use[cuviddisp->picture_index] = false;

        uchar *planes[] = {
            host_data,
            host_data + pitch * h
        };
        int pitches[] = { (int)pitch, (int)pitch };
        VideoFrame frame(codec_ctx->width, codec_ctx->height, VideoFormat::Format_NV12);
        frame.setBits(planes);
        frame.setBytesPerLine(pitches);
        //TODO: is clone required? may crash on clone, I should review clone()
        //frame = frame.clone();
        if (outFrame) {
            *outFrame = frame.clone();
        }
#if COPY_ON_DECODE
        frame_queue.put(frame.clone());
#endif
        //qDebug("frame queue size: %d", frame_queue.size());
    }
    return true;
}
Пример #22
0
static CUT_THREADPROC dt_thread_func(void *p)
{
	dt_partition *pt = (dt_partition *)p;
	struct timeval tv;
	CUresult res;
	int thread_num_x=0, thread_num_y=0;
	int block_num_x=0, block_num_y=0;

	res = cuCtxSetCurrent(ctx[pt->pid]);
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSetCurrent(ctx[%d]) failed: res = %s\n", pt->pid, cuda_response_to_string(res));
		exit(1);
	}

	/* allocate GPU memory */

	//printf("part_error_array_num = %d\n",part_error_array_num);


	if(pt->pid == 0){
		gettimeofday(&tv_memcpy_start, NULL);
	}

	res = cuMemcpyHtoD(part_C_dev[pt->pid], dst_C, SUM_SIZE_C);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(part_C_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(part_error_array_dev[pt->pid], part_error_array, part_error_array_num*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(part_error_array_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(pm_size_array_dev[pt->pid], &pt->size_array[0][0], pt->NoP*2*pt->L_MAX*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(pm_size_array_dev) falied: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(def_array_dev[pt->pid], pt->def, sum_size_def_array);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(def_array_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(numpart_dev[pt->pid], pt->numpart, pt->NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(cuMemcpyHtoD(numpart_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(PIDX_array_dev[pt->pid], pt->dst_PIDX, pt->tmp_array_size);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(PIDX_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(DID_4_array_dev[pt->pid], pt->dst_DID_4, pt->tmp_array_size);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(DID_4__array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}


	if(pt->pid == 0){
		gettimeofday(&tv_memcpy_end, NULL);
		tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
		time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	}

	int sharedMemBytes = 0;

	/* get max thread num per block */
	int max_threads_num = 0;
	res = cuDeviceGetAttribute(&max_threads_num, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev[pt->pid]);
	if(res != CUDA_SUCCESS){
		printf("\ncuDeviceGetAttribute() failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	/* prepare for launch inverse_Q */
	void* kernel_args_inverse[] = {
		&part_C_dev[pt->pid],
		&pm_size_array_dev[pt->pid],
		&part_error_array_dev[pt->pid],
		&part_error_array_num,
		(void*)&(pt->NoP),
		&PIDX_array_dev[pt->pid],
		&numpart_dev[pt->pid],
		(void*)&(pt->NoC),
		(void*)&(pt->max_numpart),
		(void*)&(pt->interval),
		(void*)&(pt->L_MAX),
		(void*)&(pt->pid),
		(void*)&(device_num)
	};

	/* define CUDA block shape */
	int upper_limit_th_num_x = max_threads_num/(pt->max_numpart*pt->NoC);
	int upper_limit_th_num_y = max_threads_num/upper_limit_th_num_x;
	if(upper_limit_th_num_x < 1) upper_limit_th_num_x++;
	if(upper_limit_th_num_y < 1) upper_limit_th_num_y++;

	thread_num_x = (pt->max_dim0*pt->max_dim1 < upper_limit_th_num_x) ? (pt->max_dim0*pt->max_dim1) : upper_limit_th_num_x;
	thread_num_y = (pt->max_numpart < upper_limit_th_num_y) ? pt->max_numpart : upper_limit_th_num_y;

	block_num_x = (pt->max_dim0*pt->max_dim1) / thread_num_x;
	block_num_y = (pt->max_numpart) / thread_num_y;
	if((pt->max_dim0*pt->max_dim1) % thread_num_x != 0) block_num_x++;
	if(pt->max_numpart % thread_num_y != 0) block_num_y++;


	int blockDimY = thread_num_y / device_num;
	if(thread_num_y%device_num != 0){
		blockDimY++;
	}

	/* launch iverse_Q */
	if(pt->pid == 0){
		gettimeofday(&tv_kernel_start, NULL);
	}
	res = cuLaunchKernel(
		func_inverse_Q[pt->pid],      // call function
		block_num_x,         // gridDimX
		block_num_y,         // gridDimY
		pt->L_MAX-pt->interval,      // gridDimZ
		thread_num_x,        // blockDimX
		blockDimY,        // blockDimY
		pt->NoC,                 // blockDimZ
		sharedMemBytes,      // sharedMemBytes
		NULL,                // hStream
		kernel_args_inverse, // kernelParams
		NULL                 // extra
		);
	if(res != CUDA_SUCCESS) {
		printf("block_num_x %d, block_num_y %d, thread_num_x %d, thread_num_y %d\n", block_num_x, block_num_y, thread_num_x, thread_num_y);
		printf("cuLaunchKernel(inverse_Q) failed : res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuCtxSynchronize();
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSynchronize(inverse_Q) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	if(pt->pid == 0){
		gettimeofday(&tv_kernel_end, NULL);
		tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
		time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	}


	/* prepare for launch dt1d_x */
	void* kernel_args_x[] = {
		&part_C_dev[pt->pid],                  // FLOAT *src_start
		&tmpM_dev[pt->pid],                    // FLOTA *dst
		&tmpIy_dev[pt->pid],                   // int *ptr
		&DID_4_array_dev[pt->pid],             // int *DID_4_array,
		&def_array_dev[pt->pid],               // FLOAT *def_array,
		&pm_size_array_dev[pt->pid],           // int *size_array
		(void*)&(pt->NoP),                  // int NoP
		&PIDX_array_dev[pt->pid],              // int *PIDX_array
		&part_error_array_dev[pt->pid],        // int *error_array
		(void*)&(part_error_array_num), // int error_array_num
		&numpart_dev[pt->pid],                 // int *numpart
		(void*)&(pt->NoC),                  // int NoC
		(void*)&(pt->max_numpart),          // int max_numpart
		(void*)&(pt->interval),             // int interval
		(void*)&(pt->L_MAX),                 // int L_MAX
		(void*)&(pt->pid),                   // int pid
		(void*)&(device_num)                 // int device_num
	};


	max_threads_num = 64/pt->NoC;
	if(max_threads_num < 1) max_threads_num++;

	thread_num_x = (pt->max_dim1 < max_threads_num) ? pt->max_dim1 : max_threads_num;
	thread_num_y = (pt->max_numpart < max_threads_num) ? pt->max_numpart : max_threads_num;

	block_num_x = pt->max_dim1 / thread_num_x;
	block_num_y = pt->max_numpart / thread_num_y;
	if(pt->max_dim1 % thread_num_x != 0) block_num_x++;
	if(pt->max_numpart % thread_num_y != 0) block_num_y++;

	blockDimY = thread_num_y / device_num;
	if(thread_num_y%device_num != 0){
		blockDimY++;
	}

	/* launch dt1d_x */
	if(pt->pid == 0){
		gettimeofday(&tv_kernel_start, NULL);
	}

	res = cuLaunchKernel(
		func_dt1d_x[pt->pid],    // call function
		block_num_x,    // gridDimX
		block_num_y,    // gridDimY
		pt->L_MAX-pt->interval, // gridDimZ
		thread_num_x,   // blockDimX
		blockDimY,   // blockDimY
		pt->NoC,            // blockDimZ
		sharedMemBytes, // sharedMemBytes
		NULL,           // hStream
		kernel_args_x,  // kernelParams
		NULL            // extra
		);
	if(res != CUDA_SUCCESS) {

		printf("block_num_x %d, block_num_y %d, thread_num_x %d, thread_num_y %d\n", block_num_x, block_num_y, thread_num_x, thread_num_y);

		printf("cuLaunchKernel(dt1d_x) failed : res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuCtxSynchronize();
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSynchronize(dt1d_x) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	if(pt->pid == 0){
		gettimeofday(&tv_kernel_end, NULL);
		tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
		time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	}


	/* prepare for launch dt1d_y */
	void* kernel_args_y[] = {
		&tmpM_dev[pt->pid],                    // FLOAT *src_start
		&M_dev[pt->pid],                       // FLOAT *dst_start
		&tmpIx_dev[pt->pid],                   // int *ptr_start
		&DID_4_array_dev[pt->pid],             // int *DID_4_array,
		&def_array_dev[pt->pid],               // FLOAT *def_array,
		(void*)&(pt->NoP),                  // int NoP
		&pm_size_array_dev[pt->pid],           // int *size_array
		&numpart_dev[pt->pid],                 // int *numpart,
		&PIDX_array_dev[pt->pid],              // int *PIDX_array,
		(void*)&(pt->NoC),                  // int NoC
		(void*)&(pt->max_numpart),          // int max_numpart
		(void*)&(pt->interval),             // int interval
		(void*)&(pt->L_MAX),                // int L_MAX
		&part_error_array_dev[pt->pid],        // int *error_array
		(void*)&(part_error_array_num), // int error_array_num
		(void*)&(pt->pid),                   // int pid
		(void*)&(device_num)                 // int device_num
	};


	thread_num_x = (pt->max_dim0 < max_threads_num) ? pt->max_dim0 : max_threads_num;
	thread_num_y = (pt->max_numpart < max_threads_num) ? pt->max_numpart : max_threads_num;

	block_num_x = pt->max_dim0 / thread_num_x;
	block_num_y = pt->max_numpart / thread_num_y;
	if(pt->max_dim0 % thread_num_x != 0) block_num_x++;
	if(pt->max_numpart % thread_num_y != 0) block_num_y++;

	blockDimY = thread_num_y / device_num;
	if(thread_num_y%device_num != 0){
		blockDimY++;
	}

	/* prepare for launch dt1d_y */
	if(pt->pid == 0){
		gettimeofday(&tv_kernel_start, NULL);
	}

	res = cuLaunchKernel(
		func_dt1d_y[pt->pid],    // call functions
		block_num_x,    // gridDimX
		block_num_y,    // gridDimY
		pt->L_MAX-pt->interval, // gridDimZ
		thread_num_x,   // blockDimX
		blockDimY,   // blockDimY
		pt->NoC,            // blockDimZ
		sharedMemBytes, // sharedMemBytes
		NULL,           // hStream
		kernel_args_y,  // kernelParams
		NULL            // extra
		);
	if(res != CUDA_SUCCESS) {
		printf("cuLaunchKernel(dt1d_y failed : res = %s\n", cuda_response_to_string(res));
		exit(1);
	}


	res = cuCtxSynchronize();
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSynchronize(dt1d_y) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	if(pt->pid == 0){
		gettimeofday(&tv_kernel_end, NULL);
		tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
		time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	}



	/* downloads datas from GPU */

	/* downloads M from GPU */

	int sum_part_size = 0;
	int sum_pointer_size = 0;
	int sum_move_size = 0;
	int part_size = 0;
	int pointer_size = 0;
	int part_y = 0;
	int move_size = 0;
	int start_kk = 0;
	int end_kk = 0;
	int part_end_kk = 0;
	unsigned long long int pointer_dst_M = (unsigned long long int)pt->dst_M;
	unsigned long long int pointer_M_dev = (unsigned long long int)M_dev[pt->pid];

	for(int L=0; L<(pt->L_MAX-pt->interval); L++) {

		/**************************************************************************/
		/* loop condition */
		if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) )
		{
			continue;
		}
		/* loop conditon */
		/**************************************************************************/


		for(int jj=0; jj<pt->NoC; jj++) {

			part_y = pt->numpart[jj] / device_num;
			if(pt->numpart[jj]%device_num != 0){
				part_y++;
			}


			start_kk = part_y * pt->pid;
			end_kk = part_y * (pt->pid + 1);

			if(end_kk > pt->numpart[jj]){
				end_kk = pt->numpart[jj];
			}

			if(pt->pid > 0){
				part_end_kk = part_y * pt->pid;
			}

			for(int kk=0; kk<pt->numpart[jj]; kk++) {

				int PIDX = pt->PIDX_array[L][jj][kk];
				int dims0 = pt->size_array[L][PIDX*2];
				int dims1 = pt->size_array[L][PIDX*2+1];
				if(start_kk <= kk && kk < end_kk){
					part_size += dims0 * dims1;
				}
				//if(pt->pid > 0 && part_start_kk <= kk && kk < part_end_kk){
				if(pt->pid > 0 && 0 <= kk && kk < part_end_kk){
					pointer_size += dims0 * dims1;
				}
				move_size += dims0 * dims1;
			}

			sum_part_size += part_size;
			sum_pointer_size += pointer_size;
			sum_move_size += move_size;

			// error pt->pid == 2 && L == 24 && jj == 1

			if(pt->pid*part_y < pt->numpart[jj]){

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_start, NULL);
				}


				res = cuMemcpyDtoH((void *)(pointer_dst_M+(unsigned long long int)(pointer_size*sizeof(FLOAT))), (CUdeviceptr)(pointer_M_dev+(unsigned long long int)(pointer_size*sizeof(FLOAT))), part_size*sizeof(FLOAT));
				if(res != CUDA_SUCCESS) {
					printf("error pid = %d\n",pt->pid);
					printf("cuMemcpyDtoH(dst_M) failed: res = %s\n", cuda_response_to_string(res));
					exit(1);
				}

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_end, NULL);
					tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
					time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
				}

			}

			pointer_dst_M += (unsigned long long int)(move_size * sizeof(FLOAT));
			pointer_M_dev += (unsigned long long int)(move_size * sizeof(FLOAT));

			part_size = 0;
			pointer_size = 0;
			move_size = 0;

		}

	}


	/* downloads tmpIx from GPU */

	sum_part_size = 0;
	sum_pointer_size = 0;
	part_size = 0;
	pointer_size = 0;
	part_y = 0;
	move_size = 0;
	start_kk = 0;
	end_kk = 0;
	part_end_kk = 0;
	unsigned long long int pointer_dst_tmpIx = (unsigned long long int)pt->dst_tmpIx;
	unsigned long long int pointer_tmpIx_dev = (unsigned long long int)tmpIx_dev[pt->pid];


	for(int L=0; L<(pt->L_MAX-pt->interval); L++) {

		/**************************************************************************/
		/* loop condition */
		if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) )
		{
			continue;
		}
		/* loop conditon */
		/**************************************************************************/


		for(int jj=0; jj<pt->NoC; jj++) {

			part_y = pt->numpart[jj] / device_num;
			if(pt->numpart[jj]%device_num != 0){
				part_y++;
			}

			start_kk = part_y * pt->pid;
			end_kk = part_y * (pt->pid + 1);

			if(end_kk > pt->numpart[jj]){
				end_kk = pt->numpart[jj];
			}

			if(pt->pid > 0){
				part_end_kk = part_y * pt->pid;
			}

			for(int kk=0; kk<pt->numpart[jj]; kk++) {

				int PIDX = pt->PIDX_array[L][jj][kk];
				int dims0 = pt->size_array[L][PIDX*2];
				int dims1 = pt->size_array[L][PIDX*2+1];
				if(start_kk <= kk && kk < end_kk){
					part_size += dims0 * dims1;
				}
				if(pt->pid > 0){
					if(0 <= kk && kk < part_end_kk){
						pointer_size += dims0 * dims1;
					}
				}
				move_size += dims0 * dims1;
			}

			sum_part_size += part_size;
			sum_pointer_size += pointer_size;


			if(pt->pid*part_y < pt->numpart[jj]){

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_start, NULL);
				}


				res = cuMemcpyDtoH((void *)(pointer_dst_tmpIx+(unsigned long long int)(pointer_size*sizeof(int))), (CUdeviceptr)(pointer_tmpIx_dev+(unsigned long long int)(pointer_size*sizeof(int))), part_size*sizeof(int));
				if(res != CUDA_SUCCESS) {
					printf("error pid = %d\n",pt->pid);
					printf("cuMemcpyDtoH(tmpIx) failed: res = %s\n", cuda_response_to_string(res));
					exit(1);
				}

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_end, NULL);
					tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
					time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
				}

			}

			pointer_dst_tmpIx += (unsigned long long int)(move_size * sizeof(int));
			pointer_tmpIx_dev += (unsigned long long int)(move_size * sizeof(int));

			part_size = 0;
			pointer_size = 0;
			move_size = 0;

		}

	}


	/* downloads tmpIy from GPU */

	sum_part_size = 0;
	sum_pointer_size = 0;
	part_size = 0;
	pointer_size = 0;
	part_y = 0;
	move_size = 0;
	start_kk = 0;
	end_kk = 0;
	part_end_kk = 0;
	unsigned long long int pointer_dst_tmpIy = (unsigned long long int)pt->dst_tmpIy;
	unsigned long long int pointer_tmpIy_dev = (unsigned long long int)tmpIy_dev[pt->pid];


	for(int L=0; L<(pt->L_MAX-pt->interval); L++) {

		/**************************************************************************/
		/* loop condition */
		if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) )
		{
			continue;
		}
		/* loop conditon */
		/**************************************************************************/


		for(int jj=0; jj<pt->NoC; jj++) {

			part_y = pt->numpart[jj] / device_num;
			if(pt->numpart[jj]%device_num != 0){
				part_y++;
			}

			start_kk = part_y * pt->pid;
			end_kk = part_y * (pt->pid + 1);

			if(end_kk > pt->numpart[jj]){
				end_kk = pt->numpart[jj];
			}

			if(pt->pid > 0){
				part_end_kk = part_y * pt->pid;
			}

			for(int kk=0; kk<pt->numpart[jj]; kk++) {

				int PIDX = pt->PIDX_array[L][jj][kk];
				int dims0 = pt->size_array[L][PIDX*2];
				int dims1 = pt->size_array[L][PIDX*2+1];
				if(start_kk <= kk && kk < end_kk){
					part_size += dims0 * dims1;
				}
				if(pt->pid > 0){
					if(0 <= kk && kk < part_end_kk){
						pointer_size += dims0 * dims1;
					}
				}
				move_size += dims0 * dims1;
			}

			sum_part_size += part_size;
			sum_pointer_size += pointer_size;

			if(pt->pid*part_y < pt->numpart[jj]){

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_start, NULL);
				}

				res = cuMemcpyDtoH((void *)(pointer_dst_tmpIy+(unsigned long long int)(pointer_size*sizeof(int))), (CUdeviceptr)(pointer_tmpIy_dev+(unsigned long long int)(pointer_size*sizeof(int))), part_size*sizeof(int));
				if(res != CUDA_SUCCESS) {
					printf("error pid = %d\n",pt->pid);
					printf("cuMemcpyDtoH(tmpIy) failed: res = %s\n", cuda_response_to_string(res));
					exit(1);
				}

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_end, NULL);
					tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
					time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
				}

			}

			pointer_dst_tmpIy += (unsigned long long int)(move_size * sizeof(int));
			pointer_tmpIy_dev += (unsigned long long int)(move_size * sizeof(int));

			part_size = 0;
			pointer_size = 0;
			move_size = 0;

		}

	}


	/* end of thread */
	CUT_THREADEND;
}
Пример #23
0
void draw()
{
    rmt_LogText("start profiling");

    //rmt_BeginCPUSample(uv_run);
    uv_run(uv_default_loop(), UV_RUN_NOWAIT);
    //rmt_EndCPUSample();

    CUstream stream0 = 0;
    rmt_BeginCUDASample(main, stream0);
    {
        if (isResized())
        {
            setupSizeResource();
        }
        // Launch the Vector Add CUDA Kernel
        int threadsPerBlock = 256;
        int blocksPerGrid = (img.width * img.height + threadsPerBlock - 1) / threadsPerBlock;
        //printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
        dim3 blockDim = { 32, 32, 1 };
        dim3 gridDim = { width / blockDim.x, height / blockDim.y, 1 };

        float3 iResolution = { width, height, 1 };
        float iGlobalTime = glfwGetTime();
        float4 iMouse = { mouseX, mouseY, mouseX, mouseY };
        rmt_BeginCUDASample(cuMemcpyHtoD, stream0);
        checkCudaErrors(cuMemcpyHtoD(d_iResolution, &iResolution, sizeof iResolution));
        checkCudaErrors(cuMemcpyHtoD(d_iGlobalTime, &iGlobalTime, sizeof iGlobalTime));
        checkCudaErrors(cuMemcpyHtoD(d_iMouse, &iMouse, sizeof iMouse));
        rmt_EndCUDASample(stream0);

        rmt_BeginCUDASample(cuLaunchKernel, stream0);
        checkCudaErrors(cuLaunchKernel(kernel_addr,
            gridDim.x, gridDim.y, gridDim.z, /* grid dim */
            blockDim.x, blockDim.y, blockDim.z, /* block dim */
            0, 0, /* shared mem, stream */
            0, /* arguments */
            0));
        rmt_EndCUDASample(stream0);
        rmt_BeginCUDASample(cuCtxSynchronize, stream0);
        checkCudaErrors(cuCtxSynchronize());
        rmt_EndCUDASample(stream0);
        
        rmt_BeginCUDASample(cuMemcpyDtoH, stream0);
        checkCudaErrors(cuMemcpyDtoH(img_content, d_img_content, item_size));
        rmt_EndCUDASample(stream0);
    }
    rmt_EndCUDASample(stream0);

    rmt_BeginOpenGLSample(main);
    {
        background(color(0,0,0));
        updateImage(img, img_content);
        image(img, 0, 0, width, height);

        TwDraw();
    }
    rmt_EndOpenGLSample();

    rmt_LogText("end profiling");
}
Пример #24
0
static void calc_a_score_GPU(FLOAT *ac_score,  FLOAT **score,
			     int *ssize_start,  Model_info *MI,
			     FLOAT scale, int *size_score_array,
			     int NoC)
{
	CUresult res;

	const int IHEI = MI->IM_HEIGHT;
	const int IWID = MI->IM_WIDTH;
	int pady_n = MI->pady;
	int padx_n = MI->padx;
	int block_pad = (int)(scale/2.0);

	struct timeval tv;

	int *RY_array, *RX_array;
	res = cuMemHostAlloc((void**)&RY_array, NoC*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP);
	if(res != CUDA_SUCCESS) {
		printf("cuMemHostAlloc(RY_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemHostAlloc((void**)&RX_array, NoC*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP);
	if(res != CUDA_SUCCESS) {
		printf("cuMemHostAlloc(RX_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	for(int i = 0; i < NoC; i++) {
		int rsize[2] = {MI->rsize[i*2], MI->rsize[i*2+1]};

		RY_array[i] = (int)((FLOAT)rsize[0]*scale/2.0-1.0+block_pad);
		RX_array[i] = (int)((FLOAT)rsize[1]*scale/2.0-1.0+block_pad);
	}

	CUdeviceptr ac_score_dev, score_dev;
	CUdeviceptr ssize_dev, size_score_dev;
	CUdeviceptr RY_dev, RX_dev;

	int size_score=0;
	for(int i = 0; i < NoC; i++) {
		size_score += size_score_array[i];
	}

	/* allocate GPU memory */
	res = cuMemAlloc(&ac_score_dev, gpu_size_A_SCORE);
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(ac_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&score_dev, size_score);
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&ssize_dev, NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(ssize) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&size_score_dev, NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(size_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&RY_dev, NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(RY) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&RX_dev, NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(RX) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	gettimeofday(&tv_memcpy_start, nullptr);
	/* upload date to GPU */
	res = cuMemcpyHtoD(ac_score_dev, &ac_score[0], gpu_size_A_SCORE);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(ac_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(score_dev, &score[0][0], size_score);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(ssize_dev, &ssize_start[0], NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(ssize) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(size_score_dev, &size_score_array[0], NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(size_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(RY_dev, &RY_array[0], NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(RY) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(RX_dev, &RX_array[0], NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(RX) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	gettimeofday(&tv_memcpy_end, nullptr);
	tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
	time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

	void* kernel_args[] = {
		(void*)&IWID,
		(void*)&IHEI,
		(void*)&scale,
		(void*)&padx_n,
		(void*)&pady_n,
		&RX_dev,
		&RY_dev,
		&ac_score_dev,
		&score_dev,
		&ssize_dev,
		(void*)&NoC,
		&size_score_dev
	};

	int sharedMemBytes = 0;

	/* define CUDA block shape */
	int max_threads_num = 0;
	int thread_num_x, thread_num_y;
	int block_num_x, block_num_y;

	res = cuDeviceGetAttribute(&max_threads_num, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev[0]);
	if(res != CUDA_SUCCESS){
		printf("\ncuDeviceGetAttribute() failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	NR_MAXTHREADS_X[0] = (int)sqrt((double)max_threads_num/NoC);
	NR_MAXTHREADS_Y[0] = (int)sqrt((double)max_threads_num/NoC);

	thread_num_x = (IWID < NR_MAXTHREADS_X[0]) ? IWID : NR_MAXTHREADS_X[0];
	thread_num_y = (IHEI < NR_MAXTHREADS_Y[0]) ? IHEI : NR_MAXTHREADS_Y[0];

	block_num_x = IWID / thread_num_x;
	block_num_y = IHEI / thread_num_y;
	if(IWID % thread_num_x != 0) block_num_x++;
	if(IHEI % thread_num_y != 0) block_num_y++;

	gettimeofday(&tv_kernel_start, nullptr);
	/* launch GPU kernel */
	res = cuLaunchKernel(
		func_calc_a_score[0], // call function
		block_num_x,       // gridDimX
		block_num_y,       // gridDimY
		1,                 // gridDimZ
		thread_num_x,      // blockDimX
		thread_num_y,      // blockDimY
		NoC,               // blockDimZ
		sharedMemBytes,    // sharedMemBytes
		nullptr,              // hStream
		kernel_args,       // kernelParams
		nullptr               // extra
		);
	if(res != CUDA_SUCCESS) {
		printf("cuLaunchKernel(calc_a_score) failed : res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuCtxSynchronize();
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSynchronize(calc_a_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}
	gettimeofday(&tv_kernel_end, nullptr);
	tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
	time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

	gettimeofday(&tv_memcpy_start, nullptr);
	/* download data from GPU */
	res = cuMemcpyDtoH(ac_score, ac_score_dev, gpu_size_A_SCORE);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH(ac_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	gettimeofday(&tv_memcpy_end, nullptr);
	tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
	time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

	/* free GPU memory */
	res = cuMemFree(ac_score_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(ac_score_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(score_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(score_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(ssize_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(ssize_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(size_score_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(size_score_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(RY_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(RY_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(RX_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(RX_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	/* free CPU memory */
	res = cuMemFreeHost(RY_array);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(RY_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFreeHost(RX_array);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(RX_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}
}
Пример #25
0
void
nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
	    int async, unsigned *dims, void *targ_mem_desc)
{
  struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
  CUfunction function;
  CUresult r;
  int i;
  struct ptx_stream *dev_str;
  void *kargs[1];
  void *hp, *dp;
  struct nvptx_thread *nvthd = nvptx_thread ();
  const char *maybe_abort_msg = "(perhaps abort was called)";

  function = targ_fn->fn;

  dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
  assert (dev_str == nvthd->current_stream);

  /* Initialize the launch dimensions.  Typically this is constant,
     provided by the device compiler, but we must permit runtime
     values.  */
  for (i = 0; i != 3; i++)
    if (targ_fn->launch->dim[i])
      dims[i] = targ_fn->launch->dim[i];

  /* This reserves a chunk of a pre-allocated page of memory mapped on both
     the host and the device. HP is a host pointer to the new chunk, and DP is
     the corresponding device pointer.  */
  map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);

  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);

  /* Copy the array of arguments to the mapped page.  */
  for (i = 0; i < mapnum; i++)
    ((void **) hp)[i] = devaddrs[i];

  /* Copy the (device) pointers to arguments to the device (dp and hp might in
     fact have the same value on a unified-memory system).  */
  r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));

  GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
		     " gangs=%u, workers=%u, vectors=%u\n",
		     __FUNCTION__, targ_fn->launch->fn,
		     dims[0], dims[1], dims[2]);

  // OpenACC		CUDA
  //
  // num_gangs		nctaid.x
  // num_workers	ntid.y
  // vector length	ntid.x

  kargs[0] = &dp;
  r = cuLaunchKernel (function,
		      dims[GOMP_DIM_GANG], 1, 1,
		      dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
		      0, dev_str->stream, kargs, 0);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));

#ifndef DISABLE_ASYNC
  if (async < acc_async_noval)
    {
      r = cuStreamSynchronize (dev_str->stream);
      if (r == CUDA_ERROR_LAUNCH_FAILED)
	GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
			   maybe_abort_msg);
      else if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
    }
  else
    {
      CUevent *e;

      e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));

      r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
      if (r == CUDA_ERROR_LAUNCH_FAILED)
	GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
			   maybe_abort_msg);
      else if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));

      event_gc (true);

      r = cuEventRecord (*e, dev_str->stream);
      if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));

      event_add (PTX_EVT_KNL, e, (void *)dev_str);
    }
#else
  r = cuCtxSynchronize ();
  if (r == CUDA_ERROR_LAUNCH_FAILED)
    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
		       maybe_abort_msg);
  else if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
#endif

  GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
		     targ_fn->launch->fn);

#ifndef DISABLE_ASYNC
  if (async < acc_async_noval)
#endif
    map_pop (dev_str);
}
int main(int argc, char **argv)
{
    // Start logs
    printf("[%s] - Starting...\n", argv[0]);

    //'h_' prefix - CPU (host) memory space
    float
    //Results calculated by CPU for reference
    *h_CallResultCPU,
    *h_PutResultCPU,
    //CPU copy of GPU results
    *h_CallResultGPU,
    *h_PutResultGPU,
    //CPU instance of input data
    *h_StockPrice,
    *h_OptionStrike,
    *h_OptionYears;

    //'d_' prefix - GPU (device) memory space
    CUdeviceptr
    //Results calculated by GPU
    d_CallResult,
    d_PutResult,

    //GPU instance of input data
    d_StockPrice,
    d_OptionStrike,
    d_OptionYears;

    double
    delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime;

    StopWatchInterface *hTimer = NULL;
    int i;

    sdkCreateTimer(&hTimer);

    printf("Initializing data...\n");
    printf("...allocating CPU memory for options.\n");

    h_CallResultCPU = (float *)malloc(OPT_SZ);
    h_PutResultCPU  = (float *)malloc(OPT_SZ);
    h_CallResultGPU = (float *)malloc(OPT_SZ);
    h_PutResultGPU  = (float *)malloc(OPT_SZ);
    h_StockPrice    = (float *)malloc(OPT_SZ);
    h_OptionStrike  = (float *)malloc(OPT_SZ);
    h_OptionYears   = (float *)malloc(OPT_SZ);


    char *ptx, *kernel_file;
    size_t ptxSize;
    kernel_file = sdkFindFilePath("BlackScholes_kernel.cuh", argv[0]);

    // Set a Compiler Option to have maximum register to be used by each thread.
    char *compile_options[1];
    compile_options[0] = (char *) malloc(sizeof(char)*(strlen("--maxrregcount=16")));
    strcpy((char *)compile_options[0],"--maxrregcount=16");

    // Compile the kernel BlackScholes_kernel.
    compileFileToPTX(kernel_file, 1, (const char **)compile_options, &ptx, &ptxSize);
    CUmodule module = loadPTX(ptx, argc, argv);

    CUfunction kernel_addr;
    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "BlackScholesGPU"));

    printf("...allocating GPU memory for options.\n");
    checkCudaErrors(cuMemAlloc(&d_CallResult, OPT_SZ));
    checkCudaErrors(cuMemAlloc(&d_PutResult, OPT_SZ));
    checkCudaErrors(cuMemAlloc(&d_StockPrice, OPT_SZ));
    checkCudaErrors(cuMemAlloc(&d_OptionStrike,OPT_SZ));
    checkCudaErrors(cuMemAlloc(&d_OptionYears, OPT_SZ));

    printf("...generating input data in CPU mem.\n");
    srand(5347);

    //Generate options set
    for (i = 0; i < OPT_N; i++)
    {
        h_CallResultCPU[i] = 0.0f;
        h_PutResultCPU[i]  = -1.0f;
        h_StockPrice[i]    = RandFloat(5.0f, 30.0f);
        h_OptionStrike[i]  = RandFloat(1.0f, 100.0f);
        h_OptionYears[i]   = RandFloat(0.25f, 10.0f);
    }

    printf("...copying input data to GPU mem.\n");
    //Copy options data to GPU memory for further processing
    checkCudaErrors(cuMemcpyHtoD(d_StockPrice, h_StockPrice, OPT_SZ));
    checkCudaErrors(cuMemcpyHtoD(d_OptionStrike, h_OptionStrike, OPT_SZ));
    checkCudaErrors(cuMemcpyHtoD(d_OptionYears, h_OptionYears, OPT_SZ));

    printf("Data init done.\n\n");
    printf("Executing Black-Scholes GPU kernel (%i iterations)...\n", NUM_ITERATIONS);

    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);

    dim3 cudaBlockSize( 128, 1, 1);
    dim3 cudaGridSize(DIV_UP(OPT_N/2, 128),1,1);

    float risk = RISKFREE;
    float volatility = VOLATILITY;
    int optval = OPT_N;

    void *arr[] = { (void *)&d_CallResult, (void *)&d_PutResult, (void *)&d_StockPrice,
        (void *)&d_OptionStrike, (void *)&d_OptionYears, (void *)&risk, (void *)&volatility, (void *)&optval };

    for (i = 0; i < NUM_ITERATIONS; i++)
    {

        checkCudaErrors(cuLaunchKernel(kernel_addr,
                                            cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, /* grid dim */
                                            cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */
                                            0,0, /* shared mem, stream */
                                            &arr[0], /* arguments */
                                            0));

    }

    checkCudaErrors(cuCtxSynchronize());

    sdkStopTimer(&hTimer);
    gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS;

    //Both call and put is calculated
    printf("Options count             : %i     \n", 2 * OPT_N);
    printf("BlackScholesGPU() time    : %f msec\n", gpuTime);
    printf("Effective memory bandwidth: %f GB/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3));
    printf("Gigaoptions per second    : %f     \n\n", ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3));
    printf("BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %u, Workgroup = %u\n",
           (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), gpuTime*1e-3, (2 * OPT_N), 1, 128);

    printf("\nReading back GPU results...\n");

    //Read back GPU results to compare them to CPU results
    checkCudaErrors(cuMemcpyDtoH(h_CallResultGPU, d_CallResult, OPT_SZ));
    checkCudaErrors(cuMemcpyDtoH(h_PutResultGPU, d_PutResult, OPT_SZ));

    printf("Checking the results...\n");
    printf("...running CPU calculations.\n\n");

    //Calculate options values on CPU
    BlackScholesCPU(
        h_CallResultCPU,
        h_PutResultCPU,
        h_StockPrice,
        h_OptionStrike,
        h_OptionYears,
        RISKFREE,
        VOLATILITY,
        OPT_N
    );

    printf("Comparing the results...\n");
    //Calculate max absolute difference and L1 distance
    //between CPU and GPU results
    sum_delta = 0;
    sum_ref   = 0;
    max_delta = 0;

    for (i = 0; i < OPT_N; i++)
    {
        ref   = h_CallResultCPU[i];
        delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]);

        if (delta > max_delta)
        {
            max_delta = delta;
        }

        sum_delta += delta;
        sum_ref   += fabs(ref);
    }

    L1norm = sum_delta / sum_ref;
    printf("L1 norm: %E\n", L1norm);
    printf("Max absolute error: %E\n\n", max_delta);

    printf("Shutting down...\n");
    printf("...releasing GPU memory.\n");

    checkCudaErrors(cuMemFree(d_OptionYears));
    checkCudaErrors(cuMemFree(d_OptionStrike));
    checkCudaErrors(cuMemFree(d_StockPrice));
    checkCudaErrors(cuMemFree(d_PutResult));
    checkCudaErrors(cuMemFree(d_CallResult));

    printf("...releasing CPU memory.\n");

    free(h_OptionYears);
    free(h_OptionStrike);
    free(h_StockPrice);
    free(h_PutResultGPU);
    free(h_CallResultGPU);
    free(h_PutResultCPU);
    free(h_CallResultCPU);

    sdkDeleteTimer(&hTimer);
    printf("Shutdown done.\n");

    printf("\n[%s] - Test Summary\n", argv[0]);

    cuProfilerStop();

    if (L1norm > 1e-6)
    {
        printf("Test failed!\n");
        exit(EXIT_FAILURE);
    }

    printf("Test passed\n");
    exit(EXIT_SUCCESS);
}
Пример #27
0
int cuda_test_madd_vmmap_hybrid(unsigned int n, char *path)
{
	int i, j, idx;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;
	CUfunction function;
	CUmodule module;
	CUdeviceptr a_dev, b_dev, c_dev;
	unsigned int *a_buf, *b_buf, *c_buf;
	unsigned long long int a_phys, b_phys, c_phys;
	unsigned int *c = (unsigned int *) malloc (n*n * sizeof(unsigned int));
	int block_x, block_y, grid_x, grid_y;
	char fname[256];
	int ret = 0;
	struct timeval tv;
	struct timeval tv_total_start, tv_total_end;
	float total;
	struct timeval tv_h2d_start, tv_h2d_end;
	float h2d;
	struct timeval tv_d2h_start, tv_d2h_end;
	float d2h;
	struct timeval tv_exec_start, tv_exec_end;
	struct timeval tv_mem_alloc_start;
	struct timeval tv_data_init_start;
	float data_init;
	struct timeval tv_conf_kern_start;
	struct timeval tv_close_start;
	float mem_alloc;
	float exec;
	float init_gpu;
	float configure_kernel;
	float close_gpu;
	float data_read;

	unsigned int dummy_b, dummy_c;
		

	/* block_x * block_y should not exceed 512. */
	block_x = n < 16 ? n : 16;
	block_y = n < 16 ? n : 16;
	grid_x = n / block_x;
	if (n % block_x != 0)
		grid_x++;
	grid_y = n / block_y;
	if (n % block_y != 0)
		grid_y++;

	gettimeofday(&tv_total_start, NULL);

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxCreate(&ctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	sprintf(fname, "%s/madd_gpu.cubin", path);
	res = cuModuleLoad(&module, fname);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		return -1;
	}
	res = cuModuleGetFunction(&function, module, "_Z3addPjS_S_j");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleGetFunction() failed\n");
		return -1;
	}
	res = cuFuncSetBlockShape(function, block_x, block_y, 1);
	if (res != CUDA_SUCCESS) {
		printf("cuFuncSetBlockShape() failed\n");
		return -1;
	}

	gettimeofday(&tv_mem_alloc_start, NULL);

	/* a[] */
	res = cuMemAlloc(&a_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (a) failed\n");
		return -1;
	}
	res = cuMemMap((void**)&a_buf, a_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemMap (a) failed\n");
		return -1;
	}
	res = cuMemGetPhysAddr(&a_phys, (void*)a_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemGetPhysAddress (a) failed\n");
		return -1;
	}
	/*printf("a[]: Physical Address 0x%llx\n", a_phys);*/

	/* b[] */
	res = cuMemAlloc(&b_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (b) failed\n");
		return -1;
	}
	res = cuMemMap((void**)&b_buf, b_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemMap (b) failed\n");
		return -1;
	}
	res = cuMemGetPhysAddr(&b_phys, (void*)b_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemGetPhysAddress (b) failed\n");
		return -1;
	}
	/*printf("b[]: Physical Address 0x%llx\n", b_phys);*/

	/* c[] */
	res = cuMemAlloc(&c_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (c) failed\n");
		return -1;
	}
	res = cuMemMap((void**)&c_buf, c_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemMap (c) failed\n");
		return -1;
	}
	res = cuMemGetPhysAddr(&c_phys, (void*)c_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemGetPhysAddress (c) failed\n");
		return -1;
	}
	/*printf("c[]: Physical Address 0x%llx\n", c_phys);*/

	gettimeofday(&tv_data_init_start, NULL);

	/* initialize A[] & B[] */
	for (i = 0; i < n; i++) {
		idx = i*n;
		for(j = 0; j < n; j++) {			
			a_buf[idx++] = i;
		}
	}
	for (i = 0; i < n; i++) {
		idx = i*n;
		for(j = 0; j < n; j++) {
			b_buf[idx++] = i;
		}
	}

	gettimeofday(&tv_h2d_start, NULL);
	gettimeofday(&tv_h2d_end, NULL);


	gettimeofday(&tv_conf_kern_start, NULL);

	/* set kernel parameters */
	res = cuParamSeti(function, 0, a_dev);	
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 4, a_dev >> 32);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 8, b_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 12, b_dev >> 32);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 16, c_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 20, c_dev >> 32);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 24, n);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSetSize(function, 28);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_exec_start, NULL);
	/* launch the kernel */
	res = cuLaunchGrid(function, grid_x, grid_y);
	if (res != CUDA_SUCCESS) {
		printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	cuCtxSynchronize();
	gettimeofday(&tv_exec_end, NULL);


	gettimeofday(&tv_d2h_start, NULL);
	/* download c[] */


	memcpy(c, c_buf, n*n*sizeof(unsigned int));

	gettimeofday(&tv_d2h_end, NULL);

	/* Read back */
	for (i = 0; i < n; i++) {
		idx = i*n;
		for(j = 0; j < n; j++) {			
			dummy_c = c[idx++];
		}
	}



	gettimeofday(&tv_close_start, NULL);

	res = cuMemUnmap((void*)a_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemUnmap (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(a_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemUnmap((void*)b_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemUnmap (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(b_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemUnmap((void*)c_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemUnmap (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(c_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuModuleUnload(module);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxDestroy(ctx);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_total_end, NULL);




	tvsub(&tv_mem_alloc_start, &tv_total_start, &tv);
	init_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_data_init_start, &tv_mem_alloc_start, &tv);
	mem_alloc = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_h2d_start, &tv_data_init_start, &tv);
	data_init = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_h2d_end, &tv_h2d_start, &tv);
	h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_exec_start, &tv_conf_kern_start, &tv);
	configure_kernel = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_exec_end, &tv_exec_start, &tv);
	exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_d2h_end, &tv_d2h_start, &tv);
	d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_close_start, &tv_d2h_end, &tv);
	data_read = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_total_end, &tv_close_start, &tv);
	close_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_total_end, &tv_total_start, &tv);
	total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	printf("Init: %f\n", init_gpu);
	printf("MemAlloc: %f\n", mem_alloc);
	printf("DataInit: %f\n", data_init);
	printf("HtoD: %f\n", h2d);
	printf("KernConf: %f\n", configure_kernel);
	printf("Exec: %f\n", exec);
	printf("DtoH: %f\n", d2h);
	printf("DataRead: %f\n", data_read);
	printf("Close: %f\n", close_gpu);
	printf("Total: %f\n", total);


	return ret;
}
Пример #28
0
static CUresult synchronize() {
  CU_ERROR_CHECK(cuCtxSynchronize());
  return CUDA_SUCCESS;
}
Пример #29
0
int cuda_test_fmadd(unsigned int n, char *path)
{
	int i, j, idx;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;
	CUfunction function;
	CUmodule module;
	CUdeviceptr a_dev, b_dev, c_dev;
	float *a = (float *) malloc (n*n * sizeof(float));
	float *b = (float *) malloc (n*n * sizeof(float));
	float *c = (float *) malloc (n*n * sizeof(float));
	int block_x, block_y, grid_x, grid_y;
	int offset;
	char fname[256];
	struct timeval tv;
	struct timeval tv_total_start, tv_total_end;
	float total;
	struct timeval tv_h2d_start, tv_h2d_end;
	float h2d;
	struct timeval tv_d2h_start, tv_d2h_end;
	float d2h;
	struct timeval tv_exec_start, tv_exec_end;
	float exec;

	/* initialize A[] & B[] */
	for (i = 0; i < n; i++) {
		for(j = 0; j < n; j++) {
			idx = i * n + j;
			a[idx] = i + 0.1;
			b[idx] = i + 0.1;
		}
	}

	/* block_x * block_y should not exceed 512. */
	block_x = n < 16 ? n : 16;
	block_y = n < 16 ? n : 16;
	grid_x = n / block_x;
	if (n % block_x != 0)
		grid_x++;
	grid_y = n / block_y;
	if (n % block_y != 0)
		grid_y++;
	printf("block = (%d, %d)\n", block_x, block_y);
	printf("grid = (%d, %d)\n", grid_x, grid_y);

	gettimeofday(&tv_total_start, NULL);

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxCreate(&ctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	sprintf(fname, "%s/fmadd_gpu.cubin", path);
	res = cuModuleLoad(&module, fname);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		return -1;
	}
	res = cuModuleGetFunction(&function, module, "_Z3addPfS_S_i");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleGetFunction() failed\n");
		return -1;
	}
	res = cuFuncSetSharedSize(function, 0x40); /* just random */
	if (res != CUDA_SUCCESS) {
		printf("cuFuncSetSharedSize() failed\n");
		return -1;
	}
	res = cuFuncSetBlockShape(function, block_x, block_y, 1);
	if (res != CUDA_SUCCESS) {
		printf("cuFuncSetBlockShape() failed\n");
		return -1;
	}

	/* a[] */
	res = cuMemAlloc(&a_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (a) failed\n");
		return -1;
	}
	/* b[] */
	res = cuMemAlloc(&b_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (b) failed\n");
		return -1;
	}
	/* c[] */
	res = cuMemAlloc(&c_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (c) failed\n");
		return -1;
	}

	gettimeofday(&tv_h2d_start, NULL);
	/* upload a[] and b[] */
	res = cuMemcpyHtoD(a_dev, a, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemcpyHtoD(b_dev, b, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	gettimeofday(&tv_h2d_end, NULL);

	/* set kernel parameters */
	offset = 0;
	res = cuParamSetv(function, offset, &a_dev, sizeof(a_dev));	
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(a_dev);
	res = cuParamSetv(function, offset, &b_dev, sizeof(b_dev));
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(b_dev);
	res = cuParamSetv(function, offset, &c_dev, sizeof(c_dev));
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(c_dev);
	res = cuParamSetv(function, offset, &n, sizeof(n));
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(n);
	res = cuParamSetSize(function, offset);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_exec_start, NULL);
	/* launch the kernel */
	res = cuLaunchGrid(function, grid_x, grid_y);
	if (res != CUDA_SUCCESS) {
		printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	cuCtxSynchronize();
	gettimeofday(&tv_exec_end, NULL);

	gettimeofday(&tv_d2h_start, NULL);
	/* download c[] */
	res = cuMemcpyDtoH(c, c_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	gettimeofday(&tv_d2h_end, NULL);

	res = cuMemFree(a_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(b_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(c_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuModuleUnload(module);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxDestroy(ctx);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_total_end, NULL);

	/* check the results */
	i = j = idx = 0;
	while (i < n) {
		while (j < n) {
			idx = i * n + j;
			if (c[idx] != a[idx] + b[idx]) {
				printf("c[%d] = %f\n", idx, c[idx]);
				printf("a[%d]+b[%d] = %f\n", idx, idx, a[idx]+b[idx]);
				return -1;
			}
			j++;
		}
		i++;
	}

	free(a);
	free(b);
	free(c);

	tvsub(&tv_h2d_end, &tv_h2d_start, &tv);
	h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
	tvsub(&tv_d2h_end, &tv_d2h_start, &tv);
	d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
	tvsub(&tv_exec_end, &tv_exec_start, &tv);
	exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
	tvsub(&tv_total_end, &tv_total_start, &tv);
	total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	printf("HtoD: %f\n", h2d);
	printf("DtoH: %f\n", d2h);
	printf("Exec: %f\n", exec);
	printf("Time (Memcpy + Launch): %f\n", h2d + d2h + exec);
	printf("Total: %f\n", total);

	return 0;
}
Пример #30
0
void octree::compute_properties (tree_structure &tree) {
  
  #if 0
      fprintf(stderr,"This file is not up to date anymore! %s\n", __FILE__);
    exit(0);
  
  //Computes the tree-properties (size, cm, monopole, quadropole, etc)
  //start the kernel for the leaf-type nodes
  propsLeaf.set_arg<int>(0,    &tree.n_leafs);
  propsLeaf.set_arg<cl_mem>(1, tree.leafNodeIdx.p());
  propsLeaf.set_arg<cl_mem>(2, tree.node_bodies.p());
  propsLeaf.set_arg<cl_mem>(3, tree.bodies_Ppos.p());
//   propsLeaf.set_arg<cl_mem>(3, tree.bodies_pos.p());  
  propsLeaf.set_arg<cl_mem>(4, tree.multipole.p());
  propsLeaf.set_arg<cl_mem>(5, tree.nodeLowerBounds.p());
  propsLeaf.set_arg<cl_mem>(6, tree.nodeUpperBounds.p());
  propsLeaf.set_arg<cl_mem>(7, tree.lowerBounds.p());
  propsLeaf.set_arg<cl_mem>(8, tree.upperBounds.p());  
  propsLeaf.set_arg<cl_mem>(9, tree.bodies_Pvel.p());  //Velocity to get max eps  
  
  
  propsLeaf.setWork(tree.n_leafs, 128);
  printf("PropsLeaf: "); propsLeaf.printWorkSize();

  propsLeaf.execute();  


  int temp = tree.n_nodes-tree.n_leafs;
  propsNonLeaf.set_arg<int>(0,    &temp);
  propsNonLeaf.set_arg<cl_mem>(1, tree.leafNodeIdx.p());
  propsNonLeaf.set_arg<cl_mem>(2, tree.node_level_list.p());
  propsNonLeaf.set_arg<cl_mem>(3, tree.n_children.p());  
  propsNonLeaf.set_arg<cl_mem>(4, tree.multipole.p());
  propsNonLeaf.set_arg<cl_mem>(5, tree.nodeLowerBounds.p());
  propsNonLeaf.set_arg<cl_mem>(6, tree.nodeUpperBounds.p());


  for(int i=tree.n_levels; i >= 1; i--)
  {   
      propsNonLeaf.set_arg<int>(0,    &i);  
      {    
        vector<size_t> localWork(2), globalWork(2);
        int totalOnThisLevel;
      
        totalOnThisLevel = tree.node_level_list[i]-tree.node_level_list[i-1];
        
        propsNonLeaf.setWork(totalOnThisLevel, 128);
        
        printf("PropsNonLeaf, nodes on level %d : %d (start: %d end: %d) , config: \t", i, totalOnThisLevel,
               tree.node_level_list[i-1], tree.node_level_list[i]); 
        propsNonLeaf.printWorkSize();
      }      
      propsNonLeaf.set_arg<int>(0,    &i); //set the level
      propsNonLeaf.execute();     
  }
  
  float theta2 = theta;
  
  propsScaling.set_arg<int>(0,    &tree.n_nodes);
  propsScaling.set_arg<real4>(1,  &tree.corner);
  propsScaling.set_arg<cl_mem>(2, tree.multipole.p());
  propsScaling.set_arg<cl_mem>(3, tree.nodeLowerBounds.p());
  propsScaling.set_arg<cl_mem>(4, tree.nodeUpperBounds.p());
  propsScaling.set_arg<cl_mem>(5, tree.n_children.p());  
  propsScaling.set_arg<cl_mem>(6, tree.node_data.p());
  propsScaling.set_arg<float >(7, &theta2);
  propsScaling.set_arg<cl_mem>(8, tree.boxSizeInfo.p());
  propsScaling.set_arg<cl_mem>(9, tree.boxCenterInfo.p());
  
  propsScaling.setWork(tree.n_nodes, 128);
  printf("propsScaling: \t "); propsScaling.printWorkSize();
  propsScaling.execute();     


  //tree.multipole.d2h();
  //printf("COM: %f %f %f %f \n",tree.multipole[0].x, tree.multipole[0].y, tree.multipole[0].z, tree.multipole[0].w);


  #ifdef USE_CUDA
    cuCtxSynchronize();
  #else
    clFinish(devContext.get_command_queue());
  #endif
  
  tree.nodeLowerBounds.d2h();
  tree.nodeUpperBounds.d2h();
  
  
  
  copyNodeDataToGroupData.set_arg<int>(0,    &tree.n_groups);
  copyNodeDataToGroupData.set_arg<int>(1,    &tree.n_nodes);
  copyNodeDataToGroupData.set_arg<cl_mem>(2, tree.node_data.p());
  copyNodeDataToGroupData.set_arg<cl_mem>(3, tree.group_data.p());
  copyNodeDataToGroupData.set_arg<cl_mem>(4, tree.node_bodies.p());
  copyNodeDataToGroupData.set_arg<cl_mem>(5, tree.group_list.p());
  copyNodeDataToGroupData.set_arg<cl_mem>(6, tree.boxCenterInfo.p());
  copyNodeDataToGroupData.set_arg<cl_mem>(7, tree.boxSizeInfo.p());
  copyNodeDataToGroupData.set_arg<cl_mem>(8, tree.groupCenterInfo.p());
  copyNodeDataToGroupData.set_arg<cl_mem>(9, tree.groupSizeInfo.p());
  copyNodeDataToGroupData.setWork(tree.n_nodes, 128);

  printf("copyNodeDataToGroupData: \t "); copyNodeDataToGroupData.printWorkSize();
  copyNodeDataToGroupData.execute();  
 
//    tree.multipole.d2h();
//   testRes.d2h();
  
//   for(int i=0; i < tree.n_nodes; i++)
//   for(int i=tree.n_nodes-10; i < tree.n_nodes; i++)
/*   for(int i=0; i < 10; i++)
   {
     fprintf(stderr,"%d\t%f\t%f\t%f\t%f\n", i, tree.multipole[i*3+0].x,tree.multipole[i*3+0].y,tree.multipole[i*3+0].z, tree.multipole[i*3+0].w);
//     fprintf(stderr,"%d\t%f\t%f\t%f\t%f\t%f\n", i, tree.multipole[i*3+1].x,tree.multipole[i*3+1].y,tree.multipole[i*3+1].z, tree.multipole[i*3+1].w, testRes[i]);
    fprintf(stderr,"%d\t%f\t%f\t%f\t%f\t%f\n", i, tree.multipole[i*3+1].x,tree.multipole[i*3+1].y,tree.multipole[i*3+1].z, tree.multipole[i*3+1].w, 0);
    fprintf(stderr,"%d\t%f\t%f\t%f\t%f\n", i, tree.multipole[i*3+2].x,tree.multipole[i*3+2].y,tree.multipole[i*3+2].z, tree.multipole[i*3+2].w);
   }

exit(0);
*/
  #else

  compute_properties_double(tree);
  
  #endif
  
}