C++ (Cpp) kernel_wrapperの例

コード例 #1

0

ファイルを表示

ファイル: main.c プロジェクト: angubenko/cuda-samples

int main(int argc, char const *argv[]) {
	printf("Computing Game Of Life On %d x %d Board.\n", DIM_X, DIM_Y);
	
	int *host_current, *host_future, *host_gpu_results;
	int *gpu_current, *gpu_future;
	
	cudaMallocHost((void**) &host_current, DIM_X * DIM_Y * sizeof(int));
	cudaMallocHost((void**) &host_future, DIM_X * DIM_Y * sizeof(int));	
	cudaMallocHost((void**) &host_gpu_results, DIM_X * DIM_Y * sizeof(int));
	cudaMalloc((void**) &gpu_current, DIM_X * DIM_Y * sizeof(int));
	cudaMalloc((void**) &gpu_future, DIM_X * DIM_Y * sizeof(int));
	assert(cudaGetLastError() == cudaSuccess);
	
	fill_board(host_current, 40);
	
	cudaMemcpy(gpu_current, host_current, DIM_X * DIM_Y * sizeof(int), cudaMemcpyHostToDevice);
	setup_textures(gpu_current, gpu_future);
	
	assert(cudaGetLastError() == cudaSuccess);
	
	clock_t start, stop;
	int current_is_a = 1;
	printf("START!\n");
	for(int i = 1; i < 10; i++) {
		start = clock();
		
		int *output = current_is_a ? gpu_future : gpu_current;
		kernel_wrapper(current_is_a, output);
		current_is_a = current_is_a ? 0 : 1;
		
		cudaMemcpy(host_gpu_results, output, DIM_X * DIM_Y * sizeof(int), cudaMemcpyDeviceToHost);
		assert(cudaGetLastError() == cudaSuccess);
		
		stop = clock();
		printf("Time for Textured GPU To Compute Next Phase: %.5f s\n", (float)(stop - start)/CLOCKS_PER_SEC);
				
		start = clock();
		update_board(host_current, host_future);
		stop = clock();
		printf("Time for CPU To Compute Next Phase: %.5f s\n", (float)(stop - start)/CLOCKS_PER_SEC);
		
		printf("======\n");
		check_boards(host_gpu_results, host_future);
				
		int *temp;
		temp = host_current;
		host_current = host_future;
		host_future = temp;
	}
	
	cudaFree(host_future);
	cudaFree(host_current);
	cudaFree(host_gpu_results);
	cudaFree(gpu_current);
	cudaFree(gpu_future);
	
	return 0;
}

コード例 #2

0

ファイルを表示

ファイル: benchmark.cpp プロジェクト: NVIDIA/gpu-rest-engine

void benchmark_execute(benchmark_ctx* ctx)
{
    try
    {
        ScopedContext<BenchmarkContext> context(ctx->pool);
        cudaStream_t stream = context->CUDAStream();
        kernel_wrapper(stream);
        errno = 0;
    }
    catch (const std::invalid_argument&)
    {
        errno = EINVAL;
    }
}

コード例 #3

0

ファイルを表示

 static kernel_wrapper make( Args&&... params )
 {
    auto *output( new T( std::forward< Args >( params )... ) );
    output->internal_alloc = true;
    return( kernel_wrapper( output ) );
 }

コード例 #4

0

ファイルを表示

ファイル: StructCUDA.v5.cpp プロジェクト: hohoCode/optTreeCUDA

int main(int argc, char** args) {
    /*if(argc < 3) {
    	return -1;
    }

    char* configFile = args[1];
    char* featureFile = args[2];
    */
    char* configFile = "ensemble-3-1.xml.tree.end";
    char* featureFile = "test.txt";

    ////////////////////////////////////////////
    // build DecisionTree
    ////////////////////////////////////////////
    FILE *fp = fopen(configFile, "r");
    int nbTrees;
    fscanf(fp, "%d", &nbTrees);

    int totalNodes = 0;
    int* nodeSizes;
    cudaHostAlloc((void **) &nodeSizes, sizeof(int)*nbTrees, cudaHostAllocDefault);
    //int* nodeSizes = (int*) malloc(nbTrees * sizeof(int));
    StructPlus** trees = (StructPlus**) malloc(nbTrees * sizeof(StructPlus*));
    printf("Starting Tree Reading....\n");
    int tindex = 0;
    for(tindex = 0; tindex < nbTrees; tindex++) {
        int treeSize;
        fscanf(fp, "%d", &treeSize);
        int internalSize = pow(2.0, treeSize) - 1;
        int fullSize = 2* pow(2.0, treeSize) - 1;
        nodeSizes[tindex] = fullSize;
        totalNodes += fullSize;
        int* pointers = (int*) malloc(internalSize * sizeof(int));
        trees[tindex] = createNodes(fullSize);

        char text[20];
        int line = 0;
        for(line = 0; line < internalSize; line++) pointers[line] = -1;
        fscanf(fp, "%s", text);
        while(strcmp(text, "end") != 0) {
            int id;
            fscanf(fp, "%d", &id);

            if(strcmp(text, "root") == 0) {
                int fid;
                float threshold;
                fscanf(fp, "%d %f", &fid, &threshold);
                setRoot(trees[tindex], id, fid, threshold);
                pointers[id] = 0;
            } else if(strcmp(text, "node") == 0) {
                int fid;
                int pid;
                float threshold;
                int leftChild = 0;
                fscanf(fp, "%d %d %d %f", &pid, &fid, &leftChild, &threshold);
                if(pointers[pid] >= 0 && trees[tindex][pointers[pid]].fid >= 0) {
                    pointers[id] = addNode(trees[tindex], pointers[pid], id, leftChild, fid, threshold);
                }
            } else if(strcmp(text, "leaf") == 0) {
                int pid;
                int leftChild = 0;
                float value;
                fscanf(fp, "%d %d %f", &pid, &leftChild, &value);
                if(pointers[pid] >= 0 && trees[tindex][pointers[pid]].fid >= 0) {
                    addNode(trees[tindex], pointers[pid], id, leftChild, -1, value);
                }
            }
            fscanf(fp, "%s", text);
        }
        free(pointers);
    }
    fclose(fp);

    // Pack all trees into a single array, thus avoiding two-D arrays.
    printf("Starting Rearrange the Tree....\n");
    //StructSimple* all_nodes = (StructSimple*) malloc(totalNodes * sizeof(StructSimple));
    StructSimple* all_nodes = NULL;
    cudaHostAlloc((void **) &all_nodes, sizeof(StructSimple)*totalNodes, cudaHostAllocDefault);
    int newIndex = 0;

    for(tindex = 0; tindex < nbTrees; tindex++) {
        int nsize = nodeSizes[tindex];
        nodeSizes[tindex] = newIndex;
        int telement;
        //printf("Size of the tree is %d\n", nsize);
        for(telement = 0; telement < nsize; telement++) {
            printf("tindex %d telement %d - FID %d Threshold %f\n",tindex, telement, trees[tindex][telement].fid,trees[tindex][telement].threshold);
            if(telement == 0) {
                all_nodes[newIndex].fid = abs(trees[tindex][telement].fid);
                all_nodes[newIndex].threshold = trees[tindex][telement].threshold;
                all_nodes[newIndex].leaf = (!trees[tindex][telement].left && !trees[tindex][telement].right)?'y':'n';
            } else if(trees[tindex][telement].fid && trees[tindex][telement].id) {
                all_nodes[newIndex].fid = trees[tindex][telement].fid;
                all_nodes[newIndex].threshold = trees[tindex][telement].threshold;
                all_nodes[newIndex].leaf = (!trees[tindex][telement].left && !trees[tindex][telement].right)?'y':'n';
            } else {
                all_nodes[newIndex].fid = NULL;
                all_nodes[newIndex].threshold = NULL;
                all_nodes[newIndex].leaf = NULL;
            }
            //printf("---fid=%d, threshold=%f, left=%d, right=%d\n", trees[tindex][telement].fid, trees[tindex][telement].threshold, trees[tindex][telement].left, trees[tindex][telement].right);
            //printf("fid=%d, threshold=%f, leaf=%c\n", all_nodes[newIndex].fid, all_nodes[newIndex].threshold, all_nodes[newIndex].leaf);
            newIndex++;
        }
    }

    ///////////////////////////////////////////////////////////
    ///////////FEATURES FILES READING//////////////////////////////
    //////////////////////////////////////////////////////////
    printf("Reading Feature File....\n");
    int numberOfFeatures = 0;
    int numberOfInstances = 0;
    fp = fopen(featureFile, "r");
    fscanf(fp, "%d %d", &numberOfInstances, &numberOfFeatures);

    ///New Code On Feature Array
    float* features = NULL;
    cudaHostAlloc((void **) &features, sizeof(float)*numberOfFeatures * numberOfInstances,  cudaHostAllocDefault);
    //float* features = (float*) malloc(numberOfFeatures * numberOfInstances * sizeof(float));
    float fvalue;
    int fIndex = 0, iIndex = 0;
    int ignore;
    char text[20];
    for(iIndex = 0; iIndex < numberOfInstances; iIndex++) {
        fscanf(fp, "%d %[^:]:%d", &ignore, text, &ignore);
        for(fIndex = 0; fIndex < numberOfFeatures; fIndex++) {
            fscanf(fp, "%[^:]:%f", text, &fvalue);
            features[iIndex*numberOfFeatures+fIndex] = fvalue;
        }
    }

    ///////////////////////////////////////////////
    /////////////TIMER
    //////////////////////////////////////////////
    float time;
    cudaEvent_t start_event, stop_event;
    cudaEventCreate(&start_event);
    cudaEventCreate(&stop_event);
    cudaEventRecord(start_event, 0);

    ///////////////////KERNEL////////////////////////
    kernel_wrapper(features, all_nodes, nodeSizes, numberOfInstances, nbTrees, numberOfFeatures, totalNodes);
    //////////////////////////////////////////////////

    cudaEventRecord(stop_event, 0);
    cudaEventSynchronize(stop_event);
    cudaEventElapsedTime(&time, start_event, stop_event);
    float timeperinstance = time*1000000/(float)numberOfInstances;
    printf ("Outside Total Time is %f ns, and Time/each instance: %f ns\n", time*1000000, timeperinstance);

    cudaFreeHost(nodeSizes);
    cudaFreeHost(all_nodes);
    cudaFreeHost(features);
    free(trees);
    fclose(fp);
    return 0;
}