int main(int argc, char const *argv[]) { printf("Computing Game Of Life On %d x %d Board.\n", DIM_X, DIM_Y); int *host_current, *host_future, *host_gpu_results; int *gpu_current, *gpu_future; cudaMallocHost((void**) &host_current, DIM_X * DIM_Y * sizeof(int)); cudaMallocHost((void**) &host_future, DIM_X * DIM_Y * sizeof(int)); cudaMallocHost((void**) &host_gpu_results, DIM_X * DIM_Y * sizeof(int)); cudaMalloc((void**) &gpu_current, DIM_X * DIM_Y * sizeof(int)); cudaMalloc((void**) &gpu_future, DIM_X * DIM_Y * sizeof(int)); assert(cudaGetLastError() == cudaSuccess); fill_board(host_current, 40); cudaMemcpy(gpu_current, host_current, DIM_X * DIM_Y * sizeof(int), cudaMemcpyHostToDevice); setup_textures(gpu_current, gpu_future); assert(cudaGetLastError() == cudaSuccess); clock_t start, stop; int current_is_a = 1; printf("START!\n"); for(int i = 1; i < 10; i++) { start = clock(); int *output = current_is_a ? gpu_future : gpu_current; kernel_wrapper(current_is_a, output); current_is_a = current_is_a ? 0 : 1; cudaMemcpy(host_gpu_results, output, DIM_X * DIM_Y * sizeof(int), cudaMemcpyDeviceToHost); assert(cudaGetLastError() == cudaSuccess); stop = clock(); printf("Time for Textured GPU To Compute Next Phase: %.5f s\n", (float)(stop - start)/CLOCKS_PER_SEC); start = clock(); update_board(host_current, host_future); stop = clock(); printf("Time for CPU To Compute Next Phase: %.5f s\n", (float)(stop - start)/CLOCKS_PER_SEC); printf("======\n"); check_boards(host_gpu_results, host_future); int *temp; temp = host_current; host_current = host_future; host_future = temp; } cudaFree(host_future); cudaFree(host_current); cudaFree(host_gpu_results); cudaFree(gpu_current); cudaFree(gpu_future); return 0; }
void benchmark_execute(benchmark_ctx* ctx) { try { ScopedContext<BenchmarkContext> context(ctx->pool); cudaStream_t stream = context->CUDAStream(); kernel_wrapper(stream); errno = 0; } catch (const std::invalid_argument&) { errno = EINVAL; } }
static kernel_wrapper make( Args&&... params ) { auto *output( new T( std::forward< Args >( params )... ) ); output->internal_alloc = true; return( kernel_wrapper( output ) ); }
int main(int argc, char** args) { /*if(argc < 3) { return -1; } char* configFile = args[1]; char* featureFile = args[2]; */ char* configFile = "ensemble-3-1.xml.tree.end"; char* featureFile = "test.txt"; //////////////////////////////////////////// // build DecisionTree //////////////////////////////////////////// FILE *fp = fopen(configFile, "r"); int nbTrees; fscanf(fp, "%d", &nbTrees); int totalNodes = 0; int* nodeSizes; cudaHostAlloc((void **) &nodeSizes, sizeof(int)*nbTrees, cudaHostAllocDefault); //int* nodeSizes = (int*) malloc(nbTrees * sizeof(int)); StructPlus** trees = (StructPlus**) malloc(nbTrees * sizeof(StructPlus*)); printf("Starting Tree Reading....\n"); int tindex = 0; for(tindex = 0; tindex < nbTrees; tindex++) { int treeSize; fscanf(fp, "%d", &treeSize); int internalSize = pow(2.0, treeSize) - 1; int fullSize = 2* pow(2.0, treeSize) - 1; nodeSizes[tindex] = fullSize; totalNodes += fullSize; int* pointers = (int*) malloc(internalSize * sizeof(int)); trees[tindex] = createNodes(fullSize); char text[20]; int line = 0; for(line = 0; line < internalSize; line++) pointers[line] = -1; fscanf(fp, "%s", text); while(strcmp(text, "end") != 0) { int id; fscanf(fp, "%d", &id); if(strcmp(text, "root") == 0) { int fid; float threshold; fscanf(fp, "%d %f", &fid, &threshold); setRoot(trees[tindex], id, fid, threshold); pointers[id] = 0; } else if(strcmp(text, "node") == 0) { int fid; int pid; float threshold; int leftChild = 0; fscanf(fp, "%d %d %d %f", &pid, &fid, &leftChild, &threshold); if(pointers[pid] >= 0 && trees[tindex][pointers[pid]].fid >= 0) { pointers[id] = addNode(trees[tindex], pointers[pid], id, leftChild, fid, threshold); } } else if(strcmp(text, "leaf") == 0) { int pid; int leftChild = 0; float value; fscanf(fp, "%d %d %f", &pid, &leftChild, &value); if(pointers[pid] >= 0 && trees[tindex][pointers[pid]].fid >= 0) { addNode(trees[tindex], pointers[pid], id, leftChild, -1, value); } } fscanf(fp, "%s", text); } free(pointers); } fclose(fp); // Pack all trees into a single array, thus avoiding two-D arrays. printf("Starting Rearrange the Tree....\n"); //StructSimple* all_nodes = (StructSimple*) malloc(totalNodes * sizeof(StructSimple)); StructSimple* all_nodes = NULL; cudaHostAlloc((void **) &all_nodes, sizeof(StructSimple)*totalNodes, cudaHostAllocDefault); int newIndex = 0; for(tindex = 0; tindex < nbTrees; tindex++) { int nsize = nodeSizes[tindex]; nodeSizes[tindex] = newIndex; int telement; //printf("Size of the tree is %d\n", nsize); for(telement = 0; telement < nsize; telement++) { printf("tindex %d telement %d - FID %d Threshold %f\n",tindex, telement, trees[tindex][telement].fid,trees[tindex][telement].threshold); if(telement == 0) { all_nodes[newIndex].fid = abs(trees[tindex][telement].fid); all_nodes[newIndex].threshold = trees[tindex][telement].threshold; all_nodes[newIndex].leaf = (!trees[tindex][telement].left && !trees[tindex][telement].right)?'y':'n'; } else if(trees[tindex][telement].fid && trees[tindex][telement].id) { all_nodes[newIndex].fid = trees[tindex][telement].fid; all_nodes[newIndex].threshold = trees[tindex][telement].threshold; all_nodes[newIndex].leaf = (!trees[tindex][telement].left && !trees[tindex][telement].right)?'y':'n'; } else { all_nodes[newIndex].fid = NULL; all_nodes[newIndex].threshold = NULL; all_nodes[newIndex].leaf = NULL; } //printf("---fid=%d, threshold=%f, left=%d, right=%d\n", trees[tindex][telement].fid, trees[tindex][telement].threshold, trees[tindex][telement].left, trees[tindex][telement].right); //printf("fid=%d, threshold=%f, leaf=%c\n", all_nodes[newIndex].fid, all_nodes[newIndex].threshold, all_nodes[newIndex].leaf); newIndex++; } } /////////////////////////////////////////////////////////// ///////////FEATURES FILES READING////////////////////////////// ////////////////////////////////////////////////////////// printf("Reading Feature File....\n"); int numberOfFeatures = 0; int numberOfInstances = 0; fp = fopen(featureFile, "r"); fscanf(fp, "%d %d", &numberOfInstances, &numberOfFeatures); ///New Code On Feature Array float* features = NULL; cudaHostAlloc((void **) &features, sizeof(float)*numberOfFeatures * numberOfInstances, cudaHostAllocDefault); //float* features = (float*) malloc(numberOfFeatures * numberOfInstances * sizeof(float)); float fvalue; int fIndex = 0, iIndex = 0; int ignore; char text[20]; for(iIndex = 0; iIndex < numberOfInstances; iIndex++) { fscanf(fp, "%d %[^:]:%d", &ignore, text, &ignore); for(fIndex = 0; fIndex < numberOfFeatures; fIndex++) { fscanf(fp, "%[^:]:%f", text, &fvalue); features[iIndex*numberOfFeatures+fIndex] = fvalue; } } /////////////////////////////////////////////// /////////////TIMER ////////////////////////////////////////////// float time; cudaEvent_t start_event, stop_event; cudaEventCreate(&start_event); cudaEventCreate(&stop_event); cudaEventRecord(start_event, 0); ///////////////////KERNEL//////////////////////// kernel_wrapper(features, all_nodes, nodeSizes, numberOfInstances, nbTrees, numberOfFeatures, totalNodes); ////////////////////////////////////////////////// cudaEventRecord(stop_event, 0); cudaEventSynchronize(stop_event); cudaEventElapsedTime(&time, start_event, stop_event); float timeperinstance = time*1000000/(float)numberOfInstances; printf ("Outside Total Time is %f ns, and Time/each instance: %f ns\n", time*1000000, timeperinstance); cudaFreeHost(nodeSizes); cudaFreeHost(all_nodes); cudaFreeHost(features); free(trees); fclose(fp); return 0; }