int main(int argc,char **argv){ // Print GPU properties //print_properties(); // Files to print the result after the last time step FILE *rho_file; FILE *E_file; rho_file = fopen("rho_final.txt", "w"); E_file = fopen("E_final.txt", "w"); // Construct initial condition for problem ICsinus Config(-1.0, 1.0, -1.0, 1.0); //ICsquare Config(0.5,0.5,gasGam); // Set initial values for Configuration 1 /* Config.set_rho(rhoConfig19); Config.set_pressure(pressureConfig19); Config.set_u(uConfig19); Config.set_v(vConfig19); */ // Determining global border based on left over tiles (a little hack) int globalPadding; globalPadding = (nx+2*border+16)/16; globalPadding = 16*globalPadding - (nx+2*border); //printf("Globalpad: %i\n", globalPadding); // Change border to add padding //border = border + globalPadding/2; // Initiate the matrices for the unknowns in the Euler equations cpu_ptr_2D rho(nx, ny, border,1); cpu_ptr_2D E(nx, ny, border,1); cpu_ptr_2D rho_u(nx, ny, border,1); cpu_ptr_2D rho_v(nx, ny, border,1); cpu_ptr_2D zeros(nx, ny, border,1); // Set initial condition Config.setIC(rho, rho_u, rho_v, E); double timeStart = get_wall_time(); // Test cpu_ptr_2D rho_dummy(nx, ny, border); cpu_ptr_2D E_dummy(nx, ny, border); /* rho_dummy.xmin = -1.0; rho_dummy.ymin = -1.0; E_dummy.xmin = -1.0; E_dummy.ymin = -1.0; */ // Set block and grid sizes dim3 gridBC = dim3(1, 1, 1); dim3 blockBC = dim3(BLOCKDIM_BC,1,1); dim3 gridBlockFlux; dim3 threadBlockFlux; dim3 gridBlockRK; dim3 threadBlockRK; computeGridBlock(gridBlockFlux, threadBlockFlux, nx + 2*border, ny + 2*border, INNERTILEDIM_X, INNERTILEDIM_Y, BLOCKDIM_X, BLOCKDIM_Y); computeGridBlock(gridBlockRK, threadBlockRK, nx + 2*border, ny + 2*border, BLOCKDIM_X_RK, BLOCKDIM_Y_RK, BLOCKDIM_X_RK, BLOCKDIM_Y_RK); int nElements = gridBlockFlux.x*gridBlockFlux.y; // Allocate memory for the GPU pointers gpu_ptr_1D L_device(nElements); gpu_ptr_1D dt_device(1); gpu_ptr_2D rho_device(nx, ny, border); gpu_ptr_2D E_device(nx, ny, border); gpu_ptr_2D rho_u_device(nx, ny, border); gpu_ptr_2D rho_v_device(nx, ny, border); gpu_ptr_2D R0(nx, ny, border); gpu_ptr_2D R1(nx, ny, border); gpu_ptr_2D R2(nx, ny, border); gpu_ptr_2D R3(nx, ny, border); gpu_ptr_2D Q0(nx, ny, border); gpu_ptr_2D Q1(nx, ny, border); gpu_ptr_2D Q2(nx, ny, border); gpu_ptr_2D Q3(nx, ny, border); // Allocate pinned memory on host init_allocate(); // Set BC arguments set_bc_args(BCArgs[0], rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), nx+2*border, ny+2*border, border); set_bc_args(BCArgs[1], Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), nx+2*border, ny+2*border, border); set_bc_args(BCArgs[2], rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), nx+2*border, ny+2*border, border); // Set FLUX arguments set_flux_args(fluxArgs[0], L_device.getRawPtr(), rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), R0.getRawPtr(),R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), nx, ny, border, rho.get_dx(), rho.get_dy(), theta, gasGam, INNERTILEDIM_X, INNERTILEDIM_Y); set_flux_args(fluxArgs[1], L_device.getRawPtr(), Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), R0.getRawPtr(),R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), nx, ny, border, rho.get_dx(), rho.get_dy(), theta, gasGam, INNERTILEDIM_X, INNERTILEDIM_Y); // Set TIME argument set_dt_args(dtArgs, L_device.getRawPtr(), dt_device.getRawPtr(), nElements, rho.get_dx(), rho.get_dy(), cfl_number); // Set Rk arguments set_rk_args(RKArgs[0], dt_device.getRawPtr(), rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), R0.getRawPtr(), R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), nx, ny, border); set_rk_args(RKArgs[1], dt_device.getRawPtr(), Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), R0.getRawPtr(), R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), nx, ny, border); L_device.set(FLT_MAX); /* R0.upload(zeros.get_ptr()); R1.upload(zeros.get_ptr()); R2.upload(zeros.get_ptr()); R3.upload(zeros.get_ptr()); Q0.upload(zeros.get_ptr()); Q1.upload(zeros.get_ptr()); Q2.upload(zeros.get_ptr()); Q3.upload(zeros.get_ptr()); */ R0.set(0,0,0,nx,ny,border); R1.set(0,0,0,nx,ny,border); R2.set(0,0,0,nx,ny,border); R3.set(0,0,0,nx,ny,border); Q0.set(0,0,0,nx,ny,border); Q1.set(0,0,0,nx,ny,border); Q2.set(0,0,0,nx,ny,border); Q3.set(0,0,0,nx,ny,border); rho_device.upload(rho.get_ptr()); rho_u_device.upload(rho_u.get_ptr()); rho_v_device.upload(rho_v.get_ptr()); E_device.upload(E.get_ptr()); // Update boudries callCollectiveSetBCPeriodic(gridBC, blockBC, BCArgs[0]); //Create cuda stream cudaStream_t stream1; cudaStreamCreate(&stream1); cudaEvent_t dt_complete; cudaEventCreate(&dt_complete); while (currentTime < timeLength && step < maxStep){ //RK1 //Compute flux callFluxKernel(gridBlockFlux, threadBlockFlux, 0, fluxArgs[0]); // Compute timestep (based on CFL condition) callDtKernel(TIMETHREADS, dtArgs); cudaMemcpyAsync(dt_host, dt_device.getRawPtr(), sizeof(float), cudaMemcpyDeviceToHost, stream1); cudaEventRecord(dt_complete, stream1); // Perform RK1 step callRKKernel(gridBlockRK, threadBlockRK, 0, RKArgs[0]); //Update boudries callCollectiveSetBCPeriodic(gridBC, blockBC, BCArgs[1]); //RK2 // Compute flux callFluxKernel(gridBlockFlux, threadBlockFlux, 1, fluxArgs[1]); //Perform RK2 step callRKKernel(gridBlockRK, threadBlockRK, 1, RKArgs[1]); //cudaEventRecord(srteam_sync, srteam1); callCollectiveSetBCPeriodic(gridBC, blockBC, BCArgs[2]); cudaEventSynchronize(dt_complete); step++; currentTime += *dt_host; // printf("Step: %i, current time: %.6f dt:%.6f\n" , step,currentTime, dt_host[0]); } //cuProfilerStop(); //cudaProfilerStop(); printf("Elapsed time %.5f", get_wall_time() - timeStart); E_device.download(E.get_ptr()); rho_u_device.download(rho_u.get_ptr()); rho_v_device.download(rho_v.get_ptr()); rho_device.download(rho_dummy.get_ptr()); rho_dummy.printToFile(rho_file, true, false); Config.exactSolution(E_dummy, currentTime); E_dummy.printToFile(E_file, true, false); float LinfError = Linf(E_dummy, rho_dummy); float L1Error = L1(E_dummy, rho_dummy); float L1Error2 = L1test(E_dummy, rho_dummy); printf("nx: %i\t Linf error %.9f\t L1 error %.7f L1test erro %.7f", nx, LinfError, L1Error, L1Error2); printf("nx: %i step: %i, current time: %.6f dt:%.6f\n" , nx, step,currentTime, dt_host[0]); /* cudaMemcpy(L_host, L_device, sizeof(float)*(nElements), cudaMemcpyDeviceToHost); for (int i =0; i < nElements; i++) printf(" %.7f ", L_host[i]); */ printf("%s\n", cudaGetErrorString(cudaGetLastError())); return(0); }
/** * free allocated memory */ static void destroy(const CudaEvent& ev) { CUDA_CHECK(cudaEventSynchronize(ev.event)); CUDA_CHECK(cudaEventDestroy(ev.event)); }
int main(int argc, char** args) { /*if(argc < 3) { return -1; } char* configFile = args[1]; char* featureFile = args[2]; */ char* configFile = "ensemble-3-1.xml.tree.end"; char* featureFile = "test.txt"; //////////////////////////////////////////// // build DecisionTree //////////////////////////////////////////// FILE *fp = fopen(configFile, "r"); int nbTrees; fscanf(fp, "%d", &nbTrees); int totalNodes = 0; int* nodeSizes; cudaHostAlloc((void **) &nodeSizes, sizeof(int)*nbTrees, cudaHostAllocDefault); //int* nodeSizes = (int*) malloc(nbTrees * sizeof(int)); StructPlus** trees = (StructPlus**) malloc(nbTrees * sizeof(StructPlus*)); printf("Starting Tree Reading....\n"); int tindex = 0; for(tindex = 0; tindex < nbTrees; tindex++) { int treeSize; fscanf(fp, "%d", &treeSize); int internalSize = pow(2.0, treeSize) - 1; int fullSize = 2* pow(2.0, treeSize) - 1; nodeSizes[tindex] = fullSize; totalNodes += fullSize; int* pointers = (int*) malloc(internalSize * sizeof(int)); trees[tindex] = createNodes(fullSize); char text[20]; int line = 0; for(line = 0; line < internalSize; line++) pointers[line] = -1; fscanf(fp, "%s", text); while(strcmp(text, "end") != 0) { int id; fscanf(fp, "%d", &id); if(strcmp(text, "root") == 0) { int fid; float threshold; fscanf(fp, "%d %f", &fid, &threshold); setRoot(trees[tindex], id, fid, threshold); pointers[id] = 0; } else if(strcmp(text, "node") == 0) { int fid; int pid; float threshold; int leftChild = 0; fscanf(fp, "%d %d %d %f", &pid, &fid, &leftChild, &threshold); if(pointers[pid] >= 0 && trees[tindex][pointers[pid]].fid >= 0) { pointers[id] = addNode(trees[tindex], pointers[pid], id, leftChild, fid, threshold); } } else if(strcmp(text, "leaf") == 0) { int pid; int leftChild = 0; float value; fscanf(fp, "%d %d %f", &pid, &leftChild, &value); if(pointers[pid] >= 0 && trees[tindex][pointers[pid]].fid >= 0) { addNode(trees[tindex], pointers[pid], id, leftChild, -1, value); } } fscanf(fp, "%s", text); } free(pointers); } fclose(fp); // Pack all trees into a single array, thus avoiding two-D arrays. printf("Starting Rearrange the Tree....\n"); //StructSimple* all_nodes = (StructSimple*) malloc(totalNodes * sizeof(StructSimple)); StructSimple* all_nodes = NULL; cudaHostAlloc((void **) &all_nodes, sizeof(StructSimple)*totalNodes, cudaHostAllocDefault); int newIndex = 0; for(tindex = 0; tindex < nbTrees; tindex++) { int nsize = nodeSizes[tindex]; nodeSizes[tindex] = newIndex; int telement; //printf("Size of the tree is %d\n", nsize); for(telement = 0; telement < nsize; telement++) { printf("tindex %d telement %d - FID %d Threshold %f\n",tindex, telement, trees[tindex][telement].fid,trees[tindex][telement].threshold); if(telement == 0) { all_nodes[newIndex].fid = abs(trees[tindex][telement].fid); all_nodes[newIndex].threshold = trees[tindex][telement].threshold; all_nodes[newIndex].leaf = (!trees[tindex][telement].left && !trees[tindex][telement].right)?'y':'n'; } else if(trees[tindex][telement].fid && trees[tindex][telement].id) { all_nodes[newIndex].fid = trees[tindex][telement].fid; all_nodes[newIndex].threshold = trees[tindex][telement].threshold; all_nodes[newIndex].leaf = (!trees[tindex][telement].left && !trees[tindex][telement].right)?'y':'n'; } else { all_nodes[newIndex].fid = NULL; all_nodes[newIndex].threshold = NULL; all_nodes[newIndex].leaf = NULL; } //printf("---fid=%d, threshold=%f, left=%d, right=%d\n", trees[tindex][telement].fid, trees[tindex][telement].threshold, trees[tindex][telement].left, trees[tindex][telement].right); //printf("fid=%d, threshold=%f, leaf=%c\n", all_nodes[newIndex].fid, all_nodes[newIndex].threshold, all_nodes[newIndex].leaf); newIndex++; } } /////////////////////////////////////////////////////////// ///////////FEATURES FILES READING////////////////////////////// ////////////////////////////////////////////////////////// printf("Reading Feature File....\n"); int numberOfFeatures = 0; int numberOfInstances = 0; fp = fopen(featureFile, "r"); fscanf(fp, "%d %d", &numberOfInstances, &numberOfFeatures); ///New Code On Feature Array float* features = NULL; cudaHostAlloc((void **) &features, sizeof(float)*numberOfFeatures * numberOfInstances, cudaHostAllocDefault); //float* features = (float*) malloc(numberOfFeatures * numberOfInstances * sizeof(float)); float fvalue; int fIndex = 0, iIndex = 0; int ignore; char text[20]; for(iIndex = 0; iIndex < numberOfInstances; iIndex++) { fscanf(fp, "%d %[^:]:%d", &ignore, text, &ignore); for(fIndex = 0; fIndex < numberOfFeatures; fIndex++) { fscanf(fp, "%[^:]:%f", text, &fvalue); features[iIndex*numberOfFeatures+fIndex] = fvalue; } } /////////////////////////////////////////////// /////////////TIMER ////////////////////////////////////////////// float time; cudaEvent_t start_event, stop_event; cudaEventCreate(&start_event); cudaEventCreate(&stop_event); cudaEventRecord(start_event, 0); ///////////////////KERNEL//////////////////////// kernel_wrapper(features, all_nodes, nodeSizes, numberOfInstances, nbTrees, numberOfFeatures, totalNodes); ////////////////////////////////////////////////// cudaEventRecord(stop_event, 0); cudaEventSynchronize(stop_event); cudaEventElapsedTime(&time, start_event, stop_event); float timeperinstance = time*1000000/(float)numberOfInstances; printf ("Outside Total Time is %f ns, and Time/each instance: %f ns\n", time*1000000, timeperinstance); cudaFreeHost(nodeSizes); cudaFreeHost(all_nodes); cudaFreeHost(features); free(trees); fclose(fp); return 0; }
void cudaTimer::stop(){ cudaEventRecord(fin, 0); cudaEventSynchronize(fin); cudaEventElapsedTime(&tiempo, inicio, fin); }
int main(int argc, char **argv) { int block_size = 32; dim3 dimsA(1*1*block_size, 1*1*block_size, 1); dim3 dimsB(1*2*block_size, 1*1*block_size, 1); unsigned int size_A = dimsA.x * dimsA.y; unsigned int mem_size_A = sizeof(int) * size_A; int *h_A = (int *)malloc(mem_size_A); unsigned int size_B = dimsB.x * dimsB.y; unsigned int mem_size_B = sizeof(int) * size_B; int *h_B = (int *)malloc(mem_size_B); const float valB = 0.01f; constantInit(h_A, size_A, 1); constantInit(h_B, size_B, 1); int *d_A, *d_B, *d_C; dim3 dimsC(dimsB.x, dimsA.y, 1); unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(int); int *h_C = (int *) malloc(mem_size_C); if (h_C == NULL) { fprintf(stderr, "Failed to allocate host matrix C!\n"); exit(EXIT_FAILURE); } cudaError_t error; error = cudaMalloc((void **) &d_A, mem_size_A); if (error != cudaSuccess) { printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } error = cudaMalloc((void **) &d_B, mem_size_B); if (error != cudaSuccess) { printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } error = cudaMalloc((void **) &d_C, mem_size_C); if (error != cudaSuccess) { printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); if (error != cudaSuccess) { printf("cudaMemcpy (d_A,h_A) returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); if (error != cudaSuccess) { printf("cudaMemcpy (d_B,h_B) returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } dim3 threads(block_size, block_size); dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); printf("Computing result using CUDA Kernel...\n"); if (block_size == 16) { { __set_CUDAConfig(grid, threads ); matrixMulCUDA<16>(d_C, d_A, d_B, dimsA.x, dimsB.x);} } else { { __set_CUDAConfig(grid, threads ); matrixMulCUDA<32>(d_C, d_A, d_B, dimsA.x, dimsB.x);} } printf("done\n"); cudaDeviceSynchronize(); cudaEvent_t start; error = cudaEventCreate(&start); if (error != cudaSuccess) { fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } cudaEvent_t stop; error = cudaEventCreate(&stop); if (error != cudaSuccess) { fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } error = cudaEventRecord(start, NULL); if (error != cudaSuccess) { fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } int nIter = 300; for (int j = 0; j < nIter; j++) { if (block_size == 16) { { __set_CUDAConfig(grid, threads ); matrixMulCUDA<16>(d_C, d_A, d_B, dimsA.x, dimsB.x);} } else { { __set_CUDAConfig(grid, threads ); matrixMulCUDA<32>(d_C, d_A, d_B, dimsA.x, dimsB.x);} } } error = cudaEventRecord(stop, NULL); if (error != cudaSuccess) { fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } error = cudaEventSynchronize(stop); if (error != cudaSuccess) { fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } float msecTotal = 0.0f; error = cudaEventElapsedTime(&msecTotal, start, stop); if (error != cudaSuccess) { fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } float msecPerMatrixMul = msecTotal / nIter; double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x; double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); printf( "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y); error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost); if (error != cudaSuccess) { printf("cudaMemcpy (h_C,d_C) returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } printf("Checking computed result for correctness: "); bool correct = true; double eps = 1.e-6 ; for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) { double abs_err = fabs(h_C[i] - (dimsA.x * valB)); double dot_length = dimsA.x; double abs_val = fabs(h_C[i]); double rel_err = abs_err/abs_val/dot_length ; if (rel_err > eps) { printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps); correct = false; } } printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); free(h_A); free(h_B); free(h_C); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); printf("\nNote: For peak performance, please refer to the matrixMulCUBLAS example.\n"); cudaDeviceReset(); if (correct) { return EXIT_SUCCESS; } else { return EXIT_FAILURE; } }
void GranularGPUDataTransferer::WaitForCopyGPUToCPU() { PrepareDevice(m_deviceId); cudaEventSynchronize(m_fetchCompleteEvent) || "cudaEventSynchronize failed"; }
int main(int argc, char*argv[]) { FILE *fp; uint16_t i, fsize, pad_size, stream_id; char * rtp_pkt; uint8_t default_aes_keys[AES_KEY_SIZE], default_ivs[AES_IV_SIZE], default_hmac_keys[HMAC_KEY_SIZE]; struct timespec start, end; #if defined(KERNEL_TEST) struct timespec kernel_start, kernel_end; #endif cudaEvent_t startE, stopE; cudaEventCreate(&startE); cudaEventCreate(&stopE); uint32_t NUM_FLOWS, STREAM_NUM; if (argc > 2) { NUM_FLOWS = atoi(argv[1]); STREAM_NUM = atoi(argv[2]); } else { NUM_FLOWS = 8192; STREAM_NUM = 1; } //printf ("Num of flows is %d, stream num is %d\n", NUM_FLOWS, STREAM_NUM); cudaStream_t stream[STREAM_NUM]; for (i = 0; i < STREAM_NUM; i ++) { cudaStreamCreate(&stream[i]); } uint8_t * host_in,*device_in[STREAM_NUM]; uint8_t * host_aes_keys,* device_aes_keys[STREAM_NUM]; uint8_t * host_ivs,* device_ivs[STREAM_NUM]; uint8_t * host_hmac_keys,*device_hmac_keys[STREAM_NUM]; uint32_t * host_pkt_offset,*device_pkt_offset[STREAM_NUM]; uint16_t * host_actual_length,*device_actual_length[STREAM_NUM]; double diff; uint8_t a = 123; fp = fopen("rtp.pkt", "rb"); fseek(fp, 0, SEEK_END); // NOTE: fsize should be 1356 bytes //fsize = ftell(fp); fsize = 1328; fseek(fp, 0, SEEK_SET); rtp_pkt = (char *)calloc(fsize, sizeof(char)); fread(rtp_pkt, fsize, sizeof(char), fp); pad_size = (fsize + 63 + 9) & (~0x03F); //printf("the original package is %d bytes,now we pad it to %d bytes\n", fsize, pad_size); for (i = 0; i < AES_KEY_SIZE; i ++) default_aes_keys[i] = a; for (i = 0; i < AES_IV_SIZE; i ++) default_ivs[i] = a; for (i = 0; i < HMAC_KEY_SIZE; i ++) default_hmac_keys[i] = a; //printf("duplicate it %d times, takes %d bytes\n",NUM_FLOWS,pad_size*NUM_FLOWS); cudaHostAlloc((void **)&host_in, pad_size * NUM_FLOWS * sizeof(uint8_t), cudaHostAllocDefault); cudaHostAlloc((void **)&host_aes_keys, NUM_FLOWS * AES_KEY_SIZE, cudaHostAllocWriteCombined); cudaHostAlloc((void **)&host_ivs, NUM_FLOWS * AES_IV_SIZE, cudaHostAllocWriteCombined); cudaHostAlloc((void **)&host_hmac_keys, NUM_FLOWS * HMAC_KEY_SIZE, cudaHostAllocWriteCombined); cudaHostAlloc((void **)&host_pkt_offset, NUM_FLOWS * PKT_OFFSET_SIZE, cudaHostAllocWriteCombined); cudaHostAlloc((void **)&host_actual_length, NUM_FLOWS * PKT_LENGTH_SIZE, cudaHostAllocWriteCombined); for (i = 0; i < NUM_FLOWS; i ++){ memcpy(host_in + i * pad_size, rtp_pkt, fsize * sizeof(uint8_t)); memcpy((uint8_t *)host_aes_keys + i * AES_KEY_SIZE, default_aes_keys, AES_KEY_SIZE); memcpy((uint8_t *)host_ivs + i * AES_IV_SIZE, default_ivs, AES_IV_SIZE); memcpy((uint8_t *)host_hmac_keys + i * HMAC_KEY_SIZE, default_hmac_keys, HMAC_KEY_SIZE); host_pkt_offset[i] = i * pad_size; host_actual_length[i] = fsize; } for (i = 0; i < STREAM_NUM; i ++) { cudaMalloc((void **)&(device_in[i]), pad_size * NUM_FLOWS * sizeof(uint8_t)); cudaMalloc((void **)&(device_aes_keys[i]), NUM_FLOWS * AES_KEY_SIZE); cudaMalloc((void **)&(device_ivs[i]), NUM_FLOWS * AES_IV_SIZE); cudaMalloc((void **)&(device_hmac_keys[i]), NUM_FLOWS * HMAC_KEY_SIZE); cudaMalloc((void **)&(device_pkt_offset[i]), NUM_FLOWS * PKT_OFFSET_SIZE); cudaMalloc((void **)&(device_actual_length[i]), NUM_FLOWS * PKT_LENGTH_SIZE); } /* warm up */ for (stream_id = 0; stream_id < STREAM_NUM; stream_id ++) { cudaMemcpyAsync(device_in[stream_id], host_in, pad_size * NUM_FLOWS * sizeof(uint8_t), cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_aes_keys[stream_id], host_aes_keys, NUM_FLOWS * AES_KEY_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_ivs[stream_id], host_ivs, NUM_FLOWS * AES_IV_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_hmac_keys[stream_id], host_hmac_keys, NUM_FLOWS * HMAC_KEY_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_pkt_offset[stream_id], host_pkt_offset, NUM_FLOWS * PKT_OFFSET_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_actual_length[stream_id], host_actual_length, NUM_FLOWS * PKT_LENGTH_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); co_aes_sha1_gpu( device_in[stream_id], device_in[stream_id], device_aes_keys[stream_id], device_ivs[stream_id], device_hmac_keys[stream_id], device_pkt_offset[stream_id], device_actual_length[stream_id], NUM_FLOWS, NULL, THREADS_PER_BLK, stream[stream_id]); cudaDeviceSynchronize(); } /* Real test */ for (i = 0; i < 1; i ++) { clock_gettime(CLOCK_MONOTONIC, &start); cudaEventRecord(startE, 0); for (stream_id = 0; stream_id < STREAM_NUM; stream_id ++) { cudaMemcpyAsync(device_in[stream_id], host_in, pad_size * NUM_FLOWS * sizeof(uint8_t), cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_aes_keys[stream_id], host_aes_keys, NUM_FLOWS * AES_KEY_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_ivs[stream_id], host_ivs, NUM_FLOWS * AES_IV_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_hmac_keys[stream_id], host_hmac_keys, NUM_FLOWS * HMAC_KEY_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_pkt_offset[stream_id], host_pkt_offset, NUM_FLOWS * PKT_OFFSET_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_actual_length[stream_id], host_actual_length, NUM_FLOWS * PKT_LENGTH_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); #if defined(KERNEL_TEST) cudaDeviceSynchronize(); clock_gettime(CLOCK_MONOTONIC, &kernel_start); //gettimeofday(&kernel_start, NULL); #endif co_aes_sha1_gpu( device_in[stream_id], device_in[stream_id], device_aes_keys[stream_id], device_ivs[stream_id], device_hmac_keys[stream_id], device_pkt_offset[stream_id], device_actual_length[stream_id], NUM_FLOWS, NULL, THREADS_PER_BLK, stream[stream_id]); #if defined(KERNEL_TEST) cudaDeviceSynchronize(); clock_gettime(CLOCK_MONOTONIC, &kernel_end); //gettimeofday(&kernel_end, NULL); #endif cudaMemcpyAsync(host_in, device_in[stream_id], pad_size * NUM_FLOWS * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream[stream_id]); } cudaDeviceSynchronize(); clock_gettime(CLOCK_MONOTONIC, &end); cudaEventRecord(stopE, 0); cudaEventSynchronize(stopE); float time; cudaEventElapsedTime(&time, startE, stopE); //printf("event speed is ------- %f Gbps\n", (fsize * 8 * NUM_FLOWS * STREAM_NUM * 1e-6)/time); #if defined(KERNEL_TEST) diff = 1000000 * (kernel_end.tv_sec-kernel_start.tv_sec)+ (kernel_end.tv_nsec-kernel_start.tv_nsec)/1000; printf("Only Kernel, the difference is %lf ms, speed is %lf Mbps\n", (double)diff/1000, (double)((fsize * 8) * NUM_FLOWS * STREAM_NUM) / diff); #else diff = 1000000 * (end.tv_sec-start.tv_sec)+ (end.tv_nsec-start.tv_nsec)/1000; printf("%lf\n", (double)diff/1000); //printf("%lfms,%lf Mbps\n", (double)diff/1000, (double)((fsize * 8) * NUM_FLOWS * STREAM_NUM) / diff); #endif } return 0; }
bool GIEFeatExtractor::extract_singleFeat_1D(cv::Mat &imMat, vector<float> &features, float (×)[2]) { times[0] = 0.0f; times[1] = 0.0f; // Check input image if (imMat.empty()) { std::cout << "GIEFeatExtractor::extract_singleFeat_1D(): empty imMat!" << std::endl; return false; } // Start timing cudaEvent_t startPrep, stopPrep, startNet, stopNet; if (timing) { cudaEventCreate(&startPrep); cudaEventCreate(&startNet); cudaEventCreate(&stopPrep); cudaEventCreate(&stopNet); cudaEventRecord(startPrep, NULL); cudaEventRecord(startNet, NULL); } // Image preprocessing // resize (to 256x256 or to the size of the mean mean image) if (imMat.rows != resizeDims.h || imMat.cols != resizeDims.w) { if (imMat.rows > resizeDims.h || imMat.cols > resizeDims.w) { cv::resize(imMat, imMat, cv::Size(resizeDims.h, resizeDims.w), 0, 0, CV_INTER_LANCZOS4); } else { cv::resize(imMat, imMat, cv::Size(resizeDims.h, resizeDims.w), 0, 0, CV_INTER_LINEAR); } } // crop and subtract the mean image or the mean pixel int h_off = (imMat.rows - mHeight) / 2; int w_off = (imMat.cols - mWidth) / 2; cv::Mat cv_cropped_img = imMat; cv::Rect roi(w_off, h_off, mWidth, mHeight); cv_cropped_img = imMat(roi); int top_index; for (int h = 0; h < mHeight; ++h) { const uchar* ptr = cv_cropped_img.ptr<uchar>(h); int img_index = 0; for (int w = 0; w < mWidth; ++w) { for (int c = 0; c < imMat.channels(); ++c) { top_index = (c * mHeight + h) * mWidth + w; float pixel = static_cast<float>(ptr[img_index++]); if (mean_values[0]==-1) { int mean_index = (c * imMat.rows + h_off + h) * imMat.cols + w_off + w; mInputCPU[top_index] = pixel - meanData[mean_index]; } else { mInputCPU[top_index] = pixel - mean_values[c]; } } } } /* // subtract mean if (meanR==-1) { if (!meanMat.empty() && imMat.rows==meanMat.rows && imMat.cols==meanMat.cols && imMat.channels()==meanMat.channels() && imMat.type()==meanMat.type()) { imMat = imMat - meanMat; } else { std::cout << "GIEFeatExtractor::extract_singleFeat_1D(): cannot subtract mean image!" << std::endl; return false; } } else { imMat = imMat - cv::Scalar(meanB, meanG, meanR); } // crop to input dimension (central crop) if (imMat.cols>=mWidth && imMat.rows>=mHeight) { cv::Rect imROI(floor((imMat.cols-mWidth)*0.5f), floor((imMat.rows-mHeight)*0.5f), mWidth, mHeight); imMat(imROI).copyTo(imMat); } else { cv::resize(imMat, imMat, cv::Size(mHeight, mWidth), 0, 0, CV_INTER_LINEAR); } // convert to float (with range 0-255) imMat.convertTo(imMat, CV_32FC3); if ( !imMat.isContinuous() ) imMat = imMat.clone();*/ // copy //CUDA( cudaMemcpy(mInputCPU, imMat.data, mInputSize, cudaMemcpyDefault) ); //memcpy(mInputCPU, imMat.data, mInputSize); void* inferenceBuffers[] = { mInputCUDA, mOutputCUDA }; if (timing) { // Record the stop event cudaEventRecord(stopPrep, NULL); // Wait for the stop event to complete cudaEventSynchronize(stopPrep); cudaEventElapsedTime(times, startPrep, stopPrep); } mContext->execute(1, inferenceBuffers); //CUDA(cudaDeviceSynchronize());\ features.insert(features.end(), &mOutputCPU[0], &mOutputCPU[mOutputDims]); if (timing) { // Record the stop event cudaEventRecord(stopNet, NULL); // Wait for the stop event to complete cudaEventSynchronize(stopNet); cudaEventElapsedTime(times+1, startNet, stopNet); } return true; }