コード例 #1
0
int main(int argc,char **argv){

// Print GPU properties
//print_properties();

// Files to print the result after the last time step
FILE *rho_file;
FILE *E_file;
rho_file = fopen("rho_final.txt", "w");
E_file = fopen("E_final.txt", "w");

// Construct initial condition for problem
ICsinus Config(-1.0, 1.0, -1.0, 1.0); 
//ICsquare Config(0.5,0.5,gasGam);

// Set initial values for Configuration 1
/*
Config.set_rho(rhoConfig19);
Config.set_pressure(pressureConfig19);
Config.set_u(uConfig19);
Config.set_v(vConfig19);
*/

// Determining global border based on left over tiles (a little hack)
int globalPadding;
globalPadding = (nx+2*border+16)/16;
globalPadding = 16*globalPadding - (nx+2*border);
//printf("Globalpad: %i\n", globalPadding);

// Change border to add padding
//border = border + globalPadding/2;

// Initiate the matrices for the unknowns in the Euler equations
cpu_ptr_2D rho(nx, ny, border,1);
cpu_ptr_2D E(nx, ny, border,1);
cpu_ptr_2D rho_u(nx, ny, border,1);
cpu_ptr_2D rho_v(nx, ny, border,1);
cpu_ptr_2D zeros(nx, ny, border,1);

// Set initial condition
Config.setIC(rho, rho_u, rho_v, E);

double timeStart = get_wall_time();

// Test 
cpu_ptr_2D rho_dummy(nx, ny, border);
cpu_ptr_2D E_dummy(nx, ny, border);

/*
rho_dummy.xmin = -1.0;
rho_dummy.ymin = -1.0;
E_dummy.xmin = -1.0;
E_dummy.ymin = -1.0;
*/

// Set block and grid sizes
dim3 gridBC = dim3(1, 1, 1);
dim3 blockBC = dim3(BLOCKDIM_BC,1,1);

dim3 gridBlockFlux;
dim3 threadBlockFlux;

dim3 gridBlockRK;
dim3 threadBlockRK;

computeGridBlock(gridBlockFlux, threadBlockFlux, nx + 2*border, ny + 2*border, INNERTILEDIM_X, INNERTILEDIM_Y, BLOCKDIM_X, BLOCKDIM_Y);

computeGridBlock(gridBlockRK, threadBlockRK, nx + 2*border, ny + 2*border, BLOCKDIM_X_RK, BLOCKDIM_Y_RK, BLOCKDIM_X_RK, BLOCKDIM_Y_RK);

int nElements = gridBlockFlux.x*gridBlockFlux.y;

// Allocate memory for the GPU pointers
gpu_ptr_1D L_device(nElements);
gpu_ptr_1D dt_device(1);

gpu_ptr_2D rho_device(nx, ny, border);
gpu_ptr_2D E_device(nx, ny, border);
gpu_ptr_2D rho_u_device(nx, ny, border);
gpu_ptr_2D rho_v_device(nx, ny, border); 

gpu_ptr_2D R0(nx, ny, border);
gpu_ptr_2D R1(nx, ny, border);
gpu_ptr_2D R2(nx, ny, border);
gpu_ptr_2D R3(nx, ny, border);

gpu_ptr_2D Q0(nx, ny, border);
gpu_ptr_2D Q1(nx, ny, border);
gpu_ptr_2D Q2(nx, ny, border);
gpu_ptr_2D Q3(nx, ny, border);

// Allocate pinned memory on host
init_allocate();

// Set BC arguments
set_bc_args(BCArgs[0], rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), nx+2*border, ny+2*border, border);
set_bc_args(BCArgs[1], Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), nx+2*border, ny+2*border, border);
set_bc_args(BCArgs[2], rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), nx+2*border, ny+2*border, border);

// Set FLUX arguments
set_flux_args(fluxArgs[0], L_device.getRawPtr(), rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), R0.getRawPtr(),R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), nx, ny, border, rho.get_dx(), rho.get_dy(), theta, gasGam, INNERTILEDIM_X, INNERTILEDIM_Y);
set_flux_args(fluxArgs[1], L_device.getRawPtr(), Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), R0.getRawPtr(),R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), nx, ny, border, rho.get_dx(), rho.get_dy(), theta, gasGam, INNERTILEDIM_X, INNERTILEDIM_Y);

// Set TIME argument
set_dt_args(dtArgs, L_device.getRawPtr(), dt_device.getRawPtr(), nElements, rho.get_dx(), rho.get_dy(), cfl_number);

// Set Rk arguments
set_rk_args(RKArgs[0], dt_device.getRawPtr(), rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), R0.getRawPtr(), R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), nx, ny, border); 
set_rk_args(RKArgs[1], dt_device.getRawPtr(), Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), R0.getRawPtr(), R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), nx, ny, border); 


L_device.set(FLT_MAX);

/*
R0.upload(zeros.get_ptr()); 
R1.upload(zeros.get_ptr()); 
R2.upload(zeros.get_ptr()); 
R3.upload(zeros.get_ptr()); 

Q0.upload(zeros.get_ptr()); 
Q1.upload(zeros.get_ptr()); 
Q2.upload(zeros.get_ptr()); 
Q3.upload(zeros.get_ptr()); 
*/

R0.set(0,0,0,nx,ny,border); 
R1.set(0,0,0,nx,ny,border); 
R2.set(0,0,0,nx,ny,border); 
R3.set(0,0,0,nx,ny,border); 

Q0.set(0,0,0,nx,ny,border); 
Q1.set(0,0,0,nx,ny,border); 
Q2.set(0,0,0,nx,ny,border); 
Q3.set(0,0,0,nx,ny,border); 


rho_device.upload(rho.get_ptr());
rho_u_device.upload(rho_u.get_ptr());
rho_v_device.upload(rho_v.get_ptr());
E_device.upload(E.get_ptr());

// Update boudries
callCollectiveSetBCPeriodic(gridBC, blockBC, BCArgs[0]);

//Create cuda stream
cudaStream_t stream1;
cudaStreamCreate(&stream1);
cudaEvent_t dt_complete;
cudaEventCreate(&dt_complete);


while (currentTime < timeLength && step < maxStep){	
	
	//RK1	
	//Compute flux
	callFluxKernel(gridBlockFlux, threadBlockFlux, 0, fluxArgs[0]);	
	
	// Compute timestep (based on CFL condition)
	callDtKernel(TIMETHREADS, dtArgs);
	
	cudaMemcpyAsync(dt_host, dt_device.getRawPtr(), sizeof(float), cudaMemcpyDeviceToHost, stream1);
	cudaEventRecord(dt_complete, stream1);

	// Perform RK1 step
	callRKKernel(gridBlockRK, threadBlockRK, 0, RKArgs[0]);
	
	//Update boudries
	callCollectiveSetBCPeriodic(gridBC, blockBC, BCArgs[1]);		

	//RK2
	// Compute flux
	callFluxKernel(gridBlockFlux, threadBlockFlux, 1, fluxArgs[1]);

	//Perform RK2 step
	callRKKernel(gridBlockRK, threadBlockRK, 1, RKArgs[1]);	

	//cudaEventRecord(srteam_sync, srteam1);

	callCollectiveSetBCPeriodic(gridBC, blockBC, BCArgs[2]);

	cudaEventSynchronize(dt_complete);

	step++;	
	currentTime += *dt_host;	
//	printf("Step: %i, current time: %.6f dt:%.6f\n" , step,currentTime, dt_host[0]);

}


//cuProfilerStop();
//cudaProfilerStop();

printf("Elapsed time %.5f", get_wall_time() - timeStart);

E_device.download(E.get_ptr());
rho_u_device.download(rho_u.get_ptr());
rho_v_device.download(rho_v.get_ptr());
rho_device.download(rho_dummy.get_ptr());

rho_dummy.printToFile(rho_file, true, false);


Config.exactSolution(E_dummy, currentTime);
E_dummy.printToFile(E_file, true, false);


float LinfError = Linf(E_dummy, rho_dummy);
float L1Error = L1(E_dummy, rho_dummy); 
float L1Error2 = L1test(E_dummy, rho_dummy);

printf("nx: %i\t Linf error %.9f\t L1 error %.7f L1test erro %.7f", nx, LinfError, L1Error, L1Error2);


printf("nx: %i step: %i, current time: %.6f dt:%.6f\n" , nx, step,currentTime, dt_host[0]); 


/*
cudaMemcpy(L_host, L_device, sizeof(float)*(nElements), cudaMemcpyDeviceToHost);
for (int i =0; i < nElements; i++)
	printf(" %.7f ", L_host[i]); 
*/


printf("%s\n", cudaGetErrorString(cudaGetLastError()));

return(0);
}
コード例 #2
0
ファイル: CudaEvent.hpp プロジェクト: AK9527lq/picongpu
 /**
  * free allocated memory
  */
 static void destroy(const CudaEvent& ev)
 {
     CUDA_CHECK(cudaEventSynchronize(ev.event));
     CUDA_CHECK(cudaEventDestroy(ev.event));
 }
コード例 #3
0
int main(int argc, char** args) {
    /*if(argc < 3) {
    	return -1;
    }

    char* configFile = args[1];
    char* featureFile = args[2];
    */
    char* configFile = "ensemble-3-1.xml.tree.end";
    char* featureFile = "test.txt";

    ////////////////////////////////////////////
    // build DecisionTree
    ////////////////////////////////////////////
    FILE *fp = fopen(configFile, "r");
    int nbTrees;
    fscanf(fp, "%d", &nbTrees);

    int totalNodes = 0;
    int* nodeSizes;
    cudaHostAlloc((void **) &nodeSizes, sizeof(int)*nbTrees, cudaHostAllocDefault);
    //int* nodeSizes = (int*) malloc(nbTrees * sizeof(int));
    StructPlus** trees = (StructPlus**) malloc(nbTrees * sizeof(StructPlus*));
    printf("Starting Tree Reading....\n");
    int tindex = 0;
    for(tindex = 0; tindex < nbTrees; tindex++) {
        int treeSize;
        fscanf(fp, "%d", &treeSize);
        int internalSize = pow(2.0, treeSize) - 1;
        int fullSize = 2* pow(2.0, treeSize) - 1;
        nodeSizes[tindex] = fullSize;
        totalNodes += fullSize;
        int* pointers = (int*) malloc(internalSize * sizeof(int));
        trees[tindex] = createNodes(fullSize);

        char text[20];
        int line = 0;
        for(line = 0; line < internalSize; line++) pointers[line] = -1;
        fscanf(fp, "%s", text);
        while(strcmp(text, "end") != 0) {
            int id;
            fscanf(fp, "%d", &id);

            if(strcmp(text, "root") == 0) {
                int fid;
                float threshold;
                fscanf(fp, "%d %f", &fid, &threshold);
                setRoot(trees[tindex], id, fid, threshold);
                pointers[id] = 0;
            } else if(strcmp(text, "node") == 0) {
                int fid;
                int pid;
                float threshold;
                int leftChild = 0;
                fscanf(fp, "%d %d %d %f", &pid, &fid, &leftChild, &threshold);
                if(pointers[pid] >= 0 && trees[tindex][pointers[pid]].fid >= 0) {
                    pointers[id] = addNode(trees[tindex], pointers[pid], id, leftChild, fid, threshold);
                }
            } else if(strcmp(text, "leaf") == 0) {
                int pid;
                int leftChild = 0;
                float value;
                fscanf(fp, "%d %d %f", &pid, &leftChild, &value);
                if(pointers[pid] >= 0 && trees[tindex][pointers[pid]].fid >= 0) {
                    addNode(trees[tindex], pointers[pid], id, leftChild, -1, value);
                }
            }
            fscanf(fp, "%s", text);
        }
        free(pointers);
    }
    fclose(fp);

    // Pack all trees into a single array, thus avoiding two-D arrays.
    printf("Starting Rearrange the Tree....\n");
    //StructSimple* all_nodes = (StructSimple*) malloc(totalNodes * sizeof(StructSimple));
    StructSimple* all_nodes = NULL;
    cudaHostAlloc((void **) &all_nodes, sizeof(StructSimple)*totalNodes, cudaHostAllocDefault);
    int newIndex = 0;

    for(tindex = 0; tindex < nbTrees; tindex++) {
        int nsize = nodeSizes[tindex];
        nodeSizes[tindex] = newIndex;
        int telement;
        //printf("Size of the tree is %d\n", nsize);
        for(telement = 0; telement < nsize; telement++) {
            printf("tindex %d telement %d - FID %d Threshold %f\n",tindex, telement, trees[tindex][telement].fid,trees[tindex][telement].threshold);
            if(telement == 0) {
                all_nodes[newIndex].fid = abs(trees[tindex][telement].fid);
                all_nodes[newIndex].threshold = trees[tindex][telement].threshold;
                all_nodes[newIndex].leaf = (!trees[tindex][telement].left && !trees[tindex][telement].right)?'y':'n';
            } else if(trees[tindex][telement].fid && trees[tindex][telement].id) {
                all_nodes[newIndex].fid = trees[tindex][telement].fid;
                all_nodes[newIndex].threshold = trees[tindex][telement].threshold;
                all_nodes[newIndex].leaf = (!trees[tindex][telement].left && !trees[tindex][telement].right)?'y':'n';
            } else {
                all_nodes[newIndex].fid = NULL;
                all_nodes[newIndex].threshold = NULL;
                all_nodes[newIndex].leaf = NULL;
            }
            //printf("---fid=%d, threshold=%f, left=%d, right=%d\n", trees[tindex][telement].fid, trees[tindex][telement].threshold, trees[tindex][telement].left, trees[tindex][telement].right);
            //printf("fid=%d, threshold=%f, leaf=%c\n", all_nodes[newIndex].fid, all_nodes[newIndex].threshold, all_nodes[newIndex].leaf);
            newIndex++;
        }
    }

    ///////////////////////////////////////////////////////////
    ///////////FEATURES FILES READING//////////////////////////////
    //////////////////////////////////////////////////////////
    printf("Reading Feature File....\n");
    int numberOfFeatures = 0;
    int numberOfInstances = 0;
    fp = fopen(featureFile, "r");
    fscanf(fp, "%d %d", &numberOfInstances, &numberOfFeatures);

    ///New Code On Feature Array
    float* features = NULL;
    cudaHostAlloc((void **) &features, sizeof(float)*numberOfFeatures * numberOfInstances,  cudaHostAllocDefault);
    //float* features = (float*) malloc(numberOfFeatures * numberOfInstances * sizeof(float));
    float fvalue;
    int fIndex = 0, iIndex = 0;
    int ignore;
    char text[20];
    for(iIndex = 0; iIndex < numberOfInstances; iIndex++) {
        fscanf(fp, "%d %[^:]:%d", &ignore, text, &ignore);
        for(fIndex = 0; fIndex < numberOfFeatures; fIndex++) {
            fscanf(fp, "%[^:]:%f", text, &fvalue);
            features[iIndex*numberOfFeatures+fIndex] = fvalue;
        }
    }

    ///////////////////////////////////////////////
    /////////////TIMER
    //////////////////////////////////////////////
    float time;
    cudaEvent_t start_event, stop_event;
    cudaEventCreate(&start_event);
    cudaEventCreate(&stop_event);
    cudaEventRecord(start_event, 0);

    ///////////////////KERNEL////////////////////////
    kernel_wrapper(features, all_nodes, nodeSizes, numberOfInstances, nbTrees, numberOfFeatures, totalNodes);
    //////////////////////////////////////////////////

    cudaEventRecord(stop_event, 0);
    cudaEventSynchronize(stop_event);
    cudaEventElapsedTime(&time, start_event, stop_event);
    float timeperinstance = time*1000000/(float)numberOfInstances;
    printf ("Outside Total Time is %f ns, and Time/each instance: %f ns\n", time*1000000, timeperinstance);

    cudaFreeHost(nodeSizes);
    cudaFreeHost(all_nodes);
    cudaFreeHost(features);
    free(trees);
    fclose(fp);
    return 0;
}
コード例 #4
0
void cudaTimer::stop(){

	cudaEventRecord(fin, 0);
	cudaEventSynchronize(fin);
	cudaEventElapsedTime(&tiempo, inicio, fin);
}
コード例 #5
0
int main(int argc, char **argv)
{
    int block_size = 32;

    dim3 dimsA(1*1*block_size, 1*1*block_size, 1);
    dim3 dimsB(1*2*block_size, 1*1*block_size, 1);

     
    unsigned int size_A = dimsA.x * dimsA.y;
    unsigned int mem_size_A = sizeof(int) * size_A;
    int *h_A = (int *)malloc(mem_size_A);
    unsigned int size_B = dimsB.x * dimsB.y;
    unsigned int mem_size_B = sizeof(int) * size_B;
    int *h_B = (int *)malloc(mem_size_B);
    const float valB = 0.01f;

     
    constantInit(h_A, size_A, 1);
    constantInit(h_B, size_B, 1);

     
    int *d_A, *d_B, *d_C;

     
    dim3 dimsC(dimsB.x, dimsA.y, 1);
    unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(int);
    int *h_C = (int *) malloc(mem_size_C);

    if (h_C == NULL)
    {
        fprintf(stderr, "Failed to allocate host matrix C!\n");
        exit(EXIT_FAILURE);
    }

    cudaError_t error;

    error = cudaMalloc((void **) &d_A, mem_size_A);

    if (error != cudaSuccess)
    {
        printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    error = cudaMalloc((void **) &d_B, mem_size_B);

    if (error != cudaSuccess)
    {
        printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    error = cudaMalloc((void **) &d_C, mem_size_C);

    if (error != cudaSuccess)
    {
        printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

     
    error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);

    if (error != cudaSuccess)
    {
        printf("cudaMemcpy (d_A,h_A) returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);

    if (error != cudaSuccess)
    {
        printf("cudaMemcpy (d_B,h_B) returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

     
    dim3 threads(block_size, block_size);
    dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);

     
    printf("Computing result using CUDA Kernel...\n");

     
    if (block_size == 16)
    {
{ __set_CUDAConfig(grid, threads ); 
          
 matrixMulCUDA<16>(d_C, d_A, d_B, dimsA.x, dimsB.x);}
          
    }
    else
    {
{ __set_CUDAConfig(grid, threads ); 
          
 matrixMulCUDA<32>(d_C, d_A, d_B, dimsA.x, dimsB.x);}
          
    }

    printf("done\n");

    cudaDeviceSynchronize();

     
    cudaEvent_t start;
    error = cudaEventCreate(&start);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    cudaEvent_t stop;
    error = cudaEventCreate(&stop);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

     
    error = cudaEventRecord(start, NULL);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

     
    int nIter = 300;

    for (int j = 0; j < nIter; j++)
    {
        if (block_size == 16)
        {
{ __set_CUDAConfig(grid, threads ); 
          
 matrixMulCUDA<16>(d_C, d_A, d_B, dimsA.x, dimsB.x);}
          
        }
        else
        {
{ __set_CUDAConfig(grid, threads ); 
          
 matrixMulCUDA<32>(d_C, d_A, d_B, dimsA.x, dimsB.x);}
          
        }
    }

     
    error = cudaEventRecord(stop, NULL);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

     
    error = cudaEventSynchronize(stop);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    float msecTotal = 0.0f;
    error = cudaEventElapsedTime(&msecTotal, start, stop);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

     
    float msecPerMatrixMul = msecTotal / nIter;
    double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x;
    double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
    printf(
        "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n",
        gigaFlops,
        msecPerMatrixMul,
        flopsPerMatrixMul,
        threads.x * threads.y);

     
    error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);

    if (error != cudaSuccess)
    {
        printf("cudaMemcpy (h_C,d_C) returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    printf("Checking computed result for correctness: ");
    bool correct = true;

     
     
    double eps = 1.e-6 ;  
    for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++)
    {
        double abs_err = fabs(h_C[i] - (dimsA.x * valB));
        double dot_length = dimsA.x;
        double abs_val = fabs(h_C[i]);
        double rel_err = abs_err/abs_val/dot_length ;
        if (rel_err > eps)
        {
            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps);
            correct = false;
        }
    }

    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");

     
    free(h_A);
    free(h_B);
    free(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    printf("\nNote: For peak performance, please refer to the matrixMulCUBLAS example.\n");

    cudaDeviceReset();

    if (correct)
    {
        return EXIT_SUCCESS;
    }
    else
    {
        return EXIT_FAILURE;
    }
}
コード例 #6
0
ファイル: GPUDataTransferer.cpp プロジェクト: AllanYiin/CNTK
void GranularGPUDataTransferer::WaitForCopyGPUToCPU()
{
    PrepareDevice(m_deviceId);
    cudaEventSynchronize(m_fetchCompleteEvent) || "cudaEventSynchronize failed";
}
コード例 #7
0
ファイル: workright.c プロジェクト: H1d3r/UPro
int main(int argc, char*argv[])
{
	FILE *fp;
	uint16_t i, fsize, pad_size, stream_id;
	char * rtp_pkt;
	uint8_t default_aes_keys[AES_KEY_SIZE], default_ivs[AES_IV_SIZE], default_hmac_keys[HMAC_KEY_SIZE];

	struct  timespec start, end;
#if defined(KERNEL_TEST)
	struct  timespec kernel_start, kernel_end;
#endif


	cudaEvent_t startE, stopE;
    cudaEventCreate(&startE);
    cudaEventCreate(&stopE);


	uint32_t NUM_FLOWS, STREAM_NUM;
	if (argc > 2) {
		NUM_FLOWS = atoi(argv[1]);
		STREAM_NUM = atoi(argv[2]);
	} else {
		NUM_FLOWS = 8192;
		STREAM_NUM = 1;
	}
	//printf ("Num of flows is %d, stream num is %d\n", NUM_FLOWS, STREAM_NUM);

	cudaStream_t stream[STREAM_NUM];
	for (i = 0; i < STREAM_NUM; i ++) {
		cudaStreamCreate(&stream[i]);
	}

	uint8_t * host_in,*device_in[STREAM_NUM];
	uint8_t * host_aes_keys,* device_aes_keys[STREAM_NUM];
	uint8_t * host_ivs,* device_ivs[STREAM_NUM];
	uint8_t * host_hmac_keys,*device_hmac_keys[STREAM_NUM];
	uint32_t * host_pkt_offset,*device_pkt_offset[STREAM_NUM];
	uint16_t * host_actual_length,*device_actual_length[STREAM_NUM];
       
	double diff;
	uint8_t a = 123;

	fp = fopen("rtp.pkt", "rb");
	fseek(fp, 0, SEEK_END);
	// NOTE: fsize should be 1356 bytes
	//fsize = ftell(fp);
	fsize = 1328;
	fseek(fp, 0, SEEK_SET);

	rtp_pkt = (char *)calloc(fsize, sizeof(char));
	fread(rtp_pkt, fsize, sizeof(char), fp);

	pad_size = (fsize + 63 + 9) & (~0x03F);

	//printf("the original package is %d bytes,now we pad it to %d bytes\n", fsize, pad_size);

	for (i = 0; i < AES_KEY_SIZE; i ++)
		default_aes_keys[i] = a;
	for (i = 0; i < AES_IV_SIZE; i ++)
		default_ivs[i] = a;
	for (i = 0; i < HMAC_KEY_SIZE; i ++)
		default_hmac_keys[i] = a;

	//printf("duplicate it %d times, takes %d bytes\n",NUM_FLOWS,pad_size*NUM_FLOWS);
	cudaHostAlloc((void **)&host_in, pad_size * NUM_FLOWS * sizeof(uint8_t), cudaHostAllocDefault);
	cudaHostAlloc((void **)&host_aes_keys, NUM_FLOWS * AES_KEY_SIZE, cudaHostAllocWriteCombined);
	cudaHostAlloc((void **)&host_ivs, NUM_FLOWS * AES_IV_SIZE, cudaHostAllocWriteCombined);
	cudaHostAlloc((void **)&host_hmac_keys, NUM_FLOWS * HMAC_KEY_SIZE, cudaHostAllocWriteCombined);
	cudaHostAlloc((void **)&host_pkt_offset, NUM_FLOWS * PKT_OFFSET_SIZE, cudaHostAllocWriteCombined);
	cudaHostAlloc((void **)&host_actual_length, NUM_FLOWS * PKT_LENGTH_SIZE, cudaHostAllocWriteCombined);

	for (i = 0; i < NUM_FLOWS; i ++){
		memcpy(host_in + i * pad_size, rtp_pkt, fsize * sizeof(uint8_t));
		memcpy((uint8_t *)host_aes_keys + i * AES_KEY_SIZE, default_aes_keys, AES_KEY_SIZE);
		memcpy((uint8_t *)host_ivs + i * AES_IV_SIZE, default_ivs, AES_IV_SIZE);
		memcpy((uint8_t *)host_hmac_keys + i * HMAC_KEY_SIZE, default_hmac_keys, HMAC_KEY_SIZE);
		host_pkt_offset[i] = i * pad_size;
		host_actual_length[i] = fsize;
	}

	for (i = 0; i < STREAM_NUM; i ++) {
		cudaMalloc((void **)&(device_in[i]), pad_size * NUM_FLOWS * sizeof(uint8_t));
		cudaMalloc((void **)&(device_aes_keys[i]), NUM_FLOWS * AES_KEY_SIZE);
		cudaMalloc((void **)&(device_ivs[i]), NUM_FLOWS * AES_IV_SIZE);
		cudaMalloc((void **)&(device_hmac_keys[i]), NUM_FLOWS * HMAC_KEY_SIZE);
		cudaMalloc((void **)&(device_pkt_offset[i]), NUM_FLOWS * PKT_OFFSET_SIZE);
		cudaMalloc((void **)&(device_actual_length[i]), NUM_FLOWS * PKT_LENGTH_SIZE);
	}

	/* warm up */
	for (stream_id = 0; stream_id < STREAM_NUM; stream_id ++) {
		cudaMemcpyAsync(device_in[stream_id], host_in, pad_size * NUM_FLOWS * sizeof(uint8_t), cudaMemcpyHostToDevice, stream[stream_id]);
		cudaMemcpyAsync(device_aes_keys[stream_id], host_aes_keys, NUM_FLOWS * AES_KEY_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);
		cudaMemcpyAsync(device_ivs[stream_id], host_ivs, NUM_FLOWS * AES_IV_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);
		cudaMemcpyAsync(device_hmac_keys[stream_id], host_hmac_keys, NUM_FLOWS * HMAC_KEY_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);
		cudaMemcpyAsync(device_pkt_offset[stream_id], host_pkt_offset, NUM_FLOWS * PKT_OFFSET_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);
		cudaMemcpyAsync(device_actual_length[stream_id], host_actual_length, NUM_FLOWS * PKT_LENGTH_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);

		co_aes_sha1_gpu(
					device_in[stream_id],
					device_in[stream_id],
					device_aes_keys[stream_id],
					device_ivs[stream_id],
					device_hmac_keys[stream_id],
					device_pkt_offset[stream_id],
					device_actual_length[stream_id],
					NUM_FLOWS,
					NULL,
					THREADS_PER_BLK,
					stream[stream_id]);

		cudaDeviceSynchronize();
	}

	/* Real test */
	for (i = 0; i < 1; i ++) {
		clock_gettime(CLOCK_MONOTONIC, &start);
		cudaEventRecord(startE, 0);

		for (stream_id = 0; stream_id < STREAM_NUM; stream_id ++) {

			cudaMemcpyAsync(device_in[stream_id], host_in, pad_size * NUM_FLOWS * sizeof(uint8_t), cudaMemcpyHostToDevice, stream[stream_id]);
			cudaMemcpyAsync(device_aes_keys[stream_id], host_aes_keys, NUM_FLOWS * AES_KEY_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);
			cudaMemcpyAsync(device_ivs[stream_id], host_ivs, NUM_FLOWS * AES_IV_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);
			cudaMemcpyAsync(device_hmac_keys[stream_id], host_hmac_keys, NUM_FLOWS * HMAC_KEY_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);
			cudaMemcpyAsync(device_pkt_offset[stream_id], host_pkt_offset, NUM_FLOWS * PKT_OFFSET_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);
			cudaMemcpyAsync(device_actual_length[stream_id], host_actual_length, NUM_FLOWS * PKT_LENGTH_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);

#if defined(KERNEL_TEST)
			cudaDeviceSynchronize();
			clock_gettime(CLOCK_MONOTONIC, &kernel_start);
			//gettimeofday(&kernel_start, NULL);
#endif
			co_aes_sha1_gpu(
					device_in[stream_id],
					device_in[stream_id],
					device_aes_keys[stream_id],
					device_ivs[stream_id],
					device_hmac_keys[stream_id],
					device_pkt_offset[stream_id],
					device_actual_length[stream_id],
					NUM_FLOWS,
					NULL,
					THREADS_PER_BLK,
					stream[stream_id]);
#if defined(KERNEL_TEST)
			cudaDeviceSynchronize();
			clock_gettime(CLOCK_MONOTONIC, &kernel_end);
			//gettimeofday(&kernel_end, NULL);
#endif
			cudaMemcpyAsync(host_in, device_in[stream_id], pad_size * NUM_FLOWS * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream[stream_id]);
		}

		cudaDeviceSynchronize();
		clock_gettime(CLOCK_MONOTONIC, &end);

		cudaEventRecord(stopE, 0);
		cudaEventSynchronize(stopE);
		float time;
		cudaEventElapsedTime(&time, startE, stopE);
		//printf("event speed is ------- %f Gbps\n", (fsize * 8 * NUM_FLOWS * STREAM_NUM * 1e-6)/time);

#if defined(KERNEL_TEST)
		diff = 1000000 * (kernel_end.tv_sec-kernel_start.tv_sec)+ (kernel_end.tv_nsec-kernel_start.tv_nsec)/1000;
		printf("Only Kernel, the difference is %lf ms, speed is %lf Mbps\n", (double)diff/1000, (double)((fsize * 8) * NUM_FLOWS * STREAM_NUM) / diff);
#else
		diff = 1000000 * (end.tv_sec-start.tv_sec)+ (end.tv_nsec-start.tv_nsec)/1000;
		printf("%lf\n", (double)diff/1000);
		//printf("%lfms,%lf Mbps\n", (double)diff/1000, (double)((fsize * 8) * NUM_FLOWS * STREAM_NUM) / diff);
#endif
	}

	return 0;
}
コード例 #8
0
bool GIEFeatExtractor::extract_singleFeat_1D(cv::Mat &imMat, vector<float> &features, float (&times)[2])
{

    times[0] = 0.0f;
    times[1] = 0.0f;

    // Check input image 
    if (imMat.empty())
    {
        std::cout << "GIEFeatExtractor::extract_singleFeat_1D(): empty imMat!" << std::endl;
        return false;
    }

    // Start timing
    cudaEvent_t startPrep, stopPrep, startNet, stopNet;
    if (timing)
    {
        cudaEventCreate(&startPrep);
        cudaEventCreate(&startNet);
        cudaEventCreate(&stopPrep);
        cudaEventCreate(&stopNet);
        cudaEventRecord(startPrep, NULL);
        cudaEventRecord(startNet, NULL);
    }

    // Image preprocessing
 
    // resize (to 256x256 or to the size of the mean mean image)
    if (imMat.rows != resizeDims.h || imMat.cols != resizeDims.w)
    {
       if (imMat.rows > resizeDims.h || imMat.cols > resizeDims.w)
       {
           cv::resize(imMat, imMat, cv::Size(resizeDims.h, resizeDims.w), 0, 0, CV_INTER_LANCZOS4);
       }
       else
       {
           cv::resize(imMat, imMat, cv::Size(resizeDims.h, resizeDims.w), 0, 0, CV_INTER_LINEAR);
       }
    }

    // crop and subtract the mean image or the mean pixel
    int h_off = (imMat.rows - mHeight) / 2;
    int w_off = (imMat.cols - mWidth) / 2;

    cv::Mat cv_cropped_img = imMat;
    cv::Rect roi(w_off, h_off, mWidth, mHeight);
    cv_cropped_img = imMat(roi);

    int top_index;
    for (int h = 0; h < mHeight; ++h)
    {
       const uchar* ptr = cv_cropped_img.ptr<uchar>(h);
       int img_index = 0;
       for (int w = 0; w < mWidth; ++w)
       {
           for (int c = 0; c < imMat.channels(); ++c)
           {
               top_index = (c * mHeight + h) * mWidth + w;
               float pixel = static_cast<float>(ptr[img_index++]);
               if (mean_values[0]==-1)
               {
                   int mean_index = (c * imMat.rows + h_off + h) * imMat.cols + w_off + w;
                   mInputCPU[top_index] = pixel - meanData[mean_index];
                }
                else
                {
                    mInputCPU[top_index] = pixel - mean_values[c]; 
                }
            }
         }
      }

    /*

    // subtract mean 
    if (meanR==-1)
    {
        if (!meanMat.empty() && imMat.rows==meanMat.rows && imMat.cols==meanMat.cols && imMat.channels()==meanMat.channels() && imMat.type()==meanMat.type())
        {
            imMat = imMat - meanMat;
        }
        else
        {
            std::cout << "GIEFeatExtractor::extract_singleFeat_1D(): cannot subtract mean image!" << std::endl;
            return false;
        }
    }
    else
    {  
        imMat = imMat - cv::Scalar(meanB, meanG, meanR);
    }

    // crop to input dimension (central crop)
    if (imMat.cols>=mWidth && imMat.rows>=mHeight)
    {
        cv::Rect imROI(floor((imMat.cols-mWidth)*0.5f), floor((imMat.rows-mHeight)*0.5f), mWidth, mHeight);
        imMat(imROI).copyTo(imMat);
    } 
        else
    {
        cv::resize(imMat, imMat, cv::Size(mHeight, mWidth), 0, 0, CV_INTER_LINEAR);
    }

    // convert to float (with range 0-255)
    imMat.convertTo(imMat, CV_32FC3);

    if ( !imMat.isContinuous() ) 
          imMat = imMat.clone();*/

    // copy 
    //CUDA( cudaMemcpy(mInputCPU, imMat.data, mInputSize, cudaMemcpyDefault) );
    //memcpy(mInputCPU, imMat.data, mInputSize);

    void* inferenceBuffers[] = { mInputCUDA, mOutputCUDA };

    if (timing)
    {
        // Record the stop event
        cudaEventRecord(stopPrep, NULL);

        // Wait for the stop event to complete
        cudaEventSynchronize(stopPrep);

        cudaEventElapsedTime(times, startPrep, stopPrep);

    }

     mContext->execute(1, inferenceBuffers);
     //CUDA(cudaDeviceSynchronize());\

    features.insert(features.end(), &mOutputCPU[0], &mOutputCPU[mOutputDims]);

    if (timing)
    {
        // Record the stop event
        cudaEventRecord(stopNet, NULL);

        // Wait for the stop event to complete
        cudaEventSynchronize(stopNet);

        cudaEventElapsedTime(times+1, startNet, stopNet);

    }

    return true;

}