コード例 #1
0
ファイル: profiler_test.cpp プロジェクト: jstebel/flow123d
TEST(Profiler, one_timer) {

    Profiler::initialize();
    double wait_time = clock_resolution();
    double total=0;
    { // uninitialize can not be in the same block as the START_TIMER
    START_TIMER("test_tag");
    EXPECT_EQ( 1, AC);
    total += wait(wait_time);
    END_TIMER("test_tag");

    START_TIMER("test_tag");
    EXPECT_EQ( total, ACT);
    EXPECT_EQ( 2, AC);
    total += wait(wait_time);
    total += wait(wait_time);
    END_TIMER("test_tag");

    START_TIMER("test_tag");
    EXPECT_EQ( total, ACT);
    EXPECT_EQ( 3, AC);

    }

    // test add_call
    {
        START_TIMER("add_call");
        ADD_CALLS(1000);
        EXPECT_EQ(1000, AC);
    }

    Profiler::uninitialize();

}
コード例 #2
0
ファイル: profiler_test.cpp プロジェクト: jbrezmorf/flow123d
void ProfilerTest::test_petsc_memory_monitor() {
    int ierr, mpi_rank;
    ierr = MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
    EXPECT_EQ( ierr, 0 );

    Profiler::initialize(); {
        PetscInt size = 10000;
        START_TIMER("A");
            Vec tmp_vector;
            VecCreateSeq(PETSC_COMM_SELF, size, &tmp_vector);
            VecDestroy(&tmp_vector);
            
            START_TIMER("C");
            END_TIMER("C");
        END_TIMER("A");
        
        START_TIMER("B");
            Vec tmp_vector1, tmp_vector2;
            VecCreateSeq(PETSC_COMM_SELF, size, &tmp_vector1);
            VecCreateSeq(PETSC_COMM_SELF, size, &tmp_vector2);
            VecDestroy(&tmp_vector1);
            VecDestroy(&tmp_vector2);
        END_TIMER("B");
    }
    PI->output(MPI_COMM_WORLD, cout);
    Profiler::uninitialize();
}
コード例 #3
0
ファイル: profiler_test.cpp プロジェクト: jbrezmorf/flow123d
 void ProfilerTest::test_one_timer() {
    const double TIMER_RESOLUTION = Profiler::get_resolution();
    const double DELTA = TIMER_RESOLUTION*1000;
    double total=0;
    Profiler::initialize();

    { // uninitialize can not be in the same block as the START_TIMER


    START_TIMER("test_tag");
        // test that number of calls of current timer is
        EXPECT_EQ( 1, ACC);

        // wait a TIMER_RESOLUTION time
        total += wait_sec(TIMER_RESOLUTION);
    END_TIMER("test_tag");



    START_TIMER("test_tag");
        // test that number of calls of current timer is
        EXPECT_EQ( 2, ACC);

        // test whether difference between measured time and total time is within TIMER_RESOLUTION
        EXPECT_LE( abs(ACT-total), DELTA);
        cout << "difference: " << abs(total-ACT) << ", tolerance: " << DELTA << endl;

        // wait a TIMER_RESOLUTION time
        total += wait_sec (TIMER_RESOLUTION);
        total += wait_sec (TIMER_RESOLUTION);

    END_TIMER("test_tag");

    START_TIMER("test_tag");
        EXPECT_EQ( 3, ACC);
        EXPECT_LE( abs(ACT-total), DELTA);
        cout << "difference: " << abs(total-ACT) << ", tolerance: " << DELTA << endl;
    }

    // test add_call
    {
        START_TIMER("add_call");
        ADD_CALLS(1000);
        EXPECT_EQ(1000, ACC);
    }

    // test absolute time
    {
        START_TIMER("one_second");
        wait_sec(1);
    }
    std::stringstream sout;
    PI->output(MPI_COMM_WORLD, sout);
    PI->output(MPI_COMM_WORLD, cout);

    //EXPECT_NE( sout.str().find("\"tag\": \"Whole Program\""), string::npos );

    Profiler::uninitialize();
}
コード例 #4
0
ファイル: profiler_test.cpp プロジェクト: jbrezmorf/flow123d
void ProfilerTest::test_memory_propagation(){
    const int SIZE = 25;
    int allocated_whole = 0;
    int allocated_A = 0;
    int allocated_B = 0;
    int allocated_C = 0;
    int allocated_D = 0;
    
    Profiler::initialize();
    {
        allocated_whole = MALLOC;
        allocated_whole += alloc_and_dealloc<int>(SIZE);
        EXPECT_EQ(MALLOC, allocated_whole);
        
        START_TIMER("A");
            allocated_A += alloc_and_dealloc<int>(10 * SIZE);
            EXPECT_EQ(MALLOC, allocated_A);
            
            START_TIMER("B");
                allocated_B += alloc_and_dealloc<int>(100 * SIZE);
                
                START_TIMER("C");
                    EXPECT_EQ(MALLOC, allocated_C);
                END_TIMER("C");
                allocated_B += allocated_C;
                
            END_TIMER("B");
            allocated_A += allocated_B;
            
            allocated_A += alloc_and_dealloc<int>(10 * SIZE);
            
            for(int i = 0; i < 5; i++) {
                START_TIMER("D");
                    allocated_D += alloc_and_dealloc<int>(1 * SIZE);
                END_TIMER("D");
                START_TIMER("D");
                    allocated_D += alloc_and_dealloc<int>(1 * SIZE);
                END_TIMER("D");
            }
            allocated_A += allocated_D;
            
            
        END_TIMER("A");
        allocated_whole += allocated_A;
    }
    PI->propagate_timers();
    EXPECT_EQ(MALLOC, allocated_whole);
    EXPECT_EQ(MALLOC, DEALOC);
    Profiler::uninitialize();
}
コード例 #5
0
ファイル: profiler_test.cpp プロジェクト: jbrezmorf/flow123d
// testing non-fatal functioning of Profiler when debug is off
TEST(Profiler, test_calls_only) {
    Profiler::initialize();
    START_TIMER("sub1");
    END_TIMER("sub1");
    PI->output(MPI_COMM_WORLD, cout);
    Profiler::uninitialize();

}
コード例 #6
0
ファイル: cfd.cpp プロジェクト: Ethan999/OpenDwarfs
void upload(cl_command_queue commands, cl_mem dst, T* src, int N)
{
	int err = clEnqueueWriteBuffer(commands, dst, CL_TRUE, 0, sizeof(T) * N, src, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CFD Data Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "Unable to write memory to device");
}
コード例 #7
0
ファイル: cfd.cpp プロジェクト: Ethan999/OpenDwarfs
void download(cl_command_queue commands, T* dst, cl_mem src, int N)
{
	int err = clEnqueueReadBuffer(commands, src, CL_TRUE, 0, sizeof(T)*N, dst, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "CFD Data Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "Unable to read memory from device");
}
コード例 #8
0
ファイル: profiler_test.cpp プロジェクト: jbrezmorf/flow123d
void ProfilerTest::test_memory_profiler() {
    const int ARR_SIZE = 1000;
    const int LOOP_CNT = 1000;
    Profiler::initialize();

    {
        START_TIMER("memory-profiler-int");
        // alloc and dealloc array of int
        for (int i = 0; i < LOOP_CNT; i++) alloc_and_dealloc<int>(ARR_SIZE);
        // test that we deallocated all allocated space
        EXPECT_EQ(MALLOC, DEALOC);
        // test that allocated space is correct size
        EXPECT_EQ(MALLOC, ARR_SIZE * LOOP_CNT * sizeof(int));
        END_TIMER("memory-profiler-int");


        START_TIMER("memory-profiler-double");
        // alloc and dealloc array of float
        for (int i = 0; i < LOOP_CNT; i++) alloc_and_dealloc<double>(ARR_SIZE);
        // test that we deallocated all allocated space
        EXPECT_EQ(MALLOC, DEALOC);
        // test that allocated space is correct size
        EXPECT_EQ(MALLOC, ARR_SIZE * LOOP_CNT * sizeof(double));
        END_TIMER("memory-profiler-double");


        START_TIMER("memory-profiler-simple");
        // alloc and dealloc array of float
        for (int i = 0; i < LOOP_CNT; i++) {
            int * j = new int;
            delete j;
        }
        // test that we deallocated all allocated space
        EXPECT_EQ(MALLOC, DEALOC);
        // test that allocated space is correct size
        EXPECT_EQ(MALLOC, LOOP_CNT * sizeof(int));
        END_TIMER("memory-profiler-simple");
    }

    PI->output(MPI_COMM_WORLD, cout);
    Profiler::uninitialize();
}
コード例 #9
0
ファイル: profiler_test.cpp プロジェクト: jbrezmorf/flow123d
void ProfilerTest::test_propagate_values() {
    int allocated = 0;
    Profiler::initialize(); {
            START_TIMER("A");
                START_TIMER("B");
                    START_TIMER("C");
                        allocated += alloc_and_dealloc<int>(25);
                    END_TIMER("C");
                END_TIMER("B");
                
                START_TIMER("D");
                END_TIMER("D");
                
                PI->propagate_timers();
                EXPECT_EQ(MALLOC, allocated);
            END_TIMER("A");
    }
    PI->output(MPI_COMM_WORLD, cout);
    Profiler::uninitialize();
}
コード例 #10
0
void IngredientInput::reloadCombos()
{
	//these only needed to be loaded once
	if ( ingredientBox->count() == 0 ) {
		START_TIMER("Loading ingredient input auto-completion");
		ingredientBox->reload();
		END_TIMER();
	}
	if ( headerBox->count() == 0 ) {
		START_TIMER("Loading ingredient header input auto-completion");
		headerBox->reload();
		END_TIMER();
	}
	if ( prepMethodBox->count() == 0 ) {
		START_TIMER("Loading prep method input auto-completion");
		prepMethodBox->reload();
		END_TIMER();
	}

	loadUnitListCombo();
}
コード例 #11
0
ファイル: profiler_test.cpp プロジェクト: jbrezmorf/flow123d
void ProfilerTest::test_petsc_memory() {
    int ierr, mpi_rank;
    ierr = MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
    EXPECT_EQ( ierr, 0 );
    
    Profiler::initialize(); {
        PetscLogDouble mem;
        START_TIMER("A");
            PetscInt size = 100*1000;
            PetscScalar value = 0.1;
            Vec tmp_vector;
            VecCreateSeq(PETSC_COMM_SELF, size, &tmp_vector);
            VecSet(tmp_vector, value);
            // VecSetRandom(tmp_vector, NULL);
        END_TIMER("A");
        
        START_TIMER("A");
            // allocated memory MUST be greater or equal to size * size of double
            EXPECT_GE(AN.petsc_memory_difference, size*sizeof(double));
        END_TIMER("A");
        
        START_TIMER("B");
            PetscScalar sum;
            VecSum(tmp_vector, &sum);
        END_TIMER("B");
        
        START_TIMER("C");
            VecDestroy(&tmp_vector);
        END_TIMER("C");
        
        START_TIMER("C");
            // since we are destroying vector, we expect to see negative memory difference
            EXPECT_LE(AN.petsc_memory_difference, 0);
        END_TIMER("C");
    }
    PI->output(MPI_COMM_WORLD, cout);
    Profiler::uninitialize();
}
コード例 #12
0
            void StableFitting::shrink(BoundingPolyhedron& poly,
                                      const std::vector<math::vec3>& points,
                                      math::vec3 backgroundPoint,
                                      float minimumDistance) const
            {
                START_TIMER(Shrinking);

                poly.positionAround(backgroundPoint, points);

                float step = poly.radius()/2.f;

                float minDistanceSquared = minimumDistance*minimumDistance;

                int originalPointsInside = countPointsInside(points, poly);

                for(int iIteration = 0; iIteration < mNoOfIterations; ++iIteration)
                {
                    float stepSquared = step*step;

                    for(auto vertex = poly.mVertices.rbegin(); vertex!=poly.mVertices.rend(); ++vertex)
                    {
                        //If too close, or move distance too great
                        const float distanceCentreToVertex = poly.centre().distanceSquared(*vertex);
                        if(distanceCentreToVertex < minDistanceSquared || distanceCentreToVertex < stepSquared)
                            continue;

                        math::vec3 moveNormal = (poly.centre() - *vertex).normalize();
                        const math::vec3 vec = moveNormal*step;

                        //Move
                        *vertex += vec;

                        int newPointsInside = countPointsInside(points, poly);

                        //Move back if movement violates rule
                        if(newPointsInside<originalPointsInside)
                            *vertex -= vec;
                    }

                    step *= 0.5f;
                }

                END_TIMER(Shrinking);
            }
コード例 #13
0
ファイル: profiler_test.cpp プロジェクト: jbrezmorf/flow123d
void ProfilerTest::test_structure() {
    Profiler::initialize();

    {
        START_TIMER("main");
        EXPECT_EQ("main", ATN);

        START_TIMER("sub1");
           EXPECT_EQ("sub1", ATN);
           START_TIMER("cross");
           EXPECT_EQ("cross", ATN);

        END_TIMER("sub1");
        EXPECT_EQ("main", ATN );


        START_TIMER("sub2");
            END_TIMER("cross");
            START_TIMER("sub_sub");
                START_TIMER("sub1");
                END_TIMER("sub1");
            END_TIMER("sub_sub");
        END_TIMER("sub2");

        START_TIMER("sub1");
        END_TIMER("sub1");
    }

    std::stringstream sout;
    PI->output(MPI_COMM_WORLD, cout);
    PI->output(MPI_COMM_WORLD, sout);


    int ierr, mpi_rank;
    ierr = MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
    EXPECT_EQ( ierr, 0 );
    
    // 0 processor will have valid profiler report
    // other processors should have empty string only
    if (mpi_rank == 0) {
        // when using find, we need to compare result to string::npos value (which indicates not found)
        EXPECT_NE( sout.str().find("\"tag\": \"Whole Program\""), string::npos );
        EXPECT_NE( sout.str().find("\"tag\": \"sub1\""), string::npos );
    } else {
        EXPECT_TRUE( sout.str().empty() );
    }
    
    Profiler::uninitialize();

}
コード例 #14
0
void countCandidatesStatic(size_t* globalWork, size_t* localWork, cl_mem episodeSupport, long eventSize, int level, int sType, int numCandidates,
                    cl_mem candidateTex, cl_mem intervalTex, cl_mem eventTex, cl_mem timeTex, size_t sharedMemNeeded)
{
    int errcode;
    errcode  = clSetKernelArg(kernel_countCandidatesStatic, 0, sizeof(cl_mem), (void *) &episodeSupport);
    errcode |= clSetKernelArg(kernel_countCandidatesStatic, 1, sizeof(long), (void *) &eventSize);
    errcode |= clSetKernelArg(kernel_countCandidatesStatic, 2, sizeof(int), (void *) &level);
    errcode |= clSetKernelArg(kernel_countCandidatesStatic, 3, sizeof(int), (void *) &sType);
    errcode |= clSetKernelArg(kernel_countCandidatesStatic, 4, sizeof(int), (void *) &numCandidates);
    errcode |= clSetKernelArg(kernel_countCandidatesStatic, 5, sizeof(cl_mem), (void *) &candidateTex);
    errcode |= clSetKernelArg(kernel_countCandidatesStatic, 6, sizeof(cl_mem), (void *) &intervalTex);
    errcode |= clSetKernelArg(kernel_countCandidatesStatic, 7, sizeof(cl_mem), (void *) &eventTex);
    errcode |= clSetKernelArg(kernel_countCandidatesStatic, 8, sizeof(cl_mem), (void *) &timeTex);
    errcode |= clSetKernelArg(kernel_countCandidatesStatic, 9, sharedMemNeeded, NULL);
    CHKERR(errcode, "Unable to set arguments for countCandidates");

	errcode = clEnqueueNDRangeKernel(commands, kernel_countCandidatesStatic, 3, NULL, globalWork, localWork, 0, NULL, &ocdTempEvent);
    	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "TDM Candidate Kernels", ocdTempTimer)
    END_TIMER(ocdTempTimer)
    CHKERR(errcode, "Error running countCandidatesStatic");
}
コード例 #15
0
ファイル: profiler_test.cpp プロジェクト: jstebel/flow123d
TEST(Profiler, structure) {
    Profiler::initialize();

    {
        START_TIMER("main");
        EXPECT_EQ("main", AT );

        START_TIMER("sub1");
           EXPECT_EQ("sub1", AT);
           START_TIMER("cross");
           EXPECT_EQ("cross", AT);

        END_TIMER("sub1");
        EXPECT_EQ("main", AT );


        START_TIMER("sub2");
            END_TIMER("cross");
            START_TIMER("sub_sub");
                START_TIMER("sub1");
                END_TIMER("sub1");
            END_TIMER("sub_sub");
        END_TIMER("sub2");

        START_TIMER("sub1");
        END_TIMER("sub1");
    }

    std::stringstream sout;
    Profiler::instance()->output(MPI_COMM_WORLD, cout);
    Profiler::instance()->output(MPI_COMM_WORLD, sout);

    EXPECT_TRUE( sout.str().find("Whole Program   0") );
    EXPECT_TRUE( sout.str().find("  sub1          2") );

    Profiler::uninitialize();

}
コード例 #16
0
ファイル: csr.c プロジェクト: daz88/OpenDwarfs
int main(int argc, char** argv)
{
	cl_int err;
	int usegpu = USEGPU;
    int do_verify = 0;
    int opt, option_index=0;

    unsigned int correct;

    size_t global_size;
    size_t local_size;

    cl_device_id device_id;
    cl_context context;
    cl_command_queue commands;
    cl_program program;
    cl_kernel kernel;

    stopwatch sw;

    cl_mem csr_ap;
    cl_mem csr_aj;
    cl_mem csr_ax;
    cl_mem x_loc;
    cl_mem y_loc;

    FILE *kernelFile;
    char *kernelSource;
    size_t kernelLength;
    size_t lengthRead;


    ocd_init(&argc, &argv, NULL);
    ocd_options opts = ocd_get_options();
    platform_id = opts.platform_id;
    n_device = opts.device_id;

    while ((opt = getopt_long(argc, argv, "::vc::", 
                            long_options, &option_index)) != -1 ) {
      switch(opt){
        //case 'i':
          //input_file = optarg;
          //break;
        case 'v':
          fprintf(stderr, "verify\n");
          do_verify = 1;
          break;
        case 'c':
          fprintf(stderr, "using cpu\n");
          usegpu = 0;
	  break;
        default:
          fprintf(stderr, "Usage: %s [-v Warning: lots of output] [-c use CPU]\n",
                  argv[0]);
          exit(EXIT_FAILURE);
      }
  }

    /* Fill input set with random float values */
    int i;

    csr_matrix csr;
    csr = laplacian_5pt(512);
    int k = 0;
      for(k = 0; k < csr.num_nonzeros; k++){
         csr.Ax[k] = 1.0 - 2.0 * (rand() / (RAND_MAX + 1.0));
      }

    //The other arrays
    float * x_host = float_new_array(csr.num_cols);
    float * y_host = float_new_array(csr.num_rows);

    unsigned int ii;
    for(ii = 0; ii < csr.num_cols; ii++){
        x_host[ii] = rand() / (RAND_MAX + 1.0);
    }
    for(ii = 0; ii < csr.num_rows; ii++){
        y_host[ii] = rand() / (RAND_MAX + 2.0);
    }

    /* Retrieve an OpenCL platform */
    device_id = GetDevice(platform_id, n_device);

    /* Create a compute context */
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    CHKERR(err, "Failed to create a compute context!");

    /* Create a command queue */
    commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
    CHKERR(err, "Failed to create a command queue!");

    /* Load kernel source */
    kernelFile = fopen("spmv_csr_kernel.cl", "r");
    fseek(kernelFile, 0, SEEK_END);
    kernelLength = (size_t) ftell(kernelFile);
    kernelSource = (char *) malloc(sizeof(char)*kernelLength);
    rewind(kernelFile);
    lengthRead = fread((void *) kernelSource, kernelLength, 1, kernelFile);
    fclose(kernelFile);

    /* Create the compute program from the source buffer */
    program = clCreateProgramWithSource(context, 1, (const char **) &kernelSource, &kernelLength, &err);
    CHKERR(err, "Failed to create a compute program!");

    /* Free kernel source */
    free(kernelSource);

    /* Build the program executable */
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err == CL_BUILD_PROGRAM_FAILURE)                                                                                                                                       
    {
        char *buildLog;
        size_t logLen;
        err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLen);
        buildLog = (char *) malloc(sizeof(char)*logLen);
        err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logLen, (void *) buildLog, NULL);
        fprintf(stderr, "CL Error %d: Failed to build program! Log:\n%s", err, buildLog);
        free(buildLog);
        exit(1);
    }
    CHKERR(err, "Failed to build program!");

    /* Create the compute kernel in the program we wish to run */
    kernel = clCreateKernel(program, "csr", &err);
    CHKERR(err, "Failed to create a compute kernel!");

    /* Create the input and output arrays in device memory for our calculation */
    csr_ap = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int)*csr.num_rows+4, NULL, &err);
    CHKERR(err, "Failed to allocate device memory!");
    csr_aj = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int)*csr.num_nonzeros, NULL, &err);
    CHKERR(err, "Failed to allocate device memory!");
    csr_ax = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_nonzeros, NULL, &err);
    CHKERR(err, "Failed to allocate device memory!");
    x_loc = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_cols, NULL, &err);
    CHKERR(err, "Failed to allocate device memory!");
    y_loc = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_rows, NULL, &err);
    CHKERR(err, "Failed to allocate device memory!");

    /* beginning of timing point */
    stopwatch_start(&sw); 
   
    /* Write our data set into the input array in device memory */
	err = clEnqueueWriteBuffer(commands, csr_ap, CL_TRUE, 0, sizeof(unsigned int)*csr.num_rows+4, csr.Ap, 0, NULL, &ocdTempEvent);
        clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
    CHKERR(err, "Failed to write to source array!");
    err = clEnqueueWriteBuffer(commands, csr_aj, CL_TRUE, 0, sizeof(unsigned int)*csr.num_nonzeros, csr.Aj, 0, NULL, &ocdTempEvent);
        clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
    CHKERR(err, "Failed to write to source array!");
    err = clEnqueueWriteBuffer(commands, csr_ax, CL_TRUE, 0, sizeof(float)*csr.num_nonzeros, csr.Ax, 0, NULL, &ocdTempEvent);
        clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
    CHKERR(err, "Failed to write to source array!");
    err = clEnqueueWriteBuffer(commands, x_loc, CL_TRUE, 0, sizeof(float)*csr.num_cols, x_host, 0, NULL, &ocdTempEvent);
        clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
    CHKERR(err, "Failed to write to source array!");
    err = clEnqueueWriteBuffer(commands, y_loc, CL_TRUE, 0, sizeof(float)*csr.num_rows, y_host, 0, NULL, &ocdTempEvent);
        clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer)
    CHKERR(err, "Failed to write to source array!");
	END_TIMER(ocdTempTimer)
    /* Set the arguments to our compute kernel */
    err = 0;
    err = clSetKernelArg(kernel, 0, sizeof(unsigned int), &csr.num_rows);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &csr_ap);
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &csr_aj);
    err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &csr_ax);
    err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &x_loc);
    err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &y_loc);
    CHKERR(err, "Failed to set kernel arguments!");

    /* Get the maximum work group size for executing the kernel on the device */
    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
    CHKERR(err, "Failed to retrieve kernel work group info!");

    /* Execute the kernel over the entire range of our 1d input data set */
    /* using the maximum number of work group items for this device */
    global_size = csr.num_rows;
    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global_size, &local_size, 0, NULL, &ocdTempEvent);
        clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CSR Kernel", ocdTempTimer)
    END_TIMER(ocdTempTimer)
    CHKERR(err, "Failed to execute kernel!");

    /* Wait for the command commands to get serviced before reading back results */
    float output[csr.num_rows];
    
    /* Read back the results from the device to verify the output */
	err = clEnqueueReadBuffer(commands, y_loc, CL_TRUE, 0, sizeof(float)*csr.num_rows, output, 0, NULL, &ocdTempEvent);
        clFinish(commands);
    	START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "CSR Data Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "Failed to read output array!");

    /* end of timing point */
    stopwatch_stop(&sw);
    printf("Time consumed(ms): %lf Gflops: %f \n", 1000*get_interval_by_sec(&sw), (2.0 * (double) csr.num_nonzeros / get_interval_by_sec(&sw)) / 1e9);

   /* Validate our results */
   if(do_verify){
       for (i = 0; i < csr.num_rows; i++){
           printf("row: %d	output: %f \n", i, output[i]);  
       }
   }

   int row = 0;
   float sum = 0;
   int row_start = 0;
   int row_end = 0;
   for(row =0; row < csr.num_rows; row++){     
        sum = y_host[row];
        
        row_start = csr.Ap[row];
        row_end   = csr.Ap[row+1];
        
        unsigned int jj = 0;
        for (jj = row_start; jj < row_end; jj++){             
            sum += csr.Ax[jj] * x_host[csr.Aj[jj]];      
        }
        y_host[row] = sum;
    }
    for (i = 0; i < csr.num_rows; i++){
        if((fabsf(y_host[i]) - fabsf(output[i])) > .001)
             printf("Possible error, difference greater then .001 at row %d \n", i);
    }

    /* Print a brief summary detailing the results */
    ocd_finalize();

    /* Shutdown and cleanup */
    clReleaseMemObject(csr_ap);
    clReleaseMemObject(csr_aj);
    clReleaseMemObject(csr_ax);
    clReleaseMemObject(x_loc);
    clReleaseMemObject(y_loc);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);
    return 0;
}
コード例 #17
0
void IngredientMatcherDialog::findRecipes( void )
{
	KApplication::setOverrideCursor( Qt::WaitCursor );

	START_TIMER("Ingredient Matcher: loading database data");

	RecipeList rlist;
	database->loadRecipes( &rlist, RecipeDB::Title | RecipeDB::NamesOnly | RecipeDB::Ingredients | RecipeDB::IngredientAmounts );

	END_TIMER();
	START_TIMER("Ingredient Matcher: analyzing data for matching recipes");

	// Clear the list
	recipeListView->listView() ->clear();

	// Now show the recipes with ingredients that are contained in the previous set
	// of ingredients
	RecipeList incompleteRecipes;
	QList <int> missingNumbers;
	Q3ValueList <IngredientList> missingIngredients;

	RecipeList::Iterator it;
	for ( it = rlist.begin();it != rlist.end();++it ) {
		IngredientList il = ( *it ).ingList;
		if ( il.isEmpty() )
			continue;

		IngredientList missing;
		if ( m_ingredientList.containsSubSet( il, missing, true, database ) ) {
			new CustomRecipeListItem( recipeListView->listView(), *it );
		}
		else {
			incompleteRecipes.append( *it );
			missingIngredients.append( missing );
			missingNumbers.append( missing.count() );
		}
	}
	END_TIMER();

	//Check if the user wants to show missing ingredients

	if ( missingNumberSpinBox->value() == 0 ) {
		KApplication::restoreOverrideCursor();
		return ;
	} //"None"



	START_TIMER("Ingredient Matcher: searching for and displaying partial matches");

	IngredientList requiredIngredients;
	for ( Q3ListViewItem *it = ingListView->listView()->firstChild(); it; it = it->nextSibling() ) {
		if ( ((Q3CheckListItem*)it)->isOn() )
			requiredIngredients << *m_item_ing_map[it];
	}

	// Classify recipes with missing ingredients in different lists by amount
	QList<int>::Iterator nit;
	Q3ValueList<IngredientList>::Iterator ilit;
	int missingNoAllowed = missingNumberSpinBox->value();

	if ( missingNoAllowed == -1 )  // "Any"
	{
		for ( nit = missingNumbers.begin();nit != missingNumbers.end();++nit )
			if ( ( *nit ) > missingNoAllowed )
				missingNoAllowed = ( *nit );
	}


	for ( int missingNo = 1; missingNo <= missingNoAllowed; missingNo++ ) {
		nit = missingNumbers.begin();
		ilit = missingIngredients.begin();

		bool titleShownYet = false;

		for ( it = incompleteRecipes.begin();it != incompleteRecipes.end();++it, ++nit, ++ilit ) {
			if ( !( *it ).ingList.containsAny( m_ingredientList ) )
				continue;

			if ( !( *it ).ingList.containsSubSet( requiredIngredients ) )
				continue;

			if ( ( *nit ) == missingNo ) {
				if ( !titleShownYet ) {
					new SectionItem( recipeListView->listView(), i18ncp( "@label:textbox", "You are missing 1 ingredient for:", "You are missing %1 ingredients for:", missingNo ) );
					titleShownYet = true;
				}
				new CustomRecipeListItem( recipeListView->listView(), *it, *ilit );
			}
		}
	}
	END_TIMER();

	KApplication::restoreOverrideCursor();
}
コード例 #18
0
ファイル: kmeans_opencl.cpp プロジェクト: daz88/OpenDwarfs
void allocateMemory(int npoints, int nfeatures, int nclusters, float **features)
{	
    cl_int errcode;
    size_t globalWorkSize;
    size_t localWorkSize;

	num_blocks = npoints / num_threads;
	if (npoints % num_threads > 0)		/* defeat truncation */
		num_blocks++;

	num_blocks_perdim = sqrt((double) num_blocks);
	while (num_blocks_perdim * num_blocks_perdim < num_blocks)	// defeat truncation (should run once)
		num_blocks_perdim++;

	num_blocks = num_blocks_perdim*num_blocks_perdim;

	/* allocate memory for memory_new[] and initialize to -1 (host) */
	membership_new = (int*) malloc(npoints * sizeof(int));
	for(int i=0;i<npoints;i++) {
		membership_new[i] = -1;
	}

	/* allocate memory for block_new_centers[] (host) */
	block_new_centers = (float *) malloc(nclusters*nfeatures*sizeof(float));
	
	/* allocate memory for feature_flipped_d[][], feature_d[][] (device) */
    feature_flipped_d = clCreateBuffer(clContext, CL_MEM_READ_ONLY, npoints*nfeatures*sizeof(float), NULL, &errcode);
    CHECKERR(errcode);
	 
    errcode = clEnqueueWriteBuffer(clCommands, feature_flipped_d, CL_TRUE, 0, npoints*nfeatures*sizeof(float), features[0], 0, NULL, &ocdTempEvent);
    
    clFinish(clCommands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "Point/Feature Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHECKERR(errcode);
    feature_d = clCreateBuffer(clContext, CL_MEM_READ_WRITE, npoints*nfeatures*sizeof(float), NULL, &errcode);
    CHECKERR(errcode);
		
	/* invert the data array (kernel execution) */	
    unsigned int arg = 0;
    errcode = clSetKernelArg(clKernel_invert_mapping, arg++, sizeof(cl_mem), (void *) &feature_flipped_d);
    errcode |= clSetKernelArg(clKernel_invert_mapping, arg++, sizeof(cl_mem), (void *) &feature_d);
    errcode |= clSetKernelArg(clKernel_invert_mapping, arg++, sizeof(int), (void *) &npoints);
    errcode |= clSetKernelArg(clKernel_invert_mapping, arg++, sizeof(int), (void *) &nfeatures);
    CHECKERR(errcode);
    globalWorkSize = num_blocks*num_threads;
    localWorkSize = num_threads;
	 
    errcode = clEnqueueNDRangeKernel(clCommands, clKernel_invert_mapping, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &ocdTempEvent);
    clFinish(clCommands);
    	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Invert Mapping Kernel", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHECKERR(errcode);
		
	/* allocate memory for membership_d[] and clusters_d[][] (device) */
    membership_d = clCreateBuffer(clContext, CL_MEM_READ_WRITE, npoints*sizeof(int), NULL, &errcode);
    CHECKERR(errcode);
    clusters_d = clCreateBuffer(clContext, CL_MEM_READ_ONLY, nclusters*nfeatures*sizeof(float), NULL, &errcode);
    CHECKERR(errcode);

	
#ifdef BLOCK_DELTA_REDUCE
	// allocate array to hold the per block deltas on the gpu side
	
    block_deltas_d = clCreateBuffer(clContext, CL_MEM_READ_WRITE, num_blocks_perdim*num_blocks_perdim*sizeof(int), NULL, &errcode);
    CHECKERR(errcode);
	//cudaMemcpy(block_delta_d, &delta_h, sizeof(int), cudaMemcpyHostToDevice);
#endif

#ifdef BLOCK_CENTER_REDUCE
	// allocate memory and copy to card cluster  array in which to accumulate center points for the next iteration
    block_clusters_d = clCreateBuffer(clContext, CL_MEM_READ_WRITE, num_blocks_perdim*num_blocks_perdim*nclusters*nfeatures*sizeof(float), NULL, &errcode);
    CHECKERR(errcode);
	//cudaMemcpy(new_clusters_d, new_centers[0], nclusters*nfeatures*sizeof(float), cudaMemcpyHostToDevice);
#endif

}
コード例 #19
0
ファイル: cfd.cpp プロジェクト: Ethan999/OpenDwarfs
int main(int argc, char** argv)
{
	ocd_init(&argc, &argv, NULL);
	ocd_initCL();

	cl_int err;

	size_t global_size;
	size_t local_size;

	cl_program program;
	cl_kernel kernel_compute_flux;
	cl_kernel kernel_compute_flux_contributions;
	cl_kernel kernel_compute_step_factor;
	cl_kernel kernel_time_step;
	cl_kernel kernel_initialize_variables;

	cl_mem ff_variable;
	cl_mem ff_fc_momentum_x;
	cl_mem ff_fc_momentum_y;
	cl_mem ff_fc_momentum_z;
	cl_mem ff_fc_density_energy;

	if (argc < 2)
	{
		printf("Usage ./cfd <data input file>\n");
		return 0;
	}


	const char* data_file_name = argv[1];


	// set far field conditions and load them into constant memory on the gpu
	{
		float h_ff_variable[NVAR];
		const float angle_of_attack = (float)(3.1415926535897931 / 180.0) * (float)(deg_angle_of_attack);

		h_ff_variable[VAR_DENSITY] = (float)(1.4);

		float ff_pressure = (float)(1.0);
		float ff_speed_of_sound = sqrt(GAMMA*ff_pressure / h_ff_variable[VAR_DENSITY]);
		float ff_speed = (float)(ff_mach)*ff_speed_of_sound;

		float3 ff_velocity;
		ff_velocity.x = ff_speed*(float)(cos((float)angle_of_attack));
		ff_velocity.y = ff_speed*(float)(sin((float)angle_of_attack));
		ff_velocity.z = 0.0;

		h_ff_variable[VAR_MOMENTUM+0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x;
		h_ff_variable[VAR_MOMENTUM+1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y;
		h_ff_variable[VAR_MOMENTUM+2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z;

		h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY]*((float)(0.5)*(ff_speed*ff_speed)) + (ff_pressure / (float)(GAMMA-1.0));

		float3 h_ff_momentum;
		h_ff_momentum.x = *(h_ff_variable+VAR_MOMENTUM+0);
		h_ff_momentum.y = *(h_ff_variable+VAR_MOMENTUM+1);
		h_ff_momentum.z = *(h_ff_variable+VAR_MOMENTUM+2);
		float3 h_ff_fc_momentum_x;
		float3 h_ff_fc_momentum_y;
		float3 h_ff_fc_momentum_z;
		float3 h_ff_fc_density_energy;
		compute_flux_contribution(&h_ff_variable[VAR_DENSITY], &h_ff_momentum,
				&h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, &ff_velocity,
				&h_ff_fc_momentum_x, &h_ff_fc_momentum_y, &h_ff_fc_momentum_z,
				&h_ff_fc_density_energy);

		// copy far field conditions to the gpu
		ff_variable = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * NVAR, h_ff_variable, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_momentum_x = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_x, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_momentum_y = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_y, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_momentum_z = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_z, &err);
		CHKERR(err, "Unable to allocate ff data");
		ff_fc_density_energy = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_density_energy, &err);
		CHKERR(err, "Unable to allocate ff data");
	}
	int nel;
	int nelr;

	// read in domain geometry
	cl_mem areas;
	cl_mem elements_surrounding_elements;
	cl_mem normals;
	{
		std::ifstream file(data_file_name);

		file >> nel;

		nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length));

		//float* h_areas = new float[nelr];
		//int* h_elements_surrounding_elements = new int[nelr*NNB];
		//float* h_normals = new float[nelr*NDIM*NNB];

		float* h_areas ;
		int* h_elements_surrounding_elements ;
		float* h_normals ;

        h_areas                         = (float*)  memalign(AOCL_ALIGNMENT,nelr*sizeof(float));
        h_elements_surrounding_elements = (int*)    memalign(AOCL_ALIGNMENT,nelr*NNB*sizeof(int));
        h_normals                       = (float *) memalign(AOCL_ALIGNMENT,nelr*NDIM*NNB*sizeof(float));


        //posix_memalign(&h_areas                         , AOCL_ALIGNMENT, nelr);
        //posix_memalign(&h_elements_surrounding_elements , AOCL_ALIGNMENT, nelr*NNB);
        //posix_memalign(&h_normals                       , AOCL_ALIGNMENT, nelr*NDIM*NNB);


		// read in data
		for(int i = 0; i < nel; i++)
		{
			file >> h_areas[i];
			for(int j = 0; j < NNB; j++)
			{
				file >> h_elements_surrounding_elements[i + j*nelr];
				if(h_elements_surrounding_elements[i+j*nelr] < 0) h_elements_surrounding_elements[i+j*nelr] = -1;
				h_elements_surrounding_elements[i + j*nelr]--; //it's coming in with Fortran numbering

				for(int k = 0; k < NDIM; k++)
				{
					file >> h_normals[i + (j + k*NNB)*nelr];
					h_normals[i + (j + k*NNB)*nelr] = -h_normals[i + (j + k*NNB)*nelr];
				}
			}
		}

		// fill in remaining data
		int last = nel-1;
		for(int i = nel; i < nelr; i++)
		{
			h_areas[i] = h_areas[last];
			for(int j = 0; j < NNB; j++)
			{
				// duplicate the last element
				h_elements_surrounding_elements[i + j*nelr] = h_elements_surrounding_elements[last + j*nelr];
				for(int k = 0; k < NDIM; k++) h_normals[last + (j + k*NNB)*nelr] = h_normals[last + (j + k*NNB)*nelr];
			}
		}

		areas = alloc<float>(context, nelr);
		upload<float>(commands, areas, h_areas, nelr);

		elements_surrounding_elements = alloc<int>(context, nelr*NNB);
		upload<int>(commands, elements_surrounding_elements, h_elements_surrounding_elements, nelr*NNB);

		normals = alloc<float>(context, nelr*NDIM*NNB);
		upload<float>(commands, normals, h_normals, nelr*NDIM*NNB);

		delete[] h_areas;
		delete[] h_elements_surrounding_elements;
		delete[] h_normals;
	}

	char* kernel_files;
	int num_kernels = 20;
	kernel_files = (char*) malloc(sizeof(char*)*num_kernels);

	strcpy(kernel_files,"cfd_kernel");
      
    program = ocdBuildProgramFromFile(context,device_id,kernel_files, NULL);

	// Create the compute kernel in the program we wish to run
	kernel_compute_flux = clCreateKernel(program, "compute_flux", &err);
	CHKERR(err, "Failed to create a compute kernel!");

	// Create the reduce kernel in the program we wish to run
	kernel_compute_flux_contributions = clCreateKernel(program, "compute_flux_contributions", &err);
	CHKERR(err, "Failed to create a compute_flux_contributions kernel!");
	// Create the reduce kernel in the program we wish to run
	kernel_compute_step_factor = clCreateKernel(program, "compute_step_factor", &err);
	CHKERR(err, "Failed to create a compute_step_factor kernel!");
	// Create the reduce kernel in the program we wish to run
	kernel_time_step = clCreateKernel(program, "time_step", &err);
	CHKERR(err, "Failed to create a time_step kernel!");
	// Create the reduce kernel in the program we wish to run
	kernel_initialize_variables = clCreateKernel(program, "initialize_variables", &err);
	CHKERR(err, "Failed to create a initialize_variables kernel!");

	// Create arrays and set initial conditions
	cl_mem variables = alloc<cl_float>(context, nelr*NVAR);

	err = 0;
	err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr);
	err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&variables);
	err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable);
	CHKERR(err, "Failed to set kernel arguments!");
	// Get the maximum work group size for executing the kernel on the device
	//err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
	CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!");
	local_size = 1;//std::min(local_size, (size_t)nelr);
	global_size = nelr;
	err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
	err = clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 0");


	cl_mem old_variables = alloc<float>(context, nelr*NVAR);
	cl_mem fluxes = alloc<float>(context, nelr*NVAR);
	cl_mem step_factors = alloc<float>(context, nelr);
	clFinish(commands);
	cl_mem fc_momentum_x = alloc<float>(context, nelr*NDIM);
	cl_mem fc_momentum_y = alloc<float>(context, nelr*NDIM);
	cl_mem fc_momentum_z = alloc<float>(context, nelr*NDIM);
	cl_mem fc_density_energy = alloc<float>(context, nelr*NDIM);
	clFinish(commands);

	// make sure all memory is floatly allocated before we start timing
	err = 0;
	err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr);
	err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&old_variables);
	err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable);
	CHKERR(err, "Failed to set kernel arguments!");
	// Get the maximum work group size for executing the kernel on the device
	err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
	CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!");
	err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 1");
	err = 0;
	err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr);
	err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&fluxes);
	err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable);
	CHKERR(err, "Failed to set kernel arguments!");
	// Get the maximum work group size for executing the kernel on the device
	err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
	CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!");

	err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 2");
	std::cout << "About to memcopy" << std::endl;
	err = clReleaseMemObject(step_factors);
	float temp[nelr];
	for(int i = 0; i < nelr; i++)
		temp[i] = 0;
	step_factors = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * nelr, temp, &err);
	CHKERR(err, "Unable to memset step_factors");
	// make sure CUDA isn't still doing something before we start timing

	clFinish(commands);

	// these need to be computed the first time in order to compute time step
	std::cout << "Starting..." << std::endl;


	// Begin iterations
	for(int i = 0; i < iterations; i++)
	{
		copy<float>(commands, old_variables, variables, nelr*NVAR);

		// for the first iteration we compute the time step
		err = 0;
		err = clSetKernelArg(kernel_compute_step_factor, 0, sizeof(int), &nelr);
		err |= clSetKernelArg(kernel_compute_step_factor, 1, sizeof(cl_mem),&variables);
		err |= clSetKernelArg(kernel_compute_step_factor, 2, sizeof(cl_mem), &areas);
		err |= clSetKernelArg(kernel_compute_step_factor, 3, sizeof(cl_mem), &step_factors);
		CHKERR(err, "Failed to set kernel arguments!");
		// Get the maximum work group size for executing the kernel on the device
		err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
		CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!");
		err = clEnqueueNDRangeKernel(commands, kernel_compute_step_factor, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Step Factor Kernel", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		CHKERR(err, "Failed to execute kernel[kernel_compute_step_factor]!");
		for(int j = 0; j < RK; j++)
		{
			err = 0;
			err = clSetKernelArg(kernel_compute_flux_contributions, 0, sizeof(int), &nelr);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 1, sizeof(cl_mem),&variables);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 2, sizeof(cl_mem), &fc_momentum_x);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 3, sizeof(cl_mem), &fc_momentum_y);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 4, sizeof(cl_mem), &fc_momentum_z);
			err |= clSetKernelArg(kernel_compute_flux_contributions, 5, sizeof(cl_mem), &fc_density_energy);
			CHKERR(err, "Failed to set kernel arguments!");
			// Get the maximum work group size for executing the kernel on the device
			err = clGetKernelWorkGroupInfo(kernel_compute_flux_contributions, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
			CHKERR(err, "Failed to retrieve kernel_compute_flux_contributions work group info!");
			err = clEnqueueNDRangeKernel(commands, kernel_compute_flux_contributions, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
			clFinish(commands);
			START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Contribution Kernel", ocdTempTimer)
			//compute_flux_contributions(nelr, variables, fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy);
			END_TIMER(ocdTempTimer)
			CHKERR(err, "Failed to execute kernel [kernel_compute_flux_contributions]!");
			err = 0;
			err = clSetKernelArg(kernel_compute_flux, 0, sizeof(int), &nelr);
			err |= clSetKernelArg(kernel_compute_flux, 1, sizeof(cl_mem), &elements_surrounding_elements);
			err |= clSetKernelArg(kernel_compute_flux, 2, sizeof(cl_mem), &normals);
			err |= clSetKernelArg(kernel_compute_flux, 3, sizeof(cl_mem), &variables);
			err |= clSetKernelArg(kernel_compute_flux, 4, sizeof(cl_mem), &fc_momentum_x);
			err |= clSetKernelArg(kernel_compute_flux, 5, sizeof(cl_mem), &fc_momentum_y);
			err |= clSetKernelArg(kernel_compute_flux, 6, sizeof(cl_mem), &fc_momentum_z);
			err |= clSetKernelArg(kernel_compute_flux, 7, sizeof(cl_mem), &fc_density_energy);
			err |= clSetKernelArg(kernel_compute_flux, 8, sizeof(cl_mem), &fluxes);
			err |= clSetKernelArg(kernel_compute_flux, 9, sizeof(cl_mem), &ff_variable);
			err |= clSetKernelArg(kernel_compute_flux, 10, sizeof(cl_mem), &ff_fc_momentum_x);
			err |= clSetKernelArg(kernel_compute_flux, 11, sizeof(cl_mem), &ff_fc_momentum_y);
			err |= clSetKernelArg(kernel_compute_flux, 12, sizeof(cl_mem), &ff_fc_momentum_z);
			err |= clSetKernelArg(kernel_compute_flux, 13, sizeof(cl_mem), &ff_fc_density_energy);
			CHKERR(err, "Failed to set kernel arguments!");
			// Get the maximum work group size for executing the kernel on the device
			err = clGetKernelWorkGroupInfo(kernel_compute_flux, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
			CHKERR(err, "Failed to retrieve kernel_compute_flux work group info!");
			err = clEnqueueNDRangeKernel(commands, kernel_compute_flux, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
			clFinish(commands);
			START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Kernel", ocdTempTimer)
			END_TIMER(ocdTempTimer)
			CHKERR(err, "Failed to execute kernel [kernel_compute_flux]!");
			err = 0;
			err = clSetKernelArg(kernel_time_step, 0, sizeof(int), &j);
			err |= clSetKernelArg(kernel_time_step, 1, sizeof(int), &nelr);
			err |= clSetKernelArg(kernel_time_step, 2, sizeof(cl_mem), &old_variables);
			err |= clSetKernelArg(kernel_time_step, 3, sizeof(cl_mem), &variables);
			err |= clSetKernelArg(kernel_time_step, 4, sizeof(cl_mem), &step_factors);
			err |= clSetKernelArg(kernel_time_step, 5, sizeof(cl_mem), &fluxes);
			CHKERR(err, "Failed to set kernel arguments!");
			// Get the maximum work group size for executing the kernel on the device
			err = clGetKernelWorkGroupInfo(kernel_time_step, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
			CHKERR(err, "Failed to retrieve kernel_time_step work group info!");
			err = clEnqueueNDRangeKernel(commands, kernel_time_step, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
			clFinish(commands);
			START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Time Step Kernel", ocdTempTimer)
			END_TIMER(ocdTempTimer)
			CHKERR(err, "Failed to execute kernel [kernel_time_step]!");
		}
	}

	clFinish(commands);
	std::cout << "Finished" << std::endl;
	std::cout << "Saving solution..." << std::endl;
	dump(commands, variables, nel, nelr);
	std::cout << "Saved solution..." << std::endl;
	std::cout << "Cleaning up..." << std::endl;

	clReleaseProgram(program);
	clReleaseKernel(kernel_compute_flux);
	clReleaseKernel(kernel_compute_flux_contributions);
	clReleaseKernel(kernel_compute_step_factor);
	clReleaseKernel(kernel_time_step);
	clReleaseKernel(kernel_initialize_variables);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);

	dealloc<float>(areas);
	dealloc<int>(elements_surrounding_elements);
	dealloc<float>(normals);

	dealloc<float>(variables);
	dealloc<float>(old_variables);
	dealloc<float>(fluxes);
	dealloc<float>(step_factors);
	dealloc<float>(fc_momentum_x);
	dealloc<float>(fc_momentum_y);
	dealloc<float>(fc_momentum_z);
	dealloc<float>(fc_density_energy);

	std::cout << "Done..." << std::endl;
	ocd_finalize();
	return 0;
}
コード例 #20
0
int main(int argc, char ** argv)
{
	ocd_init(&argc, &argv, NULL);
	ocd_initCL();

	if (argc < 3)
	{
		printf("Calculate similarities between two strings.\n");
		printf("Maximum length of each string is: %d\n", MAX_LEN);
		printf("Usage: %s query database\n", argv[0]);
		printf("or: %s query database [openPenalty extensionPenalty block#]\n", argv[0]);
		printf("openPenalty (5.0), extensionPenalty (0.5)\n");
		return 1;
	}



	/////////////////////////////////////
	//      00 --> 01
	//		|	   |	
	//		10 --> 11
	////////////////////////////////////
	char queryFilePathName[255], dbDataFilePathName[255], dbLenFilePathName[255];
	int querySize, subSequenceNum, subSequenceSize;
	float openPenalty, extensionPenalty;
	int coalescedOffset = COALESCED_OFFSET;
	int nblosumWidth = 23;
	size_t blockSize = 64;
	size_t setZeroThreadNum, mfThreadNum;
	int blockNum = 14;

	cl_ulong maxLocalSize;

	int arraySize;

	struct timeval t1, t2;
	float tmpTime;
	FILE *pfile;

	//record time
	memset(&strTime, 0, sizeof(STRUCT_TIME));
	timerStart();

	openPenalty = 5.0f;
	extensionPenalty = 0.5;

	if (argc == 6)
	{
		openPenalty = atof(argv[3]);
		extensionPenalty = atof(argv[4]);
		blockNum = atoi(argv[5]);
	}

	//relocated to after MAX_COMPUTE_UNITS check
	//mfThreadNum = blockNum * blockSize;

	cl_program hProgram;
	cl_kernel hMatchStringKernel, hTraceBackKernel, hSetZeroKernel;
	size_t sourceFileSize;
	char *cSourceCL = NULL;

	//err = clGetPlatformIDs(1, &platformID, NULL);
	//CHKERR(err, "Get platform ID error!");

	cl_int err;

	//check to make sure the device supports this block count
	//then scale threads appropriately
	cl_uint devBlockNum = 0;
	CHKERR(clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS,\
				sizeof(cl_uint), &devBlockNum, 0), \
			"Error while querying CL_DEVICE_MAX_COMPUTE_UNITS.");
	if (devBlockNum == MIN(blockNum, devBlockNum)) {
		printf("Scaling blocks from %d to %d to fit on device\n",\
				blockNum, devBlockNum);
		blockNum = devBlockNum;
	}
	mfThreadNum = blockNum * blockSize;

	CHKERR(clGetDeviceInfo(device_id, CL_DEVICE_LOCAL_MEM_SIZE,\
				sizeof(cl_ulong), &maxLocalSize, 0), \
			"Error while querying CL_DEVICE_LOCAL_MEM_SIZE.");

	//load the source file
	char kernel_file[] = "kernels.cl";
	cSourceCL = loadSource(kernel_file, &sourceFileSize);

	hProgram = clCreateProgramWithSource(context, 1, (const char **)&cSourceCL, 
			&sourceFileSize, &err);
	CHKERR(err, "Create program with source error");

	err = clBuildProgram(hProgram, 0, 0, 0, 0, 0);
	//debug================================
	int logSize = 3000, i;
	size_t retSize;
	char logTxt[3000];
	err = clGetProgramBuildInfo(hProgram, device_id, CL_PROGRAM_BUILD_LOG, logSize, logTxt, &retSize);
	for (i = 0; i < retSize; i++)
	{
		printf("%c", logTxt[i]);
	}
	//===================================
	CHKERR(err, "Build program error");

	hMatchStringKernel = clCreateKernel(hProgram, "MatchStringGPUSync", &err);
	CHKERR(err, "Create MatchString kernel error");
	hTraceBackKernel = clCreateKernel(hProgram, "trace_back2", &err);
	CHKERR(err, "Create trace_back2 kernel error");
	hSetZeroKernel = clCreateKernel(hProgram, "setZero", &err);
	CHKERR(err, "Create setZero kernel error");

	sprintf(queryFilePathName, "%s", argv[1]);
	sprintf(dbDataFilePathName, "%s.data", argv[2]);
	sprintf(dbLenFilePathName, "%s.loc", argv[2]);

	char *allSequences, *querySequence, *subSequence;
	char *seq1, *seq2;
	cl_mem seq1D, seq2D;

	allSequences = new char[2 * (MAX_LEN)];
	if (allSequences == NULL)
	{
		printf("Allocate sequence buffer error!\n");
		return 1;
	}
	querySequence = allSequences;

	seq1D = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_char) * MAX_LEN, 0, &err);
	CHKERR(err, "Create seq1D memory");
	seq2D = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_char) * MAX_LEN, 0, &err);
	CHKERR(err, "Create seq2D memory");

	//read query sequence
	querySize = readQuerySequence(queryFilePathName, querySequence);
	if (querySize <= 0 || querySize > MAX_LEN)
	{
		printf("Query size %d is out of range (0, %d)\n",
				MAX_LEN,
				querySize);
		return 1;
	}
	encoding(querySequence, querySize);
	subSequence = allSequences + querySize;

	//allocate output sequence buffer
	char *outSeq1, *outSeq2;
	outSeq1 = new char[2 * MAX_LEN];
	outSeq2 = new char[2 * MAX_LEN];
	if (outSeq1 == NULL ||
			outSeq2 == NULL)
	{
		printf("Allocate output sequence buffer on host error!\n");
		return 1;
	}

	cl_mem outSeq1D, outSeq2D;
	outSeq1D = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_char) * MAX_LEN * 2, 0, &err);
	CHKERR(err, "Create outSeq1D memory");
	outSeq2D = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_char) * MAX_LEN * 2, 0, &err);
	CHKERR(err, "Create outSeq2D memory");

	//allocate thread number per launch and 
	//location difference information
	int *threadNum, *diffPos;
	threadNum = new int[2 * MAX_LEN];
	diffPos = new int[2 * MAX_LEN];
	if (threadNum == NULL ||
			diffPos == NULL)
	{
		printf("Allocate location buffer on host error!\n");
		return 1;
	}

	cl_mem threadNumD, diffPosD;
	threadNumD = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_int) * (2 * MAX_LEN), 0, &err);
	CHKERR(err, "Create threadNumD memory");
	diffPosD = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_int) * (2 * MAX_LEN), 0, &err);
	CHKERR(err, "Create diffPosD memory");

	//allocate matrix buffer
	char *pathFlag, *extFlag; 
	float *nGapDist, *hGapDist, *vGapDist;
	int maxElemNum = (MAX_LEN + 1) * (MAX_LEN + 1);
	pathFlag  = new char[maxElemNum];
	extFlag   = new char[maxElemNum];
	nGapDist = new float[maxElemNum];
	hGapDist = new float[maxElemNum];
	vGapDist = new float[maxElemNum];
	if (pathFlag  == NULL ||
			extFlag   == NULL ||
			nGapDist == NULL ||
			hGapDist == NULL ||
			vGapDist == NULL)
	{
		printf("Allocate DP matrices on host error!\n");
		return 1;
	}

	cl_mem pathFlagD, extFlagD,	nGapDistD, hGapDistD, vGapDistD;
	pathFlagD = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_char) * maxElemNum, 0, &err);
	CHKERR(err, "Create pathFlagD memory");
	extFlagD = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_char) * maxElemNum, 0, &err);
	CHKERR(err, "Create extFlagD memory");
	nGapDistD = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * maxElemNum, 0, &err);
	CHKERR(err, "Create nGapDistD memory");
	hGapDistD = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * maxElemNum, 0, &err);
	CHKERR(err, "Create hGapDistD memory");
	vGapDistD = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * maxElemNum, 0, &err);
	CHKERR(err, "Create vGapDistD memory");

	//Allocate the MAX INFO structure
	MAX_INFO *maxInfo;
	maxInfo = new MAX_INFO[1];
	if (maxInfo == NULL)
	{
		printf("Alloate maxInfo on host error!\n");
		return 1;
	}

	cl_mem maxInfoD;
	maxInfoD = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(MAX_INFO) * mfThreadNum, 0, &err);
	CHKERR(err, "Create maxInfoD memory");

	//allocate the distance table
	cl_mem blosum62D;
	int nblosumHeight = 23;
	blosum62D = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float) * nblosumWidth * nblosumHeight, 0, &err);
	err = clEnqueueWriteBuffer(commands, blosum62D, CL_TRUE, 0,
			nblosumWidth * nblosumHeight * sizeof(cl_float), blosum62[0], 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "SWAT Scoring Matrix Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "copy blosum62 to device");
	cl_mem mutexMem;
	mutexMem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), 0, &err);
	CHKERR(err, "create mutex mem error!");

	//copy the scoring matrix to the constant memory
	//copyScoringMatrixToConstant();

	//open the database
	pDBDataFile = fopen(dbDataFilePathName, "rb");
	if (pDBDataFile == NULL)
	{
		printf("DB data file %s open error!\n", dbDataFilePathName);
		return 1;
	}

	pDBLenFile = fopen(dbLenFilePathName, "rb");
	if (pDBLenFile == NULL)
	{
		printf("DB length file %s open error!\n", dbLenFilePathName);
		return 1;
	}

	//record time
	timerEnd();
	strTime.iniTime = elapsedTime();

	//read the total number of sequences
	fread(&subSequenceNum, sizeof(int), 1, pDBLenFile);

	//get the larger and smaller of the row and colum number
	int subSequenceNo, launchNum, launchNo;
	int rowNum, columnNum, matrixIniNum;
	int DPMatrixSize;
	int seq1Pos, seq2Pos, nOffset, startPos;

	for (subSequenceNo = 0; subSequenceNo < subSequenceNum; subSequenceNo++)
	{
		//record time
		timerStart();

		//read subject sequence
		fread(&subSequenceSize, sizeof(int), 1, pDBLenFile);
		if (subSequenceSize <= 0 || subSequenceSize > MAX_LEN)
		{
			printf("Size %d of bubject sequence %d is out of range!\n",
					subSequenceSize,
					subSequenceNo);
			break;
		}
		fread(subSequence, sizeof(char), subSequenceSize, pDBDataFile);

		gettimeofday(&t1, NULL);
		if (subSequenceSize > querySize)
		{
			seq1 = subSequence;
			seq2 = querySequence;
			rowNum = subSequenceSize + 1;
			columnNum = querySize + 1;
		}
		else
		{
			seq1 = querySequence;
			seq2 = subSequence;
			rowNum = querySize + 1;
			columnNum = subSequenceSize + 1;
		}

		launchNum = rowNum + columnNum - 1;

		//preprocessing for sequences
		DPMatrixSize = preProcessing(rowNum,
				columnNum,
				threadNum,
				diffPos,
				matrixIniNum);

		//record time
		timerEnd();
		strTime.preprocessingTime += elapsedTime();

		//record time
		timerStart();

		//use a kernel to initialize the matrix
		arraySize = DPMatrixSize * sizeof(char);
		setZeroThreadNum = ((arraySize - 1) / blockSize + 1) * blockSize;
		err  = clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&pathFlagD);
		err |= clSetKernelArg(hSetZeroKernel, 1, sizeof(int), (void *)&arraySize);
		err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum,
				&blockSize, 0, NULL, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT DP Matrix Init", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		err |= clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&extFlagD);
		err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum,
				&blockSize, 0, NULL, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT DP Matrix Init", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		CHKERR(err, "Initialize flag matrice");

		arraySize = matrixIniNum * sizeof(float);
		setZeroThreadNum = ((arraySize - 1) / blockSize + 1) * blockSize;
		err  = clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&nGapDistD);
		err |= clSetKernelArg(hSetZeroKernel, 1, sizeof(int), (void *)&arraySize);
		err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum,
				&blockSize, 0, NULL, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Distance Matrix Init", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		err |= clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&hGapDistD);
		err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum,
				&blockSize, 0, NULL, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Distance Matrix Init", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		err |= clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&vGapDistD);
		err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum,
				&blockSize, 0, NULL, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Distance Matrix Init", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		CHKERR(err, "Initialize dist matrice");

		arraySize = sizeof(MAX_INFO) * mfThreadNum;
		setZeroThreadNum = ((arraySize - 1) / blockSize + 1) * blockSize;
		err  = clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&maxInfoD);
		err |= clSetKernelArg(hSetZeroKernel, 1, sizeof(int), (void *)&arraySize);
		err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum,
				&blockSize, 0, NULL, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Max Info Matrix Init", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		CHKERR(err, "Initialize max info");

		arraySize = sizeof(int);
		setZeroThreadNum = ((arraySize - 1) / blockSize + 1) * blockSize;
		err  = clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&mutexMem);
		err |= clSetKernelArg(hSetZeroKernel, 1, sizeof(int), (void *)&arraySize);
		err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum,
				&blockSize, 0, NULL, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Mutex Init", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		CHKERR(err, "Initialize mutex variable");

		//copy input sequences to device
		err  = clEnqueueWriteBuffer(commands, seq1D, CL_FALSE, 0, (rowNum - 1) * sizeof(cl_char), seq1, 0, NULL, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "SWAT Sequence Copy", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		err |= clEnqueueWriteBuffer(commands, seq2D, CL_FALSE, 0, (columnNum - 1) * sizeof(cl_char), seq2, 0, NULL, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "SWAT Sequence Copy", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		CHKERR(err, "copy input sequence");

		err  = clEnqueueWriteBuffer(commands, diffPosD, CL_FALSE, 0, launchNum * sizeof(cl_int), diffPos, 0, NULL, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "SWAT Mutex Info Copy", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		err |= clEnqueueWriteBuffer(commands, threadNumD, CL_FALSE, 0, launchNum * sizeof(cl_int), threadNum, 0, NULL, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "SWAT Mutex Info Copy", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		CHKERR(err, "copy diffpos and/or threadNum mutexMem info error!");

		//record time
		timerEnd();
		strTime.copyTimeHostToDevice += elapsedTime();

		//record time
		timerStart();

		//set arguments
		err  = clSetKernelArg(hMatchStringKernel, 0, sizeof(cl_mem), (void *)&pathFlagD);
		err |= clSetKernelArg(hMatchStringKernel, 1, sizeof(cl_mem), (void *)&extFlagD);
		err |= clSetKernelArg(hMatchStringKernel, 2, sizeof(cl_mem), (void *)&nGapDistD);
		err |= clSetKernelArg(hMatchStringKernel, 3, sizeof(cl_mem), (void *)&hGapDistD);
		err |= clSetKernelArg(hMatchStringKernel, 4, sizeof(cl_mem), (void *)&vGapDistD);
		err |= clSetKernelArg(hMatchStringKernel, 5, sizeof(cl_mem), (void *)&diffPosD);
		err |= clSetKernelArg(hMatchStringKernel, 6, sizeof(cl_mem), (void *)&threadNumD);
		err |= clSetKernelArg(hMatchStringKernel, 7, sizeof(cl_int), (void *)&rowNum);
		err |= clSetKernelArg(hMatchStringKernel, 8, sizeof(cl_int), (void *)&columnNum);
		err |= clSetKernelArg(hMatchStringKernel, 9, sizeof(cl_mem), (void *)&seq1D);
		err |= clSetKernelArg(hMatchStringKernel, 10, sizeof(cl_mem), (void *)&seq2D);	
		err |= clSetKernelArg(hMatchStringKernel, 11, sizeof(cl_int), (void *)&nblosumWidth);
		err |= clSetKernelArg(hMatchStringKernel, 12, sizeof(cl_float), (void *)&openPenalty);
		err |= clSetKernelArg(hMatchStringKernel, 13, sizeof(cl_float), (void *)&extensionPenalty);
		err |= clSetKernelArg(hMatchStringKernel, 14, sizeof(cl_mem), (void *)&maxInfoD);
		err |= clSetKernelArg(hMatchStringKernel, 15, sizeof(cl_mem), (void *)&blosum62D);
		err |= clSetKernelArg(hMatchStringKernel, 16, sizeof(cl_mem), (void *)&mutexMem);
		//err |= clSetKernelArg(hMatchStringKernel, 17, maxLocalSize, NULL);
		CHKERR(err, "Set match string argument error!");

		err = clEnqueueNDRangeKernel(commands, hMatchStringKernel, 1, NULL, &mfThreadNum,
				&blockSize, 0, NULL, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Kernels", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		CHKERR(err, "Launch kernel match string error");

		//record time
		timerEnd();
		strTime.matrixFillingTime += elapsedTime();

		//record time
		timerStart();
		err  = clSetKernelArg(hTraceBackKernel, 0, sizeof(cl_mem), (void *)&pathFlagD);
		err |= clSetKernelArg(hTraceBackKernel, 1, sizeof(cl_mem), (void *)&extFlagD);
		err |= clSetKernelArg(hTraceBackKernel, 2, sizeof(cl_mem), (void *)&diffPosD);
		err |= clSetKernelArg(hTraceBackKernel, 3, sizeof(cl_mem), (void *)&seq1D);
		err |= clSetKernelArg(hTraceBackKernel, 4, sizeof(cl_mem), (void *)&seq2D);	
		err |= clSetKernelArg(hTraceBackKernel, 5, sizeof(cl_mem), (void *)&outSeq1D);
		err |= clSetKernelArg(hTraceBackKernel, 6, sizeof(cl_mem), (void *)&outSeq2D);	
		err |= clSetKernelArg(hTraceBackKernel, 7, sizeof(cl_mem), (void *)&maxInfoD);
		err |= clSetKernelArg(hTraceBackKernel, 8, sizeof(int), (void *)&mfThreadNum);

		size_t tbGlobalSize[1] = {1};
		size_t tbLocalSize[1]  = {1};
		err = clEnqueueNDRangeKernel(commands, hTraceBackKernel, 1, NULL, tbGlobalSize,
				tbLocalSize, 0, NULL, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Kernels", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		CHKERR(err, "Launch kernel trace back error");
		clFinish(commands);
		//record time
		timerEnd();
		strTime.traceBackTime += elapsedTime();

		//record time
		timerStart();
		//copy matrix score structure back
		err = clEnqueueReadBuffer(commands, maxInfoD, CL_FALSE, 0, sizeof(MAX_INFO),
				maxInfo, 0, 0, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "SWAT Max Info Copy", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		CHKERR(err, "Read maxInfo buffer error!");

		int maxOutputLen = rowNum + columnNum - 2;
		err  = clEnqueueReadBuffer(commands, outSeq1D, CL_FALSE, 0, maxOutputLen * sizeof(cl_char),
				outSeq1, 0, 0, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "SWAT Sequence Copy", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		err = clEnqueueReadBuffer(commands, outSeq2D, CL_FALSE, 0, maxOutputLen * sizeof(cl_char),
					outSeq2, 0, 0, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "SWAT Sequence Copy", ocdTempTimer)
		END_TIMER(ocdTempTimer)
		CHKERR(err, "Read output sequence error!");
		//record time
		clFinish(commands);
		gettimeofday(&t2, NULL);
		timerEnd();
		strTime.copyTimeDeviceToHost += elapsedTime();

		//call the print function to print the match result
		printf("============================================================\n");
		printf("Sequence pair %d:\n", subSequenceNo);
		int nlength = maxInfo->noutputlen;
		PrintAlignment(outSeq1, outSeq2, nlength, CHAR_PER_LINE, openPenalty, extensionPenalty);
		printf("Max alignment score (on device) is %.1f\n", maxInfo->fmaxscore);
		//obtain max alignment score on host
		//err = clEnqueueReadBuffer(commands, nGapDistD, CL_TRUE, 0, sizeof(cl_float) * DPMatrixSize,
		//						  nGapDist, 0, 0, 0);
		//printf("Max alignment score (on host) is %.1f\n", maxScore(nGapDist, DPMatrixSize));

		printf("openPenalty = %.1f, extensionPenalty = %.1f\n", openPenalty, extensionPenalty);
		printf("Input sequence size, querySize: %d, subSequenceSize: %d\n", 
				querySize, subSequenceSize);

		printf("Max position, seq1 = %d, seq2 = %d\n", maxInfo->nposi, maxInfo->nposj);
	}
	tmpTime = 1000.0 * (t2.tv_sec - t1.tv_sec) + (t2.tv_usec - t1.tv_usec) / 1000.0;
	pfile = fopen("../kernelTime.txt", "at");
	fprintf(pfile, "verOpencl4:\t%.3f\n", tmpTime);
	fclose(pfile);

	//print time
	printTime_toStandardOutput();
	printTime_toFile();

	fclose(pDBLenFile);
	fclose(pDBDataFile);

	clReleaseKernel(hMatchStringKernel);
	clReleaseKernel(hTraceBackKernel);
	clReleaseKernel(hSetZeroKernel);

	delete allSequences;
	clReleaseMemObject(seq1D);
	clReleaseMemObject(seq2D);

	delete outSeq1;
	delete outSeq2;
	clReleaseMemObject(outSeq1D);
	clReleaseMemObject(outSeq2D);

	delete threadNum;
	clReleaseMemObject(threadNumD);
	delete diffPos;
	clReleaseMemObject(diffPosD);

	delete pathFlag;
	delete extFlag;
	delete nGapDist;
	delete hGapDist;
	delete vGapDist;
	clReleaseMemObject(pathFlagD);
	clReleaseMemObject(extFlagD);
	clReleaseMemObject(nGapDistD);
	clReleaseMemObject(hGapDistD);
	clReleaseMemObject(vGapDistD);

	delete maxInfo;
	clReleaseMemObject(maxInfoD);

	free(cSourceCL);

	clReleaseMemObject(blosum62D);
	clReleaseMemObject(mutexMem);

	clReleaseProgram(hProgram);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);
	ocd_finalize();
	return 0;
}
コード例 #21
0
KrecipesView::KrecipesView( QWidget *parent )
	: QWidget( parent ), m_actionshandler( 0, 0 )
{
	new KrecipesAdaptor(this );
	QDBusConnection::sessionBus().registerObject("/Krecipes", this);

#ifndef NDEBUG
	QTime dbg_total_timer; dbg_total_timer.start();
#endif
	// Init the setup wizard if necessary
	kDebug() << "Beginning wizard" ;
	wizard();
	kDebug() << "Wizard finished correctly" ;

	// Show Splash Screen

	KStartupLogo* start_logo = 0L;
	start_logo = new KStartupLogo();
	start_logo -> setHideEnabled( true );
	start_logo->show();
	start_logo->raise();

	// Initialize Database

	// Check if the database type is among those supported
	// and initialize the database in each case
	START_TIMER("Initializing database")
	initDatabase();
	END_TIMER()

	// Design the GUI
	QHBoxLayout *layout = new QHBoxLayout;
	setLayout( layout );
	splitter = new KHBox( this );
	layout->addWidget( splitter );
	// Create Left and Right Panels (splitter)

	leftPanelFrame = new QFrame( splitter );
	leftPanel = new KreMenu;
	QHBoxLayout *leftPanelFrameLayout = new QHBoxLayout;
	leftPanelFrame->setLayout( leftPanelFrameLayout );
	leftPanelFrameLayout->addWidget( leftPanel );
	leftPanelFrameLayout->setMargin( 0 );
	leftPanelFrame->setFrameStyle( QFrame::StyledPanel | QFrame::Raised );
	leftPanelFrame->setFrameRect( QRect( 0, 0, 0, 0 ) );
	rightPanel = new PanelDeco( splitter, i18n( "Find/Edit Recipes" ), "system-search" );

	// Design Left Panel

	START_TIMER("Setting up buttons")
	// Buttons

	button0 = new KreMenuButton( leftPanel, SelectP );
	button0->setIconSet( KIcon( "system-search" ) );
	buttonsList.append( button0 );

	button1 = new KreMenuButton( leftPanel, ShoppingP );
	button1->setIconSet( KIcon( "view-pim-tasks" ) );
	buttonsList.append( button1 );

	button7 = new KreMenuButton( leftPanel, DietP );
	button7->setIconSet( KIcon( "diet" ) );
	buttonsList.append( button7 );

	button8 = new KreMenuButton( leftPanel, MatcherP );
	button8->setIconSet( KIcon( "view-filter" ) );
	buttonsList.append( button8 );


	// Submenus
	dataMenu = leftPanel->createSubMenu( i18n( "Data..." ), "server-database" );
	
	recipeButton = new KreMenuButton( leftPanel, RecipeEdit );
	recipeButton->setIconSet( KIcon( "document-save" ) );
	buttonsList.append( recipeButton );
	recipeButton->setEnabled( false );
	recipeButton->hide();

	button2 = new KreMenuButton( leftPanel, IngredientsP, dataMenu );
	button2->setIconSet( KIcon( "ingredients" ) );
	buttonsList.append(button2);

	button3 = new KreMenuButton( leftPanel, PropertiesP, dataMenu );
	button3->setIconSet( KIcon( "properties" ) );
	buttonsList.append( button3 );

	button4 = new KreMenuButton( leftPanel, UnitsP, dataMenu );
	button4->setIconSet( KIcon( "units" ) );
	buttonsList.append( button4 );

	button9 = new KreMenuButton( leftPanel, PrepMethodsP, dataMenu );
	button9->setIconSet( KIcon( "methods" ) );
	buttonsList.append( button9 );

	button5 = new KreMenuButton( leftPanel, CategoriesP, dataMenu );
	button5->setIconSet( KIcon( "folder-yellow" ) );
	buttonsList.append( button5 );

	button6 = new KreMenuButton( leftPanel, AuthorsP, dataMenu );
	button6->setIconSet( KIcon( "authors" ) );
	buttonsList.append( button6 );	

	contextButton = new QPushButton( leftPanel );
	contextButton->setObjectName( "contextButton" );
	contextButton->setIcon( KIcon( "system-help" ) );
	contextButton->setGeometry( leftPanel->width() - 42, leftPanel->height() - 42, 32, 32 );

	QPalette p = palette();
	p.setColor(backgroundRole(), contextButton->palette().color(backgroundRole()).light( 140 ) );
	contextButton->setPalette(p);
	contextButton->setFlat( true );
	END_TIMER()

	KConfigGroup config(KGlobal::config(), "Performance" );
	int limit = config.readEntry( "CategoryLimit", -1 );
	database->updateCategoryCache(limit);

	// Right Panel Widgets
	START_TIMER("Creating input dialog")
	inputPanel = new RecipeInputDialog( rightPanel, database );
	rightPanel->addStackWidget( inputPanel );
	END_TIMER()

	START_TIMER("Creating recipe view")
	viewPanel = new RecipeViewDialog( rightPanel, database );
	rightPanel->addStackWidget( viewPanel );
	END_TIMER()

	START_TIMER("Creating recipe selection dialog")
	selectPanel = new SelectRecipeDialog( rightPanel, database );
	rightPanel->addStackWidget( selectPanel );

	END_TIMER()

	START_TIMER("Creating ingredients component")
	ingredientsPanel = new IngredientsDialog( rightPanel, database );
	rightPanel->addStackWidget( ingredientsPanel );
	END_TIMER()

	START_TIMER("Creating properties component")
	propertiesPanel = new PropertiesDialog( rightPanel, database );
	rightPanel->addStackWidget( propertiesPanel );

	END_TIMER()

	START_TIMER("Creating units component")
	unitsPanel = new UnitsDialog( rightPanel, database );
	rightPanel->addStackWidget( unitsPanel );

	END_TIMER()

	START_TIMER("Creating shopping list dialog")
	shoppingListPanel = new ShoppingListDialog( rightPanel, database );
	rightPanel->addStackWidget( shoppingListPanel );

	END_TIMER()

	START_TIMER("Creating diet wizard dialog")
	dietPanel = new DietWizardDialog( rightPanel, database );
	rightPanel->addStackWidget( dietPanel );

	END_TIMER()

	START_TIMER("Creating categories component")
	categoriesPanel = new CategoriesEditorDialog( rightPanel, database );
	rightPanel->addStackWidget( categoriesPanel );

	END_TIMER()

	START_TIMER("Creating authors component")
	authorsPanel = new AuthorsDialog( rightPanel, database );
	rightPanel->addStackWidget( authorsPanel );

	END_TIMER()

	START_TIMER("Creating prep methods component")
	prepMethodsPanel = new PrepMethodsDialog( rightPanel, database );
	rightPanel->addStackWidget( prepMethodsPanel );

	END_TIMER()

	START_TIMER("Creating ingredients matcher dialog")
	ingredientMatcherPanel = new IngredientMatcherDialog( rightPanel, database );
	rightPanel->addStackWidget( ingredientMatcherPanel );

	END_TIMER()

	database->clearCategoryCache();

	// Use to keep track of the panels
	panelMap.insert( inputPanel, RecipeEdit );
	panelMap.insert( viewPanel, RecipeView );
	panelMap.insert( selectPanel, SelectP );
	panelMap.insert( ingredientsPanel, IngredientsP );
	panelMap.insert( propertiesPanel, PropertiesP );
	panelMap.insert( unitsPanel, UnitsP );
	panelMap.insert( shoppingListPanel, ShoppingP );
	panelMap.insert( dietPanel, DietP );
	panelMap.insert( categoriesPanel, CategoriesP );
	panelMap.insert( authorsPanel, AuthorsP );
	panelMap.insert( prepMethodsPanel, PrepMethodsP );
	panelMap.insert( ingredientMatcherPanel, MatcherP );

	m_activePanel = SelectP;
	m_previousActivePanel = SelectP;
	slotSetPanel( SelectP );

	// i18n
	translate();


	// Connect Signals from Left Panel to slotSetPanel()
	connect( leftPanel, SIGNAL( clicked( KrePanel ) ), this, SLOT( slotSetPanel( KrePanel ) ) );

	connect( contextButton, SIGNAL( clicked() ), SLOT( activateContextHelp() ) );

	connect( leftPanel, SIGNAL( resized( int, int ) ), this, SLOT( resizeRightPane( int, int ) ) );


	// Retransmit signal to parent to Enable/Disable the Save Button
	connect ( inputPanel, SIGNAL( enableSaveOption( bool ) ), this, SLOT( enableSaveOptionSlot( bool ) ) );

	// Create a new button when a recipe is unsaved
	connect ( inputPanel, SIGNAL( createButton( QWidget*, const QString & ) ), this, SLOT( addRecipeButton( QWidget*, const QString & ) ) );

	// Connect Signals from selectPanel (SelectRecipeDialog)

	connect ( selectPanel, SIGNAL( recipeSelected( int, int ) ), this, SLOT( actionRecipe( int, int ) ) );
	connect ( selectPanel, SIGNAL( recipesSelected( const QList<int>&, int ) ), this, SLOT( actionRecipes( const QList<int>&, int ) ) );

	// Connect Signals from ingredientMatcherPanel (IngredientMatcherDialog)

	connect ( ingredientMatcherPanel, SIGNAL( recipeSelected( int, int ) ), SLOT( actionRecipe( int, int ) ) );

	// Close a recipe when requested (just switch panels)
	connect( inputPanel, SIGNAL( closeRecipe() ), this, SLOT( closeRecipe() ) );

	// Show a recipe when requested (just switch panels)
	connect( inputPanel, SIGNAL( showRecipe( int ) ), this, SLOT( showRecipe( int ) ) );

	// Close the recipe view when requested (just switch panels)
	connect( viewPanel, SIGNAL( closeRecipeView() ), this, SLOT( closeRecipe() ) );

	// Create a new shopping list when a new diet is generated and accepted
	connect( dietPanel, SIGNAL( dietReady() ), this, SLOT( createShoppingListFromDiet() ) );

	// Place the Tip Button in correct position when the left pane is resized
	connect( leftPanel, SIGNAL( resized( int, int ) ), this, SLOT( moveTipButton( int, int ) ) );

	connect( rightPanel, SIGNAL( panelRaised( QWidget*, QWidget* ) ), SLOT( panelRaised( QWidget*, QWidget* ) ) );

	connect( selectPanel, SIGNAL( recipeSelected(bool) ), SIGNAL( recipeSelected(bool) ) );
	
	connect( ingredientMatcherPanel, SIGNAL( recipeSelected(bool) ), SIGNAL( recipeSelected(bool) ) );

	// Close Splash Screen
	delete start_logo;

#ifndef NDEBUG
	kDebug()<<"Total time elapsed: "<<dbg_total_timer.elapsed()/1000<<" sec";
#endif
}
コード例 #22
0
ファイル: variational.cpp プロジェクト: abhimanu/CVPETpgm
int main(int argc, char *argv[]){
  boost_po::options_description options("Allowed options");

  std::string trainfname;
  std::string testfname;
  int get_train_err = 0;
  
  options.add_options()
    ("train", boost_po::value<std::string>(&trainfname)->required(), "train")
    ("test", boost_po::value<std::string>(&testfname)->required(), "test")
    ("terr", boost_po::value<int>(&get_train_err), "get train err");
  boost_po::variables_map options_map;
  boost_po::store(boost_po::parse_command_line(argc, argv, options), options_map);
  boost_po::notify(options_map);

  arma::mat X;
  arma::icolvec Y;
  
  clock_t t;
  std::cout << "start loading " << trainfname << std::endl;
  START_TIMER(t);
  X.load(trainfname, arma::csv_ascii);
  END_TIMER(t, t);

  std::cout << "data matrix loaded, size: " << X.n_rows << ", " << X.n_cols << " time = " << TO_SEC(t) << " sec" << std::endl;

  Y = arma::conv_to<arma::icolvec>::from(X.col(X.n_cols - 1));
  X.shed_col(X.n_cols - 1);

  convert_binary(Y);

  const int32_t num_samples = X.n_rows;
  const int32_t num_features = X.n_cols;

  arma::colvec mu(num_features); //gaussian mean
  arma::mat sigma(num_features, num_features); //gaussian covariance  
  double xi = 2;
  double delta = 1;

  //initialize gaussian parameters
  mu.zeros();
  sigma = sigma.eye()*COV_FAC;

  std::cout << "start inverting sigma" << std::endl;
  
  START_TIMER(t);
  arma::mat sigma_inv = sigma.i();
  END_TIMER(t, t);
  std::cout << "finished inverting, time = " << TO_SEC(t) << " secs" << std::endl;

  clock_t ts;
  std::cout << "starts training" << std::endl;
  START_TIMER(ts);
  
  int iter_cnt = 0;
  while(std::abs(delta) > CONVERGE_THRE){
    
    arma::mat post_sigma_inv;
    arma::mat post_sigma;
    arma::colvec post_mu;
    double xi_sqr;
    double post_xi;
    int32_t i;

    for(i = 0; i < num_samples; i++){
      std::cout << "sample = i = " << i << std::endl;
      arma::colvec x = X.row(i).t();
      std::cout << "x.print()" << std::endl;
      x.print();

      post_sigma_inv = get_post_sigma_inv(xi, x, sigma_inv);
  
      post_sigma = post_sigma_inv.i();
      
      post_mu = get_post_mu(x, sigma_inv, post_sigma, mu, Y(i));
      
      xi_sqr = get_xi_sqr(x, post_sigma, post_mu);
      post_xi = sqrt(xi_sqr);
      
      sigma = post_sigma;
      sigma_inv = post_sigma_inv;
      mu = post_mu;

      delta = post_xi - xi;
      xi = post_xi;
      END_TIMER(t, ts);
      std::cout << "xi = " << xi << " delta = " << delta << " time = " << TO_SEC(t) << " sec" << std::endl;
    }
    ++iter_cnt;
    std::cout << "iteration " << iter_cnt << " done. xi = " << xi << " delta = " << delta << " std::abs(delta) = " << std::abs(delta) << " time = " << TO_SEC(t) << " sec" << std::endl;
  }
  
  END_TIMER(t, ts);
  std::cout << "time = " << TO_SEC(t) << " sec" << std::endl;
  std::cout << " training done" << std::endl;
  std::cout << "xi = " << xi << std::endl;

  arma::mat X_test;
  arma::icolvec Y_test;

  std::cout << "start loading test matrix " << testfname << std::endl;
  START_TIMER(t);
  X_test.load(testfname, arma::csv_ascii);
  END_TIMER(t, t);

  std::cout << "test data matrix loaded, size: " << X_test.n_rows << ", " << X_test.n_cols << " time = " << TO_SEC(t) << " sec" << std::endl;

  Y_test = arma::conv_to<arma::icolvec>::from(X_test.col(X_test.n_cols - 1));
  X_test.shed_col(X_test.n_cols - 1);

  double test_err = classification_error(xi, sigma, sigma_inv, mu, X_test, Y_test);
  std::cout << "test error = " << test_err << std::endl;

  if(get_train_err){
    double train_err = classification_error(xi, sigma, sigma_inv, mu, X, Y);
    std::cout << "train error = " << train_err << std::endl;
  }

  sigma.save(trainfname + ".sigma.out", arma::csv_ascii);
  mu.save(trainfname + ".mu.out", arma::csv_ascii);
  X.save(trainfname + ".out", arma::csv_ascii);
  
}
コード例 #23
0
ファイル: dietwizarddialog.cpp プロジェクト: KDE/krecipes
void DietWizardDialog::createDiet( void )
{
	KApplication::setOverrideCursor( Qt::WaitCursor );

	START_TIMER("Creating the diet");

	RecipeList rlist;
	dietRList->clear();

	// Get the whole list of recipes, detailed
	int flags = RecipeDB::Title | getNecessaryFlags();
	database->loadRecipes( &rlist, flags );

	// temporal iterator list so elements can be removed without reloading them again from the DB
	// this list prevents the same meal from showing up in the same day twice
	Q3ValueList <RecipeList::Iterator> tempRList;

	bool alert = false;

	for ( int day = 0;day < dayNumber;day++ )  // Create the diet for the number of days defined by the user
	{
		populateIteratorList( rlist, &tempRList ); // temporal iterator list so elements can be removed without reloading them again from the DB
		for ( int meal = 0;meal < mealNumber;meal++ )
		{
			int dishNo = ( ( MealInput* ) ( mealTabs->widget( meal ) ) ) ->dishNo();

			for ( int dish = 0;dish < dishNo;dish++ ) {
				bool found = false;
				Q3ValueList <RecipeList::Iterator> tempDishRList = tempRList;
				while ( ( !found ) && !tempDishRList.empty() ) {
					int random_index = ( int ) ( ( float ) ( KRandom::random() ) / ( float ) RAND_MAX * tempDishRList.count() );
					Q3ValueList<RecipeList::Iterator>::Iterator iit = tempDishRList.at( random_index ); // note that at() retrieves an iterator to the iterator list, so we need to use * in order to get the RecipeList::Iterator

					RecipeList::Iterator rit = *iit;
					if ( found = ( ( ( !categoryFiltering( meal, dish ) ) || checkCategories( *rit, meal, dish ) ) && checkConstraints( *rit, meal, dish ) ) )  // Check that the recipe is inside the constraint limits and in the categories specified
					{
						dietRList->append( *rit ); // Add recipe to the diet list
						tempRList.remove( tempRList.find(*iit) ); //can't just remove()... the iterator isn't from this list (its an iterator from tempDishRList)
					}
					else {
						tempDishRList.remove( iit ); // Remove this analized recipe from teh list
					}
				}
				if ( !found )
					alert = true;
			}
		}
	}

	if ( alert ) {
		KApplication::restoreOverrideCursor();
		KMessageBox::sorry( this, i18nc( "@info", "Given the constraints, a full diet list could not be constructed. Either the recipe list is too short or the constraints are too demanding. " ) );
	}

	else // show the resulting diet
	{

		// make a list of dishnumbers
		QList<int> dishNumbers;

		for ( int meal = 0;meal < mealNumber;meal++ ) {
			int dishNo = ( ( MealInput* ) ( mealTabs->widget( meal ) ) ) ->dishNo();
			dishNumbers << dishNo;
		}

		KApplication::restoreOverrideCursor();

		// display the list
		QPointer<DietViewDialog> dietDisplay = new DietViewDialog( this, *dietRList, dayNumber, mealNumber, dishNumbers );
		connect( dietDisplay, SIGNAL( signalOk() ), this, SLOT( createShoppingList() ) );
		dietDisplay->exec();
		delete dietDisplay;
	}

	END_TIMER();
}
コード例 #24
0
ファイル: kmeans_opencl.cpp プロジェクト: daz88/OpenDwarfs
int	// delta -- had problems when return value was of float type
kmeansCuda(float  **feature,				/* in: [npoints][nfeatures] */
           int      nfeatures,				/* number of attributes for each point */
           int      npoints,				/* number of data points */
           int      nclusters,				/* number of clusters */
           int     *membership,				/* which cluster the point belongs to */
		   float  **clusters,				/* coordinates of cluster centers */
		   int     *new_centers_len,		/* number of elements in each cluster */
           float  **new_centers				/* sum of elements in each cluster */
		   )
{
    cl_int errcode;

	int delta = 0;			/* if point has moved */
	int i,j;				/* counters */


	/* copy membership (host to device) */
    	 
	errcode = clEnqueueWriteBuffer(clCommands, membership_d, CL_TRUE, 0, npoints*sizeof(int), (void *) membership_new, 0, NULL, &ocdTempEvent);
        clFinish(clCommands);
    	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "Membership Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHECKERR(errcode);

	/* copy clusters (host to device) */
    	 
	errcode = clEnqueueWriteBuffer(clCommands, clusters_d, CL_TRUE, 0, nclusters*nfeatures*sizeof(float), (void *) clusters[0], 0, NULL, &ocdTempEvent);
        clFinish(clCommands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "Cluster Copy", ocdTempTimer)
        END_TIMER(ocdTempTimer)
	CHECKERR(errcode);

	/* set up texture */
    /*cudaChannelFormatDesc chDesc0 = cudaCreateChannelDesc<float>();
    t_features.filterMode = cudaFilterModePoint;   
    t_features.normalized = false;
    t_features.channelDesc = chDesc0;

	if(cudaBindTexture(NULL, &t_features, feature_d, &chDesc0, npoints*nfeatures*sizeof(float)) != CUDA_SUCCESS)
        printf("Couldn't bind features array to texture!\n");

	cudaChannelFormatDesc chDesc1 = cudaCreateChannelDesc<float>();
    t_features_flipped.filterMode = cudaFilterModePoint;   
    t_features_flipped.normalized = false;
    t_features_flipped.channelDesc = chDesc1;

	if(cudaBindTexture(NULL, &t_features_flipped, feature_flipped_d, &chDesc1, npoints*nfeatures*sizeof(float)) != CUDA_SUCCESS)
        printf("Couldn't bind features_flipped array to texture!\n");

	cudaChannelFormatDesc chDesc2 = cudaCreateChannelDesc<float>();
    t_clusters.filterMode = cudaFilterModePoint;   
    t_clusters.normalized = false;
    t_clusters.channelDesc = chDesc2;

	if(cudaBindTexture(NULL, &t_clusters, clusters_d, &chDesc2, nclusters*nfeatures*sizeof(float)) != CUDA_SUCCESS)
        printf("Couldn't bind clusters array to texture!\n");*/

	/* copy clusters to constant memory */
	//cudaMemcpyToSymbol("c_clusters",clusters[0],nclusters*nfeatures*sizeof(float),0,cudaMemcpyHostToDevice);


    /* setup execution parameters.
	   changed to 2d (source code on NVIDIA CUDA Programming Guide) */
    size_t localWorkSize[2] = {num_threads_perdim*num_threads_perdim, 1};
    size_t globalWorkSize[2] = {num_blocks_perdim*localWorkSize[0], num_blocks_perdim*localWorkSize[1]};

    unsigned int arg = 0;
    errcode = clSetKernelArg(clKernel_kmeansPoint, arg++, sizeof(cl_mem), (void *) &feature_d);
    errcode |= clSetKernelArg(clKernel_kmeansPoint, arg++, sizeof(cl_mem), (void *) &feature_flipped_d);
    errcode |= clSetKernelArg(clKernel_kmeansPoint, arg++, sizeof(int), (void *) &nfeatures);
    errcode |= clSetKernelArg(clKernel_kmeansPoint, arg++, sizeof(int), (void *) &npoints);
    errcode |= clSetKernelArg(clKernel_kmeansPoint, arg++, sizeof(int), (void *) &nclusters);
    errcode |= clSetKernelArg(clKernel_kmeansPoint, arg++, sizeof(cl_mem), (void *) &membership_d);
    errcode |= clSetKernelArg(clKernel_kmeansPoint, arg++, sizeof(cl_mem), (void *) &clusters_d);
#ifdef BLOCK_DELTA_REDUCE
    errcode |= clSetKernelArg(clKernel_kmeansPoint, arg++, sizeof(cl_mem), (void *) &block_clusters_d);
#endif
#ifdef BLOCK_CENTER_REDUCE
    errcode |= clSetKernelArg(clKernel_kmeansPoint, arg++, sizeof(cl_mem), (void *) &block_deltas_d);
#endif
    CHECKERR(errcode);
    
	/* execute the kernel */
	 
    errcode = clEnqueueNDRangeKernel(clCommands, clKernel_kmeansPoint, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent);
	CHECKERR(errcode);
        errcode = clFinish(clCommands);
   	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Point Kernel", ocdTempTimer)

    END_TIMER(ocdTempTimer)
    CHECKERR(errcode);
	/* copy back membership (device to host) */
    	 
	errcode = clEnqueueReadBuffer(clCommands, membership_d, CL_TRUE, 0, npoints*sizeof(int), (void *) membership_new, 0, NULL, &ocdTempEvent);
        clFinish(clCommands);
    	START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "Membership Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHECKERR(errcode);

#ifdef BLOCK_CENTER_REDUCE
    /*** Copy back arrays of per block sums ***/
    float * block_clusters_h = (float *) malloc(
        num_blocks_perdim * num_blocks_perdim * 
        nclusters * nfeatures * sizeof(float));
         
    errcode = clEnqueueReadBuffer(clCommands, block_clusters_d, CL_TRUE, 0, num_blocks_perdim*num_blocks_perdim*nclusters*nfeatures*sizeof(float), (void *) block_clusters_h, 0, NULL, &ocdTempEvent);
	clFinish(clCommands)
		END_TIMER
		COUNT_D2H
	    CHECKERR(errcode);
#endif
#ifdef BLOCK_DELTA_REDUCE
    int * block_deltas_h = (int *) malloc(
        num_blocks_perdim * num_blocks_perdim * sizeof(int));
         
    errcode = clEnqueueReadBuffer(clCommands, block_deltas_d, CL_TRUE, 0, num_blocks_perdim*num_blocks_perdim*sizeof(int), (void *) block_deltas_h, 0, NULL, &ocdTempEvent);
	clFinish(clCommands)
    	END_TIMER
	COUNT_D2H
	CHECKERR(errcode);
#endif
    
	/* for each point, sum data points in each cluster
	   and see if membership has changed:
	     if so, increase delta and change old membership, and update new_centers;
	     otherwise, update new_centers */
	delta = 0;
	for (i = 0; i < npoints; i++)
	{		
		int cluster_id = membership_new[i];
		new_centers_len[cluster_id]++;
		if (membership_new[i] != membership[i])
		{
#ifdef CPU_DELTA_REDUCE
			delta++;
#endif
			membership[i] = membership_new[i];
		}
#ifdef CPU_CENTER_REDUCE
		for (j = 0; j < nfeatures; j++)
		{			
			new_centers[cluster_id][j] += feature[i][j];
		}
#endif
	}
	

#ifdef BLOCK_DELTA_REDUCE	
    /*** calculate global sums from per block sums for delta and the new centers ***/    
	
	//debug
	//printf("\t \t reducing %d block sums to global sum \n",num_blocks_perdim * num_blocks_perdim);
    for(i = 0; i < num_blocks_perdim * num_blocks_perdim; i++) {
		//printf("block %d delta is %d \n",i,block_deltas_h[i]);
        delta += block_deltas_h[i];
    }
        
#endif
#ifdef BLOCK_CENTER_REDUCE	
	
	for(int j = 0; j < nclusters;j++) {
		for(int k = 0; k < nfeatures;k++) {
			block_new_centers[j*nfeatures + k] = 0.f;
		}
	}

    for(i = 0; i < num_blocks_perdim * num_blocks_perdim; i++) {
		for(int j = 0; j < nclusters;j++) {
			for(int k = 0; k < nfeatures;k++) {
				block_new_centers[j*nfeatures + k] += block_clusters_h[i * nclusters*nfeatures + j * nfeatures + k];
			}
		}
    }
	

#ifdef CPU_CENTER_REDUCE
	//debug
	/*for(int j = 0; j < nclusters;j++) {
		for(int k = 0; k < nfeatures;k++) {
			if(new_centers[j][k] >	1.001 * block_new_centers[j*nfeatures + k] || new_centers[j][k] <	0.999 * block_new_centers[j*nfeatures + k]) {
				printf("\t \t for %d:%d, normal value is %e and gpu reduced value id %e \n",j,k,new_centers[j][k],block_new_centers[j*nfeatures + k]);
			}
		}
	}*/
#endif

#ifdef BLOCK_CENTER_REDUCE
	for(int j = 0; j < nclusters;j++) {
		for(int k = 0; k < nfeatures;k++)
			new_centers[j][k]= block_new_centers[j*nfeatures + k];		
	}
#endif

#endif

	return delta;
	
}
コード例 #25
0
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void
runTest( int argc, char** argv)
{
	ocd_options opts = ocd_get_options();
	platform_id = opts.platform_id;
	n_device = opts.device_id;

	if ( argc != 8)
	{
		printf("Usage: GpuTemporalDataMining [<platform> <device> --] <file path> <temporal constraint path> <threads> <support> <(a)bsolute or (r)atio> <(s)tatic | (d)ynamic> <(m)ap and merge | (n)aive | (o)hybrid> \n");
		return;
	}

//    CUT_DEVICE_INIT();
    initGpu();

    getDeviceVariables(device_id);


	printf("Dataset, Support Threshold, PTPE or MapMerge, A1 or A1+A2, Level, Episodes (N), Episodes Culled (X), A1 Counting Time, A2 Counting Time, Generation Time, Total Counting Time\n");

    //CUT_SAFE_CALL( cutCreateTimer( &timer));
    //CUT_SAFE_CALL( cutCreateTimer( &generating_timer));
    //CUT_SAFE_CALL( cutCreateTimer( &a1_counting_timer));
    //CUT_SAFE_CALL( cutCreateTimer( &a2_counting_timer));
    //CUT_SAFE_CALL( cutCreateTimer( &total_timer));

    //CUT_SAFE_CALL( cutStartTimer( total_timer));
    //CUT_SAFE_CALL( cutStartTimer( timer));
    //CUT_SAFE_CALL( cutStartTimer( generating_timer));
    //CUT_SAFE_CALL( cutStartTimer( a1_counting_timer));
    //CUT_SAFE_CALL( cutStartTimer( a2_counting_timer));

    unsigned int num_threads = atoi(argv[3]);
    // allocate host memory
    //initEpisodeCandidates();
	if ( loadData( argv[1] ) != 0 )
		return;
	if ( loadTemporalConstraints(argv[2]) != 0 )
		return;

	// Check whether value supplied is absolute or ratio support
	supportType = *(argv[5]) == 'a' ? ABSOLUTE : RATIO;
	memoryModel = *(argv[6]) == 's' ? STATIC : DYNAMIC;

	switch (*(argv[7]))
	{
	case 'm':
		algorithmType = MAP_AND_MERGE;
		break;
	case 'n':
		algorithmType = NAIVE;
		break;
	case 'o':
		algorithmType = OPTIMAL;
		break;
	}

	support = atof(argv[4]);

	dumpFile = fopen( "episode.txt", "w" );

	//printf("Initializing GPU Data...\n");

	setupGpu();

	// setup execution parameters
    size_t grid[3];
    size_t threads[3];

	//printf("Event stream size: %i\n", eventSize);

	// BEGIN LOOP
	for ( int level = 1; level <= eventSize; level++ )
	{
		printf("Generating episode candidates for level %i...\n", level);
//		CUT_SAFE_CALL( cutResetTimer( total_timer));
//		CUT_SAFE_CALL( cutStartTimer( total_timer));

		//CUDA_SAFE_CALL( cudaUnbindTexture( candidateTex ) );
		if(level != 1){
			unbindTexture(&candidateTex, d_episodeCandidates, numCandidates * (level-1) * sizeof(UBYTE) );
		//CUDA_SAFE_CALL( cudaUnbindTexture( intervalTex ) );
		unbindTexture(&intervalTex, d_episodeIntervals, numCandidates * (level-2) * 2 * sizeof(float));
        }

//		CUT_SAFE_CALL( cutResetTimer( generating_timer));
//		CUT_SAFE_CALL( cutStartTimer( generating_timer));

//		int test1, test = numCandidates;
//		generateEpisodeCandidatesCPU( level );
//		test1 = numCandidates;
//		numCandidates = test;
        printf("Generating Episodes\n");
#ifdef CPU_EPISODE_GENERATION
		generateEpisodeCandidatesCPU( level );
#else
		generateEpisodeCandidatesGPU( level, num_threads );
#endif

//		CUT_SAFE_CALL( cutStopTimer( generating_timer));
		//printf( "\tGenerating time: %f (ms)\n", cutGetTimerValue( generating_timer));


		if ( numCandidates == 0 )
			break;
        printf("Writing to buffer\n");
		// Copy candidates to GPU
#ifdef CPU_EPISODE_GENERATION
		clEnqueueWriteBuffer(commands, d_episodeCandidates, CL_TRUE, 0, numCandidates * level * sizeof(UBYTE), h_episodeCandidates, 0, NULL, &ocdTempEvent);
                START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "TDM Episode Copy", ocdTempTimer)
                END_TIMER(ocdTempTimer)
		clEnqueueWriteBuffer(commands, d_episodeIntervals, CL_TRUE, 0, numCandidates * (level-1) * 2 * sizeof(float), h_episodeIntervals, 0, NULL, &ocdTempEvent);
                clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "TDM Episode Copy", ocdTempTimer)
                END_TIMER(ocdTempTimer)
#endif

        bindTexture( 0, &candidateTex, d_episodeCandidates, numCandidates * level * sizeof(UBYTE), CL_UNSIGNED_INT8);

		bindTexture( 0, &intervalTex, d_episodeIntervals, numCandidates * (level-1) * 2 * sizeof(float), CL_FLOAT );

		//printf("Executing kernel on %i candidates...\n", numCandidates, level);

		// execute the kernel
		calculateGrid(grid, num_threads, numCandidates);
		calculateBlock(threads, num_threads, numCandidates);

		int sections;
		unsigned int shared_mem_needed;
		//CUT_SAFE_CALL( cutStartTimer( counting_timer));

		int aType = algorithmType;
		if ( algorithmType == OPTIMAL )
			aType = chooseAlgorithmType( level, numCandidates, num_threads );

		if ( memoryModel == DYNAMIC )
		{
			if ( aType == NAIVE )
			{
				shared_mem_needed = MaxListSize*level*threads[0]*sizeof(float);
                printf("Shared memory needed %d\n", shared_mem_needed);
				//CUT_SAFE_CALL( cutResetTimer( a1_counting_timer));
				//CUT_SAFE_CALL( cutStartTimer( a1_counting_timer));
				countCandidates(grid, threads, d_episodeSupport, eventSize, level, supportType, numCandidates, candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed );

			}
			else
			{
                printf("DYNAMIC MAP MERGE\n");
				calculateLevelParameters(level, threads, grid, sections);
				shared_mem_needed = 16000;
                printf("numCandidates=%d\n", numCandidates);
				//CUT_SAFE_CALL( cutResetTimer( a1_counting_timer));
				//CUT_SAFE_CALL( cutStartTimer( a1_counting_timer));
				countCandidatesMapMerge(grid, threads, d_episodeSupport, padEventSize, level, supportType, sections, padEventSize / sections, numCandidates,
                    candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed );
				//countCandidatesMapMergeStatic<<< grid, threads, shared_mem_needed >>>( d_episodeSupport, padEventSize, level, supportType, sections, padEventSize / sections, numCandidates );
			}
		}
		else
		{
			if ( aType == NAIVE )
			{
				shared_mem_needed = level*threads[0]*sizeof(float);
			}
			else
			{
				calculateLevelParameters(level, threads, grid, sections);
				shared_mem_needed = 16000;
			}

				//CUT_SAFE_CALL( cutResetTimer( a2_counting_timer));
				//CUT_SAFE_CALL( cutStartTimer( a2_counting_timer));
			if ( aType == NAIVE )
                countCandidatesStatic(grid, threads, d_episodeSupport, eventSize, level, supportType, numCandidates, candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed  );
			else
                countCandidatesMapMergeStatic(grid, threads, d_episodeSupport, padEventSize, level, supportType, sections, padEventSize / sections, numCandidates,
                    candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed );
			clFinish(commands);

			//CUT_SAFE_CALL( cutStopTimer( a2_counting_timer));

            int err;
            err = clEnqueueReadBuffer(commands,d_episodeSupport, CL_TRUE, 0, numCandidates * sizeof(float), h_episodeSupport, 0, NULL, &ocdTempEvent);
            clFinish(commands);
            	START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "TDM Episode Copy", ocdTempTimer)
            END_TIMER(ocdTempTimer)
		CHKERR(err, "Unable to read buffer from device.");

			unbindTexture(&candidateTex, d_episodeCandidates, numCandidates * level * sizeof(UBYTE) );
			unbindTexture(&intervalTex, d_episodeIntervals, numCandidates * (level-1) * 2 * sizeof(float));

			// Remove undersupported episodes
			cullCandidates( level );

			if ( numCandidates == 0 )
				break;
			unsigned int mmthreads = num_threads;
			if ( MaxListSize*level*num_threads*sizeof(float) > 16384 )
			{
				if ( MaxListSize*level*96*sizeof(float) < 16384 )
					mmthreads = 96;
				else if ( MaxListSize*level*64*sizeof(float) < 16384)
					mmthreads = 64;
				else if ( MaxListSize*level*32*sizeof(float) < 16384)
					mmthreads = 32;
				printf("More shared memory needed for %d threads. Changed to %d threads.\n", num_threads, mmthreads );
			}

#ifdef CPU_EPISODE_GENERATION
            err = clEnqueueWriteBuffer(commands, d_episodeCandidates, CL_TRUE, 0, numCandidates * level * sizeof(UBYTE), h_episodeCandidates, 0, NULL, &ocdTempEvent);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "TDM Episode Copy", ocdTempTimer)
            END_TIMER(ocdTempTimer)
            CHKERR(err, "Unable to write buffer 1.");
            if(numCandidates * (level - 1) * 2 * sizeof(float) != 0)
            err = clEnqueueWriteBuffer(commands, d_episodeIntervals, CL_TRUE, 0, numCandidates * (level-1) * 2 * sizeof(float), h_episodeIntervals, 0, NULL, &ocdTempEvent);
        clFinish(commands);
            START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "TDM Episode Copy", ocdTempTimer)
        CHKERR(err, "Unable to write buffer 2.");
		END_TIMER(ocdTempTimer)
#endif
            bindTexture( 0, &candidateTex, d_episodeCandidates, numCandidates * level * sizeof(UBYTE), CL_UNSIGNED_INT8);
            bindTexture( 0, &intervalTex, d_episodeIntervals, numCandidates * (level-1) * 2 * sizeof(float), CL_FLOAT );

			if ( algorithmType == OPTIMAL )
				aType = chooseAlgorithmType( level, numCandidates, mmthreads );

			// Run (T1,T2] algorithm
			if ( aType == NAIVE )
			{
				shared_mem_needed = MaxListSize*level* mmthreads*sizeof(float);
				calculateGrid(grid, mmthreads, numCandidates );
				calculateBlock(threads, mmthreads, numCandidates );
			}
			else
			{
				calculateLevelParameters(level, threads, grid, sections);
				shared_mem_needed = 16000;
			}
			//CUT_SAFE_CALL( cutResetTimer( a1_counting_timer));
			//CUT_SAFE_CALL( cutStartTimer( a1_counting_timer));

			if ( aType == NAIVE )
                countCandidates(grid, threads, d_episodeSupport, eventSize, level, supportType, numCandidates,
                    candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed );
			else
                countCandidatesMapMerge(grid, threads, d_episodeSupport, padEventSize, level, supportType, sections, padEventSize / sections, numCandidates,
                    candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed );

		}
        printf("Finishing\n");
		clFinish(commands);
		//CUT_SAFE_CALL( cutStopTimer( a1_counting_timer));
		//printf( "\tCounting time: %f (ms)\n", cutGetTimerValue( counting_timer));

		// check if kernel execution generated an error
		//CUT_CHECK_ERROR("Kernel execution failed");

		//printf("Copying result back to host...\n\n");

        int err = clEnqueueReadBuffer(commands, d_episodeSupport, CL_TRUE, 0, numCandidates * sizeof(float), h_episodeSupport, 0, NULL, &ocdTempEvent);
        clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "TDM Episode Copy", ocdTempTimer)
        END_TIMER(ocdTempTimer)
		CHKERR(err, "Unable to read memory 1.");
		err = clEnqueueReadBuffer(commands, d_episodeCandidates, CL_TRUE, 0, numCandidates * level * sizeof(UBYTE), h_episodeCandidates, 0, NULL, &ocdTempEvent);
                clFinish(commands);
                START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "TDM Episode Copy", ocdTempTimer)
                END_TIMER(ocdTempTimer)
		CHKERR(err, "Unable to read memory 2.");
		//CUDA_SAFE_CALL( cudaMemcpy( h_mapRecords, d_mapRecords, 3 * numSections * maxLevel * maxCandidates * sizeof(float), cudaMemcpyDeviceToHost ));
		saveResult(level);
		fflush(dumpFile);
	// END LOOP

		//CUT_SAFE_CALL( cutStopTimer( total_timer));

		// Print Statistics for this run
		printf("%s, %f, %s, %s, %d, %d, %d\n",
			argv[1],											// Dataset
			support,											// Support Threshold
			algorithmType == NAIVE ? "PTPE" : algorithmType == MAP_AND_MERGE ? "MapMerge" : "Episode-Based",		// PTPE or MapMerge or Episode-Based
			memoryModel == STATIC ? "A1+A2" : "A1",				// A1 or A1+A2
			level,												// Level
			numCandidates+episodesCulled,						// Episodes counted
			episodesCulled  									// Episodes removed by A2
	//		cutGetTimerValue( a1_counting_timer),				// Time for A1
//			memoryModel == STATIC ? cutGetTimerValue( a2_counting_timer) : 0.0f,				// Time for A2
		//	cutGetTimerValue( generating_timer),				// Episode generation time
		//	cutGetTimerValue( total_timer) );					// Time for total loop
            );
	}
	printf("Done!\n");

    cleanup();

    //CUT_SAFE_CALL( cutStopTimer( timer));
    //printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));
    //CUT_SAFE_CALL( cutDeleteTimer( timer));
    //CUT_SAFE_CALL( cutDeleteTimer( generating_timer));
    //CUT_SAFE_CALL( cutDeleteTimer( a1_counting_timer));
    //CUT_SAFE_CALL( cutDeleteTimer( a2_counting_timer));
    //CUT_SAFE_CALL( cutDeleteTimer( total_timer));

}
コード例 #26
0
ファイル: cdg.cpp プロジェクト: Grainspring/xoc
void CDG::build(IN OUT OPT_CTX & oc, DGRAPH & cfg)
{
	if (cfg.get_vertex_num() == 0) { return; }
	START_TIMER("CDG");
	IS_TRUE0(OPTC_is_cfg_valid(oc));
	m_ru->check_valid_and_recompute(&oc, OPT_PDOM, OPT_UNDEF);

	GRAPH pdom_tree;
	cfg.get_pdom_tree(pdom_tree);
	if (pdom_tree.get_vertex_num() == 0) { return; }

	SVECTOR<UINT> top_order;
	pdom_tree.sort_in_toplog_order(top_order, false);
	//dump_vec(top_order);

	BITSET_MGR bs_mgr;
	SVECTOR<BITSET*> cd_set;
	for (INT j = 0; j <= top_order.get_last_idx(); j++) {
		UINT ii = top_order.get(j);
		VERTEX * v = cfg.get_vertex(ii);
		IS_TRUE0(v != NULL);
		add_vertex(VERTEX_id(v));
		BITSET * cd_of_v = cd_set.get(VERTEX_id(v));
		if (cd_of_v == NULL) {
			cd_of_v = bs_mgr.create();
			cd_set.set(VERTEX_id(v), cd_of_v);
		}

		EDGE_C * in = VERTEX_in_list(v);
		while (in != NULL) {
			VERTEX * pred = EDGE_from(EC_edge(in));
			if (VERTEX_id(v) != ((DGRAPH&)cfg).get_ipdom(VERTEX_id(pred))) {
				cd_of_v->bunion(VERTEX_id(pred));
				//if (pred != v)
				{
					add_edge(VERTEX_id(pred), VERTEX_id(v));
				}
			}
			in = EC_next(in);
		}
		INT c;
		for (VERTEX * z = cfg.get_first_vertex(c);
			 z != NULL; z = cfg.get_next_vertex(c)) {
			if (((DGRAPH&)cfg).get_ipdom(VERTEX_id(z)) == VERTEX_id(v)) {
				BITSET * cd = cd_set.get(VERTEX_id(z));
				if (cd == NULL) {
					cd = bs_mgr.create();
					cd_set.set(VERTEX_id(z), cd);
				}
				for (INT i = cd->get_first(); i != -1; i = cd->get_next(i)) {
					if (VERTEX_id(v) != ((DGRAPH&)cfg).get_ipdom(i)) {
						cd_of_v->bunion(i);
						//if (i != (INT)VERTEX_id(v))
						{
							add_edge(i, VERTEX_id(v));
						}
					}
				}
			}
		}
	} //end for

	OPTC_is_cdg_valid(oc) = true;
	END_TIMER();
}
コード例 #27
0
ファイル: lud.c プロジェクト: nz-ocl/OpenDwarfs
int
main ( int argc, char *argv[] )
{
    int matrix_dim = 32; /* default matrix_dim */
    int opt, option_index=0;
    func_ret_t ret;
    const char *input_file = NULL;
    float *m, *mm;
    stopwatch sw;

    cl_device_id clDevice;
    cl_context clContext;
    cl_command_queue clCommands;
    cl_program clProgram;
    cl_kernel clKernel_diagonal;
    cl_kernel clKernel_perimeter;
    cl_kernel clKernel_internal;
    cl_int dev_type;

    cl_int errcode;

    FILE *kernelFile;
    char *kernelSource;
    size_t kernelLength;

    cl_mem d_m;

    ocd_init(&argc, &argv, NULL);
    ocd_options opts = ocd_get_options();
    platform_id = opts.platform_id;
    device_id = opts.device_id;


    while ((opt = getopt_long(argc, argv, "::vs:i:",
                              long_options, &option_index)) != -1 ) {
        switch(opt) {
        case 'i':
            input_file = optarg;
            break;
        case 'v':
            do_verify = 1;
            break;
        case 's':
            matrix_dim = atoi(optarg);
            fprintf(stderr, "Currently not supported, use -i instead\n");
            fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file|-p platform|-d device]\n", argv[0]);
            exit(EXIT_FAILURE);
        case '?':
            fprintf(stderr, "invalid option\n");
            break;
        case ':':
            fprintf(stderr, "missing argument\n");
            break;
        default:
            fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file||-p platform|-d device]\n",
                    argv[0]);
            exit(EXIT_FAILURE);
        }
    }

    if ( (optind < argc) || (optind == 1)) {
        fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file|-p platform|-d device]\n", argv[0]);
        exit(EXIT_FAILURE);
    }

    if (input_file) {
        printf("Reading matrix from file %s\n", input_file);
        ret = create_matrix_from_file(&m, input_file, &matrix_dim);
        if (ret != RET_SUCCESS) {
            m = NULL;
            fprintf(stderr, "error create matrix from file %s\n", input_file);
            exit(EXIT_FAILURE);
        }
    } else {
        printf("No input file specified!\n");
        exit(EXIT_FAILURE);
    }

    if (do_verify) {
        printf("Before LUD\n");
        print_matrix(m, matrix_dim);
        matrix_duplicate(m, &mm, matrix_dim);
    }

//  errcode = clGetPlatformIDs(NUM_PLATFORM, clPlatform, NULL);
//  CHECKERR(errcode);
//
//  errcode = clGetDeviceIDs(clPlatform[PLATFORM_ID], USEGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL);
//  CHECKERR(errcode);
#ifdef USEGPU
    dev_type = CL_DEVICE_TYPE_GPU;
#elif defined(USE_AFPGA)
    dev_type = CL_DEVICE_TYPE_ACCELERATOR;
#else
    dev_type = CL_DEVICE_TYPE_CPU;
#endif


    clDevice = GetDevice(platform_id, device_id,dev_type);
    size_t max_worksize[3];
    errcode = clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES,sizeof(size_t)*3, &max_worksize, NULL);
    CHECKERR(errcode);
    while(BLOCK_SIZE*BLOCK_SIZE>max_worksize[0])
        BLOCK_SIZE = BLOCK_SIZE/2;

    clContext = clCreateContext(NULL, 1, &clDevice, NULL, NULL, &errcode);
    CHECKERR(errcode);

    clCommands = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &errcode);
    CHECKERR(errcode);

    kernelFile = fopen("lud_kernel.cl", "r");
    fseek(kernelFile, 0, SEEK_END);
    kernelLength = (size_t) ftell(kernelFile);
    kernelSource = (char *) malloc(sizeof(char)*kernelLength);
    rewind(kernelFile);
    fread((void *) kernelSource, kernelLength, 1, kernelFile);
    fclose(kernelFile);

    clProgram = clCreateProgramWithSource(clContext, 1, (const char **) &kernelSource, &kernelLength, &errcode);
    CHECKERR(errcode);

    free(kernelSource);
    char arg[100];
    sprintf(arg,"-D BLOCK_SIZE=%d", (int)BLOCK_SIZE);
    errcode = clBuildProgram(clProgram, 1, &clDevice, arg, NULL, NULL);
    if (errcode == CL_BUILD_PROGRAM_FAILURE)
    {
        char *log;
        size_t logLength;
        errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLength);
        log = (char *) malloc(sizeof(char)*logLength);
        errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, logLength, (void *) log, NULL);
        fprintf(stderr, "Kernel build error! Log:\n%s", log);
        free(log);
        return 0;
    }
    CHECKERR(errcode);

    clKernel_diagonal = clCreateKernel(clProgram, "lud_diagonal", &errcode);
    CHECKERR(errcode);
    clKernel_perimeter = clCreateKernel(clProgram, "lud_perimeter", &errcode);
    CHECKERR(errcode);
    clKernel_internal = clCreateKernel(clProgram, "lud_internal", &errcode);
    CHECKERR(errcode);

    d_m = clCreateBuffer(clContext, CL_MEM_READ_WRITE, matrix_dim*matrix_dim*sizeof(float), NULL, &errcode);
    CHECKERR(errcode);

    /* beginning of timing point */
    stopwatch_start(&sw);

    errcode = clEnqueueWriteBuffer(clCommands, d_m, CL_TRUE, 0, matrix_dim*matrix_dim*sizeof(float), (void *) m, 0, NULL, &ocdTempEvent);

    clFinish(clCommands);
    START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "Matrix Copy", ocdTempTimer)
    END_TIMER(ocdTempTimer)
    CHECKERR(errcode);

    int i=0;
    size_t localWorkSize[2];
    size_t globalWorkSize[2];
    //printf("BLOCK_SIZE: %d\n",BLOCK_SIZE);
//	printf("max Work-item Size: %d\n",(int)max_worksize[0]);
#ifdef START_POWER
    for( int iter = 0; iter < 1000; iter++)
#endif
        for (i=0; i < matrix_dim-BLOCK_SIZE; i += BLOCK_SIZE) {
            errcode = clSetKernelArg(clKernel_diagonal, 0, sizeof(cl_mem), (void *) &d_m);
            errcode |= clSetKernelArg(clKernel_diagonal, 1, sizeof(int), (void *) &matrix_dim);
            errcode |= clSetKernelArg(clKernel_diagonal, 2, sizeof(int), (void *) &i);
            CHECKERR(errcode);

            localWorkSize[0] = BLOCK_SIZE;
            globalWorkSize[0] = BLOCK_SIZE;

            errcode = clEnqueueNDRangeKernel(clCommands, clKernel_diagonal, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent);
            clFinish(clCommands);
            START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Diagonal Kernels", ocdTempTimer)
            END_TIMER(ocdTempTimer)
            CHECKERR(errcode);
            errcode = clSetKernelArg(clKernel_perimeter, 0, sizeof(cl_mem), (void *) &d_m);
            errcode |= clSetKernelArg(clKernel_perimeter, 1, sizeof(int), (void *) &matrix_dim);
            errcode |= clSetKernelArg(clKernel_perimeter, 2, sizeof(int), (void *) &i);
            CHECKERR(errcode);
            localWorkSize[0] = BLOCK_SIZE*2;
            globalWorkSize[0] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[0];

            errcode = clEnqueueNDRangeKernel(clCommands, clKernel_perimeter, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent);
            clFinish(clCommands);
            START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Perimeter Kernel", ocdTempTimer)
            CHECKERR(errcode);
            END_TIMER(ocdTempTimer)
            errcode = clSetKernelArg(clKernel_internal, 0, sizeof(cl_mem), (void *) &d_m);
            errcode |= clSetKernelArg(clKernel_internal, 1, sizeof(int), (void *) &matrix_dim);
            errcode |= clSetKernelArg(clKernel_internal, 2, sizeof(int), (void *) &i);
            CHECKERR(errcode);
            localWorkSize[0] = BLOCK_SIZE;
            localWorkSize[1] = BLOCK_SIZE;
            globalWorkSize[0] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[0];
            globalWorkSize[1] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[1];

            errcode = clEnqueueNDRangeKernel(clCommands, clKernel_internal, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent);
            clFinish(clCommands);
            START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Internal Kernel", ocdTempTimer)
            END_TIMER(ocdTempTimer)
            CHECKERR(errcode);
        }
    errcode = clSetKernelArg(clKernel_diagonal, 0, sizeof(cl_mem), (void *) &d_m);
    errcode |= clSetKernelArg(clKernel_diagonal, 1, sizeof(int), (void *) &matrix_dim);
    errcode |= clSetKernelArg(clKernel_diagonal, 2, sizeof(int), (void *) &i);
    CHECKERR(errcode);
    localWorkSize[0] = BLOCK_SIZE;
    globalWorkSize[0] = BLOCK_SIZE;

    errcode = clEnqueueNDRangeKernel(clCommands, clKernel_diagonal, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent);
    clFinish(clCommands);
    START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Diagonal Kernels", ocdTempTimer)
    CHECKERR(errcode);

    END_TIMER(ocdTempTimer)

    errcode = clEnqueueReadBuffer(clCommands, d_m, CL_TRUE, 0, matrix_dim*matrix_dim*sizeof(float), (void *) m, 0, NULL, &ocdTempEvent);
    clFinish(clCommands);
    START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "Matrix copy", ocdTempTimer)
    END_TIMER(ocdTempTimer)
    /* end of timing point */
    stopwatch_stop(&sw);
    printf("Time consumed(ms): %lf\n", 1000*get_interval_by_sec(&sw));

    clReleaseMemObject(d_m);

    if (do_verify) {
        printf("After LUD\n");
        print_matrix(m, matrix_dim);
        printf(">>>Verify<<<<\n");
        printf("matrix_dim: %d\n",matrix_dim);
        lud_verify(mm, m, matrix_dim);
        free(mm);
    }

    clReleaseKernel(clKernel_diagonal);
    clReleaseKernel(clKernel_perimeter);
    clReleaseKernel(clKernel_internal);
    clReleaseProgram(clProgram);
    clReleaseCommandQueue(clCommands);
    clReleaseContext(clContext);

    free(m);
    ocd_finalize();
    return EXIT_SUCCESS;
}				/* ----------  end of function main  ---------- */
コード例 #28
0
ファイル: alphalocator.cpp プロジェクト: bonalex01/Primatte
        cv::Mat AlphaRayLocator::findAlphas(
                const BoundingPolyhedron* polyhedrons,
                const size_t polyhedronCount,
                const ia::InputAssembler& input) const
            {
                assert(polyhedronCount>1);
                START_TIMER(AlphaLocator);

                const cv::Mat& mat = input.mat();
                const unsigned r = input.mat().rows, c = input.mat().cols;
                const math::vec3 background = input.background();

                cv::Mat out;
                out.create(r, c, CV_32FC1);

                const SpherePolyhedron& outerPoly = polyhedrons[1];
                const SpherePolyhedron& innerPoly = polyhedrons[0];

                //For each point, send rays
                for (unsigned i = 0; i < r; ++i)
                {
                    float* data = (float*)(mat.data + mat.step*i);
                    float* dataOut = (float*)(out.data + out.step*i);
                    for(unsigned j = 0; j < c; ++j)
                    {
                        math::vec3& point = *((math::vec3*)(data + j*3));
                        float alpha;

                        //If right in the middle
                        if(background==point)
                            alpha = 0;
                        else
                        {
                            //Prepare vector
                            const math::vec3 vector = point - background;
                            const float vectorLen = vector.length();
                            const math::vec3 vectorNorm = vector/vectorLen;
                            const float distanceToPoint = point.distance(background);

                            const float distanceToOuterPoly = outerPoly.findDistanceToPolyhedron(vectorNorm);

                            //If inside outer poly, alpha < 1
                            if(distanceToPoint < distanceToOuterPoly)
                            {
                                float distanceToInnerPoly = innerPoly.findDistanceToPolyhedron(vectorNorm);

                                //If does not intersect with inner, fully inside
                                if(distanceToPoint < distanceToInnerPoly)
                                    alpha = 0;
                                else //interpolate between inner and outer
                                    alpha = (vectorLen - distanceToInnerPoly) /
                                            (distanceToOuterPoly - distanceToInnerPoly);

                            }
                            else //If intersects with middle, it's outside. Alpha = 1.
                                alpha = 1;
                        }

                        *(dataOut+j) = alpha;
                    }
                }

                END_TIMER(AlphaLocator);

                return out;
            }
コード例 #29
0
            void StableFitting::expand(BoundingPolyhedron& poly,
                                      const std::vector<math::vec3>& points, IColourSegmenter* segmenter,
                                      math::vec3 backgroundPoint, float startRadius, float endRadius) const
            {

                START_TIMER(Expanding);

                auto innerouter = segmenter->segment(points, backgroundPoint, startRadius);

                //Indicates whether a vertex was unable to move at least once due to outer points.
                std::vector<bool> didVertexEncounterResistance;
                didVertexEncounterResistance.resize(poly.mVertices.size(), false);

                //Position the polygon around the inner points.
                poly.positionAround(backgroundPoint, innerouter.inner);

                //Make a copy that is to be scaled.
                //We need a copy to deal with the case where no resistance is met.
                BoundingPolyhedron newPoly = poly;

                //The starting step is set to be half way between the start and end distance.
                float step = (endRadius-startRadius)/2.f;

                //Count number of points outside for later reference.
                int originalPointsOutside = innerouter.outer.size()-countPointsInside(innerouter.outer, newPoly);

                //Iterate...
                for(int iIteration = 0; iIteration < mNoOfIterations; ++iIteration)
                {
                    //Try moving each vertex outwards
                    for(size_t iVertex = 0; iVertex < newPoly.mVertices.size(); ++iVertex)
                    {
                        //Find movement vector
                        math::vec3 moveNormal = (newPoly.mVertices[iVertex]-newPoly.centre()).normalize();
                        const math::vec3 vec = moveNormal*step;

                        //Move
                        newPoly.mVertices[iVertex] += vec;

                        //Find the number of points now outside after movement.
                        int newPointsOutside = innerouter.outer.size()-countPointsInside(innerouter.outer, newPoly);

                        //If there are now less points outside, we have gone through something. Move back and mark resistance.
                        if(newPointsOutside < originalPointsOutside)
                        {
                            newPoly.mVertices[iVertex] -= vec;
                            didVertexEncounterResistance[iVertex] = true;
                        }
                    }

                    //halve the step and try again.
                    step *= 0.5f;
                }

                //The idea here is that if a vertex did not encounter any resistance while moving,
                //Move it by the maximin movement of the vertices that _did_ encounter resistance.

                //You might try replacing maximum with mode, etc. (mean gave really low deltas)

                //Find max movement of vertices that found resistance:
                float maxMovement = poly.mVertices[0].distance(newPoly.mVertices[0]);
                for(size_t i = 1; i < didVertexEncounterResistance.size(); ++i)
                        maxMovement = std::max(maxMovement,
                                               poly.mVertices[i].distance(newPoly.mVertices[i]));



                //If a vertex found resistance, keep it. Otherwise, restore it and move by the average.
                for(size_t i = 0; i < didVertexEncounterResistance.size(); ++i)
                    if(didVertexEncounterResistance[i])
                        poly.mVertices[i] = newPoly.mVertices[i];
                else
                        poly.mVertices[i] = poly.mVertices[i]+(newPoly.mVertices[i]-poly.mVertices[i]).normalize()*maxMovement;

                END_TIMER(Expanding);
            }