// main function
//*****************************************************************************
int main(int argc, const char **argv)
{
    cl_context cxGPUContext;                   // OpenCL context
    cl_command_queue cqCommandQue[MAX_GPU_COUNT];             // OpenCL command que
    cl_device_id* cdDevices;                   // OpenCL device list    
    cl_int err = 1;                     // Error code var

    shrSetLogFileName ("oclHiddenMarkovModel.txt");
    shrLog(LOGBOTH, 0, "%s Starting...\n\n", argv[0]); 

    shrLog(LOGBOTH, 0, "Create context\n");
    cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &err);
    shrCheckErrorEX(err, CL_SUCCESS, NULL);

    shrLog(LOGBOTH, 0, "Get device info...\n");
    int nDevice = 0;
    size_t nDeviceBytes;
    err |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);
    cdDevices = (cl_device_id*)malloc(nDeviceBytes);
    err |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, nDeviceBytes, cdDevices, NULL);
    nDevice = (int)(nDeviceBytes/sizeof(cl_device_id));
    
    shrLog(LOGBOTH, 0, "clCreateCommandQueue\n"); 
    int id_device;
    if(shrGetCmdLineArgumenti(argc, argv, "device", &id_device)) // Set up command queue(s) for GPU specified on the command line
    {
        // get & log device index # and name
        cl_device_id cdDevice = cdDevices[id_device];

        // create a command que
        cqCommandQue[0] = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &err);
        shrCheckErrorEX(err, CL_SUCCESS, NULL);
        oclPrintDevInfo(LOGBOTH, cdDevice);
        nDevice = 1;   
    } 
    else 
    { // create command queues for all available devices        
        for (int i = 0; i < nDevice; i++) 
        {
            cqCommandQue[i] = clCreateCommandQueue(cxGPUContext, cdDevices[i], 0, &err);
            shrCheckErrorEX(err, CL_SUCCESS, NULL);
        }
        for (int i = 0; i < nDevice; i++) oclPrintDevInfo(LOGBOTH, cdDevices[i]);
    }

    shrLog(LOGBOTH, 0, "\nUsing %d GPU(s)...\n\n", nDevice);
	int wgSize;
	if (!shrGetCmdLineArgumenti(argc, argv, "work-group-size", &wgSize)) 
	{
		wgSize = 256;
	}


    shrLog(LOGBOTH, 0, "Init Hidden Markov Model parameters\n");
    int nState = 256*16; // number of states
    int nEmit  = 128; // number of possible observations
    
    float *initProb = (float*)malloc(sizeof(float)*nState); // initial probability
    float *mtState  = (float*)malloc(sizeof(float)*nState*nState); // state transition matrix
    float *mtEmit   = (float*)malloc(sizeof(float)*nEmit*nState); // emission matrix
    initHMM(initProb, mtState, mtEmit, nState, nEmit);

    // define observational sequence
    int nObs = 500; // size of observational sequence
    int **obs = (int**)malloc(nDevice*sizeof(int*));
    int **viterbiPathCPU = (int**)malloc(nDevice*sizeof(int*));
    int **viterbiPathGPU = (int**)malloc(nDevice*sizeof(int*));
    float *viterbiProbCPU = (float*)malloc(nDevice*sizeof(float)); 
    for (int iDevice = 0; iDevice < nDevice; iDevice++)
    {
        obs[iDevice] = (int*)malloc(sizeof(int)*nObs);
        for (int i = 0; i < nObs; i++)
            obs[iDevice][i] = i % 15;
        viterbiPathCPU[iDevice] = (int*)malloc(sizeof(int)*nObs);
        viterbiPathGPU[iDevice] = (int*)malloc(sizeof(int)*nObs);
    }

    shrLog(LOGBOTH, 0, "# of states = %d\n# of possible observations = %d \nSize of observational sequence = %d\n\n",
        nState, nEmit, nObs);


    shrLog(LOGBOTH, 0, "Compute Viterbi path on GPU\n\n");

    HMM **oHmm = (HMM**)malloc(nDevice*sizeof(HMM*));
    for (int iDevice = 0; iDevice < nDevice; iDevice++)
    {
        oHmm[iDevice] = new HMM(cxGPUContext, cqCommandQue[iDevice], initProb, mtState, mtEmit, nState, nEmit, nObs, argv[0], wgSize);
    }

    cl_mem *vProb = (cl_mem*)malloc(sizeof(cl_mem)*nDevice);
    cl_mem *vPath = (cl_mem*)malloc(sizeof(cl_mem)*nDevice);
    for (int iDevice = 0; iDevice < nDevice; iDevice++)
    {
        vProb[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(float), NULL, &err);
        vPath[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)*nObs, NULL, &err);
    }

#ifdef GPU_PROFILING
    for (int iDevice = 0; iDevice < nDevice; iDevice++)
    {
        clFinish(cqCommandQue[iDevice]);;
    }
	shrDeltaT(1);
#endif

    size_t szWorkGroup;
	for (int iDevice = 0; iDevice < nDevice; iDevice++)
	{
		szWorkGroup = oHmm[iDevice]->ViterbiSearch(vProb[iDevice], vPath[iDevice], obs[iDevice]);
	}

	for (int iDevice = 0; iDevice < nDevice; iDevice++)
	{
	  clFinish(cqCommandQue[iDevice]);
	}

#ifdef GPU_PROFILING
    double dElapsedTime = shrDeltaT(1);
    shrLog(LOGBOTH | MASTER, 0, "oclHiddenMarkovModel, Throughput = %.4f, Time = %.5f, Size = %u, NumDevsUsed = %u, Workgroup = %u\n",
        (1.0e-6 * nDevice * nState * nObs)/dElapsedTime, dElapsedTime, (nDevice * nState * nObs), nDevice, szWorkGroup); 
#endif

    for (int iDevice = 0; iDevice < nDevice; iDevice++)
    {
        err = clEnqueueReadBuffer(cqCommandQue[iDevice], vPath[iDevice], CL_TRUE, 0, sizeof(int)*nObs, viterbiPathGPU[iDevice], 0, NULL, NULL);
    }

    shrLog(LOGBOTH, 0, "\nCompute Viterbi path on CPU\n");
    for (int iDevice = 0; iDevice < nDevice; iDevice++)
    {
        err = ViterbiCPU(viterbiProbCPU[iDevice], viterbiPathCPU[iDevice], obs[iDevice], nObs, initProb, mtState, nState, mtEmit);
    }
    
    if (!err)
    {
        shrEXIT(argc, argv);
    }

    bool pass = true;
    for (int iDevice = 0; iDevice < nDevice; iDevice++)
    {
        for (int i = 0; i < nObs; i++)
        {
            if (viterbiPathCPU[iDevice][i] != viterbiPathGPU[iDevice][i]) 
            {
                pass = false;
                break;
            }
        }
    }
    shrLog(LOGBOTH, 0, "\nTEST %s\n\n", (pass) ? "PASSED" : "FAILED !!!");

        
    // NOTE:  Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity.
    shrLog(LOGBOTH, 0, "Release CPU buffers and OpenCL objects...\n"); 
    free(initProb);
    free(mtState);
    free(mtEmit);
    for (int iDevice = 0; iDevice < nDevice; iDevice++)
    {
        free(obs[iDevice]);
        free(viterbiPathCPU[iDevice]);
        free(viterbiPathGPU[iDevice]);
        delete oHmm[iDevice];
        clReleaseCommandQueue(cqCommandQue[iDevice]);
    }
    free(obs);
    free(viterbiPathCPU);
    free(viterbiPathGPU);
    free(viterbiProbCPU);
    free(cdDevices);
    free(oHmm);
    clReleaseContext(cxGPUContext);

    shrEXIT(argc, argv);
}
// main function
//*****************************************************************************
int main(int argc, const char **argv)
{
    cl_platform_id cpPlatform;      // OpenCL platform
    cl_uint nDevice;                // OpenCL device count
    cl_device_id* cdDevices;        // OpenCL device list    
    cl_context cxGPUContext;        // OpenCL context
    cl_command_queue cqCommandQue[MAX_GPU_COUNT];             // OpenCL command que
    cl_int ciErrNum = 1;            // Error code var

    shrQAStart(argc, (char **)argv);

    // Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    shrLog("clGetPlatformID...\n"); 

    //Get all the devices
    cl_uint uiNumDevices = 0;           // Number of devices available
    cl_uint uiTargetDevice = 0;	        // Default Device to compute on
    cl_uint uiNumComputeUnits;          // Number of compute units (SM's on NV GPU)
    shrLog("Get the Device info and select Device...\n");
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

    // Get command line device options and config accordingly
    shrLog("  # of Devices Available = %u\n", uiNumDevices); 
    if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) 
    {
        uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
    }
    shrLog("  Using Device %u: ", uiTargetDevice); 
    oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
    ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    shrLog("\n  # of Compute Units = %u\n", uiNumComputeUnits); 
	
    shrSetLogFileName ("oclHiddenMarkovModel.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    shrLog("Get platform...\n");
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

    shrLog("Get devices...\n");
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    cdDevices = (cl_device_id *)malloc(nDevice * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, nDevice, cdDevices, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

    shrLog("clCreateContext\n");
    cxGPUContext = clCreateContext(0, nDevice, cdDevices, NULL, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

    shrLog("clCreateCommandQueue\n"); 
    int id_device;
    if(shrGetCmdLineArgumenti(argc, argv, "device", &id_device)) // Set up command queue(s) for GPU specified on the command line
    {
        // create a command que
        cqCommandQue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[id_device], 0, &ciErrNum);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
        oclPrintDevInfo(LOGBOTH, cdDevices[id_device]);
        nDevice = 1;   
    } 
    else 
    { // create command queues for all available devices        
        for (cl_uint i = 0; i < nDevice; i++) 
        {
            cqCommandQue[i] = clCreateCommandQueue(cxGPUContext, cdDevices[i], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
            oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
        }
        for (cl_uint i = 0; i < nDevice; i++) oclPrintDevInfo(LOGBOTH, cdDevices[i]);
    }

    shrLog("\nUsing %d GPU(s)...\n\n", nDevice);
	int wgSize;
	if (!shrGetCmdLineArgumenti(argc, argv, "work-group-size", &wgSize)) 
	{
		wgSize = 256;
	}

    shrLog("Init Hidden Markov Model parameters\n");
    int nState = 256*16; // number of states, must be a multiple of 256
    int nEmit  = 128; // number of possible observations
    
    float *initProb = (float*)malloc(sizeof(float)*nState); // initial probability
    float *mtState  = (float*)malloc(sizeof(float)*nState*nState); // state transition matrix
    float *mtEmit   = (float*)malloc(sizeof(float)*nEmit*nState); // emission matrix
    initHMM(initProb, mtState, mtEmit, nState, nEmit);

    // define observational sequence
    int nObs = 100; // size of observational sequence
    int **obs = (int**)malloc(nDevice*sizeof(int*));
    int **viterbiPathCPU = (int**)malloc(nDevice*sizeof(int*));
    int **viterbiPathGPU = (int**)malloc(nDevice*sizeof(int*));
    float *viterbiProbCPU = (float*)malloc(nDevice*sizeof(float)); 
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        obs[iDevice] = (int*)malloc(sizeof(int)*nObs);
        for (int i = 0; i < nObs; i++)
            obs[iDevice][i] = i % 15;
        viterbiPathCPU[iDevice] = (int*)malloc(sizeof(int)*nObs);
        viterbiPathGPU[iDevice] = (int*)malloc(sizeof(int)*nObs);
    }

    shrLog("# of states = %d\n# of possible observations = %d \nSize of observational sequence = %d\n\n",
        nState, nEmit, nObs);

    shrLog("Compute Viterbi path on GPU\n\n");

    HMM **oHmm = (HMM**)malloc(nDevice*sizeof(HMM*));
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        oHmm[iDevice] = new HMM(cxGPUContext, cqCommandQue[iDevice], initProb, mtState, mtEmit, nState, nEmit, nObs, argv[0], wgSize);
    }

    cl_mem *vProb = (cl_mem*)malloc(sizeof(cl_mem)*nDevice);
    cl_mem *vPath = (cl_mem*)malloc(sizeof(cl_mem)*nDevice);
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        vProb[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(float), NULL, &ciErrNum);
        vPath[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)*nObs, NULL, &ciErrNum);
    }

#ifdef GPU_PROFILING
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        clFinish(cqCommandQue[iDevice]);;
    }
	shrDeltaT(1);
#endif

    size_t szWorkGroup;
	for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
	{
		szWorkGroup = oHmm[iDevice]->ViterbiSearch(vProb[iDevice], vPath[iDevice], obs[iDevice]);
	}

	for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
	{
	  clFinish(cqCommandQue[iDevice]);
	}

#ifdef GPU_PROFILING
    double dElapsedTime = shrDeltaT(1);
    shrLogEx(LOGBOTH | MASTER, 0, "oclHiddenMarkovModel, Throughput = %.4f GB/s, Time = %.5f s, Size = %u items, NumDevsUsed = %u, Workgroup = %u\n",
        (1.0e-9 * 2.0 * sizeof(float) * nDevice * nState * nState * (nObs-1))/dElapsedTime, dElapsedTime, (nDevice * nState * nObs), nDevice, szWorkGroup); 

#endif

    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        ciErrNum = clEnqueueReadBuffer(cqCommandQue[iDevice], vPath[iDevice], CL_TRUE, 0, sizeof(int)*nObs, viterbiPathGPU[iDevice], 0, NULL, NULL);
    }

    shrLog("\nCompute Viterbi path on CPU\n");
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        ciErrNum = ViterbiCPU(viterbiProbCPU[iDevice], viterbiPathCPU[iDevice], obs[iDevice], nObs, initProb, mtState, nState, mtEmit);
    }
    
    if (!ciErrNum)
    {
        shrEXIT(argc, argv);
    }

    bool pass = true;
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        for (int i = 0; i < nObs; i++)
        {
            if (viterbiPathCPU[iDevice][i] != viterbiPathGPU[iDevice][i]) 
            {
                pass = false;
                break;
            }
        }
    }
        
    // NOTE:  Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity.
    shrLog("Release CPU buffers and OpenCL objects...\n"); 
    free(initProb);
    free(mtState);
    free(mtEmit);
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        free(obs[iDevice]);
        free(viterbiPathCPU[iDevice]);
        free(viterbiPathGPU[iDevice]);
        delete oHmm[iDevice];
        clReleaseCommandQueue(cqCommandQue[iDevice]);
    }
    free(obs);
    free(viterbiPathCPU);
    free(viterbiPathGPU);
    free(viterbiProbCPU);
    free(cdDevices);
    free(oHmm);
    clReleaseContext(cxGPUContext);

    // finish
    shrQAFinishExit(argc, (const char **)argv, pass ? QA_PASSED : QA_FAILED);

    shrEXIT(argc, argv);
}
Esempio n. 3
0
int main(int argc, char *argv[])
{
    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel_one, kernel_path;
    cl_mem d_mt_state, d_mt_emit, d_max_prob_old;
    cl_mem d_max_prob_new, d_path, v_prob, v_path;

    int wg_size = 256;
    int n_state = 256*16;
    int n_emit = 128;
    int n_obs = 100;

    size_t init_prob_size = sizeof(float) * n_state;
    size_t mt_state_size = sizeof(float) * n_state * n_state;
    size_t mt_emit_size = sizeof(float) * n_emit * n_state;

    float *init_prob = (float *) malloc(init_prob_size);
    float *mt_state = (float *) malloc(mt_state_size);
    float *mt_emit = (float *) malloc(mt_emit_size);
    int *obs = (int *) malloc(sizeof(int) * n_obs);
    int *viterbi_gpu = (int *) malloc(sizeof(int) * n_obs);

    srand(2012);
    initHMM(init_prob, mt_state, mt_emit, n_state, n_emit);

    int i;
    for (i = 0; i < n_obs; i++) {
        obs[i] = i % 15;
    }

    const char *source = load_program_source("Viterbi.cl");
    size_t source_len = strlen(source);;
    cl_uint err = 0;

    char *flags = "-cl-fast-relaxed-math";

    clGetPlatformIDs(1, &platform, NULL);
    printf("platform %p err %d\n", platform, err);

    clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &err);
    printf("device %p err %d\n", device, err);

    context = clCreateContext(0, 1, &device, NULL, NULL, &err);
    printf("context %p err %d\n", context, err);

    queue = clCreateCommandQueue(context, device, 0, &err);
    printf("queue %p err %d\n", queue, err);

    program = clCreateProgramWithSource(context, 1, &source, &source_len, &err);
    printf("program %p err %d\n", program, err);

    err = clBuildProgram(program, 0, NULL, flags, NULL, NULL);
    printf("err %d\n", err);

    /*
    char tmp[102400];
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(tmp),
        tmp, NULL);

    printf("error %s\n", tmp);
    */

    kernel_one = clCreateKernel(program, "ViterbiOneStep", &err);
    printf("kernel %p err %d\n", kernel_one, err);

    kernel_path = clCreateKernel(program, "ViterbiPath", &err);
    printf("kernel %p err %d\n", kernel_path, err);

    d_mt_state = clCreateBuffer(context, CL_MEM_READ_ONLY, mt_state_size, 
        NULL, &err);
    printf("buffer %p\n", d_mt_state);

    d_mt_emit = clCreateBuffer(context, CL_MEM_READ_ONLY, mt_emit_size, 
        NULL, &err);
    printf("buffer %p\n", d_mt_emit);

    d_max_prob_new = clCreateBuffer(context, CL_MEM_READ_WRITE, 
        init_prob_size, NULL, &err);
    printf("buffer %p\n", d_max_prob_new);

    d_max_prob_old = clCreateBuffer(context, CL_MEM_READ_WRITE, 
        init_prob_size, NULL, &err);
    printf("buffer %p\n", d_max_prob_old);

    d_path = clCreateBuffer(context, CL_MEM_READ_WRITE, 
        sizeof(int)*(n_obs-1)*n_state, NULL, &err);
    printf("buffer %p\n", d_path);

    v_prob = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float),
        NULL, &err);
    printf("buffer %p\n", v_prob);

    v_path = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*n_obs,
        NULL, &err);
    printf("buffer %p\n", v_prob);

    err = clEnqueueWriteBuffer(queue, d_mt_state, CL_TRUE, 0, mt_state_size,
        mt_state, 0, NULL, NULL);
    printf("err %d\n", err);

    err = clEnqueueWriteBuffer(queue, d_mt_emit, CL_TRUE, 0, mt_emit_size,
        mt_emit, 0, NULL, NULL);
    printf("err %d\n", err);

    err = clEnqueueWriteBuffer(queue, d_max_prob_old, CL_TRUE, 0, init_prob_size,
        init_prob, 0, NULL, NULL);
    printf("err %d\n", err);

    // max_wg_size is 1024 for Intel Core 2 CPU
    size_t max_wg_size;
    err = clGetKernelWorkGroupInfo(kernel_one, device, 
        CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_wg_size, NULL);
    printf("max_wg_size %d\n", max_wg_size);

    size_t local_work_size[2], global_work_size[2];
    local_work_size[0] = wg_size;
    local_work_size[1] = 1;
    global_work_size[0] = local_work_size[0] * 256;
    global_work_size[1] = n_state/256;

    for (i = 1; i < n_obs; i++) {
        err = clSetKernelArg(kernel_one, 0, sizeof(cl_mem), 
            (void*)&d_max_prob_new);
        printf("err %d\n", err);

        err = clSetKernelArg(kernel_one, 1, sizeof(cl_mem), 
            (void*)&d_path);
        printf("err %d\n", err);

        err = clSetKernelArg(kernel_one, 2, sizeof(cl_mem), 
            (void*)&d_max_prob_old);
        printf("err %d\n", err);

        err = clSetKernelArg(kernel_one, 3, sizeof(cl_mem), 
            (void*)&d_mt_state);
        printf("err %d\n", err);

        err = clSetKernelArg(kernel_one, 4, sizeof(cl_mem),
            (void*)&d_mt_emit);
        printf("err %d\n", err);

        err = clSetKernelArg(kernel_one, 5, sizeof(float)*local_work_size[0],
            NULL);
        printf("err %d\n", err);

        err = clSetKernelArg(kernel_one, 6, sizeof(int)*local_work_size[0],
            NULL);
        printf("err %d\n", err);

        err = clSetKernelArg(kernel_one, 7, sizeof(int),
            (void*)&n_state);
        printf("err %d\n", err);

        err = clSetKernelArg(kernel_one, 8, sizeof(int),
            (void*)&(obs[i]));
        printf("err %d\n", err);

        err = clSetKernelArg(kernel_one, 9, sizeof(int),
            (void*)&i);
        printf("err %d\n", err);


        err = clEnqueueNDRangeKernel(queue, kernel_one, 2, NULL, 
            global_work_size, local_work_size, 0, NULL, NULL);
        printf("err %d\n", err);

        err = clEnqueueCopyBuffer(queue, d_max_prob_new, d_max_prob_old, 0, 0,
            sizeof(float)*n_state, 0, NULL, NULL);
        printf("err %d\n", err);
    }

    local_work_size[0] = 1;
    global_work_size[0] = 1;

    err = clSetKernelArg(kernel_path, 0, sizeof(cl_mem), (void*)&v_prob);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel_path, 1, sizeof(cl_mem), (void*)&v_path);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel_path, 2, sizeof(cl_mem), 
        (void*)&d_max_prob_new);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel_path, 3, sizeof(cl_mem), (void*)&d_path);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel_path, 4, sizeof(int), (void*)&n_state);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel_path, 5, sizeof(int), (void*)&n_obs);
    printf("err %d\n", err);

    err = clEnqueueNDRangeKernel(queue, kernel_path, 1, NULL, 
        global_work_size, local_work_size, 0, NULL, NULL);
    printf("err %d\n", err);

    clFinish(queue);
    printf("finish done\n");

    err = clEnqueueReadBuffer(queue, v_path, CL_TRUE, 0, sizeof(int)*n_obs, 
        viterbi_gpu, 0, NULL, NULL);
    printf("err %d\n", err);

    for (i = 0; i < n_obs; i++) {
        printf("%d %d\n", i, viterbi_gpu[i]);
    }

    clReleaseMemObject(d_mt_state);
    clReleaseMemObject(d_mt_emit);
    clReleaseMemObject(d_max_prob_old);
    clReleaseMemObject(d_max_prob_new);
    clReleaseMemObject(d_path);
    clReleaseMemObject(v_prob);
    clReleaseMemObject(v_path);
    clReleaseProgram(program);
    clReleaseKernel(kernel_one);
    clReleaseKernel(kernel_path);
    clReleaseCommandQueue(queue);
}