Пример #1
0
int main(int argc, char ** argv) {
	if(argc != 3){
		printf("wrong option... \n");
		return 0;
	}
    char *needleData = argv[1];
    int needleLen = strlen(needleData);
    int cccharsPerItem;
    int ggcharsPerItem;
    int characterSetSize = 128;//ASCii set
    int *skipTable; // skipTable
    char *needle = &needleData[0];//!!!Pass it in a wrong way
    skipTable = QSPrecomputation(needle, characterSetSize); 

    char *cckernelFile = "./kernel/QS_CPU_LocalCounter_OpenCL_Kernel.cl";//the kernel for CPU
    char *ggkernelFile = "./kernel/QS_GPU_LocalMem_LocalCounter_OpenCL_Kernel.cl";//the kernel for GPU
    char *cckernelSrc = LoadKernelSrcFromFile(cckernelFile);
    char *ggkernelSrc = LoadKernelSrcFromFile(ggkernelFile);
    char *fileName = argv[2];
    
    //Load the haystacks from file
    FILE *filePtr;
    filePtr = fopen(fileName, "r");
    if(!filePtr) {
        fprintf(stderr, "can not open the text file!\n");
    }
    fseek(filePtr, 0 , SEEK_END);
    int fileSize = ftell(filePtr);
    int ggfileSize = fileSize*0.5;
    int ccfileSize = fileSize - ggfileSize;
    rewind(filePtr);
    //Split the file then I need overlaping!
    char *gghaystackData = (char*)calloc(ggfileSize, sizeof(char));
    char *cchaystackData = (char*)calloc(ccfileSize, sizeof(char));
    int textLength;
    textLength = fread(gghaystackData, sizeof(char), ggfileSize, filePtr);
    textLength += fread(cchaystackData, sizeof(char), ccfileSize, filePtr);
    if(textLength != fileSize) {
        fprintf(stderr, "reading error");
    }
    fclose(filePtr);
    //~ char *cchaystackData = LoadHaystackDataFromFile(fileName);
    int cchaystackLen = strlen(cchaystackData);
    int gghaystackLen = strlen(gghaystackData);


    cl_int err;
    cl_platform_id platform;
    cl_device_id cdevice;
    cl_device_id ggdevice;

    //Find a platform
    err = clGetPlatformIDs(1, &platform, NULL);
    if(err != CL_SUCCESS) {
        printf("cant find a platform! \n");
    }
    
    //Set up devices
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &cdevice, NULL);
    if(err != CL_SUCCESS) {
        printf("cant find CPU! \n");
    }
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &ggdevice, NULL);
    if(err != CL_SUCCESS) {
        printf("cant find GPU! \n");
    }
    
    //Create context
    cl_context ccontext = clCreateContext(NULL, 1, &cdevice, NULL, NULL, &err);
    if(err != CL_SUCCESS) {
        printf("cant create a ccontext! \n");
    }
    cl_context ggcontext = clCreateContext(NULL, 1, &ggdevice, NULL, NULL, &err);
    if(err != CL_SUCCESS) {
        printf("cant create a ggcontext! \n");
    }
    
    //Create command queues
    cl_command_queue cqueue = clCreateCommandQueue(ccontext, cdevice, 0, &err);
    if(err != CL_SUCCESS) {
        printf("cant create a  cqueue! \n");
    }
    cl_command_queue ggqueue = clCreateCommandQueue(ggcontext, ggdevice, 0, &err);
    if(err != CL_SUCCESS) {
        printf("cant create a  ggqueue! \n");
    }

    
    //Create the programm object
    cl_program cprogram = clCreateProgramWithSource(ccontext, 1, (const char**)&cckernelSrc, NULL, &err);
    if(err != CL_SUCCESS) {
        printf("cant build the program! \n");
    }
    cl_program ggprogram = clCreateProgramWithSource(ggcontext, 1, (const char**)&ggkernelSrc, NULL, &err);
    if(err != CL_SUCCESS) {
        printf("cant build the program! \n");
    }

    
    //Build the programm executable
    err = clBuildProgram(cprogram, 0, NULL, NULL, NULL, NULL);
    if(err != CL_SUCCESS) {
        printf("cant build the cprogramm exe! \n");
    }
    err = clBuildProgram(ggprogram, 0, NULL, NULL, NULL, NULL);
    if(err != CL_SUCCESS) {
        printf("cant build the ggprogramm exe! \n");
    }

    
    //Create the kernel
    cl_kernel ckernel = clCreateKernel(cprogram, "QSMatch", &err);
    if(err != CL_SUCCESS) {
        printf("cant create the ckernel! \n");
    }
    cl_kernel ggkernel = clCreateKernel(ggprogram, "QSMatch", &err);
    if(err != CL_SUCCESS) {
        printf("cant create the ggkernel! \n");
    }

    
    //Create the haystack buffer
    cl_mem cchaystackBuffer = clCreateBuffer(ccontext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cchaystackLen, cchaystackData, &err);
    if(err != CL_SUCCESS) {
        printf("couldn't create the cchaystackBuffer \n");
    }
    cl_mem gghaystackBuffer = clCreateBuffer(ggcontext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, gghaystackLen, gghaystackData, &err);
    if(err != CL_SUCCESS) {
        printf("couldn't create the gghaystackBuffer \n");
    }

	//Determine global size and local size
    size_t cclocalSize = 64;
    size_t gglocalSize = 256;
    size_t ccGlobalSize = 64*4;
    size_t ggGlobalSize = 256*5*8;
    int gpuGroupNum = (int)ggGlobalSize/gglocalSize;
    cccharsPerItem = cchaystackLen/ccGlobalSize + 1;//Add 1 it important otherwise some patterns will be lost
    ggcharsPerItem = cchaystackLen/ggGlobalSize + 1;
  
    //Create the results buffer
    int ccres[4] = {0};
    int ggres[gpuGroupNum];
    int k;
    for(k=0;k<gpuGroupNum;k++){
		ggres[k] = 0;
	}
    
    cl_mem ccresBuffer = clCreateBuffer(ccontext, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(ccres), ccres, &err);
    if(err != CL_SUCCESS){
		printf("couldn't create the ccresBuffer");
	}
	cl_mem ggresBuffer = clCreateBuffer(ggcontext, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(ggres), ggres, &err);
    if(err != CL_SUCCESS){
		printf("couldn't create the ggresBuffer");
	}
	
    //Create the needleBuffer
    cl_mem ccneedleBuffer = clCreateBuffer(ccontext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(char)*needleLen, needleData, &err);
    if(err != CL_SUCCESS){
		printf("couldn't create the ccneedleBuffer \n");
	}
	cl_mem ggneedleBuffer = clCreateBuffer(ggcontext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(char)*needleLen, needleData, &err);
    if(err != CL_SUCCESS){
		printf("couldn't create the ggneedleBuffer \n");
	}
	
    //Create the skipTableBuffer
    cl_mem ccskipTableBuffer = clCreateBuffer(ccontext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*characterSetSize, skipTable, &err);
    if(err != CL_SUCCESS){
		printf("couldn't create the ccskipTableBuffer \n");
	}
	cl_mem ggskipTableBuffer = clCreateBuffer(ggcontext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*characterSetSize, skipTable, &err);
    if(err != CL_SUCCESS){
		printf("couldn't create the ggskipTableBuffer \n");
	}
	
    //Set the kernel arguments for CPU
    err = clSetKernelArg(ckernel, 0, sizeof(cl_mem), &cchaystackBuffer);
    err |= clSetKernelArg(ckernel, 1, sizeof(cl_mem), &ccneedleBuffer);
    err |= clSetKernelArg(ckernel, 2, sizeof(cl_mem), &ccskipTableBuffer);
    err |= clSetKernelArg(ckernel, 3, sizeof(int)*1, NULL);
    err |= clSetKernelArg(ckernel, 4, sizeof(needleLen), &needleLen);
    err |= clSetKernelArg(ckernel, 5, sizeof(cccharsPerItem), &cccharsPerItem);
    err |= clSetKernelArg(ckernel, 6, sizeof(cl_mem), &ccresBuffer);
    if(err != CL_SUCCESS) {
        printf("couldn't set the ckernel arguments \n");
    }
    
    //Set the kernel arguments for GPU
    err = clSetKernelArg(ggkernel, 0, sizeof(cl_mem), &gghaystackBuffer);
    err |= clSetKernelArg(ggkernel, 1, sizeof(cl_mem), &ggneedleBuffer);
    err |= clSetKernelArg(ggkernel, 2, sizeof(char)*needleLen, NULL);
    err |= clSetKernelArg(ggkernel, 3, sizeof(cl_mem), &ggskipTableBuffer);
    err |= clSetKernelArg(ggkernel, 4, sizeof(int)*128, NULL);
    err |= clSetKernelArg(ggkernel, 5, sizeof(int)*1, NULL);
    err |= clSetKernelArg(ggkernel, 6, sizeof(needleLen), &needleLen);
    err |= clSetKernelArg(ggkernel, 7, sizeof(ggcharsPerItem), &ggcharsPerItem);
    err |= clSetKernelArg(ggkernel, 8, sizeof(cl_mem), &ggresBuffer);
    if(err != CL_SUCCESS) {
        printf("couldn't set the ggkernel arguments \n");
    }
    
    //Execut the kernel
    //On CPU
    err = clEnqueueNDRangeKernel(cqueue, ckernel, 1, NULL, &ccGlobalSize, &cclocalSize, 0, NULL, NULL);
    if(err != CL_SUCCESS) {
        printf("ckernel could not be executed... \n");
    } 
    else {
        printf("ckernel has been executed successfully! \n");
    }
      
    //~ //On GPU
    err = clEnqueueNDRangeKernel(ggqueue, ggkernel, 1, NULL, &ggGlobalSize, &gglocalSize, 0, NULL, NULL);
    if(err != CL_SUCCESS) {
        printf("ggkernel could not be executed... \n");
    } 
    else {
        printf("ggkernel has been executed successfully! \n");
    }
    
    //Finish the queue lists
    clFinish(ggqueue);
    clFinish(cqueue);
    

    //Copy the results
    clEnqueueReadBuffer(cqueue, ccresBuffer, CL_TRUE, 0, sizeof(ccres), ccres, 0, NULL, NULL);
    clEnqueueReadBuffer(ggqueue, ggresBuffer, CL_TRUE, 0, sizeof(ggres), ggres, 0, NULL, NULL);
    PrintTheResults(ccres, ggres, gpuGroupNum);
    
    //Clean up
    free(skipTable);
    free(cckernelSrc);
    free(ggkernelSrc);
    free(cchaystackData);
    free(gghaystackData);
    clReleaseMemObject(cchaystackBuffer);
    clReleaseMemObject(ccneedleBuffer);
    clReleaseMemObject(ccskipTableBuffer);
    clReleaseMemObject(ccresBuffer);
    clReleaseMemObject(gghaystackBuffer);
    clReleaseMemObject(ggneedleBuffer);
    clReleaseMemObject(ggskipTableBuffer);
    clReleaseMemObject(ggresBuffer);
    clReleaseKernel(ckernel);
    clReleaseKernel(ggkernel);
    clReleaseProgram(cprogram);
    clReleaseProgram(ggprogram);
    clReleaseCommandQueue(cqueue);
    clReleaseCommandQueue(ggqueue);
    clReleaseContext(ccontext);
    clReleaseContext(ggcontext);

    return 0;
}
Пример #2
0
int main(int argc, char * argv[]) {
	if(argc < 3){
		printf("wrong option... \n");
		return 0;
	}
	int t;// Reusing t can improve the performance somehow
	int needleNum = argc - 2;//Calculate the number of needle
    string needles[needleNum]; 
    for(t=0;t<needleNum;t++){
		needles[t] = argv[t+1];
	} 
	int cpuNeedleNum = needleNum*0.5;
	int gpuNeedleNum = needleNum - cpuNeedleNum;
	if(cpuNeedleNum + gpuNeedleNum == needleNum){
		printf("the needles number is right \n");
	}
	string cpuNeedles[cpuNeedleNum];
	string gpuNeedles[gpuNeedleNum];
	for(t=0;t<cpuNeedleNum;t++){
		cpuNeedles[t] = needles[t];
	}
	for(t=0;t<gpuNeedleNum;t++){
		gpuNeedles[t] = needles[t+cpuNeedleNum];
	}
	
    //Concatenating the needles into one
    char *cpuNeedlesData = ConcatenateNeedlesIntoOneDataBlock(cpuNeedles, cpuNeedleNum);
    printf("cpuNeedlesData = %s \n", cpuNeedlesData);
    char *gpuNeedlesData = ConcatenateNeedlesIntoOneDataBlock(gpuNeedles, gpuNeedleNum);
    printf("gpuNeedlesData = %s \n", gpuNeedlesData);
    int cpuLenOfneedlesData = strlen(cpuNeedlesData);
    int gpuLenOfneedlesData = strlen(gpuNeedlesData);
    int cpuShortestNeedleLen = findShortestNeedle(cpuNeedles,cpuNeedleNum);
    int gpuShortestNeedleLen = findShortestNeedle(gpuNeedles,gpuNeedleNum);
    int cpuLongestNeedleLen = findLongestNeedle(cpuNeedles,cpuNeedleNum);
    int gpuLongestNeedleLen = findLongestNeedle(gpuNeedles,gpuNeedleNum);
   
    int characterSetSize = 128;//ASCii set
    //~ int characterSetSize = 256; //UTF-8
    int *cpuSkipTable; //CPU skipTable
    int *gpuSkipTable; //GPU skipTable
    cpuSkipTable = SetHorspoolPrecomputation(cpuNeedles, characterSetSize, cpuShortestNeedleLen, cpuNeedleNum);
    gpuSkipTable = SetHorspoolPrecomputation(gpuNeedles, characterSetSize, gpuShortestNeedleLen, gpuNeedleNum);
    
    //Calculating the lastPosOfEachNeedle array and lenOfEachNeedle array
    int cpuLastPosOfEachNeedle[cpuNeedleNum];
    int gpuLastPosOfEachNeedle[gpuNeedleNum];
    int cpuLenOfEachNeedle[cpuNeedleNum]; 
    int gpuLenOfEachNeedle[gpuNeedleNum]; 
    for(t=0;t<cpuNeedleNum;t++){
		cpuLenOfEachNeedle[t] = strlen(cpuNeedles[t]); 
		cpuLastPosOfEachNeedle[t] = strlen(cpuNeedles[t]) - 1;
	}
	for(t=0;t<gpuNeedleNum;t++){
		gpuLenOfEachNeedle[t] = strlen(gpuNeedles[t]); 
		gpuLastPosOfEachNeedle[t] = strlen(gpuNeedles[t]) - 1;
	}

    char *cpuKernelFile = "./kernel/SH_CPU_Kernel.cl";//The kernel file's name
    char *gpuKernelFile = "./kernel/SH_GPU_LocalMem_OpenCL_Kernel.cl";//The kernel file's name
    char *cpuKernelSrc = LoadKernelSrcFromFile(cpuKernelFile);
    char *gpuKernelSrc = LoadKernelSrcFromFile(gpuKernelFile);
    char *textFile = argv[argc-1];// The text file's name
    char *haystackData = LoadHaystackDataFromFile(textFile);
    int haystackLen = strlen(haystackData);

    cl_int err;
    cl_platform_id platform;
    cl_device_id cpuDevice;
    cl_device_id gpuDevice;

    //Find a platform
    err = clGetPlatformIDs(1, &platform, NULL);
    if(err != CL_SUCCESS) {
        printf("cant find a platform! \n");
    }
    
    //Set up devices
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &cpuDevice, NULL);
    if(err != CL_SUCCESS) {
        printf("cant find a cpu device! \n");
    }
     err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &gpuDevice, NULL);
    if(err != CL_SUCCESS) {
        printf("cant find a gpu device! \n");
    }
    //Create context
    cl_context cpuContext = clCreateContext(NULL, 1, &cpuDevice, NULL, NULL, &err);
    if(err != CL_SUCCESS) {
        printf("cant create a cpu context! \n");
    }
    cl_context gpuContext = clCreateContext(NULL, 1, &gpuDevice, NULL, NULL, &err);
    if(err != CL_SUCCESS) {
        printf("cant create a gpu context! \n");
    }
    
    //Create command queue
    cl_command_queue cpuQueue = clCreateCommandQueue(cpuContext, cpuDevice, 0, &err);
    if(err != CL_SUCCESS) {
        printf("cant create a cpu queue! \n");
    }
    cl_command_queue gpuQueue = clCreateCommandQueue(gpuContext, gpuDevice, 0, &err);
    if(err != CL_SUCCESS) {
        printf("cant create a gpu queue! \n");
    }
    
    //Create the programm object
    cl_program cpuProgram = clCreateProgramWithSource(cpuContext, 1, (const char**)&cpuKernelSrc, NULL, &err);
    if(err != CL_SUCCESS) {
        printf("cant build the cpu program! \n");
    }
    cl_program gpuProgram = clCreateProgramWithSource(gpuContext, 1, (const char**)&gpuKernelSrc, NULL, &err);
    if(err != CL_SUCCESS) {
        printf("cant build the gpu program! \n");
    }
    
    //Build the programm executable
    err = clBuildProgram(cpuProgram, 0, NULL, NULL, NULL, NULL);
    if(err != CL_SUCCESS) {
        printf("cant build the cpu programm exe! \n");
    }
    err = clBuildProgram(gpuProgram, 0, NULL, NULL, NULL, NULL);
    if(err != CL_SUCCESS) {
        printf("cant build the gpu programm exe! \n");
    }
    
    //Create the kernel
    cl_kernel cpuKernel = clCreateKernel(cpuProgram, "SetHorspoolMatch", &err);
    if(err != CL_SUCCESS) {
        printf("cant create the cpu kernel! \n");
    }
    cl_kernel gpuKernel = clCreateKernel(gpuProgram, "SetHorspoolMatch", &err);
    if(err != CL_SUCCESS) {
        printf("cant create the gpu kernel! \n");
    }
    
    //Create the haystack buffer 
    //CPU and GPU program share the haystack buffer
    cl_mem cpuHaystackBuffer = clCreateBuffer(cpuContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, haystackLen, haystackData, &err);
    if(err != CL_SUCCESS) {
        printf("couldn't create the cpuHaystackData buffer \n");
    }
    cl_mem gpuHaystackBuffer = clCreateBuffer(cpuContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, haystackLen, haystackData, &err);
    if(err != CL_SUCCESS) {
        printf("couldn't create the gpuHaystackData buffer \n");
    }
    
    size_t cclocalSize;
    size_t gglocalSize;
    cclocalSize = 64;
    gglocalSize = 256;
    size_t cpuGlobalSize = 64*4;//CPU
    size_t gpuGlobalSize = 256*24*8;//GPU
    //~ size_t gpuGlobalSize = 256*5*8;//GPU
    //Group Number
    int cpuGroupNum = (int)cpuGlobalSize/cclocalSize;
    int gpuGroupNum = (int)gpuGlobalSize/gglocalSize;
    //CPU GPU share the charsPerItem
    int cpuCharsPerItem = haystackLen/cpuGlobalSize + 1;
    int gpuCharsPerItem = haystackLen/gpuGlobalSize + 1;
    
    //Create the results buffer
    int cpuRes[cpuNeedleNum*cpuGroupNum];
    int gpuRes[gpuNeedleNum*gpuGroupNum];
    for(t=0;t<cpuNeedleNum*cpuGroupNum;t++){
		cpuRes[t] = 0;
	}
	for(t=0;t<gpuNeedleNum*gpuGroupNum;t++){
		gpuRes[t] = 0;
	}
    cl_mem cpuResBuffer = clCreateBuffer(cpuContext, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cpuRes), cpuRes, &err);//Should be write only
    if(err != CL_SUCCESS) {
        printf("cant create the cpuResBuffer! \n");
    }
    cl_mem gpuResBuffer = clCreateBuffer(gpuContext, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(gpuRes), gpuRes, &err);//Should be write only
    if(err != CL_SUCCESS) {
        printf("cant create the gpuResBuffer! \n");
    }
        
    //Create the needlesBuffer 
    cl_mem cpuNeedlesBuffer = clCreateBuffer(cpuContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(char)*cpuLenOfneedlesData, cpuNeedlesData, &err);//!!cpuNeedlesLen
    if(err != CL_SUCCESS) {
        printf("cant create the cpuNeedlesBuffer! \n");
    }
    cl_mem gpuNeedlesBuffer = clCreateBuffer(gpuContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(char)*gpuLenOfneedlesData, gpuNeedlesData, &err);
    if(err != CL_SUCCESS) {
        printf("cant create the gpuNeedlesBuffer! \n");
    }
    
    //Create the skipTableBuffer
    //The skipTables of CPU and CPU are different
    cl_mem cpuSkipTableBuffer = clCreateBuffer(cpuContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*characterSetSize, cpuSkipTable, &err);
    if(err != CL_SUCCESS) {
        printf("cant create the cpuSkipTableBuffer! \n");
    }
    cl_mem gpuSkipTableBuffer = clCreateBuffer(gpuContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*characterSetSize, gpuSkipTable, &err);
    if(err != CL_SUCCESS) {
        printf("cant create the gpuSkipTableBuffer! \n");
    }
    
    //Create the lastPosOfEachNeedleBuffer
    cl_mem cpuLastPosOfEachNeedleBuffer = clCreateBuffer(cpuContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*cpuNeedleNum, cpuLastPosOfEachNeedle, &err);//!!lastPosOfEachNeedle
    if(err != CL_SUCCESS) {
        printf("cant create the cpuLastPosOfEachNeedleBuffer! \n");
    }
    cl_mem gpuLastPosOfEachNeedleBuffer = clCreateBuffer(gpuContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*gpuNeedleNum, gpuLastPosOfEachNeedle, &err);//!!gpuNeedleNum
    if(err != CL_SUCCESS) {
        printf("cant create the gpuLastPosOfEachNeedleBuffer! \n");
    }
    
    //Create the lenOfEachNeedleBuffer
    cl_mem cpuLenOfEachNeedleBuffer = clCreateBuffer(cpuContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*cpuNeedleNum, cpuLenOfEachNeedle, &err);
    if(err != CL_SUCCESS) {
        printf("cant create the cpuLenOfEachNeedleBuffer! \n");
    }
    cl_mem gpuLenOfEachNeedleBuffer = clCreateBuffer(gpuContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*gpuNeedleNum, gpuLenOfEachNeedle, &err);
    if(err != CL_SUCCESS) {
        printf("cant create the gpuLenOfEachNeedleBuffer! \n");
    }
    
    //Set the kernel arguments for CPU
    err = clSetKernelArg(cpuKernel, 0, sizeof(cl_mem), &cpuHaystackBuffer);
    err |= clSetKernelArg(cpuKernel, 1, sizeof(cl_mem), &cpuNeedlesBuffer);
    err |= clSetKernelArg(cpuKernel, 2, sizeof(cl_mem), &cpuSkipTableBuffer);
    err |= clSetKernelArg(cpuKernel, 3, sizeof(cpuLongestNeedleLen), &cpuLongestNeedleLen);
    err |= clSetKernelArg(cpuKernel, 4, sizeof(cpuCharsPerItem), &cpuCharsPerItem);
    err |= clSetKernelArg(cpuKernel, 5, sizeof(cpuNeedleNum), &cpuNeedleNum);
    err |= clSetKernelArg(cpuKernel, 6, sizeof(int)*needleNum, NULL);//localCounter
    err |= clSetKernelArg(cpuKernel, 7, sizeof(cl_mem), &cpuLastPosOfEachNeedleBuffer);
    err |= clSetKernelArg(cpuKernel, 8, sizeof(cl_mem), &cpuLenOfEachNeedleBuffer);
    err |= clSetKernelArg(cpuKernel, 9, sizeof(cl_mem), &cpuResBuffer);
    if(err != CL_SUCCESS) {
        printf("couldn't set the cpu cpuKernel arguments \n");
    }
    
    //Set the kernel arguments for GPU
    err = clSetKernelArg(gpuKernel, 0, sizeof(cl_mem), &gpuHaystackBuffer);
    err |= clSetKernelArg(gpuKernel, 1, sizeof(cl_mem), &gpuNeedlesBuffer);
    err |= clSetKernelArg(gpuKernel, 2, sizeof(int), &gpuLenOfneedlesData);
    err |= clSetKernelArg(gpuKernel, 3, sizeof(char)*gpuLenOfneedlesData, NULL);//Local needlesData
    err |= clSetKernelArg(gpuKernel, 4, sizeof(cl_mem), &gpuSkipTableBuffer);
    err |= clSetKernelArg(gpuKernel, 5, sizeof(int)*128, NULL);//Local tempTable
    err |= clSetKernelArg(gpuKernel, 6, sizeof(int), &gpuLongestNeedleLen);
    err |= clSetKernelArg(gpuKernel, 7, sizeof(int), &gpuCharsPerItem);
    err |= clSetKernelArg(gpuKernel, 8, sizeof(int), &gpuNeedleNum);
    err |= clSetKernelArg(gpuKernel, 9, sizeof(int)*gpuNeedleNum, NULL);
    err |= clSetKernelArg(gpuKernel, 10, sizeof(cl_mem), &gpuLastPosOfEachNeedleBuffer);
    //~ err |= clSetKernelArg(gpuKernel, 11, sizeof(int)*gpuNeedleNum, NULL);//localLPOEN
    err |= clSetKernelArg(gpuKernel, 11, sizeof(cl_mem), &gpuLenOfEachNeedleBuffer);
    //~ err |= clSetKernelArg(gpuKernel, 13, sizeof(cl_mem), NULL);//localLEN
    err |= clSetKernelArg(gpuKernel, 12, sizeof(cl_mem), &gpuResBuffer);
    if(err != CL_SUCCESS) {
        printf("couldn't set the gpu kernel arguments \n");
    }
    
    //Execut the kernel
    //Exe on CPU
    err = clEnqueueNDRangeKernel(cpuQueue, cpuKernel, 1, NULL, &cpuGlobalSize, &cclocalSize, 0, NULL, NULL);
    if(err != CL_SUCCESS) {
        printf("CPU kernel could not be executed... \n");
    } else {
        printf("CPU kernel has been executed successfully! \n");
    }
    //Exe on GPU
    err = clEnqueueNDRangeKernel(gpuQueue, gpuKernel, 1, NULL, &gpuGlobalSize, &gglocalSize, 0, NULL, NULL);
    if(err != CL_SUCCESS) {
        printf("gPU kernel could not be executed... \n");
    } else {
        printf("gPU kernel has been executed successfully! \n");
    }
  
    //Finish the queue list
    clFinish(gpuQueue);
    clFinish(cpuQueue);
    
    //Copy the results
    clEnqueueReadBuffer(cpuQueue, cpuResBuffer, CL_TRUE, 0, sizeof(cpuRes), cpuRes, 0, NULL, NULL);
    clEnqueueReadBuffer(gpuQueue, gpuResBuffer, CL_TRUE, 0, sizeof(gpuRes), gpuRes, 0, NULL, NULL);
    PrintTheResults(cpuRes, cpuNeedles, cpuNeedleNum, cpuGroupNum);
    PrintTheResults(gpuRes, gpuNeedles, gpuNeedleNum, gpuGroupNum);

    //Clean up
    free(cpuSkipTable); 
    free(gpuSkipTable); 
    free(cpuKernelSrc);
    free(gpuKernelSrc);
    free(haystackData);
    free(cpuNeedlesData);
    free(gpuNeedlesData);
    clReleaseMemObject(cpuHaystackBuffer);
    clReleaseMemObject(gpuHaystackBuffer);
    clReleaseMemObject(cpuNeedlesBuffer);
    clReleaseMemObject(gpuNeedlesBuffer);
    clReleaseMemObject(cpuSkipTableBuffer);
    clReleaseMemObject(gpuSkipTableBuffer);
    clReleaseMemObject(cpuLenOfEachNeedleBuffer);
    clReleaseMemObject(gpuLenOfEachNeedleBuffer);
    clReleaseMemObject(cpuLastPosOfEachNeedleBuffer);
    clReleaseMemObject(gpuLastPosOfEachNeedleBuffer);
    clReleaseMemObject(cpuResBuffer);
    clReleaseMemObject(gpuResBuffer);
    clReleaseKernel(cpuKernel);
    clReleaseKernel(gpuKernel);
    clReleaseProgram(cpuProgram);
    clReleaseProgram(gpuProgram);
    clReleaseCommandQueue(cpuQueue);
    clReleaseCommandQueue(gpuQueue);
    clReleaseContext(cpuContext);
    clReleaseContext(gpuContext);

    return 0;
}