void displayHUD(Object * ship, SDL_Texture * font, uint32_t timer) { Object * temp; char buffer[BUFFER_SIZE] = {0}; int previous = 0; SDL_Rect bar; /* Set the HUD bar */ bar.x = 0; bar.y = 0; bar.w = Global->screenWidth; bar.h = Global->screenTop; setWindowColor(0, 51, 102, 0xFF); SDL_RenderFillRect(Global->renderer, &bar); setWindowColor(0x0, 0x0, 0x0, 0xFF); /* Display score text */ strncpy(buffer, "Score", BUFFER_SIZE); temp = createTextObject(font, buffer, FONT_LARGE, 1.0); positionTextObject(temp, 0, 0); /* Display score number */ previous = (temp->x + ((countObjects(temp) + 5 - snprintf(buffer, BUFFER_SIZE, "%d", score)) * temp->clip.w * 0.5 * temp->scale)); freeObjects(temp); temp = createTextObject(font, buffer, FONT_LARGE, 1.0); positionTextObject(temp, previous, 0); /* Display lives text */ previous += getTextObjectLength(temp, 5); freeObjects(temp); strncpy(buffer, "Lives", BUFFER_SIZE); temp = createTextObject(font, buffer, FONT_LARGE, 1.0); positionTextObject(temp, previous, 0); /* Display lives number */ previous = (temp->x + ((countObjects(temp) + 3 - snprintf(buffer, BUFFER_SIZE, "%d", ship->lives)) * temp->clip.w * 0.5 * temp->scale)); freeObjects(temp); temp = createTextObject(font, buffer, FONT_LARGE, 1.0); positionTextObject(temp, previous, 0); freeObjects(temp); /* Display Timer */ snprintf(buffer, BUFFER_SIZE, "%d", timer); temp = createTextObject(font, buffer, FONT_LARGE, 1.0); positionTextObject(temp, ((Global->screenWidth - (countObjects(temp) * temp->clip.w))), 0); freeObjects(temp); }
void KDTree::freeObjects(KDNode *node, std::unordered_set<Drawable *> &freedSet) { if(node->is_leaf) { for(unsigned int i = 0; i < node->objects.size(); i++) { if(freedSet.count(node->objects[i]) == 0) { freedSet.insert(node->objects[i]); delete node->objects[i]; } } } else { freeObjects(node->left, freedSet); freeObjects(node->right, freedSet); } }
void displayTextMiddle(SDL_Texture * font, char * text, objectType type, float scale) { Object * temp = createTextObject(font, text, type, scale); positionTextObject(temp, (((Global->screenWidth - Global->screenRight) / 2) + Global->screenLeft) - (getTextObjectLength(temp, 0) / 2), (((Global->screenHeight - Global->screenBottom) / 2) + Global->screenTop) - (temp->clip.h / 2)); freeObjects(temp); }
int allocWindow(DictionaryIterator *rdi) { Tuple *t; MyWindow *mw = malloc(sizeof(MyWindow)); if (mw == NULL) { return -ENOMEM; } mw->myTextLayers = createObjects(myTextLayerDestructor); if (mw == NULL) { free(mw); return -ENOMEM; } for (int i = 0;i < NUM_BUTTONS; ++i) { mw->button_config[i] = 0; } mw->id = 0; mw->w = window_create(); mw->appTimer = NULL; if (mw->w == NULL) { freeObjects(mw->myTextLayers); free(mw); return -ENOMEM; } window_set_user_data(mw->w, mw); // Set handlers to manage the elements inside the Window window_set_window_handlers(mw->w, (WindowHandlers) { .load = window_load, .unload = window_unload });
int main(int argc, char **argv) { double min_x; double max_x; double min_y; double max_y; if (! assignIndex(atoi(argv[1]), atoi(argv[2])) ) { cerr << "Invalid arguments for field indices" << endl; return -1; } // initlize the GEOS ibjects gf = new GeometryFactory(new PrecisionModel(),0); wkt_reader= new WKTReader(gf); // process input data map<int,Geometry*> geom_polygons; string input_line; vector<string> fields; cerr << "Reading input from stdin..." <<endl; id_type id ; Geometry* geom; const Envelope * env; while(cin && getline(cin, input_line) && !cin.eof()){ fields = parse(input_line); if (fields[ID_IDX].length() <1 ) continue ; // skip lines which has empty id field id = std::strtoul(fields[ID_IDX].c_str(), NULL, 0); if (fields[GEOM_IDX].length() <2 ) { #ifndef NDEBUG cerr << "skipping record [" << id <<"]"<< endl; #endif continue ; // skip lines which has empty geometry } // try { geom = wkt_reader->read(fields[GEOM_IDX]); env = geom->getEnvelopeInternal(); cout << fields[ID_IDX] << TAB << env->getMinX() << TAB << env->getMinY() << TAB << env->getMaxX() << TAB << env->getMaxY() << endl; } cout.flush(); cerr.flush(); freeObjects(); return 0; // success }
void keyboard(unsigned char key, int x, int y) { switch (key) { case 'q':case 'Q': //free(scene); freeObjects(); exit(0); break; case 's':case 'S': save_image(); glutPostRedisplay(); break; default: break; } }
// main body of the engine int main(int argc, char** argv) { /* if (argc < 4) { cerr << "usage: resque [predicate] [shape_idx 1] [shape_idx 2] [distance]" <<endl; return 1; } */ init(); int c = 0 ; if (!extractParams(argc,argv)) { std::cerr <<"ERROR: query parameter extraction error." << std::endl << "Please see documentations, or contact author." << std::endl; usage(); return 1; } switch (stop.join_cardinality){ case 1: case 2: c = mJoinQuery(); // std::cerr <<"ERROR: input data parsing error." << std::endl << "Please see documentations, or contact author." << std::endl; break; default: std::cerr <<"ERROR: join cardinality does not match engine capacity." << std::endl ; return 1; break; } if (c >= 0 ) std::cerr <<"Query Load: [" << c << "]" <<std::endl; else { std::cerr <<"Error: ill formatted data. Terminating ....... " << std::endl; return 1; } freeObjects(); cout.flush(); cerr.flush(); return 0; }
int resetWindows(DictionaryIterator *rdi) { MyWindow *mw; int rh; objects *tmpWindows; APP_LOG(APP_LOG_LEVEL_DEBUG, "resetWindows"); // Doesn't currently work once windows exist. if (myWindows != NULL) { return 0; } tmpWindows = myWindows; myWindows = NULL; APP_LOG(APP_LOG_LEVEL_DEBUG, "About to create windows structure."); myWindows = createObjects(MyWindowDestructor); APP_LOG(APP_LOG_LEVEL_DEBUG, "objects created. "); // Need to create a window to keep the app from // exiting, so we might as well make it available. rh = allocWindow(NULL); APP_LOG(APP_LOG_LEVEL_DEBUG, "root window handle = %d", rh); if (rh != 0) { APP_LOG(APP_LOG_LEVEL_ERROR, "Root window handle %d != 0", rh); } mw = getWindowByHandle(rh); if (mw == NULL) { APP_LOG(APP_LOG_LEVEL_ERROR, "Root window null"); } pushWindow(mw, rdi); if (tmpWindows) { freeObjects(tmpWindows); } return 0; }
static void MyWindowDestructor(void *vptr) { MyWindow *mw = (MyWindow *)vptr; if (mw->myTextLayers) { freeObjects(mw->myTextLayers); mw->myTextLayers = NULL; } if (mw->appTimer) { app_timer_cancel(mw->appTimer); mw->appTimer = NULL; } if (mw->w) { APP_LOG(APP_LOG_LEVEL_DEBUG, "About to call window_destroy. mw=%p w=%p", mw, mw->w); window_stack_remove(mw->w, false); APP_LOG(APP_LOG_LEVEL_DEBUG, "Window removed from stack."); window_destroy(mw->w); APP_LOG(APP_LOG_LEVEL_DEBUG, "Window Destroyed."); mw->w = NULL; } free(mw); }
X3fParser::X3fParser(FileMap* file) { decoder = NULL; bytes = NULL; mFile = file; uint32 size = file->getSize(); if (size<104+128) ThrowRDE("X3F file too small"); if (getHostEndianness() == little) bytes = new ByteStream(file->getData(0), size); else bytes = new ByteStreamSwap(file->getData(0), size); try { try { // Read signature if (bytes->getUInt() != 0x62564f46) ThrowRDE("X3F Decoder: Not an X3f file (Signature)"); uint32 version = bytes->getUInt(); if (version < 0x00020000) ThrowRDE("X3F Decoder: File version too old"); // Skip identifier + mark bits bytes->skipBytes(16+4); bytes->setAbsoluteOffset(0); decoder = new X3fDecoder(file); readDirectory(); } catch (IOException e) { ThrowRDE("X3F Decoder: IO Error while reading header: %s", e.what()); } } catch (RawDecoderException e) { freeObjects(); throw e; } }
void benchmark_csr(char* clspmvpath, char* oclfilename, int ntimes, cl_device_type deviceType) { char outname[1000]; sprintf(outname, "%s%s", clspmvpath, "/benchmark/csr.ben"); FILE* outfile = fopen(outname, "w"); int methodnum = 2; double floptable[methodnum]; //Get device info cl_device_id* devices = NULL; cl_context context = NULL; cl_command_queue cmdQueue = NULL; cl_program program = NULL; assert(initialization(deviceType, devices, &context, &cmdQueue, &program, oclfilename) == 1); cl_int errorCode = CL_SUCCESS; //Assuming GPU is at devices[0] cl_uint dev_exec_num; size_t devicesSize = 0; errorCode = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &devicesSize ); CHECKERROR; devices = new cl_device_id[devicesSize / sizeof(cl_device_id)]; CHECKERROR; errorCode = clGetContextInfo(context, CL_CONTEXT_DEVICES, devicesSize, devices, NULL ); CHECKERROR; errorCode = clGetDeviceInfo(devices[0], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof( dev_exec_num ), &dev_exec_num, NULL ); CHECKERROR; freeObjects(devices, &context, &cmdQueue, &program); printf("\nCompute units %d\n", dev_exec_num); unsigned int warp_per_group = CSR_VEC_GROUP_SIZE/WARPSIZE; unsigned int max_group_num = dev_exec_num*MAX_WARP_PER_PROC/warp_per_group; for (unsigned int size = 1024; size <= 262144; size*=2) { float* vec = (float*)malloc(sizeof(float)*size); float* res = (float*)malloc(sizeof(float)*size); initVectorOne<int, float>(vec, size); initVectorZero<int, float>(res, size); for (unsigned int csrnum = 2; csrnum <= 2048; csrnum *= 4) //for (unsigned int csrnum = 128; csrnum <= 2048; csrnum *= 4) { if (size*csrnum > 67108864) break; if (csrnum > size) break; csr_matrix<int, float> csrmat; init_csr_mat(csrmat, size, csrnum); for (unsigned int groupnum = dev_exec_num; groupnum <= max_group_num; groupnum += dev_exec_num) { double opttime = 10000.0f; double optflop = 0.0f; int optmethod = 0; spmv_csr_ocl(&csrmat, vec, res, 1, opttime, optflop, optmethod, oclfilename, deviceType, ntimes, floptable, groupnum); printf("\n------------------------------------------------------------------------\n"); printf("CSR Dim %d BN %d GN %d opttime %f ms optflop %f optmethod %d", size, csrnum, groupnum, opttime*1000.0, optflop, optmethod); printf("\n------------------------------------------------------------------------\n"); fprintf(outfile, "%d %d %d", size, csrnum, groupnum); for (unsigned int k = 0; k < methodnum; k++) fprintf(outfile, " %f", floptable[k]); fprintf(outfile, "\n"); } free_csr_matrix(csrmat); } free(vec); free(res); } fclose(outfile); }
void spmv_csr_ocl(csr_matrix<int, float>* mat, float* vec, float* result, int dim2Size, double& opttime, double& optflop, int& optmethod, char* oclfilename, cl_device_type deviceType, int ntimes, double* floptable, int groupnum) { cl_device_id* devices = NULL; cl_context context = NULL; cl_command_queue cmdQueue = NULL; cl_program program = NULL; assert(initialization(deviceType, devices, &context, &cmdQueue, &program, oclfilename) == 1); cl_int errorCode = CL_SUCCESS; //Create device memory objects cl_mem devRowPtr; cl_mem devColId; cl_mem devData; cl_mem devVec; cl_mem devTexVec; cl_mem devRes; //Initialize values int nnz = mat->matinfo.nnz; int vecsize = mat->matinfo.width; int rownum = mat->matinfo.height; int rowptrsize = rownum + 1; ALLOCATE_GPU_READ(devRowPtr, mat->csr_row_ptr, sizeof(int)*rowptrsize); ALLOCATE_GPU_READ(devColId, mat->csr_col_id, sizeof(int)*nnz); ALLOCATE_GPU_READ(devData, mat->csr_data, sizeof(float)*nnz); ALLOCATE_GPU_READ(devVec, vec, sizeof(float)*vecsize); int paddedres = findPaddedSize(rownum, 16); devRes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*paddedres, NULL, &errorCode); CHECKERROR; //errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; const cl_image_format floatFormat = { CL_R, CL_FLOAT, }; int width = VEC2DWIDTH; int height = (vecsize + VEC2DWIDTH - 1)/VEC2DWIDTH; float* image2dVec = (float*)malloc(sizeof(float)*width*height); memset(image2dVec, 0, sizeof(float)*width*height); for (int i = 0; i < vecsize; i++) { image2dVec[i] = vec[i]; } size_t origin[] = {0, 0, 0}; size_t vectorSize[] = {width, height, 1}; devTexVec = clCreateImage2D(context, CL_MEM_READ_ONLY, &floatFormat, width, height, 0, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteImage(cmdQueue, devTexVec, CL_TRUE, origin, vectorSize, 0, 0, image2dVec, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); opttime = 10000.0f; optmethod = 0; int dim2 = dim2Size; { int methodid = 0; cl_mem devRowPtrPad; int padrowsize = findPaddedSize(rownum, CSR_VEC_GROUP_SIZE/WARPSIZE); int* rowptrpad = (int*)malloc(sizeof(int)*(padrowsize+1)); memset(rowptrpad, 0, sizeof(int)*(padrowsize+1)); for (int i = 0; i <= mat->matinfo.height; i++) rowptrpad[i] = mat->csr_row_ptr[i]; ALLOCATE_GPU_READ(devRowPtrPad, rowptrpad, sizeof(int)*(padrowsize+1)); clFinish(cmdQueue); printf("\nRow Num %d padded size %d\n", rownum, padrowsize); cl_uint work_dim = 2; //int dim2 = 16; size_t blocksize[] = {CSR_VEC_GROUP_SIZE, 1}; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, "gpu_csr_ve_slm_pm_fs", &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devRowPtrPad); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColId); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(int), &rownum); CHECKERROR; { size_t globalsize[] = {groupnum * CSR_VEC_GROUP_SIZE, dim2}; for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nCSR vector SLM row ptr padded mat strided rows fixed size:%d cpu time %lf ms GFLOPS %lf code %d \n\n", groupnum * CSR_VEC_GROUP_SIZE, time_in_sec / (double) ntimes * 1000, gflops, methodid); double onetime = time_in_sec / (double) ntimes; floptable[methodid] = gflops; if (onetime < opttime) { opttime = onetime; optmethod = methodid; optflop = gflops; } } if (devRowPtrPad) clReleaseMemObject(devRowPtrPad); if (csrKernel) clReleaseKernel(csrKernel); free(rowptrpad); } { int methodid = 1; cl_mem devRowPtrPad; int padrowsize = findPaddedSize(rownum, CSR_VEC_GROUP_SIZE/WARPSIZE); int* rowptrpad = (int*)malloc(sizeof(int)*(padrowsize+1)); memset(rowptrpad, 0, sizeof(int)*(padrowsize+1)); for (int i = 0; i <= mat->matinfo.height; i++) rowptrpad[i] = mat->csr_row_ptr[i]; ALLOCATE_GPU_READ(devRowPtrPad, rowptrpad, sizeof(int)*(padrowsize+1)); clFinish(cmdQueue); printf("\nRow Num %d padded size %d\n", rownum, padrowsize); cl_uint work_dim = 2; //int dim2 = 16; size_t blocksize[] = {CSR_VEC_GROUP_SIZE, 1}; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, "gpu_csr_ve_reduction_fs", &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devRowPtrPad); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColId); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(int), &rownum); CHECKERROR; { size_t globalsize[] = {groupnum * CSR_VEC_GROUP_SIZE, dim2}; for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nCSR vector SLM row ptr padded mat strided rows fixed size:%d cpu time %lf ms GFLOPS %lf code %d \n\n", groupnum * CSR_VEC_GROUP_SIZE, time_in_sec / (double) ntimes * 1000, gflops, methodid); double onetime = time_in_sec / (double) ntimes; floptable[methodid] = gflops; if (onetime < opttime) { opttime = onetime; optmethod = methodid; optflop = gflops; } } if (devRowPtrPad) clReleaseMemObject(devRowPtrPad); if (csrKernel) clReleaseKernel(csrKernel); free(rowptrpad); } //Clean up if (image2dVec) free(image2dVec); if (devRowPtr) clReleaseMemObject(devRowPtr); if (devColId) clReleaseMemObject(devColId); if (devData) clReleaseMemObject(devData); if (devVec) clReleaseMemObject(devVec); if (devTexVec) clReleaseMemObject(devTexVec); if (devRes) clReleaseMemObject(devRes); freeObjects(devices, &context, &cmdQueue, &program); }
int main(int argc, char **argv) { double min_x; double max_x; double min_y; double max_y; if (argc < 3) { cerr << "Not enough arguments:" << argv[0] << " [geomidx] [ratio] " << endl; return -1; } GEOM_IDX = atoi(argv[1]) -1; if (GEOM_IDX < 0) { cerr << "Invalid arguments for field indices" << endl; return -1; } ratio = strtod(argv[2], NULL); // initlize the GEOS ibjects gf = new GeometryFactory(new PrecisionModel(),0); wkt_reader= new WKTReader(gf); // process input data map<int,Geometry*> geom_polygons; string input_line; vector<string> fields; cerr << "Reading input from stdin..." <<endl; id_type id ; Geometry* geom; const Envelope * env; long count = 1; while(cin && getline(cin, input_line) && !cin.eof()){ fields = parse(input_line); //if (fields[ID_IDX].length() <1 ) // continue ; // skip lines which has empty id field // id = std::strtoul(fields[ID_IDX].c_str(), NULL, 0); if (fields[GEOM_IDX].length() <2 ) { #ifndef NDEBUG cerr << "skipping record [" << id <<"]"<< endl; #endif continue ; // skip lines which has empty geometry } // try { geom = wkt_reader->read(fields[GEOM_IDX]); env = geom->getEnvelopeInternal(); if ( (double) rand() / (double) (RAND_MAX) < ratio) { cout << count++ << TAB << env->getMinX() << TAB << env->getMinY() << TAB << env->getMaxX() << TAB << env->getMaxY() << endl; } delete geom; } cout.flush(); cerr.flush(); freeObjects(); return 0; // success }
void spmv_sell_ocl(sell_matrix<int, float>* mat, float* vec, float* result, int dim2Size, double& opttime, double& optflop, int& optmethod, char* oclfilename, cl_device_type deviceType, int ntimes, double* floptable) { cl_device_id* devices = NULL; cl_context context = NULL; cl_command_queue cmdQueue = NULL; cl_program program = NULL; assert(initialization(deviceType, devices, &context, &cmdQueue, &program, oclfilename) == 1); cl_int errorCode = CL_SUCCESS; //Create device memory objects cl_mem devSlicePtr; cl_mem devColid; cl_mem devData; cl_mem devVec; cl_mem devRes; cl_mem devTexVec; //Initialize values int nnz = mat->matinfo.nnz; int rownum = mat->matinfo.height; int vecsize = mat->matinfo.width; int sliceheight = mat->sell_slice_height; int slicenum = mat->sell_slice_num; int datasize = mat->sell_slice_ptr[slicenum]; ALLOCATE_GPU_READ(devSlicePtr, mat->sell_slice_ptr, sizeof(int)*(slicenum + 1)); ALLOCATE_GPU_READ(devColid, mat->sell_col_id, sizeof(int)*datasize); ALLOCATE_GPU_READ(devData, mat->sell_data, sizeof(float)*datasize); ALLOCATE_GPU_READ(devVec, vec, sizeof(float)*vecsize); int paddedres = findPaddedSize(rownum, 512); devRes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*paddedres, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; const cl_image_format floatFormat = { CL_R, CL_FLOAT, }; int width = VEC2DWIDTH; int height = (vecsize + VEC2DWIDTH - 1)/VEC2DWIDTH; float* image2dVec = (float*)malloc(sizeof(float)*width*height); memset(image2dVec, 0, sizeof(float)*width*height); for (int i = 0; i < vecsize; i++) { image2dVec[i] = vec[i]; } size_t origin[] = {0, 0, 0}; size_t vectorSize[] = {width, height, 1}; devTexVec = clCreateImage2D(context, CL_MEM_READ_ONLY, &floatFormat, width, height, 0, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteImage(cmdQueue, devTexVec, CL_TRUE, origin, vectorSize, 0, 0, image2dVec, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); //printf("\nvec length %d padded length %d", mat->matinfo.width, padveclength); int dim2 = dim2Size; if (sliceheight == WARPSIZE) { int methodid = 0; cl_uint work_dim = 2; size_t blocksize[] = {SELL_GROUP_SIZE, 1}; int gsize = ((rownum + SELL_GROUP_SIZE - 1)/SELL_GROUP_SIZE)*SELL_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; //printf("gsize %d rownum %d slicenum %d sliceheight %d datasize %d nnz %d vecsize %d \n", gsize, rownum, slicenum, sliceheight, datasize, nnz, vecsize); //int warpnum = SELL_GROUP_SIZE / WARPSIZE; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, "gpu_sell_warp", &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devSlicePtr); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(int), &slicenum); CHECKERROR; for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nSELL cpu warp time %lf ms GFLOPS %lf code %d \n\n", time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); double onetime = time_in_sec / (double) ntimes; floptable[methodid] = gflops; if (onetime < opttime) { opttime = onetime; optmethod = methodid; optflop = gflops; } } if (sliceheight == SELL_GROUP_SIZE) { int methodid = 1; cl_uint work_dim = 2; size_t blocksize[] = {SELL_GROUP_SIZE, 1}; int gsize = slicenum * SELL_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, "gpu_sell_group", &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devSlicePtr); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(int), &slicenum); CHECKERROR; for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nSELL cpu group time %lf ms GFLOPS %lf code %d \n\n", time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); double onetime = time_in_sec / (double) ntimes; floptable[methodid] = gflops; if (onetime < opttime) { opttime = onetime; optmethod = methodid; optflop = gflops; } } //Clean up if (image2dVec) free(image2dVec); if (devSlicePtr) clReleaseMemObject(devSlicePtr); if (devColid) clReleaseMemObject(devColid); if (devData) clReleaseMemObject(devData); if (devVec) clReleaseMemObject(devVec); if (devTexVec) clReleaseMemObject(devTexVec); if (devRes) clReleaseMemObject(devRes); freeObjects(devices, &context, &cmdQueue, &program); }
int clearWindow(MyWindow *mw, DictionaryIterator *rdi) { freeObjects(mw->myTextLayers); mw->myTextLayers = NULL; return 0; }
void deinit_windows() { if (myWindows) { freeObjects(myWindows); myWindows = NULL; } }
/* Flushes all memory */ void Sys_Purge() { freeObjects(); SysStatus = SUCCEEDED; }
X3fParser::~X3fParser(void) { freeObjects(); }
void KDTree::freeAllObj() { std::unordered_set<Drawable *> freedSet; freeObjects(root, freedSet); }
void spmv_b4ell_ocl(b4ell_matrix<int, float>* mat, float* vec, float* result, int dim2Size, double& opttime, int& optmethod, char* oclfilename, cl_device_type deviceType, float* coores, int ntimes, int bw, int bh) { cl_device_id* devices = NULL; cl_context context = NULL; cl_command_queue cmdQueue = NULL; cl_program program = NULL; assert(initialization(deviceType, devices, &context, &cmdQueue, &program, oclfilename) == 1); cl_int errorCode = CL_SUCCESS; //Create device memory objects cl_mem devColid; cl_mem devData; cl_mem devVec; cl_mem devRes; cl_mem devTexVec; //Initialize values int col_align = mat->b4ell_height_aligned; int data_align = mat->b4ell_float4_aligned; int nnz = mat->matinfo.nnz; int rownum = mat->matinfo.height; int blockrownum = mat->b4ell_row_num; int vecsize = mat->matinfo.width; int b4ellnum = mat->b4ell_block_num; int bwidth = mat->b4ell_bwidth; int bheight = mat->b4ell_bheight; int width4num = bwidth / 4; int padveclen = findPaddedSize(vecsize, 8); float* paddedvec = (float*)malloc(sizeof(float)*padveclen); memset(paddedvec, 0, sizeof(float)*padveclen); memcpy(paddedvec, vec, sizeof(float)*vecsize); ALLOCATE_GPU_READ(devColid, mat->b4ell_col_id, sizeof(int)*col_align*b4ellnum); ALLOCATE_GPU_READ(devData, mat->b4ell_data, sizeof(float)*data_align*bheight*width4num*b4ellnum); ALLOCATE_GPU_READ(devVec, paddedvec, sizeof(float)*padveclen); int paddedres = findPaddedSize(rownum, 512); devRes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*paddedres, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; const cl_image_format floatFormat = { CL_RGBA, CL_FLOAT, }; int width = VEC2DWIDTH; int height = (vecsize + VEC2DWIDTH - 1)/VEC2DWIDTH; if (height % 4 != 0) height += (4 - (height % 4)); float* image2dVec = (float*)malloc(sizeof(float)*width*height); memset(image2dVec, 0, sizeof(float)*width*height); for (int i = 0; i < vecsize; i++) { image2dVec[i] = vec[i]; } size_t origin[] = {0, 0, 0}; size_t vectorSize[] = {width, height/4, 1}; devTexVec = clCreateImage2D(context, CL_MEM_READ_ONLY, &floatFormat, width, height/4, 0, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteImage(cmdQueue, devTexVec, CL_TRUE, origin, vectorSize, 0, 0, image2dVec, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); //printf("\nvec length %d padded length %d", mat->matinfo.width, padveclength); opttime = 10000.0f; optmethod = 0; int dim2 = dim2Size; { int methodid = 0; cl_uint work_dim = 2; size_t blocksize[] = {BELL_GROUP_SIZE, 1}; int gsize = ((blockrownum + BELL_GROUP_SIZE - 1)/BELL_GROUP_SIZE)*BELL_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; int data_align4 = data_align / 4; char kernelname[100] = "gpu_bell00"; kernelname[8] += bh; kernelname[9] += bw; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, kernelname, &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(int), &data_align4); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(int), &col_align); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(int), &b4ellnum); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 7, sizeof(int), &blockrownum); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); float* tmpresult = (float*)malloc(sizeof(float)*rownum); errorCode = clEnqueueReadBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, tmpresult, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); two_vec_compare(coores, tmpresult, rownum); free(tmpresult); for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nBELL %dx%d block cpu time %lf ms GFLOPS %lf code %d \n\n", bh, bw, time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); double onetime = time_in_sec / (double) ntimes; if (onetime < opttime) { opttime = onetime; optmethod = methodid; } } { int methodid = 1; cl_uint work_dim = 2; size_t blocksize[] = {BELL_GROUP_SIZE, 1}; int gsize = ((blockrownum + BELL_GROUP_SIZE - 1)/BELL_GROUP_SIZE)*BELL_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; int data_align4 = data_align / 4; char kernelname[100] = "gpu_bell00_mad"; kernelname[8] += bh; kernelname[9] += bw; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, kernelname, &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(int), &data_align4); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(int), &col_align); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(int), &b4ellnum); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 7, sizeof(int), &blockrownum); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); float* tmpresult = (float*)malloc(sizeof(float)*rownum); errorCode = clEnqueueReadBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, tmpresult, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); two_vec_compare(coores, tmpresult, rownum); free(tmpresult); for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nBELL %dx%d block mad cpu time %lf ms GFLOPS %lf code %d \n\n", bh, bw, time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); double onetime = time_in_sec / (double) ntimes; if (onetime < opttime) { opttime = onetime; optmethod = methodid; } } { int methodid = 100; cl_uint work_dim = 2; size_t blocksize[] = {BELL_GROUP_SIZE, 1}; int gsize = ((blockrownum + BELL_GROUP_SIZE - 1)/BELL_GROUP_SIZE)*BELL_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; int data_align4 = data_align / 4; char kernelname[100] = "gpu_bell00_tx"; kernelname[8] += bh; kernelname[9] += bw; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, kernelname, &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(int), &data_align4); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(int), &col_align); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(int), &b4ellnum); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devTexVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 7, sizeof(int), &blockrownum); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); float* tmpresult = (float*)malloc(sizeof(float)*rownum); errorCode = clEnqueueReadBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, tmpresult, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); two_vec_compare(coores, tmpresult, rownum); free(tmpresult); for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nBELL %dx%d block tx cpu time %lf ms GFLOPS %lf code %d \n\n", bh, bw, time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); double onetime = time_in_sec / (double) ntimes; if (onetime < opttime) { opttime = onetime; optmethod = methodid; } } { int methodid = 101; cl_uint work_dim = 2; size_t blocksize[] = {BELL_GROUP_SIZE, 1}; int gsize = ((blockrownum + BELL_GROUP_SIZE - 1)/BELL_GROUP_SIZE)*BELL_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; int data_align4 = data_align / 4; char kernelname[100] = "gpu_bell00_mad_tx"; kernelname[8] += bh; kernelname[9] += bw; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, kernelname, &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(int), &data_align4); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(int), &col_align); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(int), &b4ellnum); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devTexVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 7, sizeof(int), &blockrownum); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); float* tmpresult = (float*)malloc(sizeof(float)*rownum); errorCode = clEnqueueReadBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, tmpresult, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); two_vec_compare(coores, tmpresult, rownum); free(tmpresult); for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nBELL %dx%d block mad tx cpu time %lf ms GFLOPS %lf code %d \n\n", bh, bw, time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); double onetime = time_in_sec / (double) ntimes; if (onetime < opttime) { opttime = onetime; optmethod = methodid; } } //Clean up if (image2dVec) free(image2dVec); if (devColid) clReleaseMemObject(devColid); if (devData) clReleaseMemObject(devData); if (devVec) clReleaseMemObject(devVec); if (devTexVec) clReleaseMemObject(devTexVec); if (devRes) clReleaseMemObject(devRes); freeObjects(devices, &context, &cmdQueue, &program); }
int main(int argc, char **argv) { if (argc != 6 && argc != 5) { cerr << "ERROR: Not enough arguments. Usage: " << argv[0] << " [geomid1] [geomid2] [partition_file] [prefixpath1] [prefixpath2]" << endl; return -1; } //int uid_idx = args_info.uid_arg; filename = argv[3]; GEOM_IDX = 2; JOIN_IDX = -1; char* stdinfilename = getenv("mapreduce_map_input_file"); char* prefix1 = argv[4]; char* prefix2 = NULL; if (argc == 6) { prefix2 = argv[5]; } if ( strstr(stdinfilename, prefix1) == NULL) { JOIN_IDX = 2; GEOM_IDX = atoi(argv[2]); } else { JOIN_IDX = 1; GEOM_IDX = atoi(argv[1]); } // cerr << "JOIN_IDX: " << JOIN_IDX << " Geom: " << GEOM_IDX <<endl; if (JOIN_IDX < 0) { cerr << "Invalid join index" << endl; return -1; } gf = new GeometryFactory(new PrecisionModel(),0); wkt_reader= new WKTReader(gf); // process input data // map<int,Geometry*> geom_polygons; string input_line; vector<string> fields; id_type id = 0; Geometry* geom ; genTiles(); bool ret = buildIndex(); if (ret == false) { cerr << "ERROR: Index building on tile structure has failed ." << std::endl; return 1 ; } else #ifndef NDEBUG cerr << "GRIDIndex Generated successfully." << endl; #endif cerr << "Reading input from stdin..." <<endl; while(cin && getline(cin, input_line) && !cin.eof()){ fields = parse(input_line); if (fields[GEOM_IDX].length() <2 ) { #ifndef NDEBUG cerr << "skipping record [" << id <<"]"<< endl; #endif continue ; // skip lines which has empty geometry } // try { geom = wkt_reader->read(fields[GEOM_IDX]); //} /*catch (...) { cerr << "WARNING: Record [id = " <<i << "] is not well formatted "<<endl; cerr << input_line << endl; continue ; }*/ // cout << input_line << endl; doQuery(geom); emitHits(geom, input_line); delete geom; } // cerr << "Number of tiles: " << geom_tiles.size() << endl; // build spatial index for input polygons cout.flush(); cerr.flush(); freeObjects(); return 0; // success }
void spmv_coo_ocl(coo_matrix<int, float>* mat, float* vec, float* result, int dim2Size, double& opttime, double& optflop, int& optmethod, char* oclfilename, cl_device_type deviceType, int ntimes, double* floptable, int maxgroupnum) { for (int i = 0; i < mat->matinfo.height; i++) result[i] = 0.0f; cl_device_id* devices = NULL; cl_context context = NULL; cl_command_queue cmdQueue = NULL; cl_program program = NULL; assert(initialization(deviceType, devices, &context, &cmdQueue, &program, oclfilename) == 1); cl_int errorCode = CL_SUCCESS; //Create device memory objects cl_mem devRowid; cl_mem devColid; cl_mem devData; cl_mem devVec; cl_mem devRes; cl_mem devTexVec; cl_mem devTmpRow; cl_mem devTmpData; //Initialize values int nnz = mat->matinfo.nnz; int rownum = mat->matinfo.height; int vecsize = mat->matinfo.width; int num_units = nnz / COO_GROUP_SIZE; if (nnz % COO_GROUP_SIZE != 0) num_units++; int group_num = (num_units < maxgroupnum) ? num_units : maxgroupnum; int work_size = group_num * COO_GROUP_SIZE; int num_iters = nnz / work_size; if (nnz % work_size != 0) num_iters++; int process_size = num_iters * COO_GROUP_SIZE; int active_warp = num_units / num_iters; if (num_units % num_iters != 0) active_warp++; int paddedNNZ = findPaddedSize(nnz, COO_ALIGNMENT); int* paddedRow = (int*)malloc(sizeof(int)*paddedNNZ); int* paddedCol = (int*)malloc(sizeof(int)*paddedNNZ); float* paddedData = (float*)malloc(sizeof(float)*paddedNNZ); memcpy(paddedRow, mat->coo_row_id, sizeof(int)*nnz); memcpy(paddedCol, mat->coo_col_id, sizeof(int)*nnz); memcpy(paddedData, mat->coo_data, sizeof(float)*nnz); for (int i = nnz; i < paddedNNZ; i++) { paddedRow[i] = mat->coo_row_id[nnz - 1]; paddedCol[i] = mat->coo_col_id[nnz - 1]; paddedData[i] = 0.0f; } ALLOCATE_GPU_READ(devRowid, paddedRow, sizeof(int)*paddedNNZ); ALLOCATE_GPU_READ(devColid, paddedCol, sizeof(int)*paddedNNZ); ALLOCATE_GPU_READ(devData, paddedData, sizeof(float)*paddedNNZ); ALLOCATE_GPU_READ(devVec, vec, sizeof(float)*vecsize); int paddedres = findPaddedSize(rownum, 512); devRes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*paddedres, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; devTmpRow = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*maxgroupnum, NULL, &errorCode); CHECKERROR; devTmpData = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*maxgroupnum, NULL, &errorCode); CHECKERROR; const cl_image_format floatFormat = { CL_R, CL_FLOAT, }; int width = VEC2DWIDTH; int height = (vecsize + VEC2DWIDTH - 1)/VEC2DWIDTH; float* image2dVec = (float*)malloc(sizeof(float)*width*height); memset(image2dVec, 0, sizeof(float)*width*height); for (int i = 0; i < vecsize; i++) { image2dVec[i] = vec[i]; } size_t origin[] = {0, 0, 0}; size_t vectorSize[] = {width, height, 1}; devTexVec = clCreateImage2D(context, CL_MEM_READ_ONLY, &floatFormat, width, height, 0, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteImage(cmdQueue, devTexVec, CL_TRUE, origin, vectorSize, 0, 0, image2dVec, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); opttime = 10000.0f; optmethod = 0; int dim2 = dim2Size; { int methodid = 0; cl_uint work_dim = 2; size_t blocksize[] = {COO_GROUP_SIZE, 1}; int gsize = group_num * COO_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, "gpu_coo_s1", &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devRowid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(int), &process_size); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(int), &paddedNNZ); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 7, sizeof(cl_mem), &devTmpRow); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 8, sizeof(cl_mem), &devTmpData); CHECKERROR; printf("process size %d nnz %d gsize %d active_warp %d\n", process_size, paddedNNZ, gsize, active_warp); size_t blocksize2[] = {COO_GROUP_SIZE * 2, 1}; size_t globalsize2[] = {COO_GROUP_SIZE * 2, dim2}; cl_kernel csrKernel2 = NULL; csrKernel2 = clCreateKernel(program, "gpu_coo_s2", &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel2, 0, sizeof(cl_mem), &devTmpRow); CHECKERROR; errorCode = clSetKernelArg(csrKernel2, 1, sizeof(cl_mem), &devTmpData); CHECKERROR; errorCode = clSetKernelArg(csrKernel2, 2, sizeof(int), &active_warp); CHECKERROR; errorCode = clSetKernelArg(csrKernel2, 3, sizeof(cl_mem), &devRes); CHECKERROR; for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); //int* tmpRow = (int*)malloc(sizeof(int)*maxgroupnum); //float* tmpData = (float*)malloc(sizeof(float)*maxgroupnum); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel2, work_dim, NULL, globalsize2, blocksize2, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nCOO cpu time %lf ms GFLOPS %lf code %d \n\n", time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); if (csrKernel2) clReleaseKernel(csrKernel2); double onetime = time_in_sec / (double) ntimes; floptable[methodid] = gflops; if (onetime < opttime) { opttime = onetime; optmethod = methodid; optflop = gflops; } //for (int i = 0; i < active_warp; i++) //printf("Row %d Data %f\n", tmpRow[i], tmpData[i]); } //Clean up if (paddedRow) free(paddedRow); if (paddedCol) free(paddedCol); if (paddedData) free(paddedData); if (image2dVec) free(image2dVec); if (devRowid) clReleaseMemObject(devRowid); if (devColid) clReleaseMemObject(devColid); if (devData) clReleaseMemObject(devData); if (devVec) clReleaseMemObject(devVec); if (devTexVec) clReleaseMemObject(devTexVec); if (devRes) clReleaseMemObject(devRes); if (devTmpRow) clReleaseMemObject(devTmpRow); if (devTmpData) clReleaseMemObject(devTmpData); freeObjects(devices, &context, &cmdQueue, &program); }