void Chunk::Init() { int width = g_BoxWidth; int depth = g_BoxDepth; int height = g_BoxHeight; int nCount = 0; float xLimit = 0; float yLimit = 0; float zLimit = 0; CPerformanceCounter counter; counter.Start(); m_ChunkPrim.Init(); m_pPhysContainer = new PhysContainer; m_GameObjects = new GameObject[g_BoxWidth * g_BoxDepth * g_BoxHeight]; m_pBoxGrid = NULL;//new Prim_t*[g_BoxWidth * g_BoxDepth * g_BoxHeight]; //m_pPhysContainer->halfwidths.Set(width * 4.0f + 0.01f, height * 4.0f + 0.01f, depth * 4.0f + 0.01f); //m_pPhysContainer->center.Set( ); Vector3d transPos; transPos.Set( m_Pos );//-128.0f, -10.0f, -128.0f); for ( int h = 0; h < height; ++h ){ for ( int i = 0; i < depth; ++i ){ for ( int j = 0; j < width; ++j ){ int idx = (h*(depth*width)) + (i*width) + j; //printf("idx: %d\n", idx); int r = rand() % 100; if( r > 65 ) { float x = transPos.x + (j*8); float y = transPos.y + (h*8); float z = transPos.z + (i*8); float xFabs = fabs(x); float yFabs = fabs(y); float zFabs = fabs(z); if( xFabs > xLimit) xLimit = xFabs; if( yFabs > yLimit) yLimit = yFabs; if( zFabs > zLimit) zLimit = zFabs; /* if( r > 75) { m_pBoxGrid[ idx ] = CreateBox( "dirt.raw", 4 ); } else { m_pBoxGrid[ idx ] = CreateBox( "test.raw", 4 ); } */ //m_pBoxGrid[ idx ] = CreateBox( "test.raw", 4 ); //m_pBoxGrid[ idx ]->vPos.Set( x, y, z ); m_GameObjects[ idx ].renderable = NULL; Vector3d boxPos; boxPos.Set( transPos.x * -1.0f, transPos.y * -1.0f, transPos.z * -1.0f ); boxPos.x += x; boxPos.y += y; boxPos.z += z; #if CHUNK_OPTIMIZATIONS m_ChunkPrim.AddBoxOpt(boxPos); #else m_ChunkPrim.AddBox(boxPos); #endif //if(m_pBoxGrid[ idx ] == NULL) { //printf("Box call failed\n"); //} //printf("Box Pos: %f %f %f\n", g_pBoxGrid[ idx ]->vPos.x, g_pBoxGrid[ idx ]->vPos.y, g_pBoxGrid[ idx ]->vPos.z); //m_GameObjects[ idx ].renderable = m_pBoxGrid[ idx ]; m_GameObjects[ idx ].phys = new PhysObject; //center point m_GameObjects[ idx ].phys->center.Set(transPos.x + (j*8), (transPos.y *0.5f) + (h*8), transPos.z + (i*8)); m_GameObjects[ idx ].phys->halfwidths.Set(4.0f,4.0f,4.0f); m_pPhysContainer->AddObject( m_GameObjects[ idx ].phys ); nCount++; } else { m_GameObjects[ idx ].renderable = NULL; m_GameObjects[ idx ].phys = NULL; //printf("renderable set to NULL\n"); } } } } m_pPhysContainer->halfwidths.Set(width * 4.0f + 0.01f, height * 4.0f + 0.01f, depth * 4.0f + 0.01f); m_pPhysContainer->center.Set( transPos.x + ((width-1)*4.0f) , (height * 4.0f) + transPos.y, transPos.z + ((depth-1.0f)*4.0f)); PhysicsManager::GetInstance().AddContainer( m_pPhysContainer ); //m_pPhysContainer->SetDebug( true ); //Vector3d boxPos; //m_ChunkPrim.AddBox(boxPos); #if CHUNK_OPTIMIZATIONS m_ChunkPrim.AddFaces(); #endif m_ChunkPrim.Bake(); m_ChunkPrim.m_Prim.vPos.Set( transPos ); //printf("%d boxes created\n", nCount); //m_ChunkPrim.PrintFaces(); counter.Stop(); printf("Chunk generated in %f seconds.\n", counter.TimeInMilliseconds()); }
int main(int argc, char** argv) { bool srcbin = 0; bool invbk = 0; if(argc < 3){ printf("Not enough args!\narg1: target image\narg2: source image\narg3: do source image adaptive threshold or not\narg4: invert back ground or not\n"); getchar(); return 1; } if(argc >= 4){ if(!strcmp(argv[3], "1")) srcbin = 1; } if(argc >= 5){ if(!strcmp(argv[4], "1")) invbk = 1; } IplImage* srcimg= 0, *srcimgb= 0, *srcimgb2 = 0, *bimg = 0, *b2img = 0,*bugimg = 0, *alg2dst = 0; srcimg= cvLoadImage(argv[2], 1); if (!srcimg) { printf("src img %s load failed!\n", argv[2]); getchar(); return 1; } //choosing the parameters for our ccl int bn = 8; //how many partitions int nwidth = 512; if(srcimg->width > 512){ nwidth = 1024; bn = 6; } if(srcimg->width > 1024){ nwidth = 2048; bn = 3; } if(srcimg->width > 2048){ printf("warning, image too wide, max support 2048. image is truncated.\n"); getchar(); return 1; } //start selection gpu devices int devCount; int smCnt = 0; cudaGetDeviceCount(&devCount); // Iterate through devices int devChosen = 0; for (int i = 0; i < devCount; ++i) { cudaDeviceProp devProp; cudaGetDeviceProperties(&devProp, i); if(devProp.major >= 2){//only one device supported smCnt = max(smCnt, devProp.multiProcessorCount); if(devProp.multiProcessorCount == smCnt) devChosen = i; } } if(smCnt == 0){ //our ccl require CUDA cap 2.0 or above, but the Ostava's ccl can be run on any CUDA gpu printf("Error, no device with cap 2.x found. Only cpu alg will be run.\n"); getchar(); return 1; } if(smCnt != 0){ cudaSetDevice(devChosen); bn = bn * smCnt; } int nheight = (cvGetSize(srcimg).height-2) / (2*bn); if((nheight*2*bn+2) < cvGetSize(srcimg).height) nheight++; nheight = nheight*2*bn+2; if(smCnt != 0) printf("gpu ccl for image width 512, 1024, 2048.\nchoosing device %d, width %d, height %d, blocks %d\n", devChosen, nwidth, nheight, bn); srcimgb= cvCreateImage(cvSize(nwidth, cvGetSize(srcimg).height),IPL_DEPTH_8U,1); srcimgb2= cvCreateImage(cvSize(nwidth, cvGetSize(srcimg).height),IPL_DEPTH_8U,1); cvSetImageROI(srcimg, cvRect(0, 0, min(cvGetSize(srcimg).width, nwidth), cvGetSize(srcimg).height)); cvSetImageROI(srcimgb2, cvRect(0, 0, min(cvGetSize(srcimg).width, nwidth), cvGetSize(srcimg).height)); cvSet(srcimgb2, cvScalar(0,0,0)); cvCvtColor(srcimg, srcimgb2, CV_BGRA2GRAY); cvResetImageROI(srcimgb2); cvReleaseImage(&srcimg); if(srcbin) cvAdaptiveThreshold(srcimgb2, srcimgb, 1.0, CV_ADAPTIVE_THRESH_MEAN_C, invbk ? CV_THRESH_BINARY_INV : CV_THRESH_BINARY); else cvThreshold(srcimgb2, srcimgb, 0.0, 1.0, invbk ? CV_THRESH_BINARY_INV : CV_THRESH_BINARY); boundCheck(srcimgb); cvScale(srcimgb, srcimgb2, 255); //the source binary image to be labeled is saved as bsrc.bmp cvSaveImage("bsrc.bmp", srcimgb2); cvSet(srcimgb2, cvScalar(0,0,0)); float elapsedMilliSeconds1; {//begin cpu labeling algorithm, the SBLA proposed by Zhao LABELDATATYPE *data=(LABELDATATYPE *)malloc(srcimgb->width * srcimgb->height * sizeof(LABELDATATYPE)); for(int j = 0; j<srcimgb->height; j++) for(int i = 0; i<srcimgb->width; i++) data[i + j*srcimgb->width] = (srcimgb->imageData[i + j*srcimgb->widthStep]) ? 1 : 0; int iNumLabels; CPerformanceCounter perf; perf.Start(); iNumLabels = LabelSBLA(data, srcimgb->width, srcimgb->height); elapsedMilliSeconds1 = (float)perf.GetElapsedMilliSeconds(); printf("cpu SBLA used %f ms, total labels %u\n", elapsedMilliSeconds1, iNumLabels); free(data); } IplImage *src2(0),*dst2(0); int iNumLabels; float elapsedMilliSeconds2; {//begin cpu labeling algorithm, the BBDT proposed by C. Grana, D. Borghesani, R. Cucchiara CPerformanceCounter perf; src2 = cvCreateImage( cvGetSize(srcimgb), IPL_DEPTH_8U, 1 ); cvCopyImage(srcimgb,src2); dst2 = cvCreateImage( cvGetSize(srcimgb), IPL_DEPTH_32S, 1 ); perf.Start(); cvLabelingImageLab(src2, dst2, 1, &iNumLabels); elapsedMilliSeconds2 = (float)perf.GetElapsedMilliSeconds(); printf("cpu BBDT used %f ms, total labels %u\n", elapsedMilliSeconds2, iNumLabels); cvSaveImage("bbdt.bmp", dst2); // cvReleaseImage(&src2); // cvReleaseImage(&dst2); } if(smCnt != 0){ bugimg = cvCreateImage(cvSize(nwidth, 9*bn),IPL_DEPTH_8U,1); bimg = cvCreateImage(cvSize(nwidth, 2*bn),IPL_DEPTH_8U,1); b2img = cvCreateImage(cvSize(nwidth, 2*bn),IPL_DEPTH_8U,1); // cvNamedWindow("src",CV_WINDOW_AUTOSIZE); // cvShowImage("src",srcimg); //prepare buffers for our gpu algorithm CudaBuffer srcBuf, dstBuf, dstBuf2, bBuf, b2Buf, errBuf, glabel; srcBuf.Create2D(nwidth, nheight); //the binary image to be processed dstBuf.Create2D(nwidth, (nheight-2)/2); //the label result, only about 1/4 the size of source image contains the final labels dstBuf2.Create2D(nwidth,(nheight-2)/2); //a copy of the pass1 temp result, for debug purpose glabel.Create2D(4, 1); //a int size global buffer for unique final label errBuf.Create2D(nwidth, 9*bn); //a buffer for debug info bBuf.Create2D(nwidth, 2 * bn); //the intersection info used by pass2 b2Buf.Create2D(nwidth, 2 * bn); //a copy of bBuf for debug purpose srcBuf.SetZeroData(); srcBuf.CopyFrom(srcimgb->imageData, srcimgb->widthStep, nwidth, cvGetSize(srcimgb).height); float elapsedTimeInMs = 0.0f; //-------------------gpu part---------------------------- cudaEvent_t start, stop; cutilSafeCall ( cudaEventCreate( &start ) ); cutilSafeCall ( cudaEventCreate( &stop ) ); cutilSafeCall( cudaEventRecord( start, 0 ) ); if(nwidth == 512) label_512(&dstBuf, &dstBuf2, &srcBuf, &bBuf, &b2Buf, &glabel, nheight, bn, &errBuf); else if(nwidth == 1024) label_1024(&dstBuf, &dstBuf2, &srcBuf, &bBuf, &b2Buf, &glabel, nheight, bn, &errBuf); else if(nwidth == 2048) label_2048(&dstBuf, &dstBuf2, &srcBuf, &bBuf, &b2Buf, &glabel, nheight, bn, &errBuf); cutilSafeCall( cudaEventRecord( stop, 0 ) ); // cutilCheckMsg("kernel launch failure"); cudaEventSynchronize(stop); cutilSafeCall( cudaEventElapsedTime( &elapsedTimeInMs, start, stop ) ); uint tlabel = 0; cudaMemcpy(&tlabel, glabel.GetData(), 4, cudaMemcpyDeviceToHost); printf("gpu alg 1 used %f ms, total labels %u\n", elapsedTimeInMs, tlabel); dstBuf.CopyToHost(srcimgb->imageData, srcimgb->widthStep, nwidth, (nheight-2)/2); dstBuf2.CopyToHost(srcimgb2->imageData, srcimgb->widthStep, nwidth, (nheight-2)/2); errBuf.CopyToHost(bugimg->imageData, bugimg->widthStep, nwidth, 9*bn); bBuf.CopyToHost(bimg->imageData, bimg->widthStep, nwidth, 2*bn); b2Buf.CopyToHost(b2img->imageData, bimg->widthStep, nwidth, 2*bn); // cvNamedWindow("gpu",CV_WINDOW_AUTOSIZE); // cvShowImage("gpu",srcimgb); cvSaveImage(argv[1], srcimgb); cvSaveImage("gpu2.bmp", srcimgb2); //the final labels of our algorithm cvSaveImage("bug.bmp", bugimg); cvSaveImage("b.bmp", bimg); cvSaveImage("b2.bmp", b2img); //now start the gpu ccl implemented by Ostava alg2dst= cvCreateImage(cvSize(nwidth*4, cvGetSize(srcimgb).height),IPL_DEPTH_8U,1); CCLBase* m_ccl; m_ccl = new CCL(); m_ccl->FindRegions(nwidth, cvGetSize(srcimgb).height, &srcBuf); m_ccl->GetConnectedRegionsBuffer()->CopyToHost(alg2dst->imageData, alg2dst->widthStep, nwidth*4, cvGetSize(srcimgb).height); delete m_ccl; cvSaveImage("alg2.bmp", alg2dst); cvReleaseImage(&bugimg); cvReleaseImage(&bimg); cvReleaseImage(&b2img); cvReleaseImage(&alg2dst); // } //cvWaitKey(0); //now start cross compare label results of our ccl and the BBDT, to check the correctness // if(smCnt != 0){ ushort *gpures, *cpures; uint sz = nwidth * (cvGetSize(srcimgb).height/2); gpures = (ushort*)malloc(sz); cpures = (ushort*)malloc(sz); dstBuf.CopyToHost(gpures, nwidth, nwidth, (cvGetSize(srcimgb).height/2)); //first, reduce cpu labels from one label for each pixel to one label for a 2x2 block, assuming 8-connectivity for(int j = 0; j < (cvGetSize(srcimgb).height/2); j++) for(int i = 0; i < (nwidth/2); i++){ uint* cpup; ushort res = LBMAX; uint y = j*2, x = i*2; cpup = (uint*)(dst2->imageData + y*dst2->widthStep); // if(y < cvGetSize(srcimgb).height){ if(cpup[x] != 0) res = cpup[x]-1; if(cpup[x+1] != 0) res = cpup[x+1]-1; // } y++; cpup = (uint*)(dst2->imageData + y*dst2->widthStep); // if(y < cvGetSize(srcimgb).height){ if(cpup[x] != 0) res = cpup[x]-1; if(cpup[x+1] != 0) res = cpup[x+1]-1; // } cpures[i + j*(nwidth/2)] = res; } //our algo use unsigned short to represent a label, the first label starts a 0, and maximun labels is LBMAX if(iNumLabels > LBMAX) printf("too much cc, compare abort.\n"); else{ //create a error //cpures[5] = 12; //cpures[15] = 18; printf("Checking correctness of gpu alg1\nChecking gpu ref by cpu.\n"); checkLabels(cpures, gpures, nwidth/2, cvGetSize(srcimgb).height/2, iNumLabels); printf("Checking cpu ref by gpu.\n"); checkLabels(gpures, cpures, nwidth/2, cvGetSize(srcimgb).height/2, tlabel); } free(gpures); free(cpures); printf("speedup is %f, %f, %f\n", gpu2time/elapsedTimeInMs, elapsedMilliSeconds1/elapsedTimeInMs, elapsedMilliSeconds2/elapsedTimeInMs); } cvReleaseImage(&srcimgb); cvReleaseImage(&srcimgb2); cvReleaseImage(&dst2); cvReleaseImage(&src2); cutilSafeCall( cudaThreadExit() ); return 0; }