Exemple #1
0
// if we use 2 threads on the same gpu, we need to reinit the threads
void cuda_reset_device(int thr_id, bool *init)
{
	int dev_id = device_map[thr_id % MAX_GPUS];
	cudaSetDevice(dev_id);
	if (init != NULL) {
		// with init array, its meant to be used in algo's scan code...
		for (int i=0; i < MAX_GPUS; i++) {
			if (device_map[i] == dev_id) {
				init[i] = false;
			}
		}
		// force exit from algo's scan loops/function
		restart_threads();
		cudaDeviceSynchronize();
		while (cudaStreamQuery(NULL) == cudaErrorNotReady)
			usleep(1000);
	}
	cudaDeviceReset();
	if (opt_cudaschedule >= 0) {
		cudaSetDeviceFlags((unsigned)(opt_cudaschedule & cudaDeviceScheduleMask));
	} else {
		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
	}
	cudaDeviceSynchronize();
}
void MFNHashTypePlainCUDA::setupDevice() {
    trace_printf("CHHashTypeVPlainCUDA::setupDevice()\n");

    CHCUDAUtils *CudaUtils = MultiforcerGlobalClassFactory.getCudaUtilsClass();

    // Set the CUDA device
    trace_printf("Thread %d setting device to %d\n",this->threadId, this->gpuDeviceId);
    cudaSetDevice(this->gpuDeviceId);

    // If the user requests zerocopy and the device can handle it, add it.
    if (this->CommandLineData->GetUseZeroCopy() &&
            CudaUtils->getCudaCanMapHostMemory(this->gpuDeviceId)) {
        this->useZeroCopy = 1;
    }

    // If the device is integrated & can map memory, add it - integrated devices
    // are already sharing host memory, so no point in copying the data over.
    if (CudaUtils->getCudaIsIntegrated(this->gpuDeviceId) &&
            CudaUtils->getCudaCanMapHostMemory(this->gpuDeviceId)) {
        this->useZeroCopy = 1;
    }

    // Enable blocking sync.  This dramatically reduces CPU usage.
    // If zero copy is being used, set DeviceMapHost as well
    if (this->useZeroCopy) {
        cudaSetDeviceFlags(cudaDeviceBlockingSync | cudaDeviceMapHost);
    } else {
        cudaSetDeviceFlags(cudaDeviceBlockingSync);
    }

}
Exemple #3
0
void THInit()
{
	static int init;

	if(init)
		return;
	init_yuv2rgb();
#ifndef USEBLAS
	blas_init();
#endif
	init = 1;
#if defined CUDNN && defined USECUDAHOSTALLOC
	// cuda_maphostmem = 1 requires that memory was allocated with cudaHostAlloc
	// cuda_maphostmem = 2 will work with malloc, but Tegra TX1 does not support cudaHostRegister with cudaHostRegisterMapped
	struct cudaDeviceProp prop;

	cudaGetDeviceProperties(&prop, 0);
	if(prop.canMapHostMemory)
	{
		errcheck(cudaSetDeviceFlags(cudaDeviceMapHost));
		cuda_maphostmem = 1;
	}
#endif
#ifdef OPENCL
	thopencl_init();
#endif
}
Exemple #4
0
 void setDeviceFlags(unsigned int flags)
 {
     cudaError_t cudaError = cudaSetDeviceFlags(flags);
     if(cudaError != cudaSuccess) {
         throw cudaError;
     }
 }
Exemple #5
0
Args<T>::Args(int _width){
	width = _width;
	gpuErrchk( cudaDeviceReset() );
	gpuErrchk( cudaSetDeviceFlags(cudaDeviceMapHost) );
	gpuErrchk( cudaHostAlloc((void **) &hostArray, width*sizeof(T), cudaHostAllocWriteCombined | cudaHostAllocMapped) );
	gpuErrchk( cudaHostGetDevicePointer(&deviceArray, hostArray, 0) );
}
Exemple #6
0
int main(int argc, char **argv)
{
    int gpu = 0;
    char *str = getenv("MV2_COMM_WORLD_LOCAL_RANK");
    if (!str) str =getenv("OMPI_COMM_WORLD_LOCAL_RANK");

    if (str) gpu=atoi(str);

    str =getenv("MV2_COMM_WORLD_RANK");
    if (!str) str =getenv("OMPI_COMM_WORLD_RANK");

    int s_env = 0;
    if (str) s_env = atoi(str);
    printf("Rank %d running on GPU %d\n",s_env, gpu);
    cudaSetDeviceFlags(cudaDeviceMapHost);
    cudaSetDevice(gpu);
    /* initialize context */
    cudaFree(0);

    putenv(enable_cuda);
    putenv(enable_threads);
    MPI_Init(&argc, &argv);

    int s,p;
    MPI_Comm_rank(MPI_COMM_WORLD, &s);
    MPI_Comm_size(MPI_COMM_WORLD, &p);

    /* use a basic grid - processor mapping */
    proc_map = malloc(sizeof(int)*p);
    int i;
    for (i = 0; i < p; i++) proc_map[i] = i;

    /* basic test in n = 1 .. 7 dimensions */
    int nd;
    for (nd = 1; nd <= 7; nd++)
    {
        if (!s) printf("Testing distributed FFT in %d dimensions ...\n",nd);
        test_distributed_fft_nd(nd);
    }

    if (!s) printf("Compare against KISS FFT (d=1)...\n");

    for (i = 1; i < 14; ++i)
        /*for (i = 1; i < 24; ++i) */
    {
        int n = (1 << i);
        if (n <= p) continue;
        if (!s) printf("N=%d\n",n);
        test_distributed_fft_1d_compare(n);
    }

    if (!s) printf("Compare against KISS FFT (d=3)... \n");
    test_distributed_fft_3d_compare();

    free(proc_map);
    MPI_Finalize();
}
int main() {

	time_t start = time(NULL);

	int dim = L * (nmax + 1);

	const real epsg = EPSG;
	const real epsf = EPSF;
	const real epsx = EPSX;
	const int maxits = MAXITS;
	stpscal = 0.5;
	int info;

	real* x;
	int* nbd;
	real* l;
	real* u;
	memAlloc<real>(&x, dim);
	memAlloc<int>(&nbd, dim);
	memAlloc<real>(&l, dim);
	memAlloc<real>(&u, dim);
	memAllocHost<real>(&f_tb_host, &f_tb_dev, 1);

	cudaSetDeviceFlags(cudaDeviceMapHost);
	cublasCreate_v2(&cublasHd);

	U = 1;
	J = 0.1;
	mu = 0.5;

	initProb(x, nbd, l, u, dim);
	lbfgsbminimize(dim, 4, x, epsg, epsf, epsx, maxits, nbd, l, u, info);
	printf("info: %d\n", info);

	printf("f: %e\n", *f_tb_host);
	real* x_host = new real[dim];
	memCopy(x_host, x, dim * sizeof(real), cudaMemcpyDeviceToHost);
	printf("x: ");
	for (int i = 0; i < dim; i++) {
		printf("%f, ", x_host[i]);
	}
	printf("\n");

	memFreeHost(f_tb_host);
	memFree(x);
	memFree(nbd);
	memFree(l);
	memFree(u);

	cublasDestroy_v2(cublasHd);

	cudaDeviceReset();

	time_t end = time(NULL);

	printf("Runtime: %ld", end-start);
}
Exemple #8
0
bool initCUDA(void)
{
#if __DEVICE_EMULATION__
	return true;
#else
	int count = 0;
	int i = 0;

	cudaGetDeviceCount(&count);
	if(count == 0) {
		fprintf(stderr, "Nu exista nici un device.\n");
		return false;
	}

	printf("Exista %d device-uri.\n",count);
	
	for(i = 0; i < count; i++) {
		cudaDeviceProp prop;
		if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
			if(prop.major >= 1) {
				break;
			}
		}
		
		if(!prop.canMapHostMemory) {
		
			fprintf(stderr, "Device %d cannot map host memory!\n",0);
			exit(EXIT_FAILURE);
		
		}

	}

	if(i == count) {
		fprintf(stderr, "Nu exista nici un device care suporta CUDA.\n");
		return false;
	}
	cudaSetDevice(cutGetMaxGflopsDeviceId());

	cudaSetDeviceFlags(cudaDeviceMapHost);
	checkCUDAError("cudaSetDeviceFlags");
	
	
	printf("CUDA initializat cu succes\n");

	// Create the CUTIL timer
    cutilCheckError( cutCreateTimer( &timer));

	return true;
#endif
}
Exemple #9
0
int main(int argc, char ** argv)
{
    srand(static_cast<unsigned int>(time(0)));

    std::cout.setf(std::ios_base::scientific, std::ios_base::floatfield);
    std::cout.setf(std::ios_base::showpos);
    std::cout.setf(std::ios_base::showpoint);
    std::cout.precision(7);

#ifdef _CUDA_
    checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost));
#endif

#ifdef Test
    return Test_Func();
#else
    return Filtering(argc, argv);
#endif
}
        void setDevice(int deviceNumber)
        {
            int num_gpus = 0; //count of gpus
            cudaGetDeviceCount(&num_gpus);
            //##ERROR handling
            if (num_gpus < 1) //check if cuda device ist found
            {
                throw std::runtime_error("no CUDA capable devices detected");
            }
            else if (num_gpus < deviceNumber) //check if i can select device with diviceNumber
            {
                std::cerr << "no CUDA device " << deviceNumber << ", only " << num_gpus << " devices found" << std::endl;
                throw std::runtime_error("CUDA capable devices can't be selected");
            }

            cudaDeviceProp devProp;
            cudaError rc;
            CUDA_CHECK(cudaGetDeviceProperties(&devProp, deviceNumber));
            if (devProp.computeMode == cudaComputeModeDefault)
            {
                CUDA_CHECK(rc = cudaSetDevice(deviceNumber));
                if (cudaSuccess == rc)
                {
                    cudaDeviceProp dprop;
                    cudaGetDeviceProperties(&dprop, deviceNumber);
                    //!\todo: write this only on debug
                    log<ggLog::CUDA_RT > ("Set device to %1%: %2%") % deviceNumber % dprop.name;
                }
            }
            else
            {
                //gpu mode is cudaComputeModeExclusiveProcess and a free device is automaticly selected.
                log<ggLog::CUDA_RT > ("Device is selected by CUDA automaticly. (because cudaComputeModeDefault is not set)");
            }
            CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceScheduleYield));
        }
Exemple #11
0
//////////////////////////////////////////////////////////////////////////////
// Program main
//////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    bool bTestResults = true;

    if (checkCmdLineFlag(argc, (const char **)argv, "help"))
    {
        printf("\n> Command line options\n");
        showHelp();
        return 0;
    }

    printf("Run \"nbody -benchmark [-numbodies=<numBodies>]\" to measure perfomance.\n");
    showHelp();

    bFullscreen  = (checkCmdLineFlag(argc, (const char **) argv, "fullscreen") != 0);

    if (bFullscreen)
    {
        bShowSliders = false;
    }

    benchmark    = (checkCmdLineFlag(argc, (const char **) argv, "benchmark") != 0);

    compareToCPU = ((checkCmdLineFlag(argc, (const char **) argv, "compare") != 0) ||
                    (checkCmdLineFlag(argc, (const char **) argv, "qatest")  != 0));

    QATest       = (checkCmdLineFlag(argc, (const char **) argv, "qatest")  != 0);
    useHostMem   = (checkCmdLineFlag(argc, (const char **) argv, "hostmem") != 0);
    fp64         = (checkCmdLineFlag(argc, (const char **) argv, "fp64") != 0);

    flopsPerInteraction = fp64 ? 30 : 20;

    useCpu       = (checkCmdLineFlag(argc, (const char **) argv, "cpu") != 0);

    if (checkCmdLineFlag(argc, (const char **)argv, "numdevices"))
    {
        numDevsRequested = getCmdLineArgumentInt(argc, (const char **) argv, "numdevices");

        if (numDevsRequested < 1)
        {
            printf("Error: \"number of CUDA devices\" specified %d is invalid.  Value should be >= 1\n", numDevsRequested);
            exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE);
        }
        else
        {
            printf("number of CUDA devices  = %d\n", numDevsRequested);
        }
    }

    // for multi-device we currently require using host memory -- the devices share
    // data via the host
    if (numDevsRequested > 1)
    {
        useHostMem = true;
    }

    int numDevsAvailable = 0;
    bool customGPU = false;
    cudaGetDeviceCount(&numDevsAvailable);

    if (numDevsAvailable < numDevsRequested)
    {
        printf("Error: only %d Devices available, %d requested.  Exiting.\n", numDevsAvailable, numDevsRequested);
        exit(EXIT_SUCCESS);
    }

    printf("> %s mode\n", bFullscreen ? "Fullscreen" : "Windowed");
    printf("> Simulation data stored in %s memory\n", useHostMem ? "system" : "video");
    printf("> %s precision floating point simulation\n", fp64 ? "Double" : "Single");
    printf("> %d Devices used for simulation\n", numDevsRequested);

    int devID;
    cudaDeviceProp props;

    if (useCpu)
    {
        useHostMem = true;
        compareToCPU = false;
        bSupportDouble = true;

#ifdef OPENMP
        printf("> Simulation with CPU using OpenMP\n");
#else
        printf("> Simulation with CPU\n");
#endif
    }

    // Initialize GL and GLUT if necessary
    if (!benchmark && !compareToCPU)
    {
        initGL(&argc, argv);
        initParameters();
    }

    
    if(!useCpu)
    {
        // Now choose the CUDA Device
        // Either without GL interop:
        if (benchmark || compareToCPU || useHostMem)
        {
            // Note if we are using host memory for the body system, we
            // don't use CUDA-GL interop.

            if (checkCmdLineFlag(argc, (const char **)argv, "device"))
            {
                customGPU = true;
            }

            devID = findCudaDevice(argc, (const char **)argv);
        }
        else   // or with GL interop:
        {
            if (checkCmdLineFlag(argc, (const char **)argv, "device"))
            {
                customGPU = true;
            }

            devID = findCudaGLDevice(argc, (const char **)argv);
        }

        checkCudaErrors(cudaGetDevice(&devID));
        checkCudaErrors(cudaGetDeviceProperties(&props, devID));

        bSupportDouble = true;

#if CUDART_VERSION < 4000

        if (numDevsRequested > 1)
        {
            printf("MultiGPU n-body requires CUDA 4.0 or later\n");
            cudaDeviceReset();
            exit(EXIT_SUCCESS);
        }

#endif

        // Initialize devices
        if (numDevsRequested > 1 && customGPU)
        {
            printf("You can't use --numdevices and --device at the same time.\n");
            exit(EXIT_SUCCESS);
        }

        if (customGPU)
        {
            cudaDeviceProp props;
            checkCudaErrors(cudaGetDeviceProperties(&props, devID));
            printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name);
        }
        else
        {
            for (int i = 0; i < numDevsRequested; i++)
            {
                cudaDeviceProp props;
                checkCudaErrors(cudaGetDeviceProperties(&props, i));

                printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name);

                if (useHostMem)
                {
#if CUDART_VERSION >= 2020

                    if (!props.canMapHostMemory)
                    {
                        fprintf(stderr, "Device %d cannot map host memory!\n", devID);
                        cudaDeviceReset();
                        exit(EXIT_SUCCESS);
                    }

                    if (numDevsRequested > 1)
                    {
                        checkCudaErrors(cudaSetDevice(i));
                    }

                    checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost));
#else
                    fprintf(stderr, "This CUDART version does not support <cudaDeviceProp.canMapHostMemory> field\n");
                    cudaDeviceReset();
                    exit(EXIT_SUCCESS);
#endif
                }
            }

            // CC 1.2 and earlier do not support double precision
            if (props.major*10 + props.minor <= 12)
            {
                bSupportDouble = false;
            }
        }

        //if(numDevsRequested > 1)
        //    checkCudaErrors(cudaSetDevice(devID));

        if (fp64 && !bSupportDouble)
        {
            fprintf(stderr, "One or more of the requested devices does not support double precision floating-point\n");
            cudaDeviceReset();
            exit(EXIT_SUCCESS);
        }
    }

    numIterations = 0;
    p = 0;
    q = 1;

    if (checkCmdLineFlag(argc, (const char **)argv, "i"))
    {
        numIterations = getCmdLineArgumentInt(argc, (const char **)argv, "i");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "p"))
    {
        p = getCmdLineArgumentInt(argc, (const char **)argv, "p");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "q"))
    {
        q = getCmdLineArgumentInt(argc, (const char **)argv, "q");
    }

    if (p == 0)   // p not set on command line
    {
        p = 256;

        if (q * p > 256)
        {
            p = 256 / q;
            printf("Setting p=%d, q=%d to maintain %d threads per block\n", p, q, 256);
        }
    }

    // default number of bodies is #SMs * 4 * CTA size
    if (useCpu)
#ifdef OPENMP
        numBodies = 8192;

#else
        numBodies = 4096;
#endif
    else if (numDevsRequested == 1)
Exemple #12
0
Fichier : cu2.c Projet : E-LLP/QuIP
static void init_cu2_device(QSP_ARG_DECL  int index, Compute_Platform *cpp)
{
	struct cudaDeviceProp deviceProp;
	cudaError_t e;
	Platform_Device *pdp;
	char name[LLEN];
	char dev_name[LLEN];
	char area_name[LLEN];
	const char *name_p;
	char *s;
	Data_Area *ap;
	float comp_cap;

	if( index >= MAX_CUDA_DEVICES ){
		sprintf(ERROR_STRING,"Program is compiled for a maximum of %d CUDA devices, can't inititialize device %d.",
			MAX_CUDA_DEVICES,index);
		ERROR1(ERROR_STRING);
	}

	if( verbose ){
		sprintf(ERROR_STRING,"init_cu2_device %d BEGIN",index);
		advise(ERROR_STRING);
	}

	if( (e=cudaGetDeviceProperties(&deviceProp, index)) != cudaSuccess ){
		describe_cuda_driver_error2("init_cu2_device","cudaGetDeviceProperties",e);
		return;
	}

	if (deviceProp.major == 9999 && deviceProp.minor == 9999){
		sprintf(ERROR_STRING,"There is no CUDA device with dev = %d!?.\n",index);
		WARN(ERROR_STRING);

		/* What should we do here??? */
		return;
	}

	/* Put the compute capability into a script variable so that we can use it */
	comp_cap = deviceProp.major * 10 + deviceProp.minor;
	if( comp_cap > CUDA_COMP_CAP ){
		sprintf(ERROR_STRING,"init_cu2_device:  CUDA device %s has compute capability %d.%d, but program was configured for %d.%d!?",
			deviceProp.name,deviceProp.major,deviceProp.minor,
			CUDA_COMP_CAP/10,CUDA_COMP_CAP%10);
		WARN(ERROR_STRING);
	}

	/* BUG if there are multiple devices, we need to make sure that this is set
	 * correctly for the current context!?
	 */
	sprintf(ERROR_STRING,"%d.%d",deviceProp.major,deviceProp.minor);
	assign_var(QSP_ARG  "cuda_comp_cap",ERROR_STRING);


	/* What does this do??? */
	e = cudaSetDeviceFlags( cudaDeviceMapHost );
	if( e != cudaSuccess ){
		describe_cuda_driver_error2("init_cu2_device",
			"cudaSetDeviceFlags",e);
	}

	strcpy(name,deviceProp.name);

	/* change spaces to underscores */
	s=name;
	while(*s){
		if( *s==' ' ) *s='_';
		s++;
	}

	/* We might have two of the same devices installed in a single system.
	 * In this case, we can't use the device name twice, because there will
	 * be a conflict.  The first one gets the name, then we have to check and
	 * make sure that the name is not in use already.  If it is, then we append
	 * a number to the string...
	 */
	name_p = available_pfdev_name(QSP_ARG  name,dev_name,cpp,MAX_CUDA_DEVICES);	// reuse name as scratch string
	pdp = new_pfdev(QSP_ARG  name_p);

#ifdef CAUTIOUS
	if( pdp == NO_PFDEV ){
		sprintf(ERROR_STRING,"CAUTIOUS:  init_cu2_device:  Error creating cuda device struct for %s!?",name_p);
		WARN(ERROR_STRING);
		return;
	}
#endif /* CAUTIOUS */

	/* Remember this name in case the default is not found */
	if( first_cuda_dev_name == NULL )
		first_cuda_dev_name = PFDEV_NAME(pdp);

	/* Compare this name against the default name set in
	 * the environment, if it exists...
	 */
	if( default_cuda_dev_name != NULL && ! default_cuda_dev_found ){
		if( !strcmp(PFDEV_NAME(pdp),default_cuda_dev_name) )
			default_cuda_dev_found=1;
	}

	SET_PFDEV_PLATFORM(pdp,cpp);
	SET_PFDEV_CUDA_INFO( pdp, getbuf(sizeof(Cuda_Dev_Info)) );

	SET_PFDEV_CUDA_DEV_INDEX(pdp,index);
	SET_PFDEV_CUDA_DEV_PROP(pdp,deviceProp);
	SET_PFDEV_CUDA_RNGEN(pdp,NULL);

	if( comp_cap >= 20 ){
		SET_PFDEV_MAX_DIMS(pdp,3);
	} else {
		SET_PFDEV_MAX_DIMS(pdp,2);
	}

	//set_cuda_device(pdp);	// is this call just so we can call cudaMalloc?
	PF_FUNC_NAME(set_device)(QSP_ARG  pdp);	// is this call just so we can call cudaMalloc?

	// address set to NULL says use custom allocator - see dobj/makedobj.c

	// BUG??  with pdp we may not need the DA_ flag???
	sprintf(area_name,"%s.%s",PLATFORM_NAME(cpp),name_p);
	ap = pf_area_init(QSP_ARG  area_name,NULL,0,
			MAX_CUDA_GLOBAL_OBJECTS,DA_CUDA_GLOBAL,pdp);
	if( ap == NO_AREA ){
		sprintf(ERROR_STRING,
	"init_cu2_device:  error creating global data area %s",area_name);
		WARN(ERROR_STRING);
	}
	// g++ won't take this line!?
	SET_AREA_CUDA_DEV(ap,pdp);
	//set_device_for_area(ap,pdp);

	SET_PFDEV_AREA(pdp,PFDEV_GLOBAL_AREA_INDEX,ap);

	/* We used to declare a heap for constant memory here,
	 * but there wasn't much of a point because:
	 * Constant memory can't be allocated, rather it is declared
	 * in the .cu code, and placed by the compiler as it sees fit.
	 * To have objects use this, we would have to declare a heap and
	 * manage it ourselves...
	 * There's only 64k, so we should be sparing...
	 * We'll try this later...
	 */


	/* Make up another area for the host memory
	 * which is locked and mappable to the device.
	 * We don't allocate a pool here, but do it as needed...
	 */

	//strcpy(area_name,name_p);
	//strcat(area_name,"_host");
	sprintf(area_name,"%s.%s_host",PLATFORM_NAME(cpp),name_p);
	ap = pf_area_init(QSP_ARG  area_name,(u_char *)NULL,0,MAX_CUDA_MAPPED_OBJECTS,
								DA_CUDA_HOST,pdp);
	if( ap == NO_AREA ){
		sprintf(ERROR_STRING,
	"init_cu2_device:  error creating host data area %s",area_name);
		ERROR1(ERROR_STRING);
	}
	SET_AREA_CUDA_DEV(ap, pdp);
	//cuda_data_area[index][CUDA_HOST_AREA_INDEX] = ap;
	SET_PFDEV_AREA(pdp,PFDEV_HOST_AREA_INDEX,ap);

	/* Make up another psuedo-area for the mapped host memory;
	 * This is the same memory as above, but mapped to the device.
	 * In the current implementation, we create objects in the host
	 * area, and then automatically create an alias on the device side.
	 * There is a BUG in that by having this psuedo area in the data
	 * area name space, a user could select it as the data area and
	 * then try to create an object.  We will detect this in make_dobj,
	 * and complain.
	 */

	//strcpy(area_name,name_p);
	//strcat(area_name,"_host_mapped");
	sprintf(area_name,"%s.%s_host_mapped",PLATFORM_NAME(cpp),name_p);
	ap = pf_area_init(QSP_ARG  area_name,(u_char *)NULL,0,MAX_CUDA_MAPPED_OBJECTS,
							DA_CUDA_HOST_MAPPED,pdp);
	if( ap == NO_AREA ){
		sprintf(ERROR_STRING,
	"init_cu2_device:  error creating host-mapped data area %s",area_name);
		ERROR1(ERROR_STRING);
	}
	SET_AREA_CUDA_DEV(ap,pdp);
	//cuda_data_area[index][CUDA_HOST_MAPPED_AREA_INDEX] = ap;
	SET_PFDEV_AREA(pdp,PFDEV_HOST_MAPPED_AREA_INDEX,ap);

	// We don't change the data area by default any more when initializing...
	/* Restore the normal area */
	//set_data_area(PFDEV_AREA(pdp,PFDEV_GLOBAL_AREA_INDEX));

	if( verbose ){
		sprintf(ERROR_STRING,"init_cu2_device %d DONE",index);
		advise(ERROR_STRING);
	}
}
Exemple #13
0
void InitializeGlut(int *argc, char *argv[])
{
	int i,j;

	glutInit(argc, argv);
	glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH);
    glutInitWindowSize(screenwidth, screenheight);
	glutCreateWindow(argv[0]);
	glutDisplayFunc(Display);
	glutKeyboardFunc(Keyboard);

	// Support mapped pinned allocations
	cudaSetDeviceFlags(cudaDeviceMapHost);

	cudaGLSetGLDevice(0);

	cublasCreate_v2(&cublasHd);

	glewInit();
	GLint max_texture_size;
	glGetIntegerv(GL_MAX_TEXTURE_SIZE, &max_texture_size);
	if(max_texture_size < screenwidth || screenwidth < screenheight) {
		printf("Max size of texttur(%d) is less than screensize(%d, %d)\n", max_texture_size, screenwidth, screenheight);
		exit(0);
	}

	//Create the textures
	glActiveTextureARB(GL_TEXTURE0_ARB);

	// 처리용 텍스쳐 2장
	// Q. 왜 2장일까?
	glGenTextures(2, Processed_Texture);
	glBindTexture(GL_TEXTURE_RECTANGLE_NV, Processed_Texture[0]);
	glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA32F_ARB, screenwidth+2, screenheight+2, 0, 
		GL_RGBA, GL_FLOAT, NULL);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, GL_CLAMP);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, GL_CLAMP);

	glBindTexture(GL_TEXTURE_RECTANGLE_NV, Processed_Texture[1]);
	glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA32F_ARB, screenwidth+2, screenheight+2, 0, 
		GL_RGBA, GL_FLOAT, NULL);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, GL_CLAMP);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, GL_CLAMP);

	// Site용 텍스쳐
	// Q. 처리용과 별개인 이유는?
	glGenTextures(1, &Site_Texture);
	glBindTexture(GL_TEXTURE_RECTANGLE_NV, Site_Texture);
	glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA32F_ARB, screenwidth+2, screenheight+2, 0, 
		GL_RGBA, GL_FLOAT, NULL);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, GL_CLAMP);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, GL_CLAMP);

	// Registers the texture or renderbuffer object specified by image for access by CUDA.
	// A handle to the registered object is returned as resource
	cutilSafeCall(cudaGraphicsGLRegisterImage(&grSite, Site_Texture, 
		GL_TEXTURE_RECTANGLE_NV, cudaGraphicsMapFlagsReadOnly));

	// 에너지용 텍스쳐
	// 처리용과 동일한 2장
	// Q. 왜??
	glGenTextures(2, Energy_Texture);
	glBindTexture(GL_TEXTURE_RECTANGLE_NV, Energy_Texture[0]);
	glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA32F_ARB, screenwidth+2, screenheight+2, 0, 
		GL_RGBA, GL_FLOAT, NULL);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, GL_CLAMP);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, GL_CLAMP);

	glBindTexture(GL_TEXTURE_RECTANGLE_NV, Energy_Texture[1]);
	glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA32F_ARB, screenwidth+2, screenheight+2, 0, 
		GL_RGBA, GL_FLOAT, NULL);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, GL_CLAMP);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, GL_CLAMP);

	// 인덱스용 텍스쳐
	// 인덱스를 컬러로 표현
	glGenTextures(1, &IndexColor_Texture);
	glBindTexture(GL_TEXTURE_RECTANGLE_NV, IndexColor_Texture);
	glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA, screenwidth, screenheight, 0, 
		GL_RGBA, GL_FLOAT, NULL);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, GL_CLAMP);
	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, GL_CLAMP);

	// Render Buffer Object
	glGenFramebuffersEXT(1, &RB_object);
	glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, RB_object);
	glRenderbufferStorageEXT(GL_RENDERBUFFER_EXT, GL_RGBA32F_ARB, screenwidth+2, screenheight+2);

	// Frame(?) Buffer Object
	glGenFramebuffersEXT(1, &FB_objects);

	// ????
	// NVIDIA 확인이라는 점만 확인
	// http://developer.download.nvidia.com/opengl/specs/nvOpenGLspecs.pdf‎
	glGetQueryiv(GL_SAMPLES_PASSED_ARB, GL_QUERY_COUNTER_BITS_ARB, &oq_bitsSupported);
	glGenQueriesARB(1, &occlusion_query);

	InitCg();

	// 미리 컴파일된 화면 픽셀 목록
	ScreenPointsList = glGenLists(1);
	glNewList(ScreenPointsList, GL_COMPILE);
	glBegin(GL_POINTS);
	for (i=0; i<screenwidth; i++)
		for (j=0; j<screenheight; j++)
			glVertex2f(i+1.5, j+1.5);
	glEnd();
	glEndList();

}
Exemple #14
0
int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
{
	uint32_t _ALIGN(64) endiandata[32];
	uint32_t *pdata = work->data;
	uint32_t *ptarget = work->target;
	uint32_t start_nonce = pdata[19];
	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 17);
	if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce);

	uint32_t *outputHash = (uint32_t*)malloc(throughput * 64);

	if (opt_benchmark)
		ptarget[7] = 0x0000ff;

	// init
	if(!init[thr_id])
	{
		cudaSetDevice(device_map[thr_id]);
		if (opt_cudaschedule == -1 && gpu_threads == 1) {
			cudaDeviceReset();
			// reduce cpu usage
			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
			CUDA_LOG_ERROR();
		}
		myriadgroestl_cpu_init(thr_id, throughput);
		init[thr_id] = true;
	}

	for (int k=0; k < 20; k++)
		be32enc(&endiandata[k], pdata[k]);

	// Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
	myriadgroestl_cpu_setBlock(thr_id, endiandata, (void*)ptarget);

	do {
		// GPU
		uint32_t foundNounce = UINT32_MAX;

		myriadgroestl_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce);

		*hashes_done = pdata[19] - start_nonce + throughput;

		if (foundNounce < UINT32_MAX && bench_algo < 0)
		{
			uint32_t _ALIGN(64) vhash[8];
			endiandata[19] = swab32(foundNounce);
			myriadhash(vhash, endiandata);
			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
				work_set_target_ratio(work, vhash);
				pdata[19] = foundNounce;
				free(outputHash);
				return 1;
			} else {
				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNounce);
			}
		}

		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
			pdata[19] = max_nonce;
			break;
		}
		pdata[19] += throughput;

	} while (!work_restart[thr_id].restart);

	*hashes_done = max_nonce - start_nonce;

	free(outputHash);
	return 0;
}
Exemple #15
0
PetscErrorCode  PetscOptionsCheckInitial_Private(void)
{
  char              string[64],mname[PETSC_MAX_PATH_LEN],*f;
  MPI_Comm          comm = PETSC_COMM_WORLD;
  PetscBool         flg1 = PETSC_FALSE,flg2 = PETSC_FALSE,flg3 = PETSC_FALSE,flag;
  PetscErrorCode    ierr;
  PetscReal         si;
  PetscInt          intensity;
  int               i;
  PetscMPIInt       rank;
  char              version[256];
#if !defined(PETSC_HAVE_THREADSAFETY)
  PetscReal         logthreshold;
#endif
#if defined(PETSC_USE_LOG)
  PetscViewerFormat format;
  PetscBool         flg4 = PETSC_FALSE;
#endif
  
  PetscFunctionBegin;
  ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);

#if !defined(PETSC_HAVE_THREADSAFETY)
  /*
      Setup the memory management; support for tracing malloc() usage
  */
  ierr = PetscOptionsHasName(NULL,"-malloc_log",&flg3);CHKERRQ(ierr);
  logthreshold = 0.0;
  ierr = PetscOptionsGetReal(NULL,"-malloc_log_threshold",&logthreshold,&flg1);CHKERRQ(ierr);
  if (flg1) flg3 = PETSC_TRUE;
#if defined(PETSC_USE_DEBUG)
  ierr = PetscOptionsGetBool(NULL,"-malloc",&flg1,&flg2);CHKERRQ(ierr);
  if ((!flg2 || flg1) && !petscsetmallocvisited) {
    if (flg2 || !(PETSC_RUNNING_ON_VALGRIND)) {
      /* turn off default -malloc if valgrind is being used */
      ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr);
    }
  }
#else
  ierr = PetscOptionsGetBool(NULL,"-malloc_dump",&flg1,NULL);CHKERRQ(ierr);
  ierr = PetscOptionsGetBool(NULL,"-malloc",&flg2,NULL);CHKERRQ(ierr);
  if (flg1 || flg2 || flg3) {ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr);}
#endif
  if (flg3) {
    ierr = PetscMallocSetDumpLogThreshold((PetscLogDouble)logthreshold);CHKERRQ(ierr);
  }
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-malloc_debug",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) {
    ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr);
    ierr = PetscMallocDebug(PETSC_TRUE);CHKERRQ(ierr);
  }
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-malloc_test",&flg1,NULL);CHKERRQ(ierr);
#if defined(PETSC_USE_DEBUG)
  if (flg1 && !PETSC_RUNNING_ON_VALGRIND) {
    ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr);
    ierr = PetscMallocSetDumpLog();CHKERRQ(ierr);
    ierr = PetscMallocDebug(PETSC_TRUE);CHKERRQ(ierr);
  }
#endif

  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-malloc_info",&flg1,NULL);CHKERRQ(ierr);
  if (!flg1) {
    flg1 = PETSC_FALSE;
    ierr = PetscOptionsGetBool(NULL,"-memory_view",&flg1,NULL);CHKERRQ(ierr);
  }
  if (flg1) {
    ierr = PetscMemorySetGetMaximumUsage();CHKERRQ(ierr);
  }
#endif

#if defined(PETSC_USE_LOG)
  ierr = PetscOptionsHasName(NULL,"-objects_dump",&PetscObjectsLog);CHKERRQ(ierr);
#endif

  /*
      Set the display variable for graphics
  */
  ierr = PetscSetDisplay();CHKERRQ(ierr);

  /*
      Print the PETSc version information
  */
  ierr = PetscOptionsHasName(NULL,"-v",&flg1);CHKERRQ(ierr);
  ierr = PetscOptionsHasName(NULL,"-version",&flg2);CHKERRQ(ierr);
  ierr = PetscOptionsHasName(NULL,"-help",&flg3);CHKERRQ(ierr);
  if (flg1 || flg2 || flg3) {

    /*
       Print "higher-level" package version message
    */
    if (PetscExternalVersionFunction) {
      ierr = (*PetscExternalVersionFunction)(comm);CHKERRQ(ierr);
    }

    ierr = PetscGetVersion(version,256);CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"--------------------------------------------\
------------------------------\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"%s\n",version);CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"%s",PETSC_AUTHOR_INFO);CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"See docs/changes/index.html for recent updates.\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"See docs/faq.html for problems.\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"See docs/manualpages/index.html for help. \n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"Libraries linked from %s\n",PETSC_LIB_DIR);CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"--------------------------------------------\
------------------------------\n");CHKERRQ(ierr);
  }

  /*
       Print "higher-level" package help message
  */
  if (flg3) {
    if (PetscExternalHelpFunction) {
      ierr = (*PetscExternalHelpFunction)(comm);CHKERRQ(ierr);
    }
  }

  /*
      Setup the error handling
  */
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-on_error_abort",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) {
    ierr = MPI_Comm_set_errhandler(PETSC_COMM_WORLD,MPI_ERRORS_ARE_FATAL);CHKERRQ(ierr);
    ierr = PetscPushErrorHandler(PetscAbortErrorHandler,0);CHKERRQ(ierr);
  }
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-on_error_mpiabort",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) { ierr = PetscPushErrorHandler(PetscMPIAbortErrorHandler,0);CHKERRQ(ierr);}
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-mpi_return_on_error",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) {
    ierr = MPI_Comm_set_errhandler(comm,MPI_ERRORS_RETURN);CHKERRQ(ierr);
  }
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-no_signal_handler",&flg1,NULL);CHKERRQ(ierr);
  if (!flg1) {ierr = PetscPushSignalHandler(PetscSignalHandlerDefault,(void*)0);CHKERRQ(ierr);}
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-fp_trap",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) {ierr = PetscSetFPTrap(PETSC_FP_TRAP_ON);CHKERRQ(ierr);}
  ierr = PetscOptionsGetInt(NULL,"-check_pointer_intensity",&intensity,&flag);CHKERRQ(ierr);
  if (flag) {ierr = PetscCheckPointerSetIntensity(intensity);CHKERRQ(ierr);}

  /*
      Setup debugger information
  */
  ierr = PetscSetDefaultDebugger();CHKERRQ(ierr);
  ierr = PetscOptionsGetString(NULL,"-on_error_attach_debugger",string,64,&flg1);CHKERRQ(ierr);
  if (flg1) {
    MPI_Errhandler err_handler;

    ierr = PetscSetDebuggerFromString(string);CHKERRQ(ierr);
    ierr = MPI_Comm_create_errhandler((MPI_Handler_function*)Petsc_MPI_DebuggerOnError,&err_handler);CHKERRQ(ierr);
    ierr = MPI_Comm_set_errhandler(comm,err_handler);CHKERRQ(ierr);
    ierr = PetscPushErrorHandler(PetscAttachDebuggerErrorHandler,0);CHKERRQ(ierr);
  }
  ierr = PetscOptionsGetString(NULL,"-debug_terminal",string,64,&flg1);CHKERRQ(ierr);
  if (flg1) { ierr = PetscSetDebugTerminal(string);CHKERRQ(ierr); }
  ierr = PetscOptionsGetString(NULL,"-start_in_debugger",string,64,&flg1);CHKERRQ(ierr);
  ierr = PetscOptionsGetString(NULL,"-stop_for_debugger",string,64,&flg2);CHKERRQ(ierr);
  if (flg1 || flg2) {
    PetscMPIInt    size;
    PetscInt       lsize,*nodes;
    MPI_Errhandler err_handler;
    /*
       we have to make sure that all processors have opened
       connections to all other processors, otherwise once the
       debugger has stated it is likely to receive a SIGUSR1
       and kill the program.
    */
    ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
    if (size > 2) {
      PetscMPIInt dummy = 0;
      MPI_Status  status;
      for (i=0; i<size; i++) {
        if (rank != i) {
          ierr = MPI_Send(&dummy,1,MPI_INT,i,109,PETSC_COMM_WORLD);CHKERRQ(ierr);
        }
      }
      for (i=0; i<size; i++) {
        if (rank != i) {
          ierr = MPI_Recv(&dummy,1,MPI_INT,i,109,PETSC_COMM_WORLD,&status);CHKERRQ(ierr);
        }
      }
    }
    /* check if this processor node should be in debugger */
    ierr  = PetscMalloc1(size,&nodes);CHKERRQ(ierr);
    lsize = size;
    ierr  = PetscOptionsGetIntArray(NULL,"-debugger_nodes",nodes,&lsize,&flag);CHKERRQ(ierr);
    if (flag) {
      for (i=0; i<lsize; i++) {
        if (nodes[i] == rank) { flag = PETSC_FALSE; break; }
      }
    }
    if (!flag) {
      ierr = PetscSetDebuggerFromString(string);CHKERRQ(ierr);
      ierr = PetscPushErrorHandler(PetscAbortErrorHandler,0);CHKERRQ(ierr);
      if (flg1) {
        ierr = PetscAttachDebugger();CHKERRQ(ierr);
      } else {
        ierr = PetscStopForDebugger();CHKERRQ(ierr);
      }
      ierr = MPI_Comm_create_errhandler((MPI_Handler_function*)Petsc_MPI_AbortOnError,&err_handler);CHKERRQ(ierr);
      ierr = MPI_Comm_set_errhandler(comm,err_handler);CHKERRQ(ierr);
    }
    ierr = PetscFree(nodes);CHKERRQ(ierr);
  }

  ierr = PetscOptionsGetString(NULL,"-on_error_emacs",emacsmachinename,128,&flg1);CHKERRQ(ierr);
  if (flg1 && !rank) {ierr = PetscPushErrorHandler(PetscEmacsClientErrorHandler,emacsmachinename);CHKERRQ(ierr);}

  /*
        Setup profiling and logging
  */
#if defined(PETSC_USE_INFO)
  {
    char logname[PETSC_MAX_PATH_LEN]; logname[0] = 0;
    ierr = PetscOptionsGetString(NULL,"-info",logname,250,&flg1);CHKERRQ(ierr);
    if (flg1 && logname[0]) {
      ierr = PetscInfoAllow(PETSC_TRUE,logname);CHKERRQ(ierr);
    } else if (flg1) {
      ierr = PetscInfoAllow(PETSC_TRUE,NULL);CHKERRQ(ierr);
    }
  }
#endif
#if defined(PETSC_USE_LOG)
  mname[0] = 0;
  ierr = PetscOptionsGetString(NULL,"-history",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr);
  if (flg1) {
    if (mname[0]) {
      ierr = PetscOpenHistoryFile(mname,&petsc_history);CHKERRQ(ierr);
    } else {
      ierr = PetscOpenHistoryFile(NULL,&petsc_history);CHKERRQ(ierr);
    }
  }
#if defined(PETSC_HAVE_MPE)
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsHasName(NULL,"-log_mpe",&flg1);CHKERRQ(ierr);
  if (flg1) {ierr = PetscLogMPEBegin();CHKERRQ(ierr);}
#endif
  flg1 = PETSC_FALSE;
  flg3 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-log_all",&flg1,NULL);CHKERRQ(ierr);
  ierr = PetscOptionsHasName(NULL,"-log_summary",&flg3);CHKERRQ(ierr);
  if (flg1)                      { ierr = PetscLogAllBegin();CHKERRQ(ierr); }
  else if (flg3)                 { ierr = PetscLogDefaultBegin();CHKERRQ(ierr);}

  ierr = PetscOptionsGetString(NULL,"-log_trace",mname,250,&flg1);CHKERRQ(ierr);
  if (flg1) {
    char name[PETSC_MAX_PATH_LEN],fname[PETSC_MAX_PATH_LEN];
    FILE *file;
    if (mname[0]) {
      sprintf(name,"%s.%d",mname,rank);
      ierr = PetscFixFilename(name,fname);CHKERRQ(ierr);
      file = fopen(fname,"w");
      if (!file) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_FILE_OPEN,"Unable to open trace file: %s",fname);
    } else file = PETSC_STDOUT;
    ierr = PetscLogTraceBegin(file);CHKERRQ(ierr);
  }

  ierr   = PetscOptionsGetViewer(PETSC_COMM_WORLD,NULL,"-log_view",NULL,&format,&flg4);CHKERRQ(ierr);
  if (flg4) {
    if (format == PETSC_VIEWER_ASCII_XML){
      ierr = PetscLogNestedBegin();CHKERRQ(ierr);
    } else {
      ierr = PetscLogDefaultBegin();CHKERRQ(ierr);
    }
  }
#endif

  ierr = PetscOptionsGetBool(NULL,"-saws_options",&PetscOptionsPublish,NULL);CHKERRQ(ierr);

#if defined(PETSC_HAVE_CUDA)
  ierr = PetscOptionsHasName(NULL,"-cuda_show_devices",&flg1);CHKERRQ(ierr);
  if (flg1) {
    struct cudaDeviceProp prop;
    int                   devCount;
    int                   device;
    cudaError_t           err = cudaSuccess;

    err = cudaGetDeviceCount(&devCount);
    if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceCount %s",cudaGetErrorString(err));
    for (device = 0; device < devCount; ++device) {
      err = cudaGetDeviceProperties(&prop, device);
      if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceProperties %s",cudaGetErrorString(err));
      ierr = PetscPrintf(PETSC_COMM_WORLD, "CUDA device %d: %s\n", device, prop.name);CHKERRQ(ierr);
    }
  }
  {
    int size;
    ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
    if (size>1) {
      int         devCount, device, rank;
      cudaError_t err = cudaSuccess;

      /* check to see if we force multiple ranks to hit the same GPU */
      ierr = PetscOptionsGetInt(NULL,"-cuda_set_device", &device, &flg1);CHKERRQ(ierr);
      if (flg1) {
        err = cudaSetDevice(device);
        if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err));
      } else {
        /* we're not using the same GPU on multiple MPI threads. So try to allocated different   GPUs to different processes */

        /* First get the device count */
        err   = cudaGetDeviceCount(&devCount);
        if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceCount %s",cudaGetErrorString(err));

        /* next determine the rank and then set the device via a mod */
        ierr   = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);
        device = rank % devCount;
        err    = cudaSetDevice(device);
        if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err));
      }

      /* set the device flags so that it can map host memory ... do NOT throw exception on err!=cudaSuccess
       multiple devices may try to set the flags on the same device. So long as one of them succeeds, things
       are ok. */
      err = cudaSetDeviceFlags(cudaDeviceMapHost);
      if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDeviceFlags %s",cudaGetErrorString(err));
    } else {
      int         device;
      cudaError_t err = cudaSuccess;

      /* the code below works for serial GPU simulations */
      ierr = PetscOptionsGetInt(NULL,"-cuda_set_device", &device, &flg1);CHKERRQ(ierr);
      if (flg1) {
        err = cudaSetDevice(device);
        if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err));
      }

      /* set the device flags so that it can map host memory ... here, we error check. */
      err = cudaSetDeviceFlags(cudaDeviceMapHost);
      if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDeviceFlags %s",cudaGetErrorString(err));
    }
  }
#endif


  /*
       Print basic help message
  */
  ierr = PetscOptionsHasName(NULL,"-help",&flg1);CHKERRQ(ierr);
  if (flg1) {
    ierr = (*PetscHelpPrintf)(comm,"Options for all PETSc programs:\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -help: prints help method for each option\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -on_error_abort: cause an abort when an error is detected. Useful \n ");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"       only when run in the debugger\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -on_error_attach_debugger [gdb,dbx,xxgdb,ups,noxterm]\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"       start the debugger in new xterm\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"       unless noxterm is given\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -start_in_debugger [gdb,dbx,xxgdb,ups,noxterm]\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"       start all processes in the debugger\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -on_error_emacs <machinename>\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"    emacs jumps to error file\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -debugger_nodes [n1,n2,..] Nodes to start in debugger\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -debugger_pause [m] : delay (in seconds) to attach debugger\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -stop_for_debugger : prints message on how to attach debugger manually\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"                      waits the delay for you to attach\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -display display: Location where X window graphics and debuggers are displayed\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -no_signal_handler: do not trap error signals\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -mpi_return_on_error: MPI returns error code, rather than abort on internal error\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -fp_trap: stop on floating point exceptions\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"           note on IBM RS6000 this slows run greatly\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc_dump <optional filename>: dump list of unfreed memory at conclusion\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc: use our error checking malloc\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc no: don't use error checking malloc\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc_info: prints total memory usage\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc_log: keeps log of all memory allocations\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc_debug: enables extended checking for memory corruption\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -options_table: dump list of options inputted\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -options_left: dump list of unused options\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -options_left no: don't dump list of unused options\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -tmp tmpdir: alternative /tmp directory\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -shared_tmp: tmp directory is shared by all processors\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -not_shared_tmp: each processor has separate tmp directory\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -memory_view: print memory usage at end of run\n");CHKERRQ(ierr);
#if defined(PETSC_USE_LOG)
    ierr = (*PetscHelpPrintf)(comm," -get_total_flops: total flops over all processors\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -log[_summary _summary_python]: logging objects and events\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -log_trace [filename]: prints trace of all PETSc calls\n");CHKERRQ(ierr);
#if defined(PETSC_HAVE_MPE)
    ierr = (*PetscHelpPrintf)(comm," -log_mpe: Also create logfile viewable through Jumpshot\n");CHKERRQ(ierr);
#endif
    ierr = (*PetscHelpPrintf)(comm," -info <optional filename>: print informative messages about the calculations\n");CHKERRQ(ierr);
#endif
    ierr = (*PetscHelpPrintf)(comm," -v: prints PETSc version number and release date\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -options_file <file>: reads options from file\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -petsc_sleep n: sleeps n seconds before running program\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"-----------------------------------------------\n");CHKERRQ(ierr);
  }

#if defined(PETSC_HAVE_POPEN)
  {
  char machine[128];
  ierr = PetscOptionsGetString(NULL,"-popen_machine",machine,128,&flg1);CHKERRQ(ierr);
  if (flg1) {
    ierr = PetscPOpenSetMachine(machine);CHKERRQ(ierr);
  }
  }
#endif

  ierr = PetscOptionsGetReal(NULL,"-petsc_sleep",&si,&flg1);CHKERRQ(ierr);
  if (flg1) {
    ierr = PetscSleep(si);CHKERRQ(ierr);
  }

  ierr = PetscOptionsGetString(NULL,"-info_exclude",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr);
  if (flg1) {
    ierr = PetscStrstr(mname,"null",&f);CHKERRQ(ierr);
    if (f) {
      ierr = PetscInfoDeactivateClass(0);CHKERRQ(ierr);
    }
  }

#if defined(PETSC_HAVE_CUSP) || defined(PETSC_HAVE_VIENNACL)
  ierr = PetscOptionsHasName(NULL,"-log_summary",&flg3);CHKERRQ(ierr);
  if (!flg3) {
  ierr = PetscOptionsHasName(NULL,"-log_view",&flg3);CHKERRQ(ierr);
  }
#endif
#if defined(PETSC_HAVE_CUSP)
  ierr = PetscOptionsGetBool(NULL,"-cusp_synchronize",&flg3,NULL);CHKERRQ(ierr);
  PetscCUSPSynchronize = flg3;
#elif defined(PETSC_HAVE_VIENNACL)
  ierr = PetscOptionsGetBool(NULL,"-viennacl_synchronize",&flg3,NULL);CHKERRQ(ierr);
  PetscViennaCLSynchronize = flg3;
#endif
  PetscFunctionReturn(0);
}
/**
 * This function does what it says on the tin.
 */
void OptiXRenderer::performRender(long long int photons, int argc_mpi, char* argv_mpi[], int width, int height, float film_location) {
	// Keep track of time
	timeval tic;

	// Create OptiX context
	optix::Context context = optix::Context::create();
	context->setRayTypeCount( 1 );

	// Debug, this will make everything SLOOOOOW
	context->setPrintEnabled(false);

	// Set some CUDA flags
	cudaSetDeviceFlags(cudaDeviceMapHost | cudaDeviceLmemResizeToMax);

	// Set used devices
	int tmp[] = { 0, 1 };
	std::vector<int> v( tmp, tmp+2 );
	context->setDevices(v.begin(), v.end());

	// Report device usage
	int num_devices = context->getEnabledDeviceCount();
	printf("Using %d devices:\n", num_devices);
	std::vector<int> enabled_devices =  context->getEnabledDevices();
	for(int i=0;i<num_devices;i++) {
		printf("    Device #%d [%s]\n", enabled_devices[i], context->getDeviceName(enabled_devices[i]).c_str());
	}

	// Set some OptiX variables
	context->setStackSize(4096);

	// Report OptiX infomation
	int stack_size_in_bytes = context->getStackSize();
	printf("Optix stack size is %d bytes (~%d KB).\n.", stack_size_in_bytes, stack_size_in_bytes/1024);

	// Declare some variables
	int threads = 500000; //20000000;
	unsigned int iterations_on_device = 1;

	// Set some scene-wide variables
	context["photon_ray_type"]->setUint( 0u );
	context["scene_bounce_limit"]->setUint( 10u );
	context["scene_epsilon"]->setFloat( 1.e-4f );
	context["iterations"]->setUint(iterations_on_device);
	context["follow_photon"]->setInt(66752);

	// Convert our existing scene into an OptiX one
	convertToOptiXScene(context, width, height, film_location);

	// Report infomation
	printf("Rendering with:\n");
	printf("    %lld photons.\n", photons);
	printf("    %d threads.\n", threads);
	printf("    %d iterations per thread.\n", iterations_on_device);
	int launches = (photons/threads)/iterations_on_device;
	if(launches*threads*iterations_on_device<photons) {
		launches++;
		printf("    NOTE: You have asked for %lld photons, we are providing %lld photons instead.\n", photons, launches*threads*iterations_on_device);
	}
	printf("    %d optix launches.\n", launches);

	// Create buffer for random numbers
	optix::Buffer random_buffer = context->createBufferForCUDA( RT_BUFFER_INPUT_OUTPUT | RT_BUFFER_GPU_LOCAL, RT_FORMAT_USER, threads );
	random_buffer->setElementSize(sizeof(curandState));
	curandState* states_ptr[num_devices];

	// Intalise
	for(int i=0;i<num_devices;i++) {
		int device_id = enabled_devices[i];
		long memory_in_bytes = threads * sizeof(curandState);
		long memory_in_megabytes = memory_in_bytes/(1024*1024);
		printf("Allocating %ld bytes (~%ld MB) of memory on device #%d for random states...\n", memory_in_bytes, memory_in_megabytes, device_id);
		gettimeofday(&tic, NULL);
		cudaSetDevice(device_id);
		cudaMalloc((void **)&states_ptr[i], memory_in_bytes);
		done(tic);
		CUDAWrapper executer;
		executer.curand_setup(threads, (void **)&states_ptr[i], time(NULL), i);
	}

	// Set as buffer on context
	context["states"]->set(random_buffer);

	// Wait
	printf("Waiting for random states to initalise...\n");
	gettimeofday(&tic, NULL);
	for(int i=0;i<num_devices;i++) {
		cudaSetDevice(enabled_devices[i]);
		sync_all_threads();
	}
	done(tic);

	// Bind to the OptiX buffer
	// We do this here because it cases a syncronise apparently
	for(int i=0;i<num_devices;i++) {
		random_buffer->setDevicePointer(enabled_devices[i], (CUdeviceptr) states_ptr[i]);
	}

	// Create Image buffer
	optix::Buffer buffer = context->createBufferForCUDA( RT_BUFFER_INPUT_OUTPUT | RT_BUFFER_GPU_LOCAL, RT_FORMAT_FLOAT4, width, height );
	optix::float4* imgs_ptr[num_devices];

	//cudaSetDevice(0);
	//cudaMalloc((void **)&states_ptr_0, threads * sizeof(curandState));
	for(int i=0;i<num_devices;i++) {
		int device_id = enabled_devices[i];
		long memory_in_bytes = width * height * sizeof(optix::float4);
		long memory_in_megabytes = memory_in_bytes/(1024*1024);
		printf("Allocating %ld bytes (~%ld MB) of memory on device #%d for image result...\n", memory_in_bytes, memory_in_megabytes, device_id);
		gettimeofday(&tic, NULL);
		cudaSetDevice(device_id);
		cudaMalloc((void **)&imgs_ptr[i], memory_in_bytes);
		done(tic);
		CUDAWrapper executer;
		executer.img_setup((void **)&imgs_ptr[i], width, height);
	}

	// Set as buffer on context
	context["output_buffer"]->set(buffer);

	// Wait for everytyhing to execute
	printf("Waiting for Image data to initalise...\n");
	gettimeofday(&tic, NULL);
	for(int i=0;i<num_devices;i++) {
		cudaSetDevice(enabled_devices[i]);
		sync_all_threads();
	}
	done(tic);

	// Bind to the OptiX buffer
	// We do this here because it cases a syncronise apparently
	for(int i=0;i<num_devices;i++) {
		buffer->setDevicePointer(enabled_devices[i], (CUdeviceptr) imgs_ptr[i]);
	}

	// Construct MPI
	int size, rank = 0;
	#ifndef PHOTON_MPI
	(void)size;
	(void)argc_mpi;
	(void)argv_mpi;
	#endif

	#ifdef PHOTON_MPI
		MPI::Init( argc_mpi, argv_mpi );

		//MPI_Get_processor_name(hostname,&strlen);
		rank = MPI::COMM_WORLD.Get_rank();
		size = MPI::COMM_WORLD.Get_size();
		printf("Hello, world; from process %d of %d\n", rank, size);

		// Adjust number of photons for MPI
		long long int long_size = (long long int) size;
		photons = photons/long_size;
		if(rank==0)	printf("MPI adjusted to %lld photons per thread", photons);
	#endif /* MPI */

	// Validate
	try{
		context->validate();
	}catch(Exception& e){
		printf("Validate error!\n");
		printf("    CUDA says  : %s\n",  cudaGetErrorString(cudaPeekAtLastError()));
		printf("    OptiX says : %s\n", e.getErrorString().c_str() );
		return;
	}

	// Compile context
	try{
		context->compile();
	}catch(Exception& e){
		printf("Compile error!\n");
		printf("    CUDA says  : %s\n",  cudaGetErrorString(cudaPeekAtLastError()));
		printf("    OptiX says : %s\n", e.getErrorString().c_str() );
		return;
	}

	// Render
	int current_launch = 0;
	try{
		printf("Begin render...\n");
		gettimeofday(&tic, NULL);
		for(current_launch=0;current_launch<launches;current_launch++) {
			printf("    ... %f percent\n", 100*((current_launch*1.0f*threads)/photons));
			context->launch(0 , threads );
		}
		done(tic);
	}catch(Exception& e){
		printf("Launch error on launch #%d!\n", current_launch);
		printf("    CUDA says  : %s\n",  cudaGetErrorString(cudaPeekAtLastError()));
		printf("    OptiX says : %s\n", e.getErrorString().c_str() );
		return;
	}

	#ifndef PHOTON_MPI

	#endif /* If not MPI */
	#ifdef PHOTON_MPI

		// Create MPI handles
		accImg = new Image(img->getWidth(), img->getHeight());
		MPI::Win window_r;
		MPI::Win window_g;
		MPI::Win window_b;

		// Construct an MPI Window to copy some data into, one for each colour.
		int size_in_bytes = sizeof(float)*img->getWidth()*img->getHeight();
		window_r = MPI::Win::Create(accImg->imageR, size_in_bytes, sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD);
		window_g = MPI::Win::Create(accImg->imageG, size_in_bytes, sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD);
		window_b = MPI::Win::Create(accImg->imageB, size_in_bytes, sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD);

		// Perform transfer
		window_r.Fence(0);
		window_g.Fence(0);
		window_b.Fence(0);
		window_r.Accumulate(
				img->imageR,
				img->getWidth()*img->getHeight(),
				MPI_FLOAT,
				0,
				0,
				img->getWidth()*img->getHeight(),
				MPI_FLOAT,
				MPI_SUM
		);
		window_g.Accumulate(
				img->imageG,
				img->getWidth()*img->getHeight(),
				MPI_FLOAT,
				0,
				0,
				img->getWidth()*img->getHeight(),
				MPI_FLOAT,
				MPI_SUM
		);
		window_b.Accumulate(
				img->imageB,
				img->getWidth()*img->getHeight(),
				MPI_FLOAT,
				0,
				0,
				img->getWidth()*img->getHeight(),
				MPI_FLOAT,
				MPI_SUM
		);
		window_r.Fence(0);
		window_g.Fence(0);
		window_b.Fence(0);
		window_r.Free();
	#endif /* MPI */
	// Output the image
	if(rank==0) {
		// Construct filename
		char sbuffer[100];
		sprintf(sbuffer, "photons-%d.ppm", 0);
		// This is the collected image data on the host
		optix::float4* img_host_ptr = (optix::float4*) malloc(width*height*sizeof(optix::float4));
		// If we have more than one device we have to accumulate everything back into one buffer
		if(num_devices == 1) {
			img_host_ptr = (optix::float4*) malloc(width*height*sizeof(optix::float4));
			cudaMemcpy(img_host_ptr, imgs_ptr[0], width*height*sizeof(optix::float4), cudaMemcpyDeviceToHost);
		} else {
			printf("We are using %d GPUs, accumulating result...", num_devices);
			gettimeofday(&tic, NULL);
			// Create an accumulate buffer on GPU #0#
// 			int device_id = enabled_devices[0];
// 			cudaSetDevice(device_id);
// 			optix::float4* accumulate_dev_ptr;
// 			cudaMalloc((void **)&accumulate_dev_ptr, width*height*sizeof(optix::float4));
// 			// put the array of memory ptrs on device 0
// 			optix::float4** ptrs_dev_ptr;
// 			cudaMalloc((void **)&ptrs_dev_ptr, num_devices*sizeof(optix::float4*));
// 			// Copy data over
// 			cudaMemcpy(ptrs_dev_ptr, imgs_ptr, num_devices*sizeof(optix::float4*), cudaMemcpyHostToDevice);
// 			CUDAWrapper executer;
// 			executer.img_accumulate((void ***)&ptrs_dev_ptr, (void **)&accumulate_dev_ptr, num_devices, width, height);
// 			cudaMemcpy(img_host_ptr, accumulate_dev_ptr, width*height*sizeof(optix::float4), cudaMemcpyDeviceToHost);
			// Copy everything to host and accumulate here
			optix::float4* host_buffers[num_devices];
			for(int i=0;i<num_devices;i++) {
				host_buffers[i] = (optix::float4*) malloc(width*height*sizeof(optix::float4));
				cudaMemcpy(host_buffers[i], imgs_ptr[i], width*height*sizeof(optix::float4), cudaMemcpyDeviceToHost);
			}
			// Acumulate
			for(int i=0;i<width*height;i++) {
				img_host_ptr[i] = make_float4(0, 0, 0, 0);
				for(int j=0;j<num_devices;j++) {
					img_host_ptr[i].x += host_buffers[j][i].x;
					img_host_ptr[i].y += host_buffers[j][i].y;
					img_host_ptr[i].z += host_buffers[j][i].z;
					img_host_ptr[i].w += host_buffers[j][i].w;
				}
			}
			for(int i=0;i<num_devices;i++) {
				free(host_buffers[i]);
			}
			done(tic);
		}
		printf("Saving Image to %s...\n", sbuffer);
		gettimeofday(&tic, NULL);
		saveToPPMFile(sbuffer, img_host_ptr, width, height);
		free(img_host_ptr);
		done(tic);
	}
	#ifdef PHOTON_MPI
		// Teardown MPI
		MPI::Finalize();
	#endif /* MPI */
}
Exemple #17
0
int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
	int dev_id = device_map[thr_id];
	uint32_t *pdata = work->data;
	uint32_t *ptarget = work->target;
	const uint32_t first_nonce = pdata[19];
	const int swap = 1; // to toggle nonce endian

	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 256*256*8
	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);

	if (opt_benchmark)
		((uint32_t*)ptarget)[7] = 0;

	if (!init[thr_id])
	{
		cudaSetDevice(dev_id);
		if (opt_cudaschedule == -1 && gpu_threads == 1) {
			cudaDeviceReset();
			// reduce cpu usage
			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
			CUDA_LOG_ERROR();
		}

		cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput);

		quark_skein512_cpu_init(thr_id, throughput);
		cuda_check_cpu_init(thr_id, throughput);

		CUDA_SAFE_CALL(cudaDeviceSynchronize());

		init[thr_id] = true;
	}

	uint32_t endiandata[20];
	for (int k=0; k < 19; k++)
		be32enc(&endiandata[k], pdata[k]);

	skein512_cpu_setBlock_80((void*)endiandata);
	cuda_check_cpu_setTarget(ptarget);

	do {
		int order = 0;

		// Hash with CUDA
		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], swap);
		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

		*hashes_done = pdata[19] - first_nonce + throughput;

		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
		if (foundNonce != UINT32_MAX)
		{
			uint32_t _ALIGN(64) vhash64[8];

			endiandata[19] = swab32_if(foundNonce, swap);
			skein2hash(vhash64, endiandata);

			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
				int res = 1;
				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
				work_set_target_ratio(work, vhash64);
				if (secNonce != 0) {
					if (!opt_quiet)
						applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", dev_id, swab32(secNonce));

					endiandata[19] = swab32_if(secNonce, swap);
					skein2hash(vhash64, endiandata);
					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio)
						work_set_target_ratio(work, vhash64);
					pdata[21] = swab32_if(secNonce, !swap);
					res++;
				}
				pdata[19] = swab32_if(foundNonce, !swap);
				return res;
			} else {
				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
			}
		}

		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
			pdata[19] = max_nonce;
			break;
		}

		pdata[19] += throughput;

	} while (!work_restart[thr_id].restart);

	*hashes_done = pdata[19] - first_nonce;

	return 0;
}
Exemple #18
0
extern int scanhash_groestlcoin(int thr_id, uint32_t *pdata, uint32_t *ptarget,
    uint32_t max_nonce, uint32_t *hashes_done)
{
	static THREAD uint32_t *foundNounce = nullptr;

    uint32_t start_nonce = pdata[19];
	unsigned int intensity = (device_sm[device_map[thr_id]] > 500) ? 24 : 23;
	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1U << intensity);
	uint32_t throughput = min(throughputmax, max_nonce - start_nonce) & 0xfffffc00;

    if (opt_benchmark)
        ptarget[7] = 0x0000000f;

    // init
	static THREAD volatile bool init = false;
	if(!init)
    {
		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));

		groestlcoin_cpu_init(thr_id, throughputmax);
		CUDA_SAFE_CALL(cudaMallocHost(&foundNounce, 2 * 4));
		init = true;
    }

    // Endian Drehung ist notwendig
    uint32_t endiandata[32];
    for (int kk=0; kk < 32; kk++)
        be32enc(&endiandata[kk], pdata[kk]);

    // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
    groestlcoin_cpu_setBlock(thr_id, endiandata);

	do
	{
		// GPU
		const uint32_t Htarg = ptarget[7];

		groestlcoin_cpu_hash(thr_id, throughput, pdata[19], foundNounce, ptarget[7]);

		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
		if(foundNounce[0] < 0xffffffff)
		{
			uint32_t tmpHash[8];
			endiandata[19] = SWAP32(foundNounce[0]);
			groestlhash(tmpHash, endiandata);

			if(tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget))
			{
				int res = 1;
				if(opt_benchmark)
					applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], foundNounce[0]);
				*hashes_done = pdata[19] - start_nonce + throughput;
				if(foundNounce[1] != 0xffffffff)
				{
					endiandata[19] = SWAP32(foundNounce[1]);
					groestlhash(tmpHash, endiandata);
					if(tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget))
					{
						pdata[21] = foundNounce[1];
						res++;
						if(opt_benchmark)
							applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], foundNounce[1]);
					}
					else
					{
						if(tmpHash[7] != Htarg)
						{
							applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNounce[1]);
						}
					}
				}
				pdata[19] = foundNounce[0];
				return res;
			}
			else
			{
				if(tmpHash[7] != Htarg)
				{
					applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNounce[0]);
				}
			}
		}

		pdata[19] += throughput;
		cudaError_t err = cudaGetLastError();
		if(err != cudaSuccess)
		{
			applog(LOG_ERR, "GPU #%d: %s", device_map[thr_id], cudaGetErrorString(err));
			exit(EXIT_FAILURE);
		}
	} while(!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));

    *hashes_done = pdata[19] - start_nonce;
    return 0;
}
Exemple #19
0
/**
 * @ingroup virginian
 * @brief Initializes the Virginian database
 *
 * Initializes or re-initializes the struct that holds the
 * state of the database. It sets a number of options via defaults hard-coded or
 * defined in virginian.h. Additionally, this function is responsible for
 * allocating the tablet memory areas in both main memory and GPU memory. If the
 * VIRG_DEBUG macro is defined, both the state struct and the tablet memory
 * areas are set to 0xDEADBEEF. Finally, this function also initializes the CUDA
 * context. The allocations made in this function are freed with virg_close().
 *
 * @param v Pointer to the state struct of the database system
 * @return VIRG_SUCCESS or VIRG_FAIL depending on errors during the function
 * call
 */
int virg_init(virginian *v)
{
	int i;

	cudaError_t r;

#ifdef VIRG_DEBUG
	// zero out db struct for valgrind
	memset(&v->db, 0xDEADBEEF, sizeof(virg_db));
#endif

	// set struct defaults
	v->tablet_slot_counter = 0;
	v->tablet_slots_taken = 0;
	v->threads_per_block = VIRG_THREADSPERBLOCK;
	v->multi_threads = VIRG_MULTITHREADS;
	v->use_multi = 0;
	v->use_gpu = 0;
	v->use_stream = 0;
	v->use_mmap = 0;
	v->dbfd = -1;

	// init mutex for locking tablet slots
	VIRG_CHECK(pthread_mutex_init(&v->slot_lock, NULL), "Could not init mutex")

	// set proper flags, first one enables mapped memory
	//cudaSetDeviceFlags(cudaDeviceMapHost | cudaDeviceScheduleSpin | cudaDeviceScheduleBlockingSync);
	cudaSetDeviceFlags(cudaDeviceMapHost);
	cudaSetDeviceFlags(cudaDeviceScheduleSpin);
	cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);

	VIRG_CUDCHK("set device flags");

	// initialize cuda context
	cudaSetDevice(VIRG_CUDADEVICE);
	VIRG_CUDCHK("set device");

	// initialize tablets
	for(i = 0; i < VIRG_MEM_TABLETS; i++) {
		v->tablet_slot_status[i] = 0;

		// if pinned, then use cuda alloc
#ifndef VIRG_NOPINNED
		r = cudaHostAlloc((void**)&v->tablet_slots[i], VIRG_TABLET_SIZE, cudaHostAllocMapped);
		VIRG_CUDCHK("Allocating pinned tablet memory");
#else
		v->tablet_slots[i] = malloc(VIRG_TABLET_SIZE);
		VIRG_CHECK(v->tablet_slots[i] == NULL, "Problem allocating tablet memory");
#endif

#ifdef VIRG_DEBUG
		// this should only be read when status is nonzero
		v->tablet_slot_ids[i] = 0;

		// zero out memory for debugging
		memset(v->tablet_slots[i], 0xDEADBEEF, VIRG_TABLET_SIZE);
#endif
	}

	if(VIRG_GPU_TABLETS > 0) {
		// initialize gpu tablets as a single block
		r = cudaMalloc((void**)&v->gpu_slots, VIRG_TABLET_SIZE * VIRG_GPU_TABLETS);
		VIRG_CHECK(r != cudaSuccess, "Problem allocating GPU tablet memory");
#ifdef VIRG_DEBUG
		cudaMemset(v->gpu_slots, 0xDEADBEEF, VIRG_TABLET_SIZE * VIRG_GPU_TABLETS);
#endif
	}

	return VIRG_SUCCESS;
}
Exemple #20
0
cudaError_t WINAPI wine_cudaSetDeviceFlags( int flags ) {
    WINE_TRACE("\n");
    return cudaSetDeviceFlags( flags );
}
// This is the GPU thread where we do the per-GPU tasks.
void GRTRegenerateChains::GPU_Thread(void *pointer) {
    struct GRTRegenerateThreadRunData *data;
    GRTWorkunitElement *WU;

    data = (GRTRegenerateThreadRunData *) pointer;

    // Set the device.
    cudaSetDevice(data->gpuDeviceId);

    // Enable blocking sync.  This dramatically reduces CPU usage.
    // If zero copy is being used, set DeviceMapHost as well
    if (this->CommandLineData->GetUseZeroCopy()) {
        cudaSetDeviceFlags(cudaDeviceBlockingSync | cudaDeviceMapHost);
    } else {
        cudaSetDeviceFlags(cudaDeviceBlockingSync);
    }

    //printf("Copying to GPU mem\n");
    this->AllocatePerGPUMemory(data);
    this->copyDataToConstant(data);
   // printf("Back from copy constant\n");
    cudaThreadSynchronize();

    // I... *think* we're ready to rock!
    // As long as we aren't supposed to exit, keep running.
    while (1) {
        WU = this->Workunit->GetNextWorkunit();
        if (WU == NULL) {
            if (!silent) {
                if (this->Display) {
                    sprintf(this->statusStrings, "Thread %d out of WU", data->threadID);
                    this->Display->addStatusLine(this->statusStrings);
                } else {
                    printf("Thread %d out of workunits\n", data->threadID);
                }
            }
            break;
        }
        if (this->HashFile->GetUncrackedHashCount() == 0) {
            if (!silent) {
                if (this->Display) {
                    sprintf(this->statusStrings, "All hashes found");
                    this->Display->addStatusLine(this->statusStrings);
                } else {
                    printf("Thread %d no unfound hashes left!\n", data->threadID);
                }
            }
            break;
        }

        this->RunGPUWorkunit(WU, data);
        this->Workunit->SubmitWorkunit(WU);
        if (this->Display) {
            this->Display->setWorkunitsCompleted(this->Workunit->GetNumberOfCompletedWorkunits());
            this->Display->setThreadFractionDone(data->threadID, 0.0);
        }
    }
    if (this->Display) {
        this->Display->setThreadCrackSpeed(data->threadID, 0, 0.0);
    }
    this->FreePerGPUMemory(data);
    // Clean up thread context for subsequent setting of thread ID
    cudaThreadExit();
}