Ejemplo n.º 1
0
void mexFunction(int nlhs, mxArray *plhs[],
                 int nrhs, const mxArray *prhs[]) {

  CUresult cudastatus = CUDA_SUCCESS;

  if (nrhs != 2)
    mexErrMsgTxt("Wrong number of arguments");


  if (init == 0) {
    // Initialize function
    //mexLock();

    // load GPUmat
    gm = gmGetGPUmat();


    init = 1;
  }

  // mex parameters are:
  GPUtype A = gm->gputype.getGPUtype(prhs[0]);
  GPUtype B = gm->gputype.getGPUtype(prhs[1]);

  int r[] = {3,4,6,1};
  gm->gputype.mxAssign(A, B, Range(3,r), 0);


}
Ejemplo n.º 2
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
  // At least 1 arguments expected
  // Input and result
  if (nrhs!=1)
     mexErrMsgTxt("Wrong number of arguments");
  if (init == 0) {
    // Initialize function
    //mexLock();
    // load GPUmat
    gm = gmGetGPUmat();
    gmCheckGPUmat(gm);
    init = 1;
  }
  // log
  gm->debug.log("> REAL\n",0);
  gm->debug.logPush();
  // mex parameters are:
  // IN1
  GPUtype IN1  = gm->gputype.getGPUtype(prhs[0]);
  if (gm->comp.getCompileMode() == 1) {
    GPUtype R = gm->gputype.create(gpuFLOAT, 0, NULL, NULL);
    gm->comp.pushGPUtype(&R);
    gm->comp.functionStart("GPUMAT_RealDrv");
    gm->comp.functionSetParamGPUtype(&R);
    gm->comp.functionSetParamGPUtype(&IN1);
    gm->comp.functionEnd();
    plhs[0] = gm->gputype.createMxArray(R);
  } else {
    GPUtype R = gm->numerics.RealDrv(IN1);
    plhs[0] = gm->gputype.createMxArray(R);
  }
  gm->debug.logPop();
}
Ejemplo n.º 3
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
  // At least 2 arguments expected
  // Input and result
  if (nrhs!=2)
     mexErrMsgTxt("Wrong number of arguments");
  if (init == 0) {
    // Initialize function
    //mexLock();
    // load GPUmat
    gm = gmGetGPUmat();
    gmCheckGPUmat(gm);
    init = 1;
  }
  // log
  gm->debug.log("> GPUuminus\n",0);
  gm->debug.logPush();
  // mex parameters are:
  // IN1
  // OUT
  GPUtype IN1  = gm->gputype.getGPUtype(prhs[0]);
  GPUtype OUT  = gm->gputype.getGPUtype(prhs[1]);
  if (gm->comp.getCompileMode() == 1) {
    gm->comp.functionStart("GPUMAT_Uminus");
    gm->comp.functionSetParamGPUtype(&OUT);
    gm->comp.functionSetParamGPUtype(&IN1);
    gm->comp.functionEnd();
  } else {
    gm->numerics.Uminus(IN1,OUT);
  }
  gm->debug.logPop();
}
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
  // At least 1 arguments expected
  // Input and result
  if (nrhs!=1)
     mexErrMsgTxt("Wrong number of arguments");
  if (init == 0) {
    // Initialize function
    //mexLock();
    // load GPUmat
    gm = gmGetGPUmat();
    gmCheckGPUmat(gm);
    init = 1;
  }
  // log
  gm->debug.log("> mxToGPUtype\n",0);
  gm->debug.logPush();
  // mex parameters are:
  // MX
  GPUtype IN1  = gm->gputype.mxToGPUtype(prhs[0]);

  if (gm->comp.getCompileMode() == 1) {
    gm->comp.abort(ERROR_NUMERICS_COMPNOTIMPLEMENTED);

    /*GPUtype R = gm->gputype.create(gpuFLOAT, 0, NULL, NULL);
    gm->comp.pushGPUtype(&R);
    gm->comp.functionStart("GPUMAT_mxToGPUtype");
    gm->comp.functionSetParamGPUtype(&R);
    gm->comp.functionSetParamMx(prhs[0]);
    gm->comp.functionEnd();
    plhs[0] = gm->gputype.createMxArray(R);*/
  } else {
    plhs[0] = gm->gputype.createMxArray(IN1);
  }
  gm->debug.logPop();
}
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
  // At least 1 arguments expected
  // Input and result
  if (nrhs!=2)
     mexErrMsgTxt("Wrong number of arguments");
  if (init == 0) {
    // Initialize function
    //mexLock();
    // load GPUmat
    gm = gmGetGPUmat();
    gmCheckGPUmat(gm);
    init = 1;
  }
  // log
  gm->debug.log("> GPUdoubleToGPUsinglePtr\n",0);
  gm->debug.logPush();
  // mex parameters are:
  // IN1
  GPUtype IN1  = gm->gputype.getGPUtype(prhs[1]);
  int isdouble = gm->gputype.isDouble(IN1);
  if (!isdouble) {
    mexErrMsgTxt(ERROR_CASTING_GPUDOUBLE);
  }
  if (gm->comp.getCompileMode() == 1) {
    gm->comp.abort(ERROR_NUMERICS_COMPNOTIMPLEMENTED);
  } else {
    GPUtype R = gm->gputype.doubleToFloat(IN1);
    plhs[0] = gm->gputype.createMxArrayPtr(prhs[0], R);
  }
  gm->debug.logPop();
}
Ejemplo n.º 6
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
    // At least 2 arguments expected
    // Input and result
    if ((nrhs!=2) || (nlhs != 1))
        mexErrMsgTxt("Wrong number of arguments");
    if (init == 0) {
        gm = gmGetGPUmat();
        init = 1;
    }
    /* mex parameters are:
     0 array 1
     1 array 2
     This function is a wrapper for the CUBLAS double-prec dot product function pending newer matcuda
     */
    GPUtype arrayA = gm->gputype.getGPUtype(prhs[0]);
    GPUtype arrayB = gm->gputype.getGPUtype(prhs[1]);

    int numElements = gm->gputype.getNumel(arrayA);
    if (numElements != gm->gputype.getNumel(arrayB)) mexErrMsgTxt("Arrays contain different numbers of elements.\n");

    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); //mxReal is our data-type
    double *AdotB = mxGetPr(plhs[0]);

    void *pointerA = (void *)gm->gputype.getGPUptr(arrayA);
    void *pointerB = (void *)gm->gputype.getGPUptr(arrayB);

    double *u = (double*)pointerA;
    double *v = (double *)pointerB;

    AdotB[0] = cublasDdot(numElements, u, 1, v, 1);

}
Ejemplo n.º 7
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

  //CUresult cudastatus = CUDA_SUCCESS;

  // simple garbage collection
  MyGCObj<Range> mygc1;

  // more than 4 arguments expected
  if (nrhs != 7)
    mexErrMsgTxt(ERROR_GPUFILL_WRONGARGS);

  if (init == 0) {
    // Initialize function
    //mexLock();

    // load GPUmat
    gm = gmGetGPUmat();

    // check gm
    gmCheckGPUmat(gm);

    // load module
    // NO MODULE REQUIRED

    // load float GPU function
    // NO FUNCTION REQUIRED

    init = 1;
  }

  // log
  gm->debug.log("> GPUfill\n",0);
  gm->debug.logPush();

  // mex parameters are:
  // DST: Destination GPUtype variable
  // offset
  // incr
  // m
  // p
  // type

  GPUtype DST = gm->gputype.getGPUtype(prhs[0]);
  if (gm->comp.getCompileMode() == 1) {

    gm->comp.functionStart("GPUMAT_mxFill");
    gm->comp.functionSetParamGPUtype(&DST);
    gm->comp.functionSetParamMxMx(nrhs-1, &(prhs[1]));
    gm->comp.functionEnd();

  } else {
    gm->gputype.mxFill(DST, nrhs-1, &(prhs[1]));
  }
  gm->debug.logPop();


}
Ejemplo n.º 8
0
/*
 * Initializes numerics MODULE.
 * 1) Load GPU kernels
 * 2) Register functions in GPUmat structure
 *
 */
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

  if (nrhs != 0)
    mexErrMsgTxt("Wrong number of arguments");

  if (init == 0) {
    // Initialize function
    mexLock();

    // load GPUmat
    gm = gmGetGPUmat();

    // load module
    // nothing to load

    //************************************************************************
    // INIT
    //************************************************************************

    seed = 0;
    // init curand
    //if (curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT)!=CURAND_STATUS_SUCCESS) {
    //  mexErrMsgTxt(ERROR_CURAND_INIT);
    //}


    //************************************************************************
    // REGISTER FUNCTION IN GPUMAT STRUCTURE
    //************************************************************************

    // put here functions to be registered
    gm->rand.rand      = GPUrand;
    gm->rand.mxRandDrv = GPUmxRandDrv;

    gm->rand.randn      = GPUrandn;
    gm->rand.mxRandnDrv = GPUmxRandnDrv;


    //************************************************************************
    // UPDATE FLAGS
    //************************************************************************
    gm->mod.rand = 1; // rand module was loaded


    init = 1;
  }

}
Ejemplo n.º 9
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
  // At least 2 arguments expected
  // Input1, Input2
  if (nrhs!=2)
     mexErrMsgTxt("Wrong number of arguments");
  if (init == 0) {
    // Initialize function
    //mexLock();
    // load GPUmat
    gm = gmGetGPUmat();
    gmCheckGPUmat(gm);
    init = 1;
  }
  // log
  gm->debug.log("> MRDIVIDE\n",0);
  gm->debug.logPush();
  // mex parameters are:
  // IN1
  // IN2
  GPUtype IN1  = gm->gputype.getGPUtype(prhs[0]);
  GPUtype IN2  = gm->gputype.getGPUtype(prhs[1]);
  
  int in1_iscalar =  gm->gputype.isScalar(IN1);
  int in2_iscalar =  gm->gputype.isScalar(IN2);

  if (in1_iscalar||in2_iscalar) {
  } else  {
    mexErrMsgTxt(ERROR_MRDIVIDE_MATRICES);
  }
  if (gm->comp.getCompileMode() == 1) {
    GPUtype R = gm->gputype.create(gpuFLOAT, 0, NULL, NULL);
    gm->comp.pushGPUtype(&R);

    gm->comp.functionStart("GPUMAT_RdivideDrv");
    gm->comp.functionSetParamGPUtype(&R);
    gm->comp.functionSetParamGPUtype(&IN1);
    gm->comp.functionSetParamGPUtype(&IN2);
    gm->comp.functionEnd();
    plhs[0] = gm->gputype.createMxArray(R);
  } else {
    GPUtype R    = gm->numerics.RdivideDrv(IN1,IN2);
    plhs[0] = gm->gputype.createMxArray(R);
  }
  gm->debug.logPop();
}
Ejemplo n.º 10
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
  // At least 1 arguments expected
  // Input and result
  if (nrhs!=1)
     mexErrMsgTxt("Wrong number of arguments");
  if (init == 0) {
    // Initialize function
    //mexLock();
    // load GPUmat
    gm = gmGetGPUmat();
    init = 1;
  }
  // mex parameters are:
  // IN1
  GPUtype IN1  = gm->gputype.getGPUtype(prhs[0]);
  GPUtype r    = gm->numerics.Log10Drv(IN1);
  plhs[0] = gm->gputype.createMxArray(r);
}
Ejemplo n.º 11
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
  // At least 2 arguments expected
  // Input and result
  if (nrhs!=2)
     mexErrMsgTxt("Wrong number of arguments");
  if (init == 0) {
    // Initialize function
    //mexLock();
    // load GPUmat
    gm = gmGetGPUmat();
    init = 1;
  }
  // mex parameters are:
  // IN1
  // OUT
  GPUtype IN1  = gm->gputype.getGPUtype(prhs[0]);
  GPUtype OUT  = gm->gputype.getGPUtype(prhs[1]);
  gm->numerics.Transpose(IN1,OUT);
}
Ejemplo n.º 12
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

  CUresult cudastatus = CUDA_SUCCESS;

  if (nrhs != 1)
    mexErrMsgTxt("Wrong number of arguments");

  if (init == 0) {
    // Initialize function
    //mexLock();

    // load GPUmat
    gm = gmGetGPUmat();

    // check gm
    gmCheckGPUmat(gm);

    init = 1;
  }

  // log
  gm->debug.log("> GPUrandn\n",0);
  gm->debug.logPush();

  // mex parameters are:
  // 1. OUT
  //OUT is the output GPU array (result)
  GPUtype OUT = gm->gputype.getGPUtype(prhs[0]);

  if (gm->comp.getCompileMode() == 1) {
    gm->comp.functionStart("GPUMAT_Randn");
    gm->comp.functionSetParamGPUtype(&OUT);
    gm->comp.functionEnd();
  } else {
    gm->rand.randn(OUT);
  }

  gm->debug.logPop();

}
Ejemplo n.º 13
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

  CUresult cudastatus = CUDA_SUCCESS;

  // At least 2 arguments expected
  // Input and result
  if (nrhs!=2)
     mexErrMsgTxt("Wrong number of arguments");

  if (init == 0) {
    // Initialize function
    //mexLock();

    // load GPUmat
    gm = gmGetGPUmat();

    // load module
    // NOT REQUIRED

    // load float GPU function
    // NOT REQUIRED

    init = 1;
  }



  // mex parameters are:
  // IN
  // OUT

  GPUtype IN  = gm->gputype.getGPUtype(prhs[0]);
  GPUtype OUT = gm->gputype.getGPUtype(prhs[1]);


  gm->numerics.Exp(IN,OUT);

}
Ejemplo n.º 14
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
  // At least 1 arguments expected
  // Input and result
  if (nrhs!=1)
     mexErrMsgTxt("Wrong number of arguments");
  if (init == 0) {
    // Initialize function
    //mexLock();
    // load GPUmat
    gm = gmGetGPUmat();
    gmCheckGPUmat(gm);
    init = 1;
  }
  // log
  gm->debug.log("> GPUdoubleToGPUsingle\n",0);
  gm->debug.logPush();
  // mex parameters are:
  // IN1
  GPUtype IN1  = gm->gputype.getGPUtype(prhs[0]);
  int isdouble = gm->gputype.isDouble(IN1);
  if (!isdouble) {
    mexErrMsgTxt(ERROR_CASTING_GPUDOUBLE);
  }
  if (gm->comp.getCompileMode() == 1) {
    GPUtype R = gm->gputype.create(gpuFLOAT, 0, NULL, NULL);
    gm->comp.pushGPUtype(&R);
    gm->comp.functionStart("GPUMAT_DoubleToFloat");
    gm->comp.functionSetParamGPUtype(&R);
    gm->comp.functionSetParamGPUtype(&IN1);
    gm->comp.functionEnd();
    plhs[0] = gm->gputype.createMxArray(R);
  } else {
    GPUtype R = gm->gputype.doubleToFloat(IN1);
    plhs[0] = gm->gputype.createMxArray(R);
  }
  gm->debug.logPop();
}
Ejemplo n.º 15
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

  CUresult cudastatus = CUDA_SUCCESS;

  if (nrhs != 3)
    mexErrMsgTxt("Wrong number of arguments");

  if (init == 0) {
    // Initialize function
    //mexLock();

    // load GPUmat
		gm = gmGetGPUmat();

		// load module
		CUmodule *drvmod = gmGetModule("examples_numerics");

    // load float GPU function
    CUresult status = cuModuleGetFunction(&drvfunf, *drvmod, "TIMESF");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }

    // load complex GPU function
    status = cuModuleGetFunction(&drvfunc, *drvmod, "TIMESC");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }

    // load double GPU function
    status = cuModuleGetFunction(&drvfund, *drvmod, "TIMESD");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }

    // load complex GPU function
    status = cuModuleGetFunction(&drvfuncd, *drvmod, "TIMESCD");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }

    init = 1;
  }

  // mex parameters are:

  // 1. IN1
  // 2. IN2
  // 3. OUT

  //IN1 is the input GPU array
    GPUtype IN1 = gm->gputype.getGPUtype(prhs[0]);

    //IN2 is the input GPU array
    GPUtype IN2 = gm->gputype.getGPUtype(prhs[1]);

    //OUT is the output GPU array (result)
    GPUtype OUT = gm->gputype.getGPUtype(prhs[2]);

    // number of elements
    int nin1 = gm->gputype.getNumel(IN1);
    int nin2 = gm->gputype.getNumel(IN2);
    int nout = gm->gputype.getNumel(OUT);

    gpuTYPE_t tin1 = gm->gputype.getType(IN1);
    gpuTYPE_t tin2 = gm->gputype.getType(IN2);
    gpuTYPE_t tout = gm->gputype.getType(OUT);

    // check input/out size and type
    if (nin1!=nin2)
      mexErrMsgTxt("Input arguments must have the same number of elements.");

    if (nin1!=nout)
      mexErrMsgTxt("Input and output arguments must have the same number of elements.");

    if (tin1!=tin2)
      mexErrMsgTxt("Input arguments must be of the same type.");

    if (tin1!=tout)
      mexErrMsgTxt("Input and output arguments must be of the same type.");

    // I need the pointers to GPU memory
    CUdeviceptr d_IN1  = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IN1));
    CUdeviceptr d_IN2  = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IN2));
    CUdeviceptr d_OUT = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(OUT));

  // The GPU kernel depends on the type of input/output
  CUfunction drvfun;
  if (tin1 == gpuFLOAT) {
    drvfun = drvfunf;
  } else if (tin1 == gpuCFLOAT) {
    drvfun = drvfunc;
  } else if (tin1 == gpuDOUBLE) {
    drvfun = drvfund;
  } else if (tin1 == gpuCDOUBLE) {
    drvfun = drvfuncd;
  }

  hostdrv_pars_t gpuprhs[3];
  int gpunrhs = 3;
  gpuprhs[0] = hostdrv_pars(&d_IN1,sizeof(d_IN1),__alignof(d_IN1));
  gpuprhs[1] = hostdrv_pars(&d_IN2,sizeof(d_IN2),__alignof(d_IN2));
  gpuprhs[2] = hostdrv_pars(&d_OUT,sizeof(d_OUT),__alignof(d_OUT));

  int N = nin1;

  hostGPUDRV(drvfun, N, gpunrhs, gpuprhs);

}
Ejemplo n.º 16
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
  // At least 2 arguments expected
  // Input and result
  if (nrhs!=3)
     mexErrMsgTxt("Wrong number of arguments");
  if (init == 0) {
    // Initialize function
    //mexLock();
    // load GPUmat
    gm = gmGetGPUmat();
    init = 1;
  }
  // mex parameters are:
  // kernel_fft_rot
  // vis_fft
  // C

  GPUtype kernel_fft_rot  = gm->gputype.getGPUtype(prhs[0]);
  GPUtype vis_fft         = gm->gputype.getGPUtype(prhs[1]);
  GPUtype C               = gm->gputype.getGPUtype(prhs[2]);

  // SIZE
  const int *ks = gm->gputype.getSize(kernel_fft_rot);
  const int *vs = gm->gputype.getSize(vis_fft);
  const int *cs = gm->gputype.getSize(C);

  // DIMENSIONS
  int kn = gm->gputype.getNdims(kernel_fft_rot);
  int vn = gm->gputype.getNdims(vis_fft);
  int cn = gm->gputype.getNdims(C);

  // check size
  // first 2 dimensions must agree
  // kn is 4
  // vn is 3
  // cn is 3
  if ((kn!=4)||(vn!=3)||(cn!=3))
    mexErrMsgTxt("Check input parameters dimensions.");

  if ((ks[0]!=vs[0])||(ks[0]!=cs[0])||(ks[1]!=vs[1])||(ks[1]!=cs[1]))
  	mexErrMsgTxt("Check input parameters size.");

  if (ks[2]!=vs[2])
      mexErrMsgTxt("Check input parameters size.");

  if (ks[2]!=cs[2])
        mexErrMsgTxt("Check input parameters size.");

  for (int i=0;i<ks[3];i++) {
  	for (int j=0;j<vs[2];j++) {
  	  GPUtype tmp1 = gm->gputype.slice(vis_fft,
  				                          Range(0,1,END,
  				                          Range(0,1,END,
  				                          Range(j,0,0))));
  		GPUtype tmp2 = gm->gputype.slice(kernel_fft_rot,
  				                              Range(0,1,END,
  				                              Range(0,1,END,
  				                              Range(j,0,0,
  				                              Range(i,0,0)))));
  		gm->numerics.Times(tmp1,tmp2,tmp1);
  		gm->numerics.Real(gm->fft.IFFT2Drv(tmp1),tmp2);

  		tmp1 = gm->gputype.slice(C,
                               Range(0,1,END,
                               Range(0,1,END,
                               Range(j,0,0))));

  		gm->numerics.Plus(tmp1,tmp2,tmp1);
  		gm->gputype.assign(C, tmp1,
                         Range(0,1,END,
                         Range(0,1,END,
                         Range(j,0,0))),1);


  	}
  }



}
Ejemplo n.º 17
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

  CUresult cudastatus = CUDA_SUCCESS;

  if (nrhs != 3) 
    mexErrMsgTxt("Wrong number of arguments");

  if (init == 0) {
    // Initialize function
    //mexLock();

    // load GPUmat
    gm = gmGetGPUmat();

    // load module
    CUmodule *drvmod = gmGetModule("misc");

    // load float GPU function
    CUresult status = cuModuleGetFunction(&drvfunf, *drvmod, "gpuNCAreg");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }

    init = 1;
  }

  // mex parameters are:

  // 1. IN1 (in)
  // 2. IN2 (w)
  // 3. OUT (out)

  //IN is the input GPU array
  GPUtype IN1 = gm->gputype.getGPUtype(prhs[0]);
  gpuTYPE_t tIn1 = gm->gputype.getType(IN1);

  //IN is the input GPU array
  GPUtype IN2 = gm->gputype.getGPUtype(prhs[1]);
  gpuTYPE_t tIn2 = gm->gputype.getType(IN2);

  //IN is the input GPU array
  GPUtype OUT = gm->gputype.getGPUtype(prhs[2]);
  gpuTYPE_t tOut = gm->gputype.getType(OUT);

  //dimensions
  const int * sIn1 = gm->gputype.getSize(IN1);
  const int * sIn2 = gm->gputype.getSize(IN2);
  const int * sOut = gm->gputype.getSize(OUT);

  int m1 = sIn1[0]; //# of dimensions
  int n1 = sIn1[1]; //# of cases

  int n2a = sIn2[0]; //# of cases
  int n2b = sIn2[1]; //# of cases

  int sout1 = sOut[0]; //# of cases
  int sout2 = sOut[1]; //# of dimensions

  if ( (n1!=n2a) || (n2a!=n2b) || (n1!=sout1) )
    mexErrMsgTxt("Number of cases must be consistent");

  if (m1!=sout2)
    mexErrMsgTxt("Number of dimensions must be consistent");

  /* Output for debugging
     mexPrintf("numcases: %d numdims: %d\n",n,m);
     mexPrintf("output: %d x %d\n",sout1,sout2);
  */

  CUfunction drvfun;
  if ((tIn1 == gpuFLOAT) && (tIn2 == gpuFLOAT))
    drvfun = drvfunf;
  else {
    mexErrMsgTxt("Only singles are supported at present.");
  }

  // I need the pointers to GPU memory
  CUdeviceptr d_IN1  = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IN1));
  CUdeviceptr d_IN2  = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IN2));
  CUdeviceptr d_OUT = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(OUT));
  
  hostdrv_pars_t gpuprhs[3];
  int gpunrhs = 3;
  gpuprhs[0] = hostdrv_pars(&d_IN1,sizeof(d_IN1));
  gpuprhs[1] = hostdrv_pars(&d_IN2,sizeof(d_IN2));
  gpuprhs[2] = hostdrv_pars(&d_OUT,sizeof(d_OUT));

  //hostGPUDRV(drvfun, N, gpunrhs, gpuprhs);
  hostGPUPdist(drvfun, gpunrhs, gpuprhs, n1, m1); 
  
}
Ejemplo n.º 18
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
    
    CUresult cudastatus = CUDA_SUCCESS;
    
    if (nrhs < 2)
        mexErrMsgTxt("Wrong number of arguments, atleast two required.");
    
    
    if (init == 0) {
        // Initialize function
        //mexLock();
        
        // load GPUmat
        gm = gmGetGPUmat();
        
        // load module
        CUmodule *drvmod = gmGetModule("avg_pool");
        
        // load float GPU function
//         CUresult status = cuModuleGetFunction(&drvfunf, *drvmod, "MAXPOOLF");
        CUresult status = cuModuleGetFunction(&drvfunfnoind, *drvmod,  "AVGPOOLFNOIND");
        if (CUDA_SUCCESS != status) {
            mexErrMsgTxt("Unable to load user function.");
        }
        
//         status = cuModuleGetFunction(&drvfunfind, *drvmod,  "AVGPOOLFIND");
//         if (CUDA_SUCCESS != status) {
//             mexErrMsgTxt("Unable to load user function.");
//         }
        init = 1;
    }
    
    double *psize = mxGetPr(prhs[1]); // For the pool_size array.
    
    
    // mex parameters are:
    
    // 1. IN1
    // 2. IN2
    // 3. OUT
    
    //IN1 is the input GPU array
    GPUtype IN1 = gm->gputype.getGPUtype(prhs[0]);
    
    
    
    
    
    //OUT is the output GPU array (result)
    //   GPUtype OUT = gm->gputype.getGPUtype(prhs[2]);
    
    // number of elements
    int nin1 = gm->gputype.getNumel(IN1);
//     int nin2 = gm->gputype.getNumel(IN2);
    
    // Get the number of dimensions.
    int ndims = gm->gputype.getNdims(IN1);
    
    gpuTYPE_t tin1 = gm->gputype.getType(IN1);
//     gpuTYPE_t tin2 = gm->gputype.getType(IN2);
    
    // create GPUtype, with given dimensions (same size as input).
    int* im_size;
    int* out_size;
    im_size = (int*) mxCalloc(ndims, sizeof(mwSize));
    out_size = (int*) mxCalloc(ndims, sizeof(mwSize));
    im_size = (int*) gm->gputype.getSize(IN1);
    
    // Calculate the reduced size (just spatially reduced in 2D pooling).
    out_size[0] = (int) ceil(((float)im_size[0])/((float)psize[0]));
    out_size[1] = (int) ceil(((float)im_size[1])/((float)psize[1]));
    for(int dim=ndims;dim>2;dim--){
        out_size[dim-1] = im_size[dim-1];
    }
    
//     mexErrMsgTxt("Right after out_size\n");
    GPUtype OUT = gm->gputype.create(tin1, ndims, out_size, NULL);
    
    // The kernel we ultimately call depends on if the indices were passed in.
    CUfunction drvfun;
    
//     GPUtype IND;
    
//     //IND is the input GPU index array
//     if(nrhs >=3){
//         IND = gm->gputype.getGPUtype(prhs[2]);
//         drvfun = drvfunfind;
//     }
//     
//     if(nrhs<3 || gm->gputype.isEmpty(IND)){ // If IND wasn't passed int we need to create it here.
//         IND = gm->gputype.create(tin1, ndims, out_size, NULL);
        drvfun = drvfunfnoind;
//     }
    
//     mexPrintf("After creating new array.\n");
    
    int nout = gm->gputype.getNumel(OUT);
    gpuTYPE_t tout = gm->gputype.getType(OUT);
//     mexPrintf("nout: %d, nin1: %d, out_size: %d x %d\n", nout, nin1, out_size[0], out_size[1]);
    
    if (tin1 !=gpuFLOAT)
        mexErrMsgTxt("Input must be GPUsingle");
    
    // check input/out size and type
//     if (nin1!=nin2)
//         mexErrMsgTxt("Input arguments must have the same number of elements.");
    
//       if (nin1!=nout)
//         mexErrMsgTxt("Input and output arguments must have the same number of elements.");
    
//     if (tin1!=tin2)
//         mexErrMsgTxt("Input arguments must be of the same type.");
    
    if (tin1!=tout)
        mexErrMsgTxt("Input and output arguments must be of the same type.");
    
    
    // I need the pointers to GPU memory
    CUdeviceptr d_IN1  = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IN1));
//     CUdeviceptr d_IND  = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IND));
    CUdeviceptr d_OUT = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(OUT));
    
    // CUdeviceptr d_IN1  = (UINTPTR gm->gputype.getGPUptr(IN1));
    // CUdeviceptr d_IN2  = (UINTPTR gm->gputype.getGPUptr(IN2));
    // CUdeviceprt d_OUT = (UINTPTR gm->gputype.getGPUptr(OUT));
    
    
    hostdrv_pars_t gpuprhs[2];
    int gpunrhs = 2;
    gpuprhs[0] = hostdrv_pars(&d_IN1, sizeof(d_IN1), __alignof(d_IN1));
    gpuprhs[1] = hostdrv_pars(&d_OUT, sizeof(d_OUT), __alignof(d_OUT));
//     gpuprhs[2] = hostdrv_pars(&d_IND, sizeof(d_IND), __alignof(d_IND));
//     GPUtype pool_size = gm->gputype.mxToGPUtype(prhs[2]);
//     gpuprhs[3] = hostdrv_pars(&psize, sizeof(psize), __alignof(psize));
    
    
    int N = nout;
//         mexErrMsgTxt("just before avg pooling");

//      hostGPUDRV(drvfun, N, gpunrhs, gpuprhs);
    hostDriver(drvfun, N, gpunrhs, gpuprhs, (int)im_size[0], (int)im_size[1], (int)out_size[0], (int)out_size[1], (int)psize[0], (int)psize[1]);
    //   int numThreadsPerBlock = 256;
    //   const unsigned int numBlocks = (out_size[0] + numThreadsPerBlock - 1) / numThreadsPerBlock;
    
    //  PLUSF<<<numBlocks,numThreadsPerBlock>>>(N, 0, (float *)d_IN1,(float *)d_IN2,(float *)d_OUT);
    
    
    // Finally make the output available to MATLAB
    plhs[0] = gm->gputype.createMxArray(OUT);
//     if(nlhs>1)
//         plhs[1] = gm->gputype.createMxArray(IND);
    // Create empty output array.
//     if(nlhs>1){
//         GPUtype IND;
//         int* ind_size;    
//         ind_size = (int*) mxCalloc(2, sizeof(int));
//         ind_size[0] = 0;
//         ind_size[1] = 0;
//         IND = gm->gputype.create(tin1, 0, ind_size, NULL);
//         plhs[1] = gm->gputype.createMxArray(IND);
//     }
    
    
    
    
}
Ejemplo n.º 19
0
/*
 * Initializes numerics MODULE.
 * 1) Load GPU kernels
 * 2) Register functions in GPUmat structure
 *
 */
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

  if (nrhs != 0)
    mexErrMsgTxt("Wrong number of arguments");

  if (init == 0) {
    // Initialize function
    mexLock();

    // load GPUmat
    gm = gmGetGPUmat();

    // load module
    CUmodule *drvmod = gmGetModule("numerics");

    //************************************************************************
    // EYE GPU KERNELS
    //************************************************************************

    // load float GPU function
    CUresult status = cuModuleGetFunction(&EYEdrvfuns[N_EYEF], *drvmod, "EYEF");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }

    // load complex GPU function
    status = cuModuleGetFunction(&EYEdrvfuns[N_EYEC], *drvmod, "EYEC");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }

    // load double GPU function
    status = cuModuleGetFunction(&EYEdrvfuns[N_EYED], *drvmod, "EYED");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }

    // load complex GPU function
    status = cuModuleGetFunction(&EYEdrvfuns[N_EYEDC], *drvmod, "EYEDC");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }

    //************************************************************************
    // REGISTER FUNCTION IN GPUMAT STRUCTURE
    //************************************************************************

    // put here functions to be registered
    gm->gputype.mxRepmatDrv = GPUmxRepmatDrv;
    gm->gputype.mxPermuteDrv = GPUmxPermuteDrv;
    gm->gputype.mxEyeDrv = GPUmxEyeDrv;
    gm->gputype.eye = GPUeye;

    gm->gputype.mxZerosDrv = GPUmxZerosDrv;
    gm->gputype.zeros = GPUzeros;

    gm->gputype.mxOnesDrv = GPUmxOnesDrv;
    gm->gputype.ones = GPUones;

    gm->gputype.mxFill = GPUmxFill;

    gm->gputype.mxColonDrv = GPUmxColon;

    gm->gputype.mxMemCpyDtoD = GPUmxMemCpyDtoD;
    gm->gputype.mxMemCpyHtoD = GPUmxMemCpyHtoD;



    // aux
    gm->aux.mxAssign = GPUmxAssign;

    gm->aux.mxSliceDrv = GPUmxSliceDrv;


    //************************************************************************
    // UPDATE FLAGS
    //************************************************************************
    gm->mod.numerics = 1; // numerics module was loaded


    init = 1;
  }

}
Ejemplo n.º 20
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

  CUresult cudastatus = CUDA_SUCCESS;

  if (nrhs != 3)
    mexErrMsgTxt("Wrong number of arguments");

  if (init == 0) {
    // Initialize function
    //mexLock();

    // load GPUmat
    gm = gmGetGPUmat();

    // load module
    CUmodule *drvmod = gmGetModule("subsample");

    //load appropriate GPU kernel (mangled name)
    CUresult status;

    status = cuModuleGetFunction(&subsample_noreduc_2T, *drvmod, "_Z18kSubsample_noreducILi2ELb1EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_2F, *drvmod, "_Z18kSubsample_noreducILi2ELb0EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_3T, *drvmod, "_Z18kSubsample_noreducILi3ELb1EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_3F, *drvmod, "_Z18kSubsample_noreducILi3ELb0EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_4T, *drvmod, "_Z18kSubsample_noreducILi4ELb1EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_4F, *drvmod, "_Z18kSubsample_noreducILi4ELb0EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_5T, *drvmod, "_Z18kSubsample_noreducILi5ELb1EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_5F, *drvmod, "_Z18kSubsample_noreducILi5ELb0EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_6T, *drvmod, "_Z18kSubsample_noreducILi6ELb1EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_6F, *drvmod, "_Z18kSubsample_noreducILi6ELb0EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_7T, *drvmod, "_Z18kSubsample_noreducILi7ELb1EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_7F, *drvmod, "_Z18kSubsample_noreducILi7ELb0EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_8T, *drvmod, "_Z18kSubsample_noreducILi8ELb1EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_8F, *drvmod, "_Z18kSubsample_noreducILi8ELb0EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_9T, *drvmod, "_Z18kSubsample_noreducILi9ELb1EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_9F, *drvmod, "_Z18kSubsample_noreducILi9ELb0EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_10T, *drvmod, "_Z18kSubsample_noreducILi10ELb1EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_10F, *drvmod, "_Z18kSubsample_noreducILi10ELb0EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_11T, *drvmod, "_Z18kSubsample_noreducILi11ELb1EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_11F, *drvmod, "_Z18kSubsample_noreducILi11ELb0EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_12T, *drvmod, "_Z18kSubsample_noreducILi12ELb1EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_12F, *drvmod, "_Z18kSubsample_noreducILi12ELb0EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_13T, *drvmod, "_Z18kSubsample_noreducILi13ELb1EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_13F, *drvmod, "_Z18kSubsample_noreducILi13ELb0EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_14T, *drvmod, "_Z18kSubsample_noreducILi14ELb1EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_14F, *drvmod, "_Z18kSubsample_noreducILi14ELb0EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_15T, *drvmod, "_Z18kSubsample_noreducILi15ELb1EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_15F, *drvmod, "_Z18kSubsample_noreducILi15ELb0EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_16T, *drvmod, "_Z18kSubsample_noreducILi16ELb1EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&subsample_noreduc_16F, *drvmod, "_Z18kSubsample_noreducILi16ELb0EEvPfS0_iii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }

    init = 1;
  }

  // mex parameters are:

  // 1. IN1
  // 2. OUT
  // 3. subsampling factor

  bool avoidBankConflicts = true; //hard-coded

  //IN1 is the input GPU array
  GPUtype IN1 = gm->gputype.getGPUtype(prhs[0]);

  //OUT is the output GPU array (result)
  GPUtype OUT = gm->gputype.getGPUtype(prhs[1]);

  //last parameter is the filterSize (int)
  int factor = (int) mxGetScalar(prhs[2]);

  // number of elements
  int nin1 = gm->gputype.getNumel(IN1);
  int nout = gm->gputype.getNumel(OUT);

  //dimensions 
  const int * sin1 = gm->gputype.getSize(IN1);
  const int * sout = gm->gputype.getSize(OUT);

  int imgPixels = sin1[0];
  if ( floor(sqrt(float(imgPixels))) != sqrt(float(imgPixels)) )
    mexErrMsgTxt("Images not square");
  int imgSize = int(sqrt(imgPixels));

  if (imgSize <= factor)
    mexErrMsgTxt("imgSize must be > factor");
  if (imgSize % factor !=0)
    mexErrMsgTxt("imgSize must be evenly divisible by factor");
  if (factor > 16)
    mexErrMsgTxt("factor > 16");
  if (factor < 2)
    mexErrMsgTxt("factor < 2");
  if (imgSize > 512)
    mexErrMsgTxt("max imgSize: 512");

  int numRegions = nin1 / (factor*factor);
  int numRegionsY = (imgSize / factor) * sin1[1];
  if (nout != numRegions)
    mexErrMsgTxt("Target dimensions not consistent");


  int regionsXPerBlock = imgSize / factor;
  int numThreadsX = imgSize;
  int SHMEM_MAX = 8192; // don't use more than this much shmem
  int regionsYPerBlock = MIN(512 / numThreadsX, SHMEM_MAX / (4*imgSize)); // to avoid running out of shmem
//    regionsYPerBlock--;
  int regionsPerBlock = regionsYPerBlock * regionsXPerBlock;

  // this will avoid all bank conflicts but may (?) use up too much shmem
  int shmemPadX = avoidBankConflicts * (DIVUP(16,factor) + (regionsPerBlock % 16 == 0 ? 0 : 16 - regionsPerBlock % 16));
  //    shmemPadX = 0;
  int shmemY = factor, shmemX = regionsPerBlock + shmemPadX;
  int shmem = 4 * shmemX * shmemY;
  if (shmem == 0 || shmem > 16300) {
    // this really shouldn't happen and i've only put this here as a precautionary measure
    // to avoid getting mysteriously wrong results.
    mexErrMsgTxt("subsample: not enough shared memory!");
  }

  int numThreadsY = regionsYPerBlock;
  //    int blocks = numRegionsY / regionsYPerBlock;
  int blocksX = imgSize / factor, blocksY = DIVUP(sin1[1], regionsYPerBlock);
  if (blocksX >=65535 || blocksY >= 65535)
    mexErrMsgTxt("Exceeded max block size");

  //    assert(numRegionsY % regionsYPerBlock == 0);
  bool checkThreadBounds = numRegionsY % regionsYPerBlock != 0;
  //    printf("num regions y: %d, regions y per block: %d\n", numRegionsY, regionsYPerBlock);
  dim3 grid(blocksX, blocksY);
  dim3 threads(numThreadsX, numThreadsY);
  /*
  mexPrintf("grid: %ux%u, threads: %ux%u\n", grid.y, grid.x, threads.y, threads.x);
  mexPrintf("check bounds: %u\n", checkThreadBounds);
  mexPrintf("using %u bytes of shmem\n", shmem);
  */

  gpuTYPE_t tin1 = gm->gputype.getType(IN1);
  gpuTYPE_t tout = gm->gputype.getType(OUT);

  // check input/out size and type
  if (tin1!=tout)
    mexErrMsgTxt("Input and output arguments must be of the same type.");

  // I need the pointers to GPU memory
  CUdeviceptr d_IN1  = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IN1));
  CUdeviceptr d_OUT = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(OUT));

  // // The GPU kernel depends on the type of input/output
  // CUfunction drvfun;
  // if (tin1 == gpuFLOAT) {
  //   drvfun = drvfunf;
  // } else 
  //   mexErrMsgTxt("Currently only single types supported.");

  hostdrv_pars_t gpuprhs[2];
  int gpunrhs = 2;
  gpuprhs[0] = hostdrv_pars(&d_IN1,sizeof(d_IN1));
  gpuprhs[1] = hostdrv_pars(&d_OUT,sizeof(d_OUT));

  //int N = nin1;

  if (factor == 2) {
    if (checkThreadBounds) {
      hostDriver(subsample_noreduc_2T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    } else {
      hostDriver(subsample_noreduc_2F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    }
  } else if (factor == 3) {
    if (checkThreadBounds) {
      hostDriver(subsample_noreduc_3T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    } else {
      hostDriver(subsample_noreduc_3F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    }
  } else if (factor == 4) {
    if (checkThreadBounds) {
      hostDriver(subsample_noreduc_4T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    } else {
      hostDriver(subsample_noreduc_4F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    }
  } else if (factor == 5) {
    if (checkThreadBounds) {
      hostDriver(subsample_noreduc_5T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    } else {
      hostDriver(subsample_noreduc_5F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    }
  } else if (factor == 6) {
    if (checkThreadBounds) {
      hostDriver(subsample_noreduc_6T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    } else {
      hostDriver(subsample_noreduc_6F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    }
  } else if (factor == 7) {
    if (checkThreadBounds) {
      hostDriver(subsample_noreduc_7T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    } else {
      hostDriver(subsample_noreduc_7F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    }
  } else if (factor == 8) {
    if (checkThreadBounds) {
      hostDriver(subsample_noreduc_8T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    } else {
      hostDriver(subsample_noreduc_8F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    }
  } else if (factor == 9) {
    if (checkThreadBounds) {
      hostDriver(subsample_noreduc_9T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    } else {
      hostDriver(subsample_noreduc_9F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    }
  } else if (factor == 10) {
    if (checkThreadBounds) {
      hostDriver(subsample_noreduc_10T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    } else {
      hostDriver(subsample_noreduc_10F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    }
  } else if (factor == 11) {
    if (checkThreadBounds) {
      hostDriver(subsample_noreduc_11T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    } else {
      hostDriver(subsample_noreduc_11F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    }
  } else if (factor == 12) {
    if (checkThreadBounds) {
      hostDriver(subsample_noreduc_12T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    } else {
      hostDriver(subsample_noreduc_12F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    }
  } else if (factor == 13) {
    if (checkThreadBounds) {
      hostDriver(subsample_noreduc_13T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    } else {
      hostDriver(subsample_noreduc_13F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    }
  } else if (factor == 14) {
    if (checkThreadBounds) {
      hostDriver(subsample_noreduc_14T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    } else {
      hostDriver(subsample_noreduc_14F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    }
  } else if (factor == 15) {
    if (checkThreadBounds) {
      hostDriver(subsample_noreduc_15T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    } else {
      hostDriver(subsample_noreduc_15F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    }
  } else if (factor == 16) {
    if (checkThreadBounds) {
      hostDriver(subsample_noreduc_16T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    } else {
      hostDriver(subsample_noreduc_16F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs);
    }
  }
    //cutilCheckMsg("kernel execution failed");
    
    //    if(factor == 4) {
    ////        kSubsample_reduc<4><<<grid, threads,4*numThreadsX*numThreadsY>>>(images->getDevData(), targets->getDevData(), imgSize, numRegionsY);
    //    }

}
Ejemplo n.º 21
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

  CUresult cudastatus = CUDA_SUCCESS;

  if (nrhs != 2)
    mexErrMsgTxt("Wrong number of arguments");

  if (init == 0) {
    // Initialize function
    //mexLock();

    // load GPUmat
		gm = gmGetGPUmat();

		// load module
		CUmodule *drvmod = gmGetModule("examples_texture");


    // load float GPU function
    CUresult status = cuModuleGetFunction(&drvfunf, *drvmod, "LININTERF");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }

    // load double GPU function
    status = cuModuleGetFunction(&drvfund, *drvmod, "LININTERD");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }

    // load textures defined in module
    status = cuModuleGetTexRef(&texf, *drvmod, "texref_f1_a");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load texture.");
    }

    status = cuModuleGetTexRef(&texd, *drvmod, "texref_d1_a");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load texture.");
    }

    // no complex function support
    init = 1;
  }

  // mex parameters are:

  // 1. IN1. Input array
  // 2. IN2. Input indexes array

  //IN1 is the input GPU array
  GPUtype IN1 = gm->gputype.getGPUtype(prhs[0]);

  //IN2 is the input GPU array
  GPUtype IN2 = gm->gputype.getGPUtype(prhs[1]);

  //OUT is the output GPU array (result)
  // Create of the same size of IN1
  gpuTYPE_t in1_t = gm->gputype.getType(IN1);
  int in1_d = gm->gputype.getNdims(IN1);
  const int * in1_s = gm->gputype.getSize(IN1);
  int in1_n = gm->gputype.getNumel(IN1);
  int in1_b = gm->gputype.getDataSize(IN1);

  gpuTYPE_t in2_t = gm->gputype.getType(IN2);
  int in2_d = gm->gputype.getNdims(IN2);
  const int * in2_s = gm->gputype.getSize(IN2);
  int in2_n = gm->gputype.getNumel(IN2);

  if ((in1_t==gpuCFLOAT) || (in1_t==gpuCDOUBLE)) {
    mexErrMsgTxt("Complex TYPE not supported");
  }

  if (in1_t != in2_t) {
    mexErrMsgTxt("Input arguments must be of the same type");
  }

  if (in1_n != in2_n) {
    mexErrMsgTxt("Input arguments must have the same number of elements");
  }

  //OUT is the output GPU array (result)
  // Create of the same size of IN1
  GPUtype OUT = gm->gputype.create(in1_t, in1_d, in1_s, NULL);



  // I need the pointers to GPU memory
  CUdeviceptr d_IN1  = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IN1));
  CUdeviceptr d_IN2  = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IN2));
  CUdeviceptr d_OUT = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(OUT));

  // The GPU kernel depends on the type of input/output
  CUfunction drvfun;
  CUtexref   drvtex;
  CUarray_format_enum drvtexformat;
  int drvtexnum;

  if (in1_t == gpuFLOAT) {
    drvfun = drvfunf;
    drvtex = texf;
    drvtexformat = CU_AD_FORMAT_FLOAT;
    drvtexnum = 1;
  } else if (in1_t == gpuDOUBLE) {
    drvfun = drvfund;
    drvtex = texd;
    drvtexformat = CU_AD_FORMAT_SIGNED_INT32;
    drvtexnum = 2;
  }

  if (CUDA_SUCCESS != cuTexRefSetFormat(drvtex, drvtexformat, drvtexnum)) {
    mexErrMsgTxt("Execution error (texture).");
  }
  if (CUDA_SUCCESS != cuTexRefSetAddress(NULL, drvtex, UINTPTR d_IN1, in1_n*in1_b)) {
    mexErrMsgTxt("Execution error (texture).");
  }

  if (CUDA_SUCCESS != cuParamSetTexRef(drvfun, CU_PARAM_TR_DEFAULT, drvtex)) {
    mexErrMsgTxt("Execution error (texture1).");
  }


  hostdrv_pars_t gpuprhs[2];
	int gpunrhs = 2;
	gpuprhs[0] = hostdrv_pars(&d_IN2,sizeof(d_IN2),__alignof(d_IN2));
	gpuprhs[1] = hostdrv_pars(&d_OUT,sizeof(d_OUT),__alignof(d_OUT));

	int N = in1_n;

	hostGPUDRV(drvfun, N, gpunrhs, gpuprhs);


	// return result
	plhs[0] = gm->gputype.createMxArray(OUT);


}
Ejemplo n.º 22
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

  CUresult cudastatus = CUDA_SUCCESS;

  if (nrhs != 3)
    mexErrMsgTxt("Wrong number of arguments");

  if (init == 0) {
    // Initialize function
    //mexLock();

    // load GPUmat
    gm = gmGetGPUmat();

    // load module
    CUmodule *drvmod = gmGetModule("subsample");

    //load appropriate GPU kernel (mangled name)
    CUresult status;

    status = cuModuleGetFunction(&_supersampleMedium_2, *drvmod, "_Z18kSupersampleMediumILi2EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMedium_3, *drvmod, "_Z18kSupersampleMediumILi3EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMedium_4, *drvmod, "_Z18kSupersampleMediumILi4EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMedium_5, *drvmod, "_Z18kSupersampleMediumILi5EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMedium_6, *drvmod, "_Z18kSupersampleMediumILi6EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMedium_7, *drvmod, "_Z18kSupersampleMediumILi7EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMedium_8, *drvmod, "_Z18kSupersampleMediumILi8EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMedium_9, *drvmod, "_Z18kSupersampleMediumILi9EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMedium_10, *drvmod, "_Z18kSupersampleMediumILi10EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMedium_11, *drvmod, "_Z18kSupersampleMediumILi11EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMedium_12, *drvmod, "_Z18kSupersampleMediumILi12EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMedium_13, *drvmod, "_Z18kSupersampleMediumILi13EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMedium_14, *drvmod, "_Z18kSupersampleMediumILi14EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMedium_15, *drvmod, "_Z18kSupersampleMediumILi15EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMedium_16, *drvmod, "_Z18kSupersampleMediumILi16EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMediumLoopy_2, *drvmod, "_Z23kSupersampleMediumLoopyILi2EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMediumLoopy_3, *drvmod, "_Z23kSupersampleMediumLoopyILi3EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMediumLoopy_4, *drvmod, "_Z23kSupersampleMediumLoopyILi4EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMediumLoopy_5, *drvmod, "_Z23kSupersampleMediumLoopyILi5EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMediumLoopy_6, *drvmod, "_Z23kSupersampleMediumLoopyILi6EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMediumLoopy_7, *drvmod, "_Z23kSupersampleMediumLoopyILi7EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMediumLoopy_8, *drvmod, "_Z23kSupersampleMediumLoopyILi8EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMediumLoopy_9, *drvmod, "_Z23kSupersampleMediumLoopyILi9EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMediumLoopy_10, *drvmod, "_Z23kSupersampleMediumLoopyILi10EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMediumLoopy_11, *drvmod, "_Z23kSupersampleMediumLoopyILi11EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMediumLoopy_12, *drvmod, "_Z23kSupersampleMediumLoopyILi12EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMediumLoopy_13, *drvmod, "_Z23kSupersampleMediumLoopyILi13EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMediumLoopy_14, *drvmod, "_Z23kSupersampleMediumLoopyILi14EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMediumLoopy_15, *drvmod, "_Z23kSupersampleMediumLoopyILi15EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }
    status = cuModuleGetFunction(&_supersampleMediumLoopy_16, *drvmod, "_Z23kSupersampleMediumLoopyILi16EEvPfS0_ii");
    if (CUDA_SUCCESS != status) {
      mexErrMsgTxt("Unable to load user function.");
    }

    init = 1;
  }

  // mex parameters are:

  // 1. IN1
  // 2. OUT
  // 3. supersampling factor

  bool avoidBankConflicts = true; //hard-coded
  bool trans = false; //hard-coded

  //IN1 is the input GPU array
  GPUtype IN1 = gm->gputype.getGPUtype(prhs[0]);

  //OUT is the output GPU array (result)
  GPUtype OUT = gm->gputype.getGPUtype(prhs[1]);

  //last parameter is the filterSize (int)
  int factor = (int) mxGetScalar(prhs[2]);

  // number of elements
  int nin1 = gm->gputype.getNumel(IN1);
  int nout = gm->gputype.getNumel(OUT);

  //dimensions 
  const int * sin1 = gm->gputype.getSize(IN1);
  const int * sout = gm->gputype.getSize(OUT);

  int imgPixels = sin1[0];
  if ( floor(sqrt(float(imgPixels))) != sqrt(float(imgPixels)) )
    mexErrMsgTxt("Images not square");
  int imgSize = int(sqrt(imgPixels));
  int numImages = sin1[1];
  

  /* if (imgSize <= factor)
     mexErrMsgTxt("imgSize must be > factor"); */
  if (factor > 16)
    mexErrMsgTxt("factor > 16");
  if (factor < 2)
    mexErrMsgTxt("factor < 2");
  if (imgSize > 512)
    mexErrMsgTxt("max imgSize: 512");
  if (imgSize < 1)
    mexErrMsgTxt("min imgSize: 1");

  int targetPixels = sout[0];
  if ( floor(sqrt(float(targetPixels))) != sqrt(float(targetPixels)) )
    mexErrMsgTxt("Targets not square");
  int targetSize = int(sqrt(targetPixels));
  if (targetSize % factor !=0)
    mexErrMsgTxt("imgSize must be evenly divisible by factor");
  if (targetSize / factor != imgSize)
    mexErrMsgTxt("targetSize/ factor must = imgSize");
  if (nout != nin1 * factor*factor)
    mexErrMsgTxt("Target dimensions not consistent");

  int threadsX, threadsY;
  int SHMEM_MAX = 8192; // don't use more than this much shmem
  int shmemX, shmemY, blocksX, blocksY;
  bool useLoopy = false;
  int THREADS_MAX_LOOPY = 512, THREADS_MAX = trans ? 256 : 512;
  if (!trans) {
    threadsX = imgSize;
    threadsY = factor * MIN(THREADS_MAX / (factor*threadsX), SHMEM_MAX / (4*threadsX*factor)); // to avoid running out of shmem

    if(threadsY == 0) {
      if (factor > 32)
	mexErrMsgTxt("factor can't be > 32");
      //assert(factor <= 32); // yes this is covered by assert above but in case i ever remove that
      THREADS_MAX = 512;
      useLoopy = true;
      threadsX = MIN(16, imgSize); // not that imgsize can be < 16 here under current conditions
      threadsY = factor * MIN(THREADS_MAX_LOOPY / (factor*threadsX), SHMEM_MAX / (4*threadsX*factor)); // to avoid running out of shmem
    }

    shmemY = threadsY;
    shmemX = threadsX;
    blocksX = imgSize;
    blocksY = DIVUP(numImages, threadsY);
    //        printf("boundary problems: %u\n", numImages % threadsY != 0);
  } else {

    threadsY = imgSize;
    threadsX = factor * MIN(THREADS_MAX / (factor*threadsY), SHMEM_MAX / (4*threadsY*factor)); // to avoid running out of shmem

    if(threadsX < 8) {
      useLoopy = true;
      int xFactorMult = DIVUP(16, factor);
      threadsX = xFactorMult * factor;
      threadsY = THREADS_MAX / threadsX;
      int newThreadsX = threadsX, newThreadsY = threadsY;
      while (newThreadsY > 0 && imgSize % newThreadsY != 0) { // let's see if we can make threadsY divide imgSize
	newThreadsX += factor;
	newThreadsY = THREADS_MAX / newThreadsX;
      }
      if (newThreadsY > 0) {
	threadsY = newThreadsY;
	threadsX = newThreadsX;
      }

      if (threadsY <=0)
	mexErrMsgTxt("threadsY <=0; not expected");
      //assert(threadsY > 0);
    }

    shmemY = threadsX;
    shmemX = threadsY + (1 - (threadsY % 2));
    blocksX = DIVUP(numImages, threadsX);
    blocksY = imgSize;
    //        printf("boundary problems: %u\n", numImages % threadsX != 0);
  }
  int shmem = 4 * shmemX * shmemY;
  if (shmem == 0 || shmem > 16300) {
    // this really shouldn't happen and i've only put this here as a precautionary measure
    // to avoid getting mysteriously wrong results.
    mexErrMsgTxt("supersample: not enough shared memory!");
    //exit(EXIT_FAILURE);
  }

  dim3 grid(blocksX, blocksY);
  dim3 threads(threadsX, threadsY);
  //mexPrintf("blocks: %dx%d, threads: %dx%d\n", blocksY, blocksX, threadsY, threadsX);
  //mexPrintf("using %dx%d = %d bytes of shmem\n", shmemY, shmemX, shmem);


  gpuTYPE_t tin1 = gm->gputype.getType(IN1);
  gpuTYPE_t tout = gm->gputype.getType(OUT);

  // check input/out size and type
  if (tin1!=tout)
    mexErrMsgTxt("Input and output arguments must be of the same type.");

  // I need the pointers to GPU memory
  CUdeviceptr d_IN1  = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IN1));
  CUdeviceptr d_OUT = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(OUT));

  // // The GPU kernel depends on the type of input/output
  // CUfunction drvfun;
  // if (tin1 == gpuFLOAT) {
  //   drvfun = drvfunf;
  // } else 
  //   mexErrMsgTxt("Currently only single types supported.");

  hostdrv_pars_t gpuprhs[2];
  int gpunrhs = 2;
  gpuprhs[0] = hostdrv_pars(&d_IN1,sizeof(d_IN1));
  gpuprhs[1] = hostdrv_pars(&d_OUT,sizeof(d_OUT));


  /* trans not implemented; so always !trans    if(!trans) { */
        if(!useLoopy) {
            if(factor == 2) {
	      hostDriver(_supersampleMedium_2, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
	      //kSupersampleMedium<2><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), imgSize, numImages*imgSize);
            } else if(factor == 3) {
                hostDriver(_supersampleMedium_3, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 4) {
                hostDriver(_supersampleMedium_4, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 5) {
                hostDriver(_supersampleMedium_5, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 6) {
                hostDriver(_supersampleMedium_6, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 7) {
                hostDriver(_supersampleMedium_7, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 8) {
                hostDriver(_supersampleMedium_8, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 9) {
                hostDriver(_supersampleMedium_9, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 10) {
                hostDriver(_supersampleMedium_10, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 11) {
                hostDriver(_supersampleMedium_11, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 12) {
                hostDriver(_supersampleMedium_12, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 13) {
                hostDriver(_supersampleMedium_13, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 14) {
                hostDriver(_supersampleMedium_14, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 15) {
                hostDriver(_supersampleMedium_15, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 16) {
                hostDriver(_supersampleMedium_16, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            }
        } else {
            if(factor == 2) {
                hostDriver(_supersampleMediumLoopy_2, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 3) {
                hostDriver(_supersampleMediumLoopy_3, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 4) {
                hostDriver(_supersampleMediumLoopy_4, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 5) {
                hostDriver(_supersampleMediumLoopy_5, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 6) {
                hostDriver(_supersampleMediumLoopy_6, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 7) {
                hostDriver(_supersampleMediumLoopy_7, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 8) {
                hostDriver(_supersampleMediumLoopy_8, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 9) {
                hostDriver(_supersampleMediumLoopy_9, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 10) {
                hostDriver(_supersampleMediumLoopy_10, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 11) {
                hostDriver(_supersampleMediumLoopy_11, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 12) {
                hostDriver(_supersampleMediumLoopy_12, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 13) {
                hostDriver(_supersampleMediumLoopy_13, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 14) {
                hostDriver(_supersampleMediumLoopy_14, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 15) {
                hostDriver(_supersampleMediumLoopy_15, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            } else if(factor == 16) {
                hostDriver(_supersampleMediumLoopy_16, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs);
            }
        }
	/* } else {
        if(!useLoopy) {
            if(factor == 2) {
                kSupersampleMediumTrans<2><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 3) {
                kSupersampleMediumTrans<3><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 4) {
                kSupersampleMediumTrans<4><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 5) {
                kSupersampleMediumTrans<5><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 6) {
                kSupersampleMediumTrans<6><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 7) {
                kSupersampleMediumTrans<7><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 8) {
                kSupersampleMediumTrans<8><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 9) {
                kSupersampleMediumTrans<9><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 10) {
                kSupersampleMediumTrans<10><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 11) {
                kSupersampleMediumTrans<11><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 12) {
                kSupersampleMediumTrans<12><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 13) {
                kSupersampleMediumTrans<13><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 14) {
                kSupersampleMediumTrans<14><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 15) {
                kSupersampleMediumTrans<15><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 16) {
                kSupersampleMediumTrans<16><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            }
        } else {
            if(factor == 2) {
                kSupersampleMediumTransLoopy<2><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 3) {
                kSupersampleMediumTransLoopy<3><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 4) {
                kSupersampleMediumTransLoopy<4><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 5) {
                kSupersampleMediumTransLoopy<5><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 6) {
                kSupersampleMediumTransLoopy<6><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 7) {
                kSupersampleMediumTransLoopy<7><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 8) {
                kSupersampleMediumTransLoopy<8><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 9) {
                kSupersampleMediumTransLoopy<9><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 10) {
                kSupersampleMediumTransLoopy<10><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 11) {
                kSupersampleMediumTransLoopy<11><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 12) {
                kSupersampleMediumTransLoopy<12><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 13) {
                kSupersampleMediumTransLoopy<13><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 14) {
                kSupersampleMediumTransLoopy<14><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 15) {
                kSupersampleMediumTransLoopy<15><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            } else if(factor == 16) {
                kSupersampleMediumTransLoopy<16><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX);
            }
        }
    }
	*/

}
Ejemplo n.º 23
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

  CUresult cudastatus = CUDA_SUCCESS;



  // more than 4 arguments expected
  if (nrhs < 4)
    mexErrMsgTxt("Wrong number of arguments");

  if (init == 0) {
    // Initialize function
    //mexLock();

    // load GPUmat
    gm = gmGetGPUmat();

    // check gm
    gmCheckGPUmat(gm);

    // load module
    // NO MODULE REQUIRED

    // load float GPU function
    // NO FUNCTION REQUIRED

    init = 1;
  }

  // log
  gm->debug.log("> ASSIGN\n",0);
  gm->debug.logPush();

  // mex parameters are:
  // dir: direction. Range is applied to the left or the right
  // LHS: GPUtype variable (left hand side)
  // RHS: GPUtype variable (right hand side)
  //      or Matlab array (converted to GPUtype)
  // ...: variable number of arguments after 'dir' representing the range

  if (mxGetClassID(prhs[0]) != mxDOUBLE_CLASS) {
    mexErrMsgTxt(ERROR_ASSIGN_FIRSTARG);
  }
  int dir = (int) mxGetScalar(prhs[0]);
  GPUtype LHS = gm->gputype.getGPUtype(prhs[1]);

  if (gm->comp.getCompileMode() == 1) {
    GPUtype RHS;
    if ((mxGetClassID(prhs[2]) == mxDOUBLE_CLASS)||(mxGetClassID(prhs[2]) == mxSINGLE_CLASS)) {
      // compile this option
      // create dummy RHS
      RHS = gm->gputype.create(gpuFLOAT, 0, NULL, NULL);
      gm->comp.pushGPUtype(&RHS);

      gm->comp.functionStart("GPUMAT_mxToGPUtype");
      gm->comp.functionSetParamGPUtype(&RHS);
      gm->comp.functionSetParamMx(prhs[2]);
      gm->comp.functionEnd();
      //RHS = gm->gputype.mxToGPUtype(prhs[2]);
    } else {
      RHS = gm->gputype.getGPUtype(prhs[2]);
    }

    gm->comp.functionStart("GPUMAT_mxAssign");
    gm->comp.functionSetParamGPUtype(&LHS);
    gm->comp.functionSetParamGPUtype(&RHS);
    gm->comp.functionSetParamInt(dir);
    gm->comp.functionSetParamMxMx(nrhs-3, &prhs[3]);
    gm->comp.functionEnd();


  } else {
    GPUtype RHS;
    // convert Matlab array to GPUtype
    if ((mxGetClassID(prhs[2]) == mxDOUBLE_CLASS)||(mxGetClassID(prhs[2]) == mxSINGLE_CLASS)) {
      RHS = gm->gputype.mxToGPUtype(prhs[2]);
    } else {
      RHS = gm->gputype.getGPUtype(prhs[2]);
    }

    gm->aux.mxAssign(LHS, RHS, dir, nrhs-3, &prhs[3] );
  }

  gm->debug.logPop();


}
Ejemplo n.º 24
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

  CUresult cudastatus = CUDA_SUCCESS;

  // no more than 2 arguments expected
  if (nrhs > 2)
    mexErrMsgTxt("Wrong number of arguments");

  if (init == 0) {
    // Initialize function
    //mexLock();

    // load GPUmat
    gm = gmGetGPUmat();

    // check gm
    gmCheckGPUmat(gm);

    // load module
    // NO MODULE REQUIRED

    // load float GPU function
    // NO FUNCTION REQUIRED

    init = 1;
  }

  // log
  gm->debug.log("> SIZE\n",0);
  gm->debug.logPush();

  if (gm->comp.getCompileMode() == 1) {
    mexWarnMsgTxt(WARNING_NUMERICS_COMPNOTIMPLEMENTED);
  }


  // mex parameters are:
  // IN: GPUtype variable

  GPUtype IN = gm->gputype.getGPUtype(prhs[0]);
  const int *in_size = gm->gputype.getSize(IN);
  int in_ndims = gm->gputype.getNdims(IN);
  // 2 cases
  // 1. s = size(A)
  // 2. s = size(A,dim)
  if (nrhs == 1) {
    // 2 cases:
    // 1. s = size(A)
    // 2. [a,b,c,...] = size(A)
    if (nlhs<=1) {
      // 1. s = size(A)
      // create output plhs[0]
      plhs[0] = mxCreateDoubleMatrix(1, in_ndims, mxREAL);
      // fill in plhs[0] with IN dimensions
      double *plhs_size = mxGetPr(plhs[0]);
      for (int i = 0; i < in_ndims; i++)
        plhs_size[i] = (double) in_size[i];
    } else {
      // 2. [a,b,c,...] = size(A)
      for (int i=0;i<nlhs;i++) {
        // create output
        // create output plhs[i]
        int r = 1;
        // if i is greater than IN dims return 1
        if (i>(in_ndims-1)) {
          // r = 1
        } else {
          r = in_size[i];
        }
        plhs[i] = mxCreateDoubleScalar(r);
      }
    }
  } else {
    // retrieve dim
    int dim  =  (int) mxGetScalar(prhs[1]);
    int r = 1;
    // if dim is greater than IN dims return 1
    if (dim>in_ndims) {
      // r = 1
    } else {
      r = in_size[dim-1];
    }
    // create output plhs[0]
    plhs[0] = mxCreateDoubleScalar(r);
  }



}
Ejemplo n.º 25
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
  // At least 2 arguments expected
  // Input and result
  if (nrhs!=6)
     mexErrMsgTxt("Wrong number of arguments");
  if (init == 0) {
    // Initialize function
    // mexLock();
    // load GPUmat
    gm = gmGetGPUmat();
    init = 1;
  }
  /* mex parameters are:
   0 Source array [X or one of the y-edge-zeroed copies of it]
   1 Destination array [Accumulator, basically]
   2 Stack array 1 [We swap XY or YZ planes with this before copying to assure clean shift] - Must be all zeroes and of size max(NxNy, NyNz, NxNz)
   3 Stack array 2
   4 Shift directions
   5 Coefficient on shift */

  // Get GPU array pointers
  GPUtype srcArray    = gm->gputype.getGPUtype(prhs[0]);
  GPUtype dstArray    = gm->gputype.getGPUtype(prhs[1]);
  GPUtype stackArrayX = gm->gputype.getGPUtype(prhs[2]);
//GPUtype stackArrayY = gm->gputype.getGPUtype(prhs[3]);
  GPUtype stackArrayZ = gm->gputype.getGPUtype(prhs[3]);

  // Get some control variables sorted out
  double *shiftdirs  = mxGetPr(prhs[4]);
  const int *dims    = gm->gputype.getSize(srcArray);

  double alpha       = *mxGetPr(prhs[5]);

  int shifts[3];
  shifts[0] = (int)shiftdirs[0];
  shifts[1] = (int)shiftdirs[1];
  shifts[2] = (int)shiftdirs[2];

  double *cubSrc = (double*)gm->gputype.getGPUptr(srcArray);

  // Remove appropriate YZ plane if any
  double *cubDst = (double*)gm->gputype.getGPUptr(stackArrayX);

  if(shifts[0] == -1) cublasDswap(dims[1]*dims[2], cubSrc,             dims[0], cubDst, 1);
  if(shifts[0] ==  1) cublasDswap(dims[1]*dims[2], cubSrc + dims[0]-1, dims[0], cubDst, 1);

  // Remove appropriate XZ plane if any
  //stackSwapXZplane(cubSrc, (double*)gm->gputype.getGPUptr(stackArrayY), (int *)dims, shifts);

  // Remove appropriate XY plane if any
  cubDst = (double*)gm->gputype.getGPUptr(stackArrayZ);

  if(shifts[2] == -1) cublasDswap(dims[0]*dims[1], cubSrc,                               1, cubDst, 1);
  if(shifts[2] ==  1) cublasDswap(dims[0]*dims[1], cubSrc + dims[0]*dims[1]*(dims[2]-1), 1, cubDst, 1);

  // Decide the amount of offset to acheive desired shift
  int theta = shifts[0] + dims[0]*shifts[1] + dims[0]*dims[1]*shifts[2];
  int Ntot  = dims[0] * dims[1] * dims[2];

  cubDst = (double*)gm->gputype.getGPUptr(dstArray);
  if(theta >= 0) {
    cublasDaxpy(Ntot-theta, alpha, cubSrc,         1, cubDst + theta, 1);
  } else {
    cublasDaxpy(Ntot+theta, alpha, cubSrc - theta, 1, cubDst,         1);
  }

  // Replace the XY plane if it was removed
  cubDst = (double*)gm->gputype.getGPUptr(stackArrayZ);
  if(shifts[2] == -1) cublasDswap(dims[0]*dims[1], cubSrc,                               1, cubDst, 1);
  if(shifts[2] ==  1) cublasDswap(dims[0]*dims[1], cubSrc + dims[0]*dims[1]*(dims[2]-1), 1, cubDst, 1);

  // replace the XZ plane if it was removed
  //stackSwapXZplane(cubSrc, (double*)gm->gputype.getGPUptr(stackArrayY), (int *)dims, shifts);

  // Replace the YZ plane if it was removed
  cubDst = (double*)gm->gputype.getGPUptr(stackArrayX);
  if(shifts[0] == -1) cublasDswap(dims[1]*dims[2], cubSrc,             dims[0], cubDst, 1);
  if(shifts[0] ==  1) cublasDswap(dims[1]*dims[2], cubSrc + dims[0]-1, dims[0], cubDst, 1);
}