void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { CUresult cudastatus = CUDA_SUCCESS; if (nrhs != 2) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); init = 1; } // mex parameters are: GPUtype A = gm->gputype.getGPUtype(prhs[0]); GPUtype B = gm->gputype.getGPUtype(prhs[1]); int r[] = {3,4,6,1}; gm->gputype.mxAssign(A, B, Range(3,r), 0); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { // At least 1 arguments expected // Input and result if (nrhs!=1) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); gmCheckGPUmat(gm); init = 1; } // log gm->debug.log("> REAL\n",0); gm->debug.logPush(); // mex parameters are: // IN1 GPUtype IN1 = gm->gputype.getGPUtype(prhs[0]); if (gm->comp.getCompileMode() == 1) { GPUtype R = gm->gputype.create(gpuFLOAT, 0, NULL, NULL); gm->comp.pushGPUtype(&R); gm->comp.functionStart("GPUMAT_RealDrv"); gm->comp.functionSetParamGPUtype(&R); gm->comp.functionSetParamGPUtype(&IN1); gm->comp.functionEnd(); plhs[0] = gm->gputype.createMxArray(R); } else { GPUtype R = gm->numerics.RealDrv(IN1); plhs[0] = gm->gputype.createMxArray(R); } gm->debug.logPop(); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { // At least 2 arguments expected // Input and result if (nrhs!=2) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); gmCheckGPUmat(gm); init = 1; } // log gm->debug.log("> GPUuminus\n",0); gm->debug.logPush(); // mex parameters are: // IN1 // OUT GPUtype IN1 = gm->gputype.getGPUtype(prhs[0]); GPUtype OUT = gm->gputype.getGPUtype(prhs[1]); if (gm->comp.getCompileMode() == 1) { gm->comp.functionStart("GPUMAT_Uminus"); gm->comp.functionSetParamGPUtype(&OUT); gm->comp.functionSetParamGPUtype(&IN1); gm->comp.functionEnd(); } else { gm->numerics.Uminus(IN1,OUT); } gm->debug.logPop(); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { // At least 1 arguments expected // Input and result if (nrhs!=1) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); gmCheckGPUmat(gm); init = 1; } // log gm->debug.log("> mxToGPUtype\n",0); gm->debug.logPush(); // mex parameters are: // MX GPUtype IN1 = gm->gputype.mxToGPUtype(prhs[0]); if (gm->comp.getCompileMode() == 1) { gm->comp.abort(ERROR_NUMERICS_COMPNOTIMPLEMENTED); /*GPUtype R = gm->gputype.create(gpuFLOAT, 0, NULL, NULL); gm->comp.pushGPUtype(&R); gm->comp.functionStart("GPUMAT_mxToGPUtype"); gm->comp.functionSetParamGPUtype(&R); gm->comp.functionSetParamMx(prhs[0]); gm->comp.functionEnd(); plhs[0] = gm->gputype.createMxArray(R);*/ } else { plhs[0] = gm->gputype.createMxArray(IN1); } gm->debug.logPop(); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { // At least 1 arguments expected // Input and result if (nrhs!=2) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); gmCheckGPUmat(gm); init = 1; } // log gm->debug.log("> GPUdoubleToGPUsinglePtr\n",0); gm->debug.logPush(); // mex parameters are: // IN1 GPUtype IN1 = gm->gputype.getGPUtype(prhs[1]); int isdouble = gm->gputype.isDouble(IN1); if (!isdouble) { mexErrMsgTxt(ERROR_CASTING_GPUDOUBLE); } if (gm->comp.getCompileMode() == 1) { gm->comp.abort(ERROR_NUMERICS_COMPNOTIMPLEMENTED); } else { GPUtype R = gm->gputype.doubleToFloat(IN1); plhs[0] = gm->gputype.createMxArrayPtr(prhs[0], R); } gm->debug.logPop(); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { // At least 2 arguments expected // Input and result if ((nrhs!=2) || (nlhs != 1)) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { gm = gmGetGPUmat(); init = 1; } /* mex parameters are: 0 array 1 1 array 2 This function is a wrapper for the CUBLAS double-prec dot product function pending newer matcuda */ GPUtype arrayA = gm->gputype.getGPUtype(prhs[0]); GPUtype arrayB = gm->gputype.getGPUtype(prhs[1]); int numElements = gm->gputype.getNumel(arrayA); if (numElements != gm->gputype.getNumel(arrayB)) mexErrMsgTxt("Arrays contain different numbers of elements.\n"); plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); //mxReal is our data-type double *AdotB = mxGetPr(plhs[0]); void *pointerA = (void *)gm->gputype.getGPUptr(arrayA); void *pointerB = (void *)gm->gputype.getGPUptr(arrayB); double *u = (double*)pointerA; double *v = (double *)pointerB; AdotB[0] = cublasDdot(numElements, u, 1, v, 1); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { //CUresult cudastatus = CUDA_SUCCESS; // simple garbage collection MyGCObj<Range> mygc1; // more than 4 arguments expected if (nrhs != 7) mexErrMsgTxt(ERROR_GPUFILL_WRONGARGS); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); // check gm gmCheckGPUmat(gm); // load module // NO MODULE REQUIRED // load float GPU function // NO FUNCTION REQUIRED init = 1; } // log gm->debug.log("> GPUfill\n",0); gm->debug.logPush(); // mex parameters are: // DST: Destination GPUtype variable // offset // incr // m // p // type GPUtype DST = gm->gputype.getGPUtype(prhs[0]); if (gm->comp.getCompileMode() == 1) { gm->comp.functionStart("GPUMAT_mxFill"); gm->comp.functionSetParamGPUtype(&DST); gm->comp.functionSetParamMxMx(nrhs-1, &(prhs[1])); gm->comp.functionEnd(); } else { gm->gputype.mxFill(DST, nrhs-1, &(prhs[1])); } gm->debug.logPop(); }
/* * Initializes numerics MODULE. * 1) Load GPU kernels * 2) Register functions in GPUmat structure * */ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { if (nrhs != 0) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function mexLock(); // load GPUmat gm = gmGetGPUmat(); // load module // nothing to load //************************************************************************ // INIT //************************************************************************ seed = 0; // init curand //if (curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT)!=CURAND_STATUS_SUCCESS) { // mexErrMsgTxt(ERROR_CURAND_INIT); //} //************************************************************************ // REGISTER FUNCTION IN GPUMAT STRUCTURE //************************************************************************ // put here functions to be registered gm->rand.rand = GPUrand; gm->rand.mxRandDrv = GPUmxRandDrv; gm->rand.randn = GPUrandn; gm->rand.mxRandnDrv = GPUmxRandnDrv; //************************************************************************ // UPDATE FLAGS //************************************************************************ gm->mod.rand = 1; // rand module was loaded init = 1; } }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { // At least 2 arguments expected // Input1, Input2 if (nrhs!=2) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); gmCheckGPUmat(gm); init = 1; } // log gm->debug.log("> MRDIVIDE\n",0); gm->debug.logPush(); // mex parameters are: // IN1 // IN2 GPUtype IN1 = gm->gputype.getGPUtype(prhs[0]); GPUtype IN2 = gm->gputype.getGPUtype(prhs[1]); int in1_iscalar = gm->gputype.isScalar(IN1); int in2_iscalar = gm->gputype.isScalar(IN2); if (in1_iscalar||in2_iscalar) { } else { mexErrMsgTxt(ERROR_MRDIVIDE_MATRICES); } if (gm->comp.getCompileMode() == 1) { GPUtype R = gm->gputype.create(gpuFLOAT, 0, NULL, NULL); gm->comp.pushGPUtype(&R); gm->comp.functionStart("GPUMAT_RdivideDrv"); gm->comp.functionSetParamGPUtype(&R); gm->comp.functionSetParamGPUtype(&IN1); gm->comp.functionSetParamGPUtype(&IN2); gm->comp.functionEnd(); plhs[0] = gm->gputype.createMxArray(R); } else { GPUtype R = gm->numerics.RdivideDrv(IN1,IN2); plhs[0] = gm->gputype.createMxArray(R); } gm->debug.logPop(); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { // At least 1 arguments expected // Input and result if (nrhs!=1) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); init = 1; } // mex parameters are: // IN1 GPUtype IN1 = gm->gputype.getGPUtype(prhs[0]); GPUtype r = gm->numerics.Log10Drv(IN1); plhs[0] = gm->gputype.createMxArray(r); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { // At least 2 arguments expected // Input and result if (nrhs!=2) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); init = 1; } // mex parameters are: // IN1 // OUT GPUtype IN1 = gm->gputype.getGPUtype(prhs[0]); GPUtype OUT = gm->gputype.getGPUtype(prhs[1]); gm->numerics.Transpose(IN1,OUT); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { CUresult cudastatus = CUDA_SUCCESS; if (nrhs != 1) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); // check gm gmCheckGPUmat(gm); init = 1; } // log gm->debug.log("> GPUrandn\n",0); gm->debug.logPush(); // mex parameters are: // 1. OUT //OUT is the output GPU array (result) GPUtype OUT = gm->gputype.getGPUtype(prhs[0]); if (gm->comp.getCompileMode() == 1) { gm->comp.functionStart("GPUMAT_Randn"); gm->comp.functionSetParamGPUtype(&OUT); gm->comp.functionEnd(); } else { gm->rand.randn(OUT); } gm->debug.logPop(); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { CUresult cudastatus = CUDA_SUCCESS; // At least 2 arguments expected // Input and result if (nrhs!=2) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); // load module // NOT REQUIRED // load float GPU function // NOT REQUIRED init = 1; } // mex parameters are: // IN // OUT GPUtype IN = gm->gputype.getGPUtype(prhs[0]); GPUtype OUT = gm->gputype.getGPUtype(prhs[1]); gm->numerics.Exp(IN,OUT); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { // At least 1 arguments expected // Input and result if (nrhs!=1) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); gmCheckGPUmat(gm); init = 1; } // log gm->debug.log("> GPUdoubleToGPUsingle\n",0); gm->debug.logPush(); // mex parameters are: // IN1 GPUtype IN1 = gm->gputype.getGPUtype(prhs[0]); int isdouble = gm->gputype.isDouble(IN1); if (!isdouble) { mexErrMsgTxt(ERROR_CASTING_GPUDOUBLE); } if (gm->comp.getCompileMode() == 1) { GPUtype R = gm->gputype.create(gpuFLOAT, 0, NULL, NULL); gm->comp.pushGPUtype(&R); gm->comp.functionStart("GPUMAT_DoubleToFloat"); gm->comp.functionSetParamGPUtype(&R); gm->comp.functionSetParamGPUtype(&IN1); gm->comp.functionEnd(); plhs[0] = gm->gputype.createMxArray(R); } else { GPUtype R = gm->gputype.doubleToFloat(IN1); plhs[0] = gm->gputype.createMxArray(R); } gm->debug.logPop(); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { CUresult cudastatus = CUDA_SUCCESS; if (nrhs != 3) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); // load module CUmodule *drvmod = gmGetModule("examples_numerics"); // load float GPU function CUresult status = cuModuleGetFunction(&drvfunf, *drvmod, "TIMESF"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } // load complex GPU function status = cuModuleGetFunction(&drvfunc, *drvmod, "TIMESC"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } // load double GPU function status = cuModuleGetFunction(&drvfund, *drvmod, "TIMESD"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } // load complex GPU function status = cuModuleGetFunction(&drvfuncd, *drvmod, "TIMESCD"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } init = 1; } // mex parameters are: // 1. IN1 // 2. IN2 // 3. OUT //IN1 is the input GPU array GPUtype IN1 = gm->gputype.getGPUtype(prhs[0]); //IN2 is the input GPU array GPUtype IN2 = gm->gputype.getGPUtype(prhs[1]); //OUT is the output GPU array (result) GPUtype OUT = gm->gputype.getGPUtype(prhs[2]); // number of elements int nin1 = gm->gputype.getNumel(IN1); int nin2 = gm->gputype.getNumel(IN2); int nout = gm->gputype.getNumel(OUT); gpuTYPE_t tin1 = gm->gputype.getType(IN1); gpuTYPE_t tin2 = gm->gputype.getType(IN2); gpuTYPE_t tout = gm->gputype.getType(OUT); // check input/out size and type if (nin1!=nin2) mexErrMsgTxt("Input arguments must have the same number of elements."); if (nin1!=nout) mexErrMsgTxt("Input and output arguments must have the same number of elements."); if (tin1!=tin2) mexErrMsgTxt("Input arguments must be of the same type."); if (tin1!=tout) mexErrMsgTxt("Input and output arguments must be of the same type."); // I need the pointers to GPU memory CUdeviceptr d_IN1 = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IN1)); CUdeviceptr d_IN2 = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IN2)); CUdeviceptr d_OUT = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(OUT)); // The GPU kernel depends on the type of input/output CUfunction drvfun; if (tin1 == gpuFLOAT) { drvfun = drvfunf; } else if (tin1 == gpuCFLOAT) { drvfun = drvfunc; } else if (tin1 == gpuDOUBLE) { drvfun = drvfund; } else if (tin1 == gpuCDOUBLE) { drvfun = drvfuncd; } hostdrv_pars_t gpuprhs[3]; int gpunrhs = 3; gpuprhs[0] = hostdrv_pars(&d_IN1,sizeof(d_IN1),__alignof(d_IN1)); gpuprhs[1] = hostdrv_pars(&d_IN2,sizeof(d_IN2),__alignof(d_IN2)); gpuprhs[2] = hostdrv_pars(&d_OUT,sizeof(d_OUT),__alignof(d_OUT)); int N = nin1; hostGPUDRV(drvfun, N, gpunrhs, gpuprhs); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { // At least 2 arguments expected // Input and result if (nrhs!=3) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); init = 1; } // mex parameters are: // kernel_fft_rot // vis_fft // C GPUtype kernel_fft_rot = gm->gputype.getGPUtype(prhs[0]); GPUtype vis_fft = gm->gputype.getGPUtype(prhs[1]); GPUtype C = gm->gputype.getGPUtype(prhs[2]); // SIZE const int *ks = gm->gputype.getSize(kernel_fft_rot); const int *vs = gm->gputype.getSize(vis_fft); const int *cs = gm->gputype.getSize(C); // DIMENSIONS int kn = gm->gputype.getNdims(kernel_fft_rot); int vn = gm->gputype.getNdims(vis_fft); int cn = gm->gputype.getNdims(C); // check size // first 2 dimensions must agree // kn is 4 // vn is 3 // cn is 3 if ((kn!=4)||(vn!=3)||(cn!=3)) mexErrMsgTxt("Check input parameters dimensions."); if ((ks[0]!=vs[0])||(ks[0]!=cs[0])||(ks[1]!=vs[1])||(ks[1]!=cs[1])) mexErrMsgTxt("Check input parameters size."); if (ks[2]!=vs[2]) mexErrMsgTxt("Check input parameters size."); if (ks[2]!=cs[2]) mexErrMsgTxt("Check input parameters size."); for (int i=0;i<ks[3];i++) { for (int j=0;j<vs[2];j++) { GPUtype tmp1 = gm->gputype.slice(vis_fft, Range(0,1,END, Range(0,1,END, Range(j,0,0)))); GPUtype tmp2 = gm->gputype.slice(kernel_fft_rot, Range(0,1,END, Range(0,1,END, Range(j,0,0, Range(i,0,0))))); gm->numerics.Times(tmp1,tmp2,tmp1); gm->numerics.Real(gm->fft.IFFT2Drv(tmp1),tmp2); tmp1 = gm->gputype.slice(C, Range(0,1,END, Range(0,1,END, Range(j,0,0)))); gm->numerics.Plus(tmp1,tmp2,tmp1); gm->gputype.assign(C, tmp1, Range(0,1,END, Range(0,1,END, Range(j,0,0))),1); } } }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { CUresult cudastatus = CUDA_SUCCESS; if (nrhs != 3) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); // load module CUmodule *drvmod = gmGetModule("misc"); // load float GPU function CUresult status = cuModuleGetFunction(&drvfunf, *drvmod, "gpuNCAreg"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } init = 1; } // mex parameters are: // 1. IN1 (in) // 2. IN2 (w) // 3. OUT (out) //IN is the input GPU array GPUtype IN1 = gm->gputype.getGPUtype(prhs[0]); gpuTYPE_t tIn1 = gm->gputype.getType(IN1); //IN is the input GPU array GPUtype IN2 = gm->gputype.getGPUtype(prhs[1]); gpuTYPE_t tIn2 = gm->gputype.getType(IN2); //IN is the input GPU array GPUtype OUT = gm->gputype.getGPUtype(prhs[2]); gpuTYPE_t tOut = gm->gputype.getType(OUT); //dimensions const int * sIn1 = gm->gputype.getSize(IN1); const int * sIn2 = gm->gputype.getSize(IN2); const int * sOut = gm->gputype.getSize(OUT); int m1 = sIn1[0]; //# of dimensions int n1 = sIn1[1]; //# of cases int n2a = sIn2[0]; //# of cases int n2b = sIn2[1]; //# of cases int sout1 = sOut[0]; //# of cases int sout2 = sOut[1]; //# of dimensions if ( (n1!=n2a) || (n2a!=n2b) || (n1!=sout1) ) mexErrMsgTxt("Number of cases must be consistent"); if (m1!=sout2) mexErrMsgTxt("Number of dimensions must be consistent"); /* Output for debugging mexPrintf("numcases: %d numdims: %d\n",n,m); mexPrintf("output: %d x %d\n",sout1,sout2); */ CUfunction drvfun; if ((tIn1 == gpuFLOAT) && (tIn2 == gpuFLOAT)) drvfun = drvfunf; else { mexErrMsgTxt("Only singles are supported at present."); } // I need the pointers to GPU memory CUdeviceptr d_IN1 = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IN1)); CUdeviceptr d_IN2 = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IN2)); CUdeviceptr d_OUT = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(OUT)); hostdrv_pars_t gpuprhs[3]; int gpunrhs = 3; gpuprhs[0] = hostdrv_pars(&d_IN1,sizeof(d_IN1)); gpuprhs[1] = hostdrv_pars(&d_IN2,sizeof(d_IN2)); gpuprhs[2] = hostdrv_pars(&d_OUT,sizeof(d_OUT)); //hostGPUDRV(drvfun, N, gpunrhs, gpuprhs); hostGPUPdist(drvfun, gpunrhs, gpuprhs, n1, m1); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { CUresult cudastatus = CUDA_SUCCESS; if (nrhs < 2) mexErrMsgTxt("Wrong number of arguments, atleast two required."); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); // load module CUmodule *drvmod = gmGetModule("avg_pool"); // load float GPU function // CUresult status = cuModuleGetFunction(&drvfunf, *drvmod, "MAXPOOLF"); CUresult status = cuModuleGetFunction(&drvfunfnoind, *drvmod, "AVGPOOLFNOIND"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } // status = cuModuleGetFunction(&drvfunfind, *drvmod, "AVGPOOLFIND"); // if (CUDA_SUCCESS != status) { // mexErrMsgTxt("Unable to load user function."); // } init = 1; } double *psize = mxGetPr(prhs[1]); // For the pool_size array. // mex parameters are: // 1. IN1 // 2. IN2 // 3. OUT //IN1 is the input GPU array GPUtype IN1 = gm->gputype.getGPUtype(prhs[0]); //OUT is the output GPU array (result) // GPUtype OUT = gm->gputype.getGPUtype(prhs[2]); // number of elements int nin1 = gm->gputype.getNumel(IN1); // int nin2 = gm->gputype.getNumel(IN2); // Get the number of dimensions. int ndims = gm->gputype.getNdims(IN1); gpuTYPE_t tin1 = gm->gputype.getType(IN1); // gpuTYPE_t tin2 = gm->gputype.getType(IN2); // create GPUtype, with given dimensions (same size as input). int* im_size; int* out_size; im_size = (int*) mxCalloc(ndims, sizeof(mwSize)); out_size = (int*) mxCalloc(ndims, sizeof(mwSize)); im_size = (int*) gm->gputype.getSize(IN1); // Calculate the reduced size (just spatially reduced in 2D pooling). out_size[0] = (int) ceil(((float)im_size[0])/((float)psize[0])); out_size[1] = (int) ceil(((float)im_size[1])/((float)psize[1])); for(int dim=ndims;dim>2;dim--){ out_size[dim-1] = im_size[dim-1]; } // mexErrMsgTxt("Right after out_size\n"); GPUtype OUT = gm->gputype.create(tin1, ndims, out_size, NULL); // The kernel we ultimately call depends on if the indices were passed in. CUfunction drvfun; // GPUtype IND; // //IND is the input GPU index array // if(nrhs >=3){ // IND = gm->gputype.getGPUtype(prhs[2]); // drvfun = drvfunfind; // } // // if(nrhs<3 || gm->gputype.isEmpty(IND)){ // If IND wasn't passed int we need to create it here. // IND = gm->gputype.create(tin1, ndims, out_size, NULL); drvfun = drvfunfnoind; // } // mexPrintf("After creating new array.\n"); int nout = gm->gputype.getNumel(OUT); gpuTYPE_t tout = gm->gputype.getType(OUT); // mexPrintf("nout: %d, nin1: %d, out_size: %d x %d\n", nout, nin1, out_size[0], out_size[1]); if (tin1 !=gpuFLOAT) mexErrMsgTxt("Input must be GPUsingle"); // check input/out size and type // if (nin1!=nin2) // mexErrMsgTxt("Input arguments must have the same number of elements."); // if (nin1!=nout) // mexErrMsgTxt("Input and output arguments must have the same number of elements."); // if (tin1!=tin2) // mexErrMsgTxt("Input arguments must be of the same type."); if (tin1!=tout) mexErrMsgTxt("Input and output arguments must be of the same type."); // I need the pointers to GPU memory CUdeviceptr d_IN1 = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IN1)); // CUdeviceptr d_IND = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IND)); CUdeviceptr d_OUT = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(OUT)); // CUdeviceptr d_IN1 = (UINTPTR gm->gputype.getGPUptr(IN1)); // CUdeviceptr d_IN2 = (UINTPTR gm->gputype.getGPUptr(IN2)); // CUdeviceprt d_OUT = (UINTPTR gm->gputype.getGPUptr(OUT)); hostdrv_pars_t gpuprhs[2]; int gpunrhs = 2; gpuprhs[0] = hostdrv_pars(&d_IN1, sizeof(d_IN1), __alignof(d_IN1)); gpuprhs[1] = hostdrv_pars(&d_OUT, sizeof(d_OUT), __alignof(d_OUT)); // gpuprhs[2] = hostdrv_pars(&d_IND, sizeof(d_IND), __alignof(d_IND)); // GPUtype pool_size = gm->gputype.mxToGPUtype(prhs[2]); // gpuprhs[3] = hostdrv_pars(&psize, sizeof(psize), __alignof(psize)); int N = nout; // mexErrMsgTxt("just before avg pooling"); // hostGPUDRV(drvfun, N, gpunrhs, gpuprhs); hostDriver(drvfun, N, gpunrhs, gpuprhs, (int)im_size[0], (int)im_size[1], (int)out_size[0], (int)out_size[1], (int)psize[0], (int)psize[1]); // int numThreadsPerBlock = 256; // const unsigned int numBlocks = (out_size[0] + numThreadsPerBlock - 1) / numThreadsPerBlock; // PLUSF<<<numBlocks,numThreadsPerBlock>>>(N, 0, (float *)d_IN1,(float *)d_IN2,(float *)d_OUT); // Finally make the output available to MATLAB plhs[0] = gm->gputype.createMxArray(OUT); // if(nlhs>1) // plhs[1] = gm->gputype.createMxArray(IND); // Create empty output array. // if(nlhs>1){ // GPUtype IND; // int* ind_size; // ind_size = (int*) mxCalloc(2, sizeof(int)); // ind_size[0] = 0; // ind_size[1] = 0; // IND = gm->gputype.create(tin1, 0, ind_size, NULL); // plhs[1] = gm->gputype.createMxArray(IND); // } }
/* * Initializes numerics MODULE. * 1) Load GPU kernels * 2) Register functions in GPUmat structure * */ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { if (nrhs != 0) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function mexLock(); // load GPUmat gm = gmGetGPUmat(); // load module CUmodule *drvmod = gmGetModule("numerics"); //************************************************************************ // EYE GPU KERNELS //************************************************************************ // load float GPU function CUresult status = cuModuleGetFunction(&EYEdrvfuns[N_EYEF], *drvmod, "EYEF"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } // load complex GPU function status = cuModuleGetFunction(&EYEdrvfuns[N_EYEC], *drvmod, "EYEC"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } // load double GPU function status = cuModuleGetFunction(&EYEdrvfuns[N_EYED], *drvmod, "EYED"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } // load complex GPU function status = cuModuleGetFunction(&EYEdrvfuns[N_EYEDC], *drvmod, "EYEDC"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } //************************************************************************ // REGISTER FUNCTION IN GPUMAT STRUCTURE //************************************************************************ // put here functions to be registered gm->gputype.mxRepmatDrv = GPUmxRepmatDrv; gm->gputype.mxPermuteDrv = GPUmxPermuteDrv; gm->gputype.mxEyeDrv = GPUmxEyeDrv; gm->gputype.eye = GPUeye; gm->gputype.mxZerosDrv = GPUmxZerosDrv; gm->gputype.zeros = GPUzeros; gm->gputype.mxOnesDrv = GPUmxOnesDrv; gm->gputype.ones = GPUones; gm->gputype.mxFill = GPUmxFill; gm->gputype.mxColonDrv = GPUmxColon; gm->gputype.mxMemCpyDtoD = GPUmxMemCpyDtoD; gm->gputype.mxMemCpyHtoD = GPUmxMemCpyHtoD; // aux gm->aux.mxAssign = GPUmxAssign; gm->aux.mxSliceDrv = GPUmxSliceDrv; //************************************************************************ // UPDATE FLAGS //************************************************************************ gm->mod.numerics = 1; // numerics module was loaded init = 1; } }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { CUresult cudastatus = CUDA_SUCCESS; if (nrhs != 3) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); // load module CUmodule *drvmod = gmGetModule("subsample"); //load appropriate GPU kernel (mangled name) CUresult status; status = cuModuleGetFunction(&subsample_noreduc_2T, *drvmod, "_Z18kSubsample_noreducILi2ELb1EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_2F, *drvmod, "_Z18kSubsample_noreducILi2ELb0EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_3T, *drvmod, "_Z18kSubsample_noreducILi3ELb1EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_3F, *drvmod, "_Z18kSubsample_noreducILi3ELb0EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_4T, *drvmod, "_Z18kSubsample_noreducILi4ELb1EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_4F, *drvmod, "_Z18kSubsample_noreducILi4ELb0EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_5T, *drvmod, "_Z18kSubsample_noreducILi5ELb1EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_5F, *drvmod, "_Z18kSubsample_noreducILi5ELb0EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_6T, *drvmod, "_Z18kSubsample_noreducILi6ELb1EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_6F, *drvmod, "_Z18kSubsample_noreducILi6ELb0EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_7T, *drvmod, "_Z18kSubsample_noreducILi7ELb1EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_7F, *drvmod, "_Z18kSubsample_noreducILi7ELb0EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_8T, *drvmod, "_Z18kSubsample_noreducILi8ELb1EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_8F, *drvmod, "_Z18kSubsample_noreducILi8ELb0EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_9T, *drvmod, "_Z18kSubsample_noreducILi9ELb1EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_9F, *drvmod, "_Z18kSubsample_noreducILi9ELb0EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_10T, *drvmod, "_Z18kSubsample_noreducILi10ELb1EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_10F, *drvmod, "_Z18kSubsample_noreducILi10ELb0EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_11T, *drvmod, "_Z18kSubsample_noreducILi11ELb1EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_11F, *drvmod, "_Z18kSubsample_noreducILi11ELb0EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_12T, *drvmod, "_Z18kSubsample_noreducILi12ELb1EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_12F, *drvmod, "_Z18kSubsample_noreducILi12ELb0EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_13T, *drvmod, "_Z18kSubsample_noreducILi13ELb1EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_13F, *drvmod, "_Z18kSubsample_noreducILi13ELb0EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_14T, *drvmod, "_Z18kSubsample_noreducILi14ELb1EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_14F, *drvmod, "_Z18kSubsample_noreducILi14ELb0EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_15T, *drvmod, "_Z18kSubsample_noreducILi15ELb1EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_15F, *drvmod, "_Z18kSubsample_noreducILi15ELb0EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_16T, *drvmod, "_Z18kSubsample_noreducILi16ELb1EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&subsample_noreduc_16F, *drvmod, "_Z18kSubsample_noreducILi16ELb0EEvPfS0_iii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } init = 1; } // mex parameters are: // 1. IN1 // 2. OUT // 3. subsampling factor bool avoidBankConflicts = true; //hard-coded //IN1 is the input GPU array GPUtype IN1 = gm->gputype.getGPUtype(prhs[0]); //OUT is the output GPU array (result) GPUtype OUT = gm->gputype.getGPUtype(prhs[1]); //last parameter is the filterSize (int) int factor = (int) mxGetScalar(prhs[2]); // number of elements int nin1 = gm->gputype.getNumel(IN1); int nout = gm->gputype.getNumel(OUT); //dimensions const int * sin1 = gm->gputype.getSize(IN1); const int * sout = gm->gputype.getSize(OUT); int imgPixels = sin1[0]; if ( floor(sqrt(float(imgPixels))) != sqrt(float(imgPixels)) ) mexErrMsgTxt("Images not square"); int imgSize = int(sqrt(imgPixels)); if (imgSize <= factor) mexErrMsgTxt("imgSize must be > factor"); if (imgSize % factor !=0) mexErrMsgTxt("imgSize must be evenly divisible by factor"); if (factor > 16) mexErrMsgTxt("factor > 16"); if (factor < 2) mexErrMsgTxt("factor < 2"); if (imgSize > 512) mexErrMsgTxt("max imgSize: 512"); int numRegions = nin1 / (factor*factor); int numRegionsY = (imgSize / factor) * sin1[1]; if (nout != numRegions) mexErrMsgTxt("Target dimensions not consistent"); int regionsXPerBlock = imgSize / factor; int numThreadsX = imgSize; int SHMEM_MAX = 8192; // don't use more than this much shmem int regionsYPerBlock = MIN(512 / numThreadsX, SHMEM_MAX / (4*imgSize)); // to avoid running out of shmem // regionsYPerBlock--; int regionsPerBlock = regionsYPerBlock * regionsXPerBlock; // this will avoid all bank conflicts but may (?) use up too much shmem int shmemPadX = avoidBankConflicts * (DIVUP(16,factor) + (regionsPerBlock % 16 == 0 ? 0 : 16 - regionsPerBlock % 16)); // shmemPadX = 0; int shmemY = factor, shmemX = regionsPerBlock + shmemPadX; int shmem = 4 * shmemX * shmemY; if (shmem == 0 || shmem > 16300) { // this really shouldn't happen and i've only put this here as a precautionary measure // to avoid getting mysteriously wrong results. mexErrMsgTxt("subsample: not enough shared memory!"); } int numThreadsY = regionsYPerBlock; // int blocks = numRegionsY / regionsYPerBlock; int blocksX = imgSize / factor, blocksY = DIVUP(sin1[1], regionsYPerBlock); if (blocksX >=65535 || blocksY >= 65535) mexErrMsgTxt("Exceeded max block size"); // assert(numRegionsY % regionsYPerBlock == 0); bool checkThreadBounds = numRegionsY % regionsYPerBlock != 0; // printf("num regions y: %d, regions y per block: %d\n", numRegionsY, regionsYPerBlock); dim3 grid(blocksX, blocksY); dim3 threads(numThreadsX, numThreadsY); /* mexPrintf("grid: %ux%u, threads: %ux%u\n", grid.y, grid.x, threads.y, threads.x); mexPrintf("check bounds: %u\n", checkThreadBounds); mexPrintf("using %u bytes of shmem\n", shmem); */ gpuTYPE_t tin1 = gm->gputype.getType(IN1); gpuTYPE_t tout = gm->gputype.getType(OUT); // check input/out size and type if (tin1!=tout) mexErrMsgTxt("Input and output arguments must be of the same type."); // I need the pointers to GPU memory CUdeviceptr d_IN1 = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IN1)); CUdeviceptr d_OUT = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(OUT)); // // The GPU kernel depends on the type of input/output // CUfunction drvfun; // if (tin1 == gpuFLOAT) { // drvfun = drvfunf; // } else // mexErrMsgTxt("Currently only single types supported."); hostdrv_pars_t gpuprhs[2]; int gpunrhs = 2; gpuprhs[0] = hostdrv_pars(&d_IN1,sizeof(d_IN1)); gpuprhs[1] = hostdrv_pars(&d_OUT,sizeof(d_OUT)); //int N = nin1; if (factor == 2) { if (checkThreadBounds) { hostDriver(subsample_noreduc_2T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } else { hostDriver(subsample_noreduc_2F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } } else if (factor == 3) { if (checkThreadBounds) { hostDriver(subsample_noreduc_3T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } else { hostDriver(subsample_noreduc_3F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } } else if (factor == 4) { if (checkThreadBounds) { hostDriver(subsample_noreduc_4T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } else { hostDriver(subsample_noreduc_4F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } } else if (factor == 5) { if (checkThreadBounds) { hostDriver(subsample_noreduc_5T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } else { hostDriver(subsample_noreduc_5F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } } else if (factor == 6) { if (checkThreadBounds) { hostDriver(subsample_noreduc_6T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } else { hostDriver(subsample_noreduc_6F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } } else if (factor == 7) { if (checkThreadBounds) { hostDriver(subsample_noreduc_7T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } else { hostDriver(subsample_noreduc_7F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } } else if (factor == 8) { if (checkThreadBounds) { hostDriver(subsample_noreduc_8T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } else { hostDriver(subsample_noreduc_8F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } } else if (factor == 9) { if (checkThreadBounds) { hostDriver(subsample_noreduc_9T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } else { hostDriver(subsample_noreduc_9F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } } else if (factor == 10) { if (checkThreadBounds) { hostDriver(subsample_noreduc_10T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } else { hostDriver(subsample_noreduc_10F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } } else if (factor == 11) { if (checkThreadBounds) { hostDriver(subsample_noreduc_11T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } else { hostDriver(subsample_noreduc_11F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } } else if (factor == 12) { if (checkThreadBounds) { hostDriver(subsample_noreduc_12T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } else { hostDriver(subsample_noreduc_12F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } } else if (factor == 13) { if (checkThreadBounds) { hostDriver(subsample_noreduc_13T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } else { hostDriver(subsample_noreduc_13F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } } else if (factor == 14) { if (checkThreadBounds) { hostDriver(subsample_noreduc_14T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } else { hostDriver(subsample_noreduc_14F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } } else if (factor == 15) { if (checkThreadBounds) { hostDriver(subsample_noreduc_15T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } else { hostDriver(subsample_noreduc_15F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } } else if (factor == 16) { if (checkThreadBounds) { hostDriver(subsample_noreduc_16T, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } else { hostDriver(subsample_noreduc_16F, grid, threads, imgSize, numRegionsY, shmemX, shmem, gpunrhs, gpuprhs); } } //cutilCheckMsg("kernel execution failed"); // if(factor == 4) { //// kSubsample_reduc<4><<<grid, threads,4*numThreadsX*numThreadsY>>>(images->getDevData(), targets->getDevData(), imgSize, numRegionsY); // } }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { CUresult cudastatus = CUDA_SUCCESS; if (nrhs != 2) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); // load module CUmodule *drvmod = gmGetModule("examples_texture"); // load float GPU function CUresult status = cuModuleGetFunction(&drvfunf, *drvmod, "LININTERF"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } // load double GPU function status = cuModuleGetFunction(&drvfund, *drvmod, "LININTERD"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } // load textures defined in module status = cuModuleGetTexRef(&texf, *drvmod, "texref_f1_a"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load texture."); } status = cuModuleGetTexRef(&texd, *drvmod, "texref_d1_a"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load texture."); } // no complex function support init = 1; } // mex parameters are: // 1. IN1. Input array // 2. IN2. Input indexes array //IN1 is the input GPU array GPUtype IN1 = gm->gputype.getGPUtype(prhs[0]); //IN2 is the input GPU array GPUtype IN2 = gm->gputype.getGPUtype(prhs[1]); //OUT is the output GPU array (result) // Create of the same size of IN1 gpuTYPE_t in1_t = gm->gputype.getType(IN1); int in1_d = gm->gputype.getNdims(IN1); const int * in1_s = gm->gputype.getSize(IN1); int in1_n = gm->gputype.getNumel(IN1); int in1_b = gm->gputype.getDataSize(IN1); gpuTYPE_t in2_t = gm->gputype.getType(IN2); int in2_d = gm->gputype.getNdims(IN2); const int * in2_s = gm->gputype.getSize(IN2); int in2_n = gm->gputype.getNumel(IN2); if ((in1_t==gpuCFLOAT) || (in1_t==gpuCDOUBLE)) { mexErrMsgTxt("Complex TYPE not supported"); } if (in1_t != in2_t) { mexErrMsgTxt("Input arguments must be of the same type"); } if (in1_n != in2_n) { mexErrMsgTxt("Input arguments must have the same number of elements"); } //OUT is the output GPU array (result) // Create of the same size of IN1 GPUtype OUT = gm->gputype.create(in1_t, in1_d, in1_s, NULL); // I need the pointers to GPU memory CUdeviceptr d_IN1 = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IN1)); CUdeviceptr d_IN2 = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IN2)); CUdeviceptr d_OUT = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(OUT)); // The GPU kernel depends on the type of input/output CUfunction drvfun; CUtexref drvtex; CUarray_format_enum drvtexformat; int drvtexnum; if (in1_t == gpuFLOAT) { drvfun = drvfunf; drvtex = texf; drvtexformat = CU_AD_FORMAT_FLOAT; drvtexnum = 1; } else if (in1_t == gpuDOUBLE) { drvfun = drvfund; drvtex = texd; drvtexformat = CU_AD_FORMAT_SIGNED_INT32; drvtexnum = 2; } if (CUDA_SUCCESS != cuTexRefSetFormat(drvtex, drvtexformat, drvtexnum)) { mexErrMsgTxt("Execution error (texture)."); } if (CUDA_SUCCESS != cuTexRefSetAddress(NULL, drvtex, UINTPTR d_IN1, in1_n*in1_b)) { mexErrMsgTxt("Execution error (texture)."); } if (CUDA_SUCCESS != cuParamSetTexRef(drvfun, CU_PARAM_TR_DEFAULT, drvtex)) { mexErrMsgTxt("Execution error (texture1)."); } hostdrv_pars_t gpuprhs[2]; int gpunrhs = 2; gpuprhs[0] = hostdrv_pars(&d_IN2,sizeof(d_IN2),__alignof(d_IN2)); gpuprhs[1] = hostdrv_pars(&d_OUT,sizeof(d_OUT),__alignof(d_OUT)); int N = in1_n; hostGPUDRV(drvfun, N, gpunrhs, gpuprhs); // return result plhs[0] = gm->gputype.createMxArray(OUT); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { CUresult cudastatus = CUDA_SUCCESS; if (nrhs != 3) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); // load module CUmodule *drvmod = gmGetModule("subsample"); //load appropriate GPU kernel (mangled name) CUresult status; status = cuModuleGetFunction(&_supersampleMedium_2, *drvmod, "_Z18kSupersampleMediumILi2EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMedium_3, *drvmod, "_Z18kSupersampleMediumILi3EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMedium_4, *drvmod, "_Z18kSupersampleMediumILi4EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMedium_5, *drvmod, "_Z18kSupersampleMediumILi5EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMedium_6, *drvmod, "_Z18kSupersampleMediumILi6EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMedium_7, *drvmod, "_Z18kSupersampleMediumILi7EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMedium_8, *drvmod, "_Z18kSupersampleMediumILi8EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMedium_9, *drvmod, "_Z18kSupersampleMediumILi9EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMedium_10, *drvmod, "_Z18kSupersampleMediumILi10EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMedium_11, *drvmod, "_Z18kSupersampleMediumILi11EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMedium_12, *drvmod, "_Z18kSupersampleMediumILi12EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMedium_13, *drvmod, "_Z18kSupersampleMediumILi13EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMedium_14, *drvmod, "_Z18kSupersampleMediumILi14EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMedium_15, *drvmod, "_Z18kSupersampleMediumILi15EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMedium_16, *drvmod, "_Z18kSupersampleMediumILi16EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMediumLoopy_2, *drvmod, "_Z23kSupersampleMediumLoopyILi2EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMediumLoopy_3, *drvmod, "_Z23kSupersampleMediumLoopyILi3EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMediumLoopy_4, *drvmod, "_Z23kSupersampleMediumLoopyILi4EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMediumLoopy_5, *drvmod, "_Z23kSupersampleMediumLoopyILi5EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMediumLoopy_6, *drvmod, "_Z23kSupersampleMediumLoopyILi6EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMediumLoopy_7, *drvmod, "_Z23kSupersampleMediumLoopyILi7EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMediumLoopy_8, *drvmod, "_Z23kSupersampleMediumLoopyILi8EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMediumLoopy_9, *drvmod, "_Z23kSupersampleMediumLoopyILi9EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMediumLoopy_10, *drvmod, "_Z23kSupersampleMediumLoopyILi10EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMediumLoopy_11, *drvmod, "_Z23kSupersampleMediumLoopyILi11EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMediumLoopy_12, *drvmod, "_Z23kSupersampleMediumLoopyILi12EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMediumLoopy_13, *drvmod, "_Z23kSupersampleMediumLoopyILi13EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMediumLoopy_14, *drvmod, "_Z23kSupersampleMediumLoopyILi14EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMediumLoopy_15, *drvmod, "_Z23kSupersampleMediumLoopyILi15EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } status = cuModuleGetFunction(&_supersampleMediumLoopy_16, *drvmod, "_Z23kSupersampleMediumLoopyILi16EEvPfS0_ii"); if (CUDA_SUCCESS != status) { mexErrMsgTxt("Unable to load user function."); } init = 1; } // mex parameters are: // 1. IN1 // 2. OUT // 3. supersampling factor bool avoidBankConflicts = true; //hard-coded bool trans = false; //hard-coded //IN1 is the input GPU array GPUtype IN1 = gm->gputype.getGPUtype(prhs[0]); //OUT is the output GPU array (result) GPUtype OUT = gm->gputype.getGPUtype(prhs[1]); //last parameter is the filterSize (int) int factor = (int) mxGetScalar(prhs[2]); // number of elements int nin1 = gm->gputype.getNumel(IN1); int nout = gm->gputype.getNumel(OUT); //dimensions const int * sin1 = gm->gputype.getSize(IN1); const int * sout = gm->gputype.getSize(OUT); int imgPixels = sin1[0]; if ( floor(sqrt(float(imgPixels))) != sqrt(float(imgPixels)) ) mexErrMsgTxt("Images not square"); int imgSize = int(sqrt(imgPixels)); int numImages = sin1[1]; /* if (imgSize <= factor) mexErrMsgTxt("imgSize must be > factor"); */ if (factor > 16) mexErrMsgTxt("factor > 16"); if (factor < 2) mexErrMsgTxt("factor < 2"); if (imgSize > 512) mexErrMsgTxt("max imgSize: 512"); if (imgSize < 1) mexErrMsgTxt("min imgSize: 1"); int targetPixels = sout[0]; if ( floor(sqrt(float(targetPixels))) != sqrt(float(targetPixels)) ) mexErrMsgTxt("Targets not square"); int targetSize = int(sqrt(targetPixels)); if (targetSize % factor !=0) mexErrMsgTxt("imgSize must be evenly divisible by factor"); if (targetSize / factor != imgSize) mexErrMsgTxt("targetSize/ factor must = imgSize"); if (nout != nin1 * factor*factor) mexErrMsgTxt("Target dimensions not consistent"); int threadsX, threadsY; int SHMEM_MAX = 8192; // don't use more than this much shmem int shmemX, shmemY, blocksX, blocksY; bool useLoopy = false; int THREADS_MAX_LOOPY = 512, THREADS_MAX = trans ? 256 : 512; if (!trans) { threadsX = imgSize; threadsY = factor * MIN(THREADS_MAX / (factor*threadsX), SHMEM_MAX / (4*threadsX*factor)); // to avoid running out of shmem if(threadsY == 0) { if (factor > 32) mexErrMsgTxt("factor can't be > 32"); //assert(factor <= 32); // yes this is covered by assert above but in case i ever remove that THREADS_MAX = 512; useLoopy = true; threadsX = MIN(16, imgSize); // not that imgsize can be < 16 here under current conditions threadsY = factor * MIN(THREADS_MAX_LOOPY / (factor*threadsX), SHMEM_MAX / (4*threadsX*factor)); // to avoid running out of shmem } shmemY = threadsY; shmemX = threadsX; blocksX = imgSize; blocksY = DIVUP(numImages, threadsY); // printf("boundary problems: %u\n", numImages % threadsY != 0); } else { threadsY = imgSize; threadsX = factor * MIN(THREADS_MAX / (factor*threadsY), SHMEM_MAX / (4*threadsY*factor)); // to avoid running out of shmem if(threadsX < 8) { useLoopy = true; int xFactorMult = DIVUP(16, factor); threadsX = xFactorMult * factor; threadsY = THREADS_MAX / threadsX; int newThreadsX = threadsX, newThreadsY = threadsY; while (newThreadsY > 0 && imgSize % newThreadsY != 0) { // let's see if we can make threadsY divide imgSize newThreadsX += factor; newThreadsY = THREADS_MAX / newThreadsX; } if (newThreadsY > 0) { threadsY = newThreadsY; threadsX = newThreadsX; } if (threadsY <=0) mexErrMsgTxt("threadsY <=0; not expected"); //assert(threadsY > 0); } shmemY = threadsX; shmemX = threadsY + (1 - (threadsY % 2)); blocksX = DIVUP(numImages, threadsX); blocksY = imgSize; // printf("boundary problems: %u\n", numImages % threadsX != 0); } int shmem = 4 * shmemX * shmemY; if (shmem == 0 || shmem > 16300) { // this really shouldn't happen and i've only put this here as a precautionary measure // to avoid getting mysteriously wrong results. mexErrMsgTxt("supersample: not enough shared memory!"); //exit(EXIT_FAILURE); } dim3 grid(blocksX, blocksY); dim3 threads(threadsX, threadsY); //mexPrintf("blocks: %dx%d, threads: %dx%d\n", blocksY, blocksX, threadsY, threadsX); //mexPrintf("using %dx%d = %d bytes of shmem\n", shmemY, shmemX, shmem); gpuTYPE_t tin1 = gm->gputype.getType(IN1); gpuTYPE_t tout = gm->gputype.getType(OUT); // check input/out size and type if (tin1!=tout) mexErrMsgTxt("Input and output arguments must be of the same type."); // I need the pointers to GPU memory CUdeviceptr d_IN1 = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(IN1)); CUdeviceptr d_OUT = (CUdeviceptr) (UINTPTR gm->gputype.getGPUptr(OUT)); // // The GPU kernel depends on the type of input/output // CUfunction drvfun; // if (tin1 == gpuFLOAT) { // drvfun = drvfunf; // } else // mexErrMsgTxt("Currently only single types supported."); hostdrv_pars_t gpuprhs[2]; int gpunrhs = 2; gpuprhs[0] = hostdrv_pars(&d_IN1,sizeof(d_IN1)); gpuprhs[1] = hostdrv_pars(&d_OUT,sizeof(d_OUT)); /* trans not implemented; so always !trans if(!trans) { */ if(!useLoopy) { if(factor == 2) { hostDriver(_supersampleMedium_2, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); //kSupersampleMedium<2><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), imgSize, numImages*imgSize); } else if(factor == 3) { hostDriver(_supersampleMedium_3, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 4) { hostDriver(_supersampleMedium_4, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 5) { hostDriver(_supersampleMedium_5, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 6) { hostDriver(_supersampleMedium_6, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 7) { hostDriver(_supersampleMedium_7, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 8) { hostDriver(_supersampleMedium_8, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 9) { hostDriver(_supersampleMedium_9, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 10) { hostDriver(_supersampleMedium_10, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 11) { hostDriver(_supersampleMedium_11, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 12) { hostDriver(_supersampleMedium_12, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 13) { hostDriver(_supersampleMedium_13, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 14) { hostDriver(_supersampleMedium_14, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 15) { hostDriver(_supersampleMedium_15, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 16) { hostDriver(_supersampleMedium_16, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } } else { if(factor == 2) { hostDriver(_supersampleMediumLoopy_2, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 3) { hostDriver(_supersampleMediumLoopy_3, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 4) { hostDriver(_supersampleMediumLoopy_4, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 5) { hostDriver(_supersampleMediumLoopy_5, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 6) { hostDriver(_supersampleMediumLoopy_6, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 7) { hostDriver(_supersampleMediumLoopy_7, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 8) { hostDriver(_supersampleMediumLoopy_8, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 9) { hostDriver(_supersampleMediumLoopy_9, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 10) { hostDriver(_supersampleMediumLoopy_10, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 11) { hostDriver(_supersampleMediumLoopy_11, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 12) { hostDriver(_supersampleMediumLoopy_12, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 13) { hostDriver(_supersampleMediumLoopy_13, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 14) { hostDriver(_supersampleMediumLoopy_14, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 15) { hostDriver(_supersampleMediumLoopy_15, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } else if(factor == 16) { hostDriver(_supersampleMediumLoopy_16, grid, threads, shmem, imgSize, numImages*imgSize, gpunrhs, gpuprhs); } } /* } else { if(!useLoopy) { if(factor == 2) { kSupersampleMediumTrans<2><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 3) { kSupersampleMediumTrans<3><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 4) { kSupersampleMediumTrans<4><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 5) { kSupersampleMediumTrans<5><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 6) { kSupersampleMediumTrans<6><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 7) { kSupersampleMediumTrans<7><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 8) { kSupersampleMediumTrans<8><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 9) { kSupersampleMediumTrans<9><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 10) { kSupersampleMediumTrans<10><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 11) { kSupersampleMediumTrans<11><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 12) { kSupersampleMediumTrans<12><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 13) { kSupersampleMediumTrans<13><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 14) { kSupersampleMediumTrans<14><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 15) { kSupersampleMediumTrans<15><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 16) { kSupersampleMediumTrans<16><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } } else { if(factor == 2) { kSupersampleMediumTransLoopy<2><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 3) { kSupersampleMediumTransLoopy<3><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 4) { kSupersampleMediumTransLoopy<4><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 5) { kSupersampleMediumTransLoopy<5><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 6) { kSupersampleMediumTransLoopy<6><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 7) { kSupersampleMediumTransLoopy<7><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 8) { kSupersampleMediumTransLoopy<8><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 9) { kSupersampleMediumTransLoopy<9><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 10) { kSupersampleMediumTransLoopy<10><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 11) { kSupersampleMediumTransLoopy<11><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 12) { kSupersampleMediumTransLoopy<12><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 13) { kSupersampleMediumTransLoopy<13><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 14) { kSupersampleMediumTransLoopy<14><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 15) { kSupersampleMediumTransLoopy<15><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } else if(factor == 16) { kSupersampleMediumTransLoopy<16><<<grid, threads, shmem>>>(images->getDevData(), targets->getDevData(), numImages*imgSize, imgSize, shmemX); } } } */ }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { CUresult cudastatus = CUDA_SUCCESS; // more than 4 arguments expected if (nrhs < 4) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); // check gm gmCheckGPUmat(gm); // load module // NO MODULE REQUIRED // load float GPU function // NO FUNCTION REQUIRED init = 1; } // log gm->debug.log("> ASSIGN\n",0); gm->debug.logPush(); // mex parameters are: // dir: direction. Range is applied to the left or the right // LHS: GPUtype variable (left hand side) // RHS: GPUtype variable (right hand side) // or Matlab array (converted to GPUtype) // ...: variable number of arguments after 'dir' representing the range if (mxGetClassID(prhs[0]) != mxDOUBLE_CLASS) { mexErrMsgTxt(ERROR_ASSIGN_FIRSTARG); } int dir = (int) mxGetScalar(prhs[0]); GPUtype LHS = gm->gputype.getGPUtype(prhs[1]); if (gm->comp.getCompileMode() == 1) { GPUtype RHS; if ((mxGetClassID(prhs[2]) == mxDOUBLE_CLASS)||(mxGetClassID(prhs[2]) == mxSINGLE_CLASS)) { // compile this option // create dummy RHS RHS = gm->gputype.create(gpuFLOAT, 0, NULL, NULL); gm->comp.pushGPUtype(&RHS); gm->comp.functionStart("GPUMAT_mxToGPUtype"); gm->comp.functionSetParamGPUtype(&RHS); gm->comp.functionSetParamMx(prhs[2]); gm->comp.functionEnd(); //RHS = gm->gputype.mxToGPUtype(prhs[2]); } else { RHS = gm->gputype.getGPUtype(prhs[2]); } gm->comp.functionStart("GPUMAT_mxAssign"); gm->comp.functionSetParamGPUtype(&LHS); gm->comp.functionSetParamGPUtype(&RHS); gm->comp.functionSetParamInt(dir); gm->comp.functionSetParamMxMx(nrhs-3, &prhs[3]); gm->comp.functionEnd(); } else { GPUtype RHS; // convert Matlab array to GPUtype if ((mxGetClassID(prhs[2]) == mxDOUBLE_CLASS)||(mxGetClassID(prhs[2]) == mxSINGLE_CLASS)) { RHS = gm->gputype.mxToGPUtype(prhs[2]); } else { RHS = gm->gputype.getGPUtype(prhs[2]); } gm->aux.mxAssign(LHS, RHS, dir, nrhs-3, &prhs[3] ); } gm->debug.logPop(); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { CUresult cudastatus = CUDA_SUCCESS; // no more than 2 arguments expected if (nrhs > 2) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function //mexLock(); // load GPUmat gm = gmGetGPUmat(); // check gm gmCheckGPUmat(gm); // load module // NO MODULE REQUIRED // load float GPU function // NO FUNCTION REQUIRED init = 1; } // log gm->debug.log("> SIZE\n",0); gm->debug.logPush(); if (gm->comp.getCompileMode() == 1) { mexWarnMsgTxt(WARNING_NUMERICS_COMPNOTIMPLEMENTED); } // mex parameters are: // IN: GPUtype variable GPUtype IN = gm->gputype.getGPUtype(prhs[0]); const int *in_size = gm->gputype.getSize(IN); int in_ndims = gm->gputype.getNdims(IN); // 2 cases // 1. s = size(A) // 2. s = size(A,dim) if (nrhs == 1) { // 2 cases: // 1. s = size(A) // 2. [a,b,c,...] = size(A) if (nlhs<=1) { // 1. s = size(A) // create output plhs[0] plhs[0] = mxCreateDoubleMatrix(1, in_ndims, mxREAL); // fill in plhs[0] with IN dimensions double *plhs_size = mxGetPr(plhs[0]); for (int i = 0; i < in_ndims; i++) plhs_size[i] = (double) in_size[i]; } else { // 2. [a,b,c,...] = size(A) for (int i=0;i<nlhs;i++) { // create output // create output plhs[i] int r = 1; // if i is greater than IN dims return 1 if (i>(in_ndims-1)) { // r = 1 } else { r = in_size[i]; } plhs[i] = mxCreateDoubleScalar(r); } } } else { // retrieve dim int dim = (int) mxGetScalar(prhs[1]); int r = 1; // if dim is greater than IN dims return 1 if (dim>in_ndims) { // r = 1 } else { r = in_size[dim-1]; } // create output plhs[0] plhs[0] = mxCreateDoubleScalar(r); } }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { // At least 2 arguments expected // Input and result if (nrhs!=6) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function // mexLock(); // load GPUmat gm = gmGetGPUmat(); init = 1; } /* mex parameters are: 0 Source array [X or one of the y-edge-zeroed copies of it] 1 Destination array [Accumulator, basically] 2 Stack array 1 [We swap XY or YZ planes with this before copying to assure clean shift] - Must be all zeroes and of size max(NxNy, NyNz, NxNz) 3 Stack array 2 4 Shift directions 5 Coefficient on shift */ // Get GPU array pointers GPUtype srcArray = gm->gputype.getGPUtype(prhs[0]); GPUtype dstArray = gm->gputype.getGPUtype(prhs[1]); GPUtype stackArrayX = gm->gputype.getGPUtype(prhs[2]); //GPUtype stackArrayY = gm->gputype.getGPUtype(prhs[3]); GPUtype stackArrayZ = gm->gputype.getGPUtype(prhs[3]); // Get some control variables sorted out double *shiftdirs = mxGetPr(prhs[4]); const int *dims = gm->gputype.getSize(srcArray); double alpha = *mxGetPr(prhs[5]); int shifts[3]; shifts[0] = (int)shiftdirs[0]; shifts[1] = (int)shiftdirs[1]; shifts[2] = (int)shiftdirs[2]; double *cubSrc = (double*)gm->gputype.getGPUptr(srcArray); // Remove appropriate YZ plane if any double *cubDst = (double*)gm->gputype.getGPUptr(stackArrayX); if(shifts[0] == -1) cublasDswap(dims[1]*dims[2], cubSrc, dims[0], cubDst, 1); if(shifts[0] == 1) cublasDswap(dims[1]*dims[2], cubSrc + dims[0]-1, dims[0], cubDst, 1); // Remove appropriate XZ plane if any //stackSwapXZplane(cubSrc, (double*)gm->gputype.getGPUptr(stackArrayY), (int *)dims, shifts); // Remove appropriate XY plane if any cubDst = (double*)gm->gputype.getGPUptr(stackArrayZ); if(shifts[2] == -1) cublasDswap(dims[0]*dims[1], cubSrc, 1, cubDst, 1); if(shifts[2] == 1) cublasDswap(dims[0]*dims[1], cubSrc + dims[0]*dims[1]*(dims[2]-1), 1, cubDst, 1); // Decide the amount of offset to acheive desired shift int theta = shifts[0] + dims[0]*shifts[1] + dims[0]*dims[1]*shifts[2]; int Ntot = dims[0] * dims[1] * dims[2]; cubDst = (double*)gm->gputype.getGPUptr(dstArray); if(theta >= 0) { cublasDaxpy(Ntot-theta, alpha, cubSrc, 1, cubDst + theta, 1); } else { cublasDaxpy(Ntot+theta, alpha, cubSrc - theta, 1, cubDst, 1); } // Replace the XY plane if it was removed cubDst = (double*)gm->gputype.getGPUptr(stackArrayZ); if(shifts[2] == -1) cublasDswap(dims[0]*dims[1], cubSrc, 1, cubDst, 1); if(shifts[2] == 1) cublasDswap(dims[0]*dims[1], cubSrc + dims[0]*dims[1]*(dims[2]-1), 1, cubDst, 1); // replace the XZ plane if it was removed //stackSwapXZplane(cubSrc, (double*)gm->gputype.getGPUptr(stackArrayY), (int *)dims, shifts); // Replace the YZ plane if it was removed cubDst = (double*)gm->gputype.getGPUptr(stackArrayX); if(shifts[0] == -1) cublasDswap(dims[1]*dims[2], cubSrc, dims[0], cubDst, 1); if(shifts[0] == 1) cublasDswap(dims[1]*dims[2], cubSrc + dims[0]-1, dims[0], cubDst, 1); }