/* The matlab mex function */
void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[] )
{
    /* Ox and Oy are the grid points */
    /* Zo is the input image */
    /* Zi is the transformed image */

    /* nx and ny are the number of grid points (inside the image) */
    double *Ox,*Oy,*Oz,*dxa, *dya,*dza,*Iout;
    mxArray *matlabCallOut[1]={0};
    mxArray *matlabCallIn[1]={0};
    double *Nthreadsd;
    int Nthreads;
	/* double pointer array to store all needed function variables) */
    double ***ThreadArgs;
    double **ThreadArgs1;
    /* Handles to the worker threads */
		ThreadHANDLE *ThreadList;

    /* ID of Threads */
    double **ThreadID;              
    double *ThreadID1;
    
    double nlhs_d[1]={0};
    
    /* Size of input image */
	double *Isize_d;
    mwSize dims[3];


    /* Size of grid */
    mwSize  Osizex, Osizey, Osizez;
    double Osize_d[3]={0,0,0};
    const mwSize *dimso;
  
    /* B-spline variablesl */
    double u,v,w;
    int u_index=0; 
    int v_index=0;
    int w_index=0;
    
    double *Bu, *Bv, *Bw;
    double *Bdu, *Bdv, *Bdw;
	
	/* Loop variable  */
	int i;
	/* Grid distance */
    int dx,dy,dz; 
    /* X,Y,Z coordinates of current pixel */
    int x,y,z;
        
  /* Check for proper number of arguments. */
  if(nrhs!=7) {
    mexErrMsgTxt("Seven inputs are required.");
  }
 

  /* Get the sizes of the grid */
  dimso = mxGetDimensions(prhs[0]);   
  Osizex = dimso[0]; 
  Osizey = dimso[1];
  Osizez = dimso[2];
    
  /* Assign pointers to each input. */
  Ox=(double *)mxGetData(prhs[0]);
  Oy=(double *)mxGetData(prhs[1]);
  Oz=(double *)mxGetData(prhs[2]);
  Isize_d=(double *)mxGetData(prhs[3]);
  dxa=(double *)mxGetData(prhs[4]);
  dya=(double *)mxGetData(prhs[5]);
  dza=(double *)mxGetData(prhs[6]);
   
   
  /* Create image matrix for the return arguments with the size of input image  */  
  dims[0]=(mwSize)Isize_d[0];
  dims[1]=(mwSize)Isize_d[1];
  dims[2]=(mwSize)Isize_d[2];
  
  plhs[0] = mxCreateNumericArray(3, dims, mxDOUBLE_CLASS, mxREAL); 
  
  
  /* Get the spacing of the uniform b-spline grid */
  dx=(int)dxa[0]; dy=(int)dya[0]; dz=(int)dza[0]; 
  
  /* Get number of allowed threads */
  mexCallMATLAB(1, matlabCallOut, 0, matlabCallIn, "maxNumCompThreads");
  Nthreadsd=mxGetPr(matlabCallOut[0]);
  Nthreads=(int)Nthreadsd[0];
  
    /* Reserve room for handles of threads in ThreadList  */
		ThreadList = (ThreadHANDLE*)malloc(Nthreads* sizeof( ThreadHANDLE ));
	
  ThreadID = (double **)malloc( Nthreads* sizeof(double *) );
  ThreadArgs = (double ***)malloc( Nthreads* sizeof(double **) );
  

  /* Assign pointer to output. */
  Iout = (double *)mxGetData(plhs[0]);
 
   /*  Make polynomial look up tables   */
  Bu=malloc(dx*4*sizeof(double));
  Bv=malloc(dy*4*sizeof(double));
  Bw=malloc(dz*4*sizeof(double));
  Bdu=malloc(dx*4*sizeof(double));
  Bdv=malloc(dy*4*sizeof(double));
  Bdw=malloc(dz*4*sizeof(double));
  
  for (x=0; x<dx; x++)
  {
    u=((double)x/(double)dx)-floor((double)x/(double)dx);
    Bu[mindex2(0,x,4)] = BsplineCoefficient(u,0);
    Bu[mindex2(1,x,4)] = BsplineCoefficient(u,1);
    Bu[mindex2(2,x,4)] = BsplineCoefficient(u,2);
    Bu[mindex2(3,x,4)] = BsplineCoefficient(u,3);
    Bdu[mindex2(0,x,4)] = BsplineCoefficientDerivative(u,0)/dxa[0];
    Bdu[mindex2(1,x,4)] = BsplineCoefficientDerivative(u,1)/dxa[0];
    Bdu[mindex2(2,x,4)] = BsplineCoefficientDerivative(u,2)/dxa[0];
    Bdu[mindex2(3,x,4)] = BsplineCoefficientDerivative(u,3)/dxa[0];
  }
  
  for (y=0; y<dy; y++)
  {
    v=((double)y/(double)dy)-floor((double)y/(double)dy);
    Bv[mindex2(0,y,4)] = BsplineCoefficient(v,0);
    Bv[mindex2(1,y,4)] = BsplineCoefficient(v,1);
    Bv[mindex2(2,y,4)] = BsplineCoefficient(v,2);
    Bv[mindex2(3,y,4)] = BsplineCoefficient(v,3);
    Bdv[mindex2(0,y,4)] = BsplineCoefficientDerivative(v,0)/dya[0];
    Bdv[mindex2(1,y,4)] = BsplineCoefficientDerivative(v,1)/dya[0];
    Bdv[mindex2(2,y,4)] = BsplineCoefficientDerivative(v,2)/dya[0];
    Bdv[mindex2(3,y,4)] = BsplineCoefficientDerivative(v,3)/dya[0];
  }
  
  for (z=0; z<dz; z++)
  {
    w=((double)z/(double)dz)-floor((double)z/(double)dz);
    Bw[mindex2(0,z,4)] = BsplineCoefficient(w,0);
    Bw[mindex2(1,z,4)] = BsplineCoefficient(w,1);
    Bw[mindex2(2,z,4)] = BsplineCoefficient(w,2);
    Bw[mindex2(3,z,4)] = BsplineCoefficient(w,3);
    Bdw[mindex2(0,z,4)] = BsplineCoefficientDerivative(w,0)/dza[0];
    Bdw[mindex2(1,z,4)] = BsplineCoefficientDerivative(w,1)/dza[0];
    Bdw[mindex2(2,z,4)] = BsplineCoefficientDerivative(w,2)/dza[0];
    Bdw[mindex2(3,z,4)] = BsplineCoefficientDerivative(w,3)/dza[0];
	
  }
  

  Osize_d[0]=(double)Osizex;  Osize_d[1]=(double)Osizey; Osize_d[2]=(double)Osizez;
  
  nlhs_d[0]=(double)nlhs;
  
 /* Reserve room for 16 function variables(arrays)   */
  for (i=0; i<Nthreads; i++)
  {
    /*  Make Thread ID  */
    ThreadID1= (double *)malloc( 1* sizeof(double) );
    ThreadID1[0]=(double)i;
    ThreadID[i]=ThreadID1;  
	
    /*  Make Thread Structure  */
    ThreadArgs1 = (double **)malloc( 17* sizeof( double * ) );  
	ThreadArgs1[0]=Bu;
	ThreadArgs1[1]=Bv;
	ThreadArgs1[2]=Bw;
	ThreadArgs1[3]=Isize_d;
	ThreadArgs1[4]=Osize_d;
	ThreadArgs1[5]=Iout;
	ThreadArgs1[6]=dxa;
	ThreadArgs1[7]=dya;
	ThreadArgs1[8]=dza;
	ThreadArgs1[9]=ThreadID[i];
	ThreadArgs1[10]=Ox;
	ThreadArgs1[11]=Oy;
	ThreadArgs1[12]=Oz;
	ThreadArgs1[13]=Nthreadsd;
    ThreadArgs1[14]=Bdu;
    ThreadArgs1[15]=Bdv;
	ThreadArgs1[16]=Bdw;
	
    ThreadArgs[i]=ThreadArgs1;

	StartThread(ThreadList[i], &transformvolume_jacobiandet, ThreadArgs[i])
  }

     for (i=0; i<Nthreads; i++) { WaitForThreadFinish(ThreadList[i]); }


  for (i=0; i<Nthreads; i++) 
  { 
    free(ThreadArgs[i]);
    free(ThreadID[i]);
  }

  free(ThreadArgs);
  free(ThreadID );
  free(ThreadList);  
  

  free(Bu);
  free(Bv);
  free(Bw);
  free(Bdu);
  free(Bdv);
  free(Bdw);
  
}
/* The matlab mex function */
void mexFunction( int nlhs, mxArray *plhs[],
        int nrhs, const mxArray *prhs[] ) {
    /* Ox and Oy are the grid points */
    /* Zo is the input image */
    /* Zi is the transformed image */
    /* dx and dy are the spacing of the b-spline knots */
    double *Ox, *Oy, *dxa, *dya, *E, *Egradient;
    double *ThreadErrorOut, *ThreadGradientOutX, *ThreadGradientOutY;
    mxArray *matlabCallOut[1]={0};
    mxArray *matlabCallIn[1]={0};
    double *Nthreadsd;
    int Nthreads;
    /* Finite difference step size */
	double step=0.001;
    /* index offsets */
    int offset1;
    /* double pointer array to store all needed function variables) */
    double ***ThreadArgs;
    double **ThreadArgs1;
    /* Handles to the worker threads */
    ThreadHANDLE *ThreadList;
    /* ID of Threads */
    double **ThreadID;
    double *ThreadID1;
    /* Dims outputs */
    const int dims_error[2]={1, 1};
    int dims_error_gradient[3]={1, 1, 2};
    /* Size of input image */
    double *Isize_d;
    /* Size of grid */
    mwSize  Osizex, Osizey;
    int Onumel;
    double Inumel;
    double Osize_d[2]={0, 0};
    /* B-spline variablesl */
    double u, v;
    int u_index=0;
    int v_index=0;
    double *Bu, *Bv, *Bdu, *Bdv;
    /* Loop variables  */
    int i, j;
    /* X,Y coordinates of current pixel */
    int x, y;
    /* Grid distance */
    int dx, dy;
    
    /* Check for proper number of arguments. */
    if(nrhs!=5) {
        mexErrMsgTxt("Five nputs are required.");
    }
    
    /* Get the sizes of the grid */
    Osizex = (mwSize)mxGetM(prhs[0]);
    Osizey = (mwSize)mxGetN(prhs[0]);
    
    /* Assign pointers to each input. */
    Ox=mxGetPr(prhs[0]);
    Oy=mxGetPr(prhs[1]);
    Isize_d=mxGetPr(prhs[2]);
    dxa=mxGetPr(prhs[3]);
    dya=mxGetPr(prhs[4]);
    
    Onumel= Osizex*Osizey;
    Inumel = Isize_d[0]*Isize_d[1];
    /* Create image matrix for the Error return argument  */
    plhs[0] = mxCreateNumericArray(2, dims_error, mxDOUBLE_CLASS, mxREAL);
    if(nlhs>1) {
        dims_error_gradient[0]=Osizex;
        dims_error_gradient[1]=Osizey;
        /* Error Gradient needed */
        
        plhs[1] = mxCreateNumericArray(3, dims_error_gradient, mxDOUBLE_CLASS, mxREAL);
    }
    
    /* Get the spacing of the uniform b-spline grid */
    dx=(int)dxa[0]; dy=(int)dya[0];
    
    /* Get number of allowed threads */
    mexCallMATLAB(1, matlabCallOut, 0, matlabCallIn, "maxNumCompThreads");
    Nthreadsd=mxGetPr(matlabCallOut[0]);
    Nthreads=(int)Nthreadsd[0];
    /* Reserve room for handles of threads in ThreadList  */
    ThreadList = (ThreadHANDLE*)malloc(Nthreads* sizeof( ThreadHANDLE ));
    
    ThreadID = (double **)malloc( Nthreads* sizeof(double *) );
    ThreadArgs = (double ***)malloc( Nthreads* sizeof(double **) );
    
    ThreadErrorOut= (double *)malloc(Nthreads* sizeof(double) ); 
    if(nlhs==1)
    {	
        ThreadGradientOutX=NULL;
        ThreadGradientOutY=NULL;
    }
    else 
    {   
        ThreadGradientOutX= (double *)malloc(Nthreads*Onumel*sizeof(double));
        ThreadGradientOutY= (double *)malloc(Nthreads*Onumel*sizeof(double));
    }
    
    /* Assign pointer to output. */
    E = mxGetPr(plhs[0]);
    if(nlhs>1) { Egradient = mxGetPr(plhs[1]); }
    
    /*  Make polynomial look up tables   */
    Bu=malloc(dx*4*sizeof(double));
    Bv=malloc(dy*4*sizeof(double));
    Bdu=malloc(dx*4*sizeof(double));
    Bdv=malloc(dy*4*sizeof(double));
    for (x=0; x<dx; x++) {
        u=(x/(double)dx)-floor(x/(double)dx);
        Bu[mindex2(0, x, 4)] = BsplineCoefficient(u, 0);
        Bu[mindex2(1, x, 4)] = BsplineCoefficient(u, 1);
        Bu[mindex2(2, x, 4)] = BsplineCoefficient(u, 2);
        Bu[mindex2(3, x, 4)] = BsplineCoefficient(u, 3);
        Bdu[mindex2(0, x, 4)] = BsplineCoefficientDerivative(u, 0)/dxa[0];
        Bdu[mindex2(1, x, 4)] = BsplineCoefficientDerivative(u, 1)/dxa[0];
        Bdu[mindex2(2, x, 4)] = BsplineCoefficientDerivative(u, 2)/dxa[0];
        Bdu[mindex2(3, x, 4)] = BsplineCoefficientDerivative(u, 3)/dxa[0];
    }
    
    for (y=0; y<dy; y++) {
        v=(y/(double)dy)-floor(y/(double)dy);
        Bv[mindex2(0, y, 4)] = BsplineCoefficient(v, 0);
        Bv[mindex2(1, y, 4)] = BsplineCoefficient(v, 1);
        Bv[mindex2(2, y, 4)] = BsplineCoefficient(v, 2);
        Bv[mindex2(3, y, 4)] = BsplineCoefficient(v, 3);
        Bdv[mindex2(0, y, 4)] = BsplineCoefficientDerivative(v, 0)/dya[0];
        Bdv[mindex2(1, y, 4)] = BsplineCoefficientDerivative(v, 1)/dya[0];
        Bdv[mindex2(2, y, 4)] = BsplineCoefficientDerivative(v, 2)/dya[0];
        Bdv[mindex2(3, y, 4)] = BsplineCoefficientDerivative(v, 3)/dya[0];
    }
    
    Osize_d[0]=Osizex;  Osize_d[1]=Osizey;
        
    /* Reserve room for 14 function variables(arrays)   */
    for (i=0; i<Nthreads; i++) {
        /*  Make Thread ID  */
        ThreadID1= (double *)malloc( 1* sizeof(double) );
        ThreadID1[0]=i;
        ThreadID[i]=ThreadID1;
        /*  Make Thread Structure  */
        ThreadArgs1 = (double **)malloc( 15 * sizeof( double * ) );
        ThreadArgs1[0]=Bu;
        ThreadArgs1[1]=Bv;
        ThreadArgs1[2]=Isize_d;
        ThreadArgs1[3]=Osize_d;
        ThreadArgs1[4]=ThreadErrorOut;
        ThreadArgs1[5]=dxa;
        ThreadArgs1[6]=dya;
        ThreadArgs1[7]=ThreadID[i];
        ThreadArgs1[8]=Ox;
        ThreadArgs1[9]=Oy;
        ThreadArgs1[10]=Nthreadsd;
        ThreadArgs1[11]=Bdu;
        ThreadArgs1[12]=Bdv;
        ThreadArgs1[13]=ThreadGradientOutX;
        ThreadArgs1[14]=ThreadGradientOutY;
    
        ThreadArgs[i]=ThreadArgs1;
        if(nlhs>1) 
        {
            StartThread(ThreadList[i], &jacobian_errorgradient, ThreadArgs[i])
        }
        else
        {
            StartThread(ThreadList[i], &jacobian_error, ThreadArgs[i])
        }
    }

    for (i=0; i<Nthreads; i++) { WaitForThreadFinish(ThreadList[i]); }
    
    /* Add accumlated error of all threads */
    E[0]=0;
    for (i=0; i<Nthreads; i++) 
    {
        E[0]+=ThreadErrorOut[i]; 
    } 
    E[0]/=Inumel;

    if(nlhs>1) {
        for (i=0; i<Nthreads; i++) {
            offset1=i*Onumel;
            for(j=0; j<Onumel; j++) {
                Egradient[j]+=ThreadGradientOutX[j+offset1];
                Egradient[j+Onumel]+=ThreadGradientOutY[j+offset1];
            }
        }
        for(j=0; j<Onumel; j++) {
            Egradient[j]/=Inumel*step;
            Egradient[j+Onumel]/=Inumel*step;
        }
    }
        
    for (i=0; i<Nthreads; i++) {
        free(ThreadArgs[i]);
        free(ThreadID[i]);
    }
    
    free(ThreadErrorOut);
    free(ThreadGradientOutX);
    free(ThreadGradientOutY);
        
    free(ThreadArgs);
    free(ThreadID );
    free(ThreadList);
    free(Bu);
    free(Bdu);
    free(Bv);
    free(Bdv);
    
}
/* The matlab mex function */
void mexFunction( int nlhs, mxArray *plhs[],
                  int nrhs, const mxArray *prhs[] )
{
    /* I is the input image, Iout the transformed image  */
    /* Tx and Ty images of the translation of every pixel. */
    double *Iin, *Iout, *Tx, *Ty, *ImageSizeT;
	double *moded;
	mxArray *matlabCallOut[1]={0};
    mxArray *matlabCallIn[1]={0};
    double *Nthreadsd;
    int Nthreads;
    double *Tlocalx, *Tlocaly;
    int x,y;
    int index;
    
    /* double pointer array to store all needed function variables) */
    double ***ThreadArgs;
    double **ThreadArgs1;
    
	/* Handles to the worker threads */
		ThreadHANDLE *ThreadList;

    
    /* ID of Threads */
    double **ThreadID;              
    double *ThreadID1;

    /* Size of input image */
    const mwSize *dims;
    double Isize_d[3]={0,0,0};
    int Isize[3]={1,1,1};
    
    /* Size of output image */
    double ImageSize_d[3]={0,0,0};
    int ImageSize[3]={1,1,1};
	
	/* Loop variable  */
	int i;
	
    /* Check for proper number of arguments. */
    if(nrhs<4) {
      mexErrMsgTxt("Four inputs are required.");
    } else if(nlhs!=1) {
      mexErrMsgTxt("One output required");
    }
 
    /* Get the sizes of the image */
    dims = mxGetDimensions(prhs[0]);   
    Isize_d[0] = (double)dims[0]; Isize_d[1] = (double)dims[1]; 
    /* Detect if color image */
    if(mxGetNumberOfDimensions(prhs[0])>2) { Isize_d[2]=(double)3; } else { Isize_d[2]=1; }
    Isize[0]=(int)Isize_d[0];
    Isize[1]=(int)Isize_d[1];
    Isize[2]=(int)Isize_d[2];
     
   
    /* Assign pointers to each input. */
    Iin=mxGetPr(prhs[0]);
    Tx=mxGetPr(prhs[1]);
    Ty=mxGetPr(prhs[2]);
	moded=mxGetPr(prhs[3]);
    if(nrhs==5)
    {
      ImageSizeT=mxGetPr(prhs[4]);
    }
 
 
	if(nrhs==5)
	{
		ImageSize_d[0]=ImageSizeT[0];
		ImageSize_d[1]=ImageSizeT[1];
		ImageSize_d[2]=Isize_d[2];
	}
	else
	{
		ImageSize_d[0]= Isize_d[0];
		ImageSize_d[1]= Isize_d[1];
		ImageSize_d[2]= Isize_d[2];
	}
	ImageSize[0]=(int)ImageSize_d[0];
	ImageSize[1]=(int)ImageSize_d[1];
	ImageSize[2]=(int)ImageSize_d[2];
  
   /* Create image matrix for the return arguments with the size of input image  */  
    if(Isize_d[2]>1) {
          plhs[0] = mxCreateNumericArray(3, ImageSize, mxDOUBLE_CLASS, mxREAL);
    }
    else  {
        plhs[0] = mxCreateNumericArray(2, ImageSize, mxDOUBLE_CLASS, mxREAL);
    }

	
  
	  
    /* Assign pointer to output. */
    Iout = mxGetPr(plhs[0]);
        
    if(moded[0]==4)
    {
        Tlocalx=(double*)malloc(Isize[0]*Isize[1]*sizeof(double));
        Tlocaly=(double*)malloc(Isize[0]*Isize[1]*sizeof(double));
        for (y=0; y<Isize[1]; y++)
        {
            for (x=0; x<Isize[0]; x++)
            {
                index=mindex2(x,y,Isize[0]);
                Tlocalx[index]=((double)x)+Tx[index];
                Tlocaly[index]=((double)y)+Ty[index];
            }
        }
    
        interpolate_forward_2d_double(Iin, Tlocalx, Tlocaly, Isize, ImageSize, Iout);
        
        free(Tlocalx);
        free(Tlocaly);
    
    }
    else
    {   
        mexCallMATLAB(1, matlabCallOut, 0, matlabCallIn, "maxNumCompThreads");
        Nthreadsd=mxGetPr(matlabCallOut[0]);
        Nthreads=(int)Nthreadsd[0];
        /* Reserve room for handles of threads in ThreadList  */
            ThreadList = (ThreadHANDLE*)malloc(Nthreads* sizeof( ThreadHANDLE ));

        ThreadID = (double **)malloc( Nthreads* sizeof(double *) );
        ThreadArgs = (double ***)malloc( Nthreads* sizeof(double **) );


      for (i=0; i<Nthreads; i++)
      {
        /*  Make Thread ID  */
        ThreadID1= (double *)malloc( 1* sizeof(double) );
        ThreadID1[0]=i;
        ThreadID[i]=ThreadID1;  

        /*  Make Thread Structure  */
        ThreadArgs1 = (double **)malloc( 9* sizeof( double * ) );  
        ThreadArgs1[0]=Iin;
        ThreadArgs1[1]=Iout;
        ThreadArgs1[2]=Tx;
        ThreadArgs1[3]=Ty;
        ThreadArgs1[4]=Isize_d;
        ThreadArgs1[5]=ThreadID[i];
        ThreadArgs1[6]=moded;
        ThreadArgs1[7]=Nthreadsd;
		ThreadArgs1[8]=ImageSize_d;
		

        ThreadArgs[i]=ThreadArgs1;
        if(Isize_d[2]>1) {
            StartThread(ThreadList[i], &transformvolume_color, ThreadArgs[i])
        }
        else
        {
            StartThread(ThreadList[i], &transformvolume_gray, ThreadArgs[i])
        }
      }

        for (i=0; i<Nthreads; i++) { WaitForThreadFinish(ThreadList[i]); }


      for (i=0; i<Nthreads; i++) 
      { 
        free(ThreadArgs[i]);
        free(ThreadID[i]);
      }

      free(ThreadArgs);
      free(ThreadID );
      free(ThreadList);
    }
}
/* The matlab mex function */
void mexFunction( int nlhs, mxArray *plhs[],
                  int nrhs, const mxArray *prhs[] )
{
    /* I is the input image, Iout the transformed image  */
    /* Tx and Ty images of the translation of every pixel. */
    float *Iin, *Vout, *Xi, *Yi, *Zi;
	float *moded;
	mxArray *matlabCallOut[1]={0};
    mxArray *matlabCallIn[1]={0};
    double *Nthreadsd;
	float Nthreadsf[1];	
    int Nthreads;
    
    /* float pointer array to store all needed function variables) */
    float ***ThreadArgs;
    float **ThreadArgs1;
    
	/* Handles to the worker threads */
	ThreadHANDLE *ThreadList;
    
    /* ID of Threads */
    float **ThreadID;              
    float *ThreadID1;

    /* Size of input image */
    const mwSize *dims;
    float Isize_d[3]={0,0,0};
    int Isize[3]={1,1,1};
    
    /* Size of output */
    float VoutSize_d[3]={1,1};
    int VoutSize[3]={1,1};
	
	/* Loop variable  */
	int i;
	
    /* Check for proper number of arguments. */
    if(nrhs<5) {
      mexErrMsgTxt("Five inputs are required.");
    } else if(nlhs!=1) {
      mexErrMsgTxt("One output required");
    }
 
    /* Get the sizes of the image */
    dims = mxGetDimensions(prhs[0]);   
    Isize_d[0] = (float)dims[0]; 
    Isize_d[1] = (float)dims[1]; 
    Isize_d[2] = (float)dims[2]; 
    
    Isize[0]=(int)Isize_d[0];
    Isize[1]=(int)Isize_d[1];
    Isize[2]=(int)Isize_d[2];   
    /* Set the sizes of the output */
    VoutSize[0]=(int)mxGetNumberOfElements(prhs[1]);
    VoutSize_d[0]=(float)VoutSize[0];
            
    /* J= interp2(I,xi,yi,'linear') */

    /* Assign pointers to each input. */
    Iin=(float *)mxGetData(prhs[0]);
    Xi=(float *)mxGetData(prhs[1]);
    Yi=(float *)mxGetData(prhs[2]);
	Zi=(float *)mxGetData(prhs[3]);
	moded=(float *)mxGetData(prhs[4]);
        
    /* Create image matrix for the return arguments   */  
    plhs[0] = mxCreateNumericArray(2, VoutSize, mxSINGLE_CLASS, mxREAL);

    /* Assign pointer to output. */
    Vout = (float *)mxGetData(plhs[0]);
        
	/* Get number of allowed threads */
	mexCallMATLAB(1, matlabCallOut, 0, matlabCallIn, "maxNumCompThreads");
	Nthreadsd=mxGetPr(matlabCallOut[0]); Nthreadsf[0]=(float)Nthreadsd[0];
	Nthreads=(int)Nthreadsd[0];
    
    /* Reserve room for handles of threads in ThreadList  */
    ThreadList = (ThreadHANDLE*)malloc(Nthreads* sizeof( ThreadHANDLE ));
    ThreadID = (float **)malloc( Nthreads* sizeof(float *) );
    ThreadArgs = (float ***)malloc( Nthreads* sizeof(float **) );


      for (i=0; i<Nthreads; i++)
      {
        /*  Make Thread ID  */
        ThreadID1= (float *)malloc( 1* sizeof(float) );
        ThreadID1[0]=(float)i;
        ThreadID[i]=ThreadID1;  

        /*  Make Thread Structure  */
        ThreadArgs1 = (float **)malloc( 10* sizeof( float * ) );  
        ThreadArgs1[0]=Iin;
        ThreadArgs1[1]=Vout;
        ThreadArgs1[2]=Xi;
        ThreadArgs1[3]=Yi;
		ThreadArgs1[4]=Zi;
		ThreadArgs1[5]=Isize_d;
        ThreadArgs1[6]=VoutSize_d;
        ThreadArgs1[7]=ThreadID[i];
        ThreadArgs1[8]=moded;
        ThreadArgs1[9]=Nthreadsf;

        ThreadArgs[i]=ThreadArgs1;
        StartThread(ThreadList[i], &getgrayvalue, ThreadArgs[i])
      }

      for (i=0; i<Nthreads; i++) { WaitForThreadFinish(ThreadList[i]); }

      for (i=0; i<Nthreads; i++) 
      { 
        free(ThreadArgs[i]);
        free(ThreadID[i]);
      }

      free(ThreadArgs);
      free(ThreadID );
      free(ThreadList);

}
/* The matlab mex function */
void mexFunction( int nlhs, mxArray *plhs[],
                  int nrhs, const mxArray *prhs[] )
{
    /* Ox and Oy are the grid points */
    /* Zo is the input image */
    /* Zi is the transformed image */

    /* nx and ny are the number of grid points (inside the image) */
    double *Ox,*Oy,*Oz,*I1,*I2,*dxa, *dya,*dza, *E, *Egradient, *ThreadOut;
    mxArray *matlabCallOut[1]={0};
    mxArray *matlabCallIn[1]={0};
    double *Nthreadsd;
    int Nthreads;
	/* Finite difference step size */
	double step=0.01;
	/* index offsets */
	int offset1, offset2, offset3;
	/* Dims outputs */
	const int dims_error[2]={1,1};
	int dims_error_gradient[4]={1,1,1,3};
	/* double pointer array to store all needed function variables) */
    double ***ThreadArgs;
    double **ThreadArgs1;
    /* Handles to the worker threads */
		ThreadHANDLE *ThreadList;
    
    /* ID of Threads */
    double **ThreadID;              
    double *ThreadID1;
    
    /* Size of input image */
    mwSize  Isizex, Isizey, Isizez;
    double Isize_d[3]={0,0,0};
    const mwSize *dims;

    /* Size of grid */
    mwSize  Osizex, Osizey, Osizez;
	int Onumel;
    double Osize_d[3]={0,0,0};
   
    /* B-spline variablesl */
    double u,v,w;
    int u_index=0; 
    int v_index=0;
    int w_index=0;
    
    double *Bu, *Bv, *Bw;
    
    /* Loop variables  */
    int i,j;
	/* Grid distance */
    int dx,dy,dz; 
    /* X,Y,Z coordinates of current pixel */
    int x,y,z;
        
  /* Check for proper number of arguments. */
  if(nrhs!=8) {
    mexErrMsgTxt("Eight inputs are required.");
  }
 
  /* Get the sizes of the grid */
  dims = mxGetDimensions(prhs[0]);   
  Osizex = dims[0]; 
  Osizey = dims[1];
  Osizez = dims[2];
  Onumel = Osizex*Osizey*Osizez;
   
  /* Create image matrix for the return arguments with the size of input image  */  
  dims = mxGetDimensions(prhs[3]);  
  Isizex = dims[0]; 
  Isizey = dims[1];
  Isizez = dims[2];
  
  /* Create image matrix for the Error return argument  */  
  plhs[0] = mxCreateNumericArray(2, dims_error, mxDOUBLE_CLASS, mxREAL);
  if(nlhs>1) 
  {
    dims_error_gradient[0]=Osizex; dims_error_gradient[1]=Osizey; dims_error_gradient[2]=Osizez;
    /* Error Gradient needed */
	plhs[1] = mxCreateNumericArray(4, dims_error_gradient, mxDOUBLE_CLASS, mxREAL);
  }
  
  /* Assign pointers to each input. */
  Ox=(double *)mxGetData(prhs[0]);
  Oy=(double *)mxGetData(prhs[1]);
  Oz=(double *)mxGetData(prhs[2]);
  I1=(double *)mxGetData(prhs[3]);
  I2=(double *)mxGetData(prhs[4]);
  dxa=(double *)mxGetData(prhs[5]);
  dya=(double *)mxGetData(prhs[6]);
  dza=(double *)mxGetData(prhs[7]);
   
  /* Get the spacing of the uniform b-spline grid */
  dx=(int)dxa[0]; dy=(int)dya[0]; dz=(int)dza[0]; 
  
  /* Get number of allowed threads */
  mexCallMATLAB(1, matlabCallOut, 0, matlabCallIn, "maxNumCompThreads");
  Nthreadsd=mxGetPr(matlabCallOut[0]);
  Nthreads=(int)Nthreadsd[0];
  
    /* Reserve room for handles of threads in ThreadList  */
		ThreadList = (ThreadHANDLE*)malloc(Nthreads* sizeof( ThreadHANDLE ));

  ThreadID = (double **)malloc( Nthreads* sizeof(double *) );
  ThreadArgs = (double ***)malloc( Nthreads* sizeof(double **) );
  if(nlhs==1){	ThreadOut = (double *)malloc(Nthreads* sizeof(double) ); }
  else { ThreadOut = (double *)malloc(Nthreads*(1+Onumel*3)*sizeof(double) );  }

  /* Assign pointer to output. */
  E = mxGetPr(plhs[0]);
  if(nlhs>1) { Egradient = mxGetPr(plhs[1]); }
  
   /*  Make polynomial look up tables   */
  Bu=malloc(dx*4*sizeof(double));
  Bv=malloc(dy*4*sizeof(double));
  Bw=malloc(dz*4*sizeof(double));
  for (x=0; x<dx; x++)
  {
    u=((double)x/(double)dx)-floor((double)x/(double)dx);
    Bu[mindex2(0,x,4)] = (double)pow((1-u),3)/6;
    Bu[mindex2(1,x,4)] = (double)( 3*pow(u,3) - 6*pow(u,2) + 4)/6;
    Bu[mindex2(2,x,4)] = (double)(-3*pow(u,3) + 3*pow(u,2) + 3*u + 1)/6;
    Bu[mindex2(3,x,4)] = (double)pow(u,3)/6;
  }
  
  for (y=0; y<dy; y++)
  {
    v=((double)y/(double)dy)-floor((double)y/(double)dy);
    Bv[mindex2(0,y,4)] = (double)pow((1-v),3)/6;
    Bv[mindex2(1,y,4)] = (double)( 3*pow(v,3) - 6*pow(v,2) + 4)/6;
    Bv[mindex2(2,y,4)] = (double)(-3*pow(v,3) + 3*pow(v,2) + 3*v + 1)/6;
    Bv[mindex2(3,y,4)] = (double)pow(v,3)/6;
  }

  for (z=0; z<dz; z++)
  {
    w=((double)z/(double)dz)-floor((double)z/(double)dz);
    Bw[mindex2(0,z,4)] = (double)pow((1-w),3)/6;
    Bw[mindex2(1,z,4)] = (double)( 3*pow(w,3) - 6*pow(w,2) + 4)/6;
    Bw[mindex2(2,z,4)] = (double)(-3*pow(w,3) + 3*pow(w,2) + 3*w + 1)/6;
    Bw[mindex2(3,z,4)] = (double)pow(w,3)/6;
  }

  Isize_d[0]=(double)Isizex;  Isize_d[1]=(double)Isizey; Isize_d[2]=(double)Isizez;
  Osize_d[0]=(double)Osizex;  Osize_d[1]=(double)Osizey; Osize_d[2]=(double)Osizez;
  
 /* Reserve room for 16 function variables(arrays)   */
  for (i=0; i<Nthreads; i++)
  {
    /*  Make Thread ID  */
    ThreadID1= (double *)malloc( 1* sizeof(double) );
    ThreadID1[0]=(double)i;
    ThreadID[i]=ThreadID1;  
	
    /*  Make Thread Structure  */
    ThreadArgs1 = (double **)malloc( 16 * sizeof( double * ) );  
    ThreadArgs1[0]=Bu;
    ThreadArgs1[1]=Bv;
	ThreadArgs1[2]=Bw;
	ThreadArgs1[3]=Isize_d;
    ThreadArgs1[4]=Osize_d;
    ThreadArgs1[5]=ThreadOut;
    ThreadArgs1[6]=dxa;
    ThreadArgs1[7]=dya;
    ThreadArgs1[8]=dza;
    ThreadArgs1[9]=ThreadID[i];
    ThreadArgs1[10]=Ox;
    ThreadArgs1[11]=Oy;
	ThreadArgs1[12]=Oz;
	ThreadArgs1[13]=I1;
	ThreadArgs1[14]=I2;
	ThreadArgs1[15]=Nthreadsd;
    ThreadArgs[i]=ThreadArgs1;
       

		if(nlhs==1){
		   StartThread(ThreadList[i], &transformvolume_error, ThreadArgs[i])
		}
		else{
			StartThread(ThreadList[i], &transformvolume_gradient, ThreadArgs[i])
		}
  }
  
 for (i=0; i<Nthreads; i++) { WaitForThreadFinish(ThreadList[i]); }


  /* Add accumlated error of all threads */
  E[0]=0; for (i=0; i<Nthreads; i++) { E[0]+=ThreadOut[i]; } E[0]/=Nthreads;

  if(nlhs>1)
  {
		for (i=0; i<Nthreads; i++) 
		{ 
			offset1=i*(3*Onumel);
            offset2=offset1+Onumel;
			offset3=offset2+Onumel;
			for(j=0; j<Onumel; j++)
			{
				Egradient[j]+=ThreadOut[Nthreads+j+offset1]/step;
				Egradient[j+Onumel]+=ThreadOut[Nthreads+j+offset2]/step;
				Egradient[j+2*Onumel]+=ThreadOut[Nthreads+j+offset3]/step;
			}
		}
        for(j=0; j<3*Onumel; j++)
    	{
            Egradient[j]/=Nthreads;
        }
  }
  
  for (i=0; i<Nthreads; i++) 
  { 
    free(ThreadArgs[i]);
    free(ThreadID[i]);
  }

  free(ThreadArgs);
  free(ThreadID );
  free(ThreadList);
  
  free(Bu);
  free(Bv);
  free(Bw);
  
}