/* The matlab mex function */
void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[] )
{
    /* Ox and Oy are the grid points */
    /* Zo is the input image */
    /* Zi is the transformed image */

    /* nx and ny are the number of grid points (inside the image) */
    double *Ox,*Oy,*Oz,*dxa, *dya,*dza,*Iout;
    mxArray *matlabCallOut[1]={0};
    mxArray *matlabCallIn[1]={0};
    double *Nthreadsd;
    int Nthreads;
	/* double pointer array to store all needed function variables) */
    double ***ThreadArgs;
    double **ThreadArgs1;
    /* Handles to the worker threads */
		ThreadHANDLE *ThreadList;

    /* ID of Threads */
    double **ThreadID;              
    double *ThreadID1;
    
    double nlhs_d[1]={0};
    
    /* Size of input image */
	double *Isize_d;
    mwSize dims[3];


    /* Size of grid */
    mwSize  Osizex, Osizey, Osizez;
    double Osize_d[3]={0,0,0};
    const mwSize *dimso;
  
    /* B-spline variablesl */
    double u,v,w;
    int u_index=0; 
    int v_index=0;
    int w_index=0;
    
    double *Bu, *Bv, *Bw;
    double *Bdu, *Bdv, *Bdw;
	
	/* Loop variable  */
	int i;
	/* Grid distance */
    int dx,dy,dz; 
    /* X,Y,Z coordinates of current pixel */
    int x,y,z;
        
  /* Check for proper number of arguments. */
  if(nrhs!=7) {
    mexErrMsgTxt("Seven inputs are required.");
  }
 

  /* Get the sizes of the grid */
  dimso = mxGetDimensions(prhs[0]);   
  Osizex = dimso[0]; 
  Osizey = dimso[1];
  Osizez = dimso[2];
    
  /* Assign pointers to each input. */
  Ox=(double *)mxGetData(prhs[0]);
  Oy=(double *)mxGetData(prhs[1]);
  Oz=(double *)mxGetData(prhs[2]);
  Isize_d=(double *)mxGetData(prhs[3]);
  dxa=(double *)mxGetData(prhs[4]);
  dya=(double *)mxGetData(prhs[5]);
  dza=(double *)mxGetData(prhs[6]);
   
   
  /* Create image matrix for the return arguments with the size of input image  */  
  dims[0]=(mwSize)Isize_d[0];
  dims[1]=(mwSize)Isize_d[1];
  dims[2]=(mwSize)Isize_d[2];
  
  plhs[0] = mxCreateNumericArray(3, dims, mxDOUBLE_CLASS, mxREAL); 
  
  
  /* Get the spacing of the uniform b-spline grid */
  dx=(int)dxa[0]; dy=(int)dya[0]; dz=(int)dza[0]; 
  
  /* Get number of allowed threads */
  mexCallMATLAB(1, matlabCallOut, 0, matlabCallIn, "maxNumCompThreads");
  Nthreadsd=mxGetPr(matlabCallOut[0]);
  Nthreads=(int)Nthreadsd[0];
  
    /* Reserve room for handles of threads in ThreadList  */
		ThreadList = (ThreadHANDLE*)malloc(Nthreads* sizeof( ThreadHANDLE ));
	
  ThreadID = (double **)malloc( Nthreads* sizeof(double *) );
  ThreadArgs = (double ***)malloc( Nthreads* sizeof(double **) );
  

  /* Assign pointer to output. */
  Iout = (double *)mxGetData(plhs[0]);
 
   /*  Make polynomial look up tables   */
  Bu=malloc(dx*4*sizeof(double));
  Bv=malloc(dy*4*sizeof(double));
  Bw=malloc(dz*4*sizeof(double));
  Bdu=malloc(dx*4*sizeof(double));
  Bdv=malloc(dy*4*sizeof(double));
  Bdw=malloc(dz*4*sizeof(double));
  
  for (x=0; x<dx; x++)
  {
    u=((double)x/(double)dx)-floor((double)x/(double)dx);
    Bu[mindex2(0,x,4)] = BsplineCoefficient(u,0);
    Bu[mindex2(1,x,4)] = BsplineCoefficient(u,1);
    Bu[mindex2(2,x,4)] = BsplineCoefficient(u,2);
    Bu[mindex2(3,x,4)] = BsplineCoefficient(u,3);
    Bdu[mindex2(0,x,4)] = BsplineCoefficientDerivative(u,0)/dxa[0];
    Bdu[mindex2(1,x,4)] = BsplineCoefficientDerivative(u,1)/dxa[0];
    Bdu[mindex2(2,x,4)] = BsplineCoefficientDerivative(u,2)/dxa[0];
    Bdu[mindex2(3,x,4)] = BsplineCoefficientDerivative(u,3)/dxa[0];
  }
  
  for (y=0; y<dy; y++)
  {
    v=((double)y/(double)dy)-floor((double)y/(double)dy);
    Bv[mindex2(0,y,4)] = BsplineCoefficient(v,0);
    Bv[mindex2(1,y,4)] = BsplineCoefficient(v,1);
    Bv[mindex2(2,y,4)] = BsplineCoefficient(v,2);
    Bv[mindex2(3,y,4)] = BsplineCoefficient(v,3);
    Bdv[mindex2(0,y,4)] = BsplineCoefficientDerivative(v,0)/dya[0];
    Bdv[mindex2(1,y,4)] = BsplineCoefficientDerivative(v,1)/dya[0];
    Bdv[mindex2(2,y,4)] = BsplineCoefficientDerivative(v,2)/dya[0];
    Bdv[mindex2(3,y,4)] = BsplineCoefficientDerivative(v,3)/dya[0];
  }
  
  for (z=0; z<dz; z++)
  {
    w=((double)z/(double)dz)-floor((double)z/(double)dz);
    Bw[mindex2(0,z,4)] = BsplineCoefficient(w,0);
    Bw[mindex2(1,z,4)] = BsplineCoefficient(w,1);
    Bw[mindex2(2,z,4)] = BsplineCoefficient(w,2);
    Bw[mindex2(3,z,4)] = BsplineCoefficient(w,3);
    Bdw[mindex2(0,z,4)] = BsplineCoefficientDerivative(w,0)/dza[0];
    Bdw[mindex2(1,z,4)] = BsplineCoefficientDerivative(w,1)/dza[0];
    Bdw[mindex2(2,z,4)] = BsplineCoefficientDerivative(w,2)/dza[0];
    Bdw[mindex2(3,z,4)] = BsplineCoefficientDerivative(w,3)/dza[0];
	
  }
  

  Osize_d[0]=(double)Osizex;  Osize_d[1]=(double)Osizey; Osize_d[2]=(double)Osizez;
  
  nlhs_d[0]=(double)nlhs;
  
 /* Reserve room for 16 function variables(arrays)   */
  for (i=0; i<Nthreads; i++)
  {
    /*  Make Thread ID  */
    ThreadID1= (double *)malloc( 1* sizeof(double) );
    ThreadID1[0]=(double)i;
    ThreadID[i]=ThreadID1;  
	
    /*  Make Thread Structure  */
    ThreadArgs1 = (double **)malloc( 17* sizeof( double * ) );  
	ThreadArgs1[0]=Bu;
	ThreadArgs1[1]=Bv;
	ThreadArgs1[2]=Bw;
	ThreadArgs1[3]=Isize_d;
	ThreadArgs1[4]=Osize_d;
	ThreadArgs1[5]=Iout;
	ThreadArgs1[6]=dxa;
	ThreadArgs1[7]=dya;
	ThreadArgs1[8]=dza;
	ThreadArgs1[9]=ThreadID[i];
	ThreadArgs1[10]=Ox;
	ThreadArgs1[11]=Oy;
	ThreadArgs1[12]=Oz;
	ThreadArgs1[13]=Nthreadsd;
    ThreadArgs1[14]=Bdu;
    ThreadArgs1[15]=Bdv;
	ThreadArgs1[16]=Bdw;
	
    ThreadArgs[i]=ThreadArgs1;

	StartThread(ThreadList[i], &transformvolume_jacobiandet, ThreadArgs[i])
  }

     for (i=0; i<Nthreads; i++) { WaitForThreadFinish(ThreadList[i]); }


  for (i=0; i<Nthreads; i++) 
  { 
    free(ThreadArgs[i]);
    free(ThreadID[i]);
  }

  free(ThreadArgs);
  free(ThreadID );
  free(ThreadList);  
  

  free(Bu);
  free(Bv);
  free(Bw);
  free(Bdu);
  free(Bdv);
  free(Bdw);
  
}
/* The matlab mex function */
void mexFunction( int nlhs, mxArray *plhs[],
        int nrhs, const mxArray *prhs[] ) {
    /* Ox and Oy are the grid points */
    /* Zo is the input image */
    /* Zi is the transformed image */
    /* dx and dy are the spacing of the b-spline knots */
    double *Ox, *Oy, *dxa, *dya, *E, *Egradient;
    double *ThreadErrorOut, *ThreadGradientOutX, *ThreadGradientOutY;
    mxArray *matlabCallOut[1]={0};
    mxArray *matlabCallIn[1]={0};
    double *Nthreadsd;
    int Nthreads;
    /* Finite difference step size */
	double step=0.001;
    /* index offsets */
    int offset1;
    /* double pointer array to store all needed function variables) */
    double ***ThreadArgs;
    double **ThreadArgs1;
    /* Handles to the worker threads */
    ThreadHANDLE *ThreadList;
    /* ID of Threads */
    double **ThreadID;
    double *ThreadID1;
    /* Dims outputs */
    const int dims_error[2]={1, 1};
    int dims_error_gradient[3]={1, 1, 2};
    /* Size of input image */
    double *Isize_d;
    /* Size of grid */
    mwSize  Osizex, Osizey;
    int Onumel;
    double Inumel;
    double Osize_d[2]={0, 0};
    /* B-spline variablesl */
    double u, v;
    int u_index=0;
    int v_index=0;
    double *Bu, *Bv, *Bdu, *Bdv;
    /* Loop variables  */
    int i, j;
    /* X,Y coordinates of current pixel */
    int x, y;
    /* Grid distance */
    int dx, dy;
    
    /* Check for proper number of arguments. */
    if(nrhs!=5) {
        mexErrMsgTxt("Five nputs are required.");
    }
    
    /* Get the sizes of the grid */
    Osizex = (mwSize)mxGetM(prhs[0]);
    Osizey = (mwSize)mxGetN(prhs[0]);
    
    /* Assign pointers to each input. */
    Ox=mxGetPr(prhs[0]);
    Oy=mxGetPr(prhs[1]);
    Isize_d=mxGetPr(prhs[2]);
    dxa=mxGetPr(prhs[3]);
    dya=mxGetPr(prhs[4]);
    
    Onumel= Osizex*Osizey;
    Inumel = Isize_d[0]*Isize_d[1];
    /* Create image matrix for the Error return argument  */
    plhs[0] = mxCreateNumericArray(2, dims_error, mxDOUBLE_CLASS, mxREAL);
    if(nlhs>1) {
        dims_error_gradient[0]=Osizex;
        dims_error_gradient[1]=Osizey;
        /* Error Gradient needed */
        
        plhs[1] = mxCreateNumericArray(3, dims_error_gradient, mxDOUBLE_CLASS, mxREAL);
    }
    
    /* Get the spacing of the uniform b-spline grid */
    dx=(int)dxa[0]; dy=(int)dya[0];
    
    /* Get number of allowed threads */
    mexCallMATLAB(1, matlabCallOut, 0, matlabCallIn, "maxNumCompThreads");
    Nthreadsd=mxGetPr(matlabCallOut[0]);
    Nthreads=(int)Nthreadsd[0];
    /* Reserve room for handles of threads in ThreadList  */
    ThreadList = (ThreadHANDLE*)malloc(Nthreads* sizeof( ThreadHANDLE ));
    
    ThreadID = (double **)malloc( Nthreads* sizeof(double *) );
    ThreadArgs = (double ***)malloc( Nthreads* sizeof(double **) );
    
    ThreadErrorOut= (double *)malloc(Nthreads* sizeof(double) ); 
    if(nlhs==1)
    {	
        ThreadGradientOutX=NULL;
        ThreadGradientOutY=NULL;
    }
    else 
    {   
        ThreadGradientOutX= (double *)malloc(Nthreads*Onumel*sizeof(double));
        ThreadGradientOutY= (double *)malloc(Nthreads*Onumel*sizeof(double));
    }
    
    /* Assign pointer to output. */
    E = mxGetPr(plhs[0]);
    if(nlhs>1) { Egradient = mxGetPr(plhs[1]); }
    
    /*  Make polynomial look up tables   */
    Bu=malloc(dx*4*sizeof(double));
    Bv=malloc(dy*4*sizeof(double));
    Bdu=malloc(dx*4*sizeof(double));
    Bdv=malloc(dy*4*sizeof(double));
    for (x=0; x<dx; x++) {
        u=(x/(double)dx)-floor(x/(double)dx);
        Bu[mindex2(0, x, 4)] = BsplineCoefficient(u, 0);
        Bu[mindex2(1, x, 4)] = BsplineCoefficient(u, 1);
        Bu[mindex2(2, x, 4)] = BsplineCoefficient(u, 2);
        Bu[mindex2(3, x, 4)] = BsplineCoefficient(u, 3);
        Bdu[mindex2(0, x, 4)] = BsplineCoefficientDerivative(u, 0)/dxa[0];
        Bdu[mindex2(1, x, 4)] = BsplineCoefficientDerivative(u, 1)/dxa[0];
        Bdu[mindex2(2, x, 4)] = BsplineCoefficientDerivative(u, 2)/dxa[0];
        Bdu[mindex2(3, x, 4)] = BsplineCoefficientDerivative(u, 3)/dxa[0];
    }
    
    for (y=0; y<dy; y++) {
        v=(y/(double)dy)-floor(y/(double)dy);
        Bv[mindex2(0, y, 4)] = BsplineCoefficient(v, 0);
        Bv[mindex2(1, y, 4)] = BsplineCoefficient(v, 1);
        Bv[mindex2(2, y, 4)] = BsplineCoefficient(v, 2);
        Bv[mindex2(3, y, 4)] = BsplineCoefficient(v, 3);
        Bdv[mindex2(0, y, 4)] = BsplineCoefficientDerivative(v, 0)/dya[0];
        Bdv[mindex2(1, y, 4)] = BsplineCoefficientDerivative(v, 1)/dya[0];
        Bdv[mindex2(2, y, 4)] = BsplineCoefficientDerivative(v, 2)/dya[0];
        Bdv[mindex2(3, y, 4)] = BsplineCoefficientDerivative(v, 3)/dya[0];
    }
    
    Osize_d[0]=Osizex;  Osize_d[1]=Osizey;
        
    /* Reserve room for 14 function variables(arrays)   */
    for (i=0; i<Nthreads; i++) {
        /*  Make Thread ID  */
        ThreadID1= (double *)malloc( 1* sizeof(double) );
        ThreadID1[0]=i;
        ThreadID[i]=ThreadID1;
        /*  Make Thread Structure  */
        ThreadArgs1 = (double **)malloc( 15 * sizeof( double * ) );
        ThreadArgs1[0]=Bu;
        ThreadArgs1[1]=Bv;
        ThreadArgs1[2]=Isize_d;
        ThreadArgs1[3]=Osize_d;
        ThreadArgs1[4]=ThreadErrorOut;
        ThreadArgs1[5]=dxa;
        ThreadArgs1[6]=dya;
        ThreadArgs1[7]=ThreadID[i];
        ThreadArgs1[8]=Ox;
        ThreadArgs1[9]=Oy;
        ThreadArgs1[10]=Nthreadsd;
        ThreadArgs1[11]=Bdu;
        ThreadArgs1[12]=Bdv;
        ThreadArgs1[13]=ThreadGradientOutX;
        ThreadArgs1[14]=ThreadGradientOutY;
    
        ThreadArgs[i]=ThreadArgs1;
        if(nlhs>1) 
        {
            StartThread(ThreadList[i], &jacobian_errorgradient, ThreadArgs[i])
        }
        else
        {
            StartThread(ThreadList[i], &jacobian_error, ThreadArgs[i])
        }
    }

    for (i=0; i<Nthreads; i++) { WaitForThreadFinish(ThreadList[i]); }
    
    /* Add accumlated error of all threads */
    E[0]=0;
    for (i=0; i<Nthreads; i++) 
    {
        E[0]+=ThreadErrorOut[i]; 
    } 
    E[0]/=Inumel;

    if(nlhs>1) {
        for (i=0; i<Nthreads; i++) {
            offset1=i*Onumel;
            for(j=0; j<Onumel; j++) {
                Egradient[j]+=ThreadGradientOutX[j+offset1];
                Egradient[j+Onumel]+=ThreadGradientOutY[j+offset1];
            }
        }
        for(j=0; j<Onumel; j++) {
            Egradient[j]/=Inumel*step;
            Egradient[j+Onumel]/=Inumel*step;
        }
    }
        
    for (i=0; i<Nthreads; i++) {
        free(ThreadArgs[i]);
        free(ThreadID[i]);
    }
    
    free(ThreadErrorOut);
    free(ThreadGradientOutX);
    free(ThreadGradientOutY);
        
    free(ThreadArgs);
    free(ThreadID );
    free(ThreadList);
    free(Bu);
    free(Bdu);
    free(Bv);
    free(Bdv);
    
}