void mexFunction(const int nlhs, mxArray *plhs[], const int nrhs, const mxArray *prhs[]) {
    double u, scalFactor;
    ClusterSetCPP<double> HBar;
    ClusterSetCPP<double> dHBardu;//The first derivatives
    ClusterSetCPP<double> d2HBardu2;//The second derivatives
    size_t M, numH, i;
    mxArray *CSRetVal;
    mxArray *clusterElsMATLAB,*clusterSizesMATLAB, *offsetArrayMATLAB;
    
    if(nrhs!=3){
        mexErrMsgTxt("Incorrect number of inputs.");
        return;
    }

    u=getDoubleFromMatlab(prhs[0]);
    M=getSizeTFromMatlab(prhs[1]);
    scalFactor=getDoubleFromMatlab(prhs[2]);
    
    if(M<3) {
       mexErrMsgTxt("The maximum order should be at least 3.");
       return; 
    }
    
    numH=(M+1)*(M+2)/2;
    
    //Allocate space for the results.
    clusterElsMATLAB=mxCreateDoubleMatrix(numH,1,mxREAL);
    clusterSizesMATLAB=allocUnsignedSizeMatInMatlab(M+1,1);
    offsetArrayMATLAB=allocUnsignedSizeMatInMatlab(M+1,1);
    
    HBar.numClust=M+1;
    HBar.totalNumEl=numH;
    HBar.clusterEls=reinterpret_cast<double*>(mxGetData(clusterElsMATLAB));
    HBar.offsetArray=reinterpret_cast<size_t*>(mxGetData(offsetArrayMATLAB));
    HBar.clusterSizes=reinterpret_cast<size_t*>(mxGetData(clusterSizesMATLAB));
    
    //Initialize the offset array and cluster sizes.
    HBar.offsetArray[0]=0;
    HBar.clusterSizes[0]=1;
    for(i=1;i<=M;i++){
        HBar.clusterSizes[i]=i+1;
        HBar.offsetArray[i]=HBar.offsetArray[i-1]+HBar.clusterSizes[i-1];
    }
    
    normHelmHoltzCPP(HBar,u,scalFactor);
    
    //Set the first return value
    mexCallMATLAB(1,&CSRetVal,0, 0, "ClusterSet");
    mxSetProperty(CSRetVal,0,"clusterEls",clusterElsMATLAB);
    mxSetProperty(CSRetVal,0,"clusterSizes",clusterSizesMATLAB);
    mxSetProperty(CSRetVal,0,"offsetArray",offsetArrayMATLAB);
    
    plhs[0]=CSRetVal;
    
    if(nlhs>1) {//Compute the first derivatives, if they are desired.
        mxArray *clusterEls1stDerivMATLAB=mxCreateDoubleMatrix(numH,1,mxREAL);
        
        dHBardu.numClust=M+1;
        dHBardu.totalNumEl=numH;
        dHBardu.clusterEls=reinterpret_cast<double*>(mxGetData(clusterEls1stDerivMATLAB));
        dHBardu.offsetArray=reinterpret_cast<size_t*>(mxGetData(offsetArrayMATLAB));
        dHBardu.clusterSizes=reinterpret_cast<size_t*>(mxGetData(clusterSizesMATLAB));
        
        normHelmHoltzDerivCPP(dHBardu,HBar);
        //Set the second return value
        mexCallMATLAB(1,&CSRetVal,0, 0, "ClusterSet");
        mxSetProperty(CSRetVal,0,"clusterEls",clusterEls1stDerivMATLAB);
        mxSetProperty(CSRetVal,0,"clusterSizes",clusterSizesMATLAB);
        mxSetProperty(CSRetVal,0,"offsetArray",offsetArrayMATLAB);

        plhs[1]=CSRetVal;
        mxDestroyArray(clusterEls1stDerivMATLAB);
    }
    
    if(nlhs>2) {//Compute the second derivatives if they are desired.
        mxArray *clusterEls2ndDerivMATLAB=mxCreateDoubleMatrix(numH,1,mxREAL);
        
        d2HBardu2.numClust=M+1;
        d2HBardu2.totalNumEl=numH;
        d2HBardu2.clusterEls=reinterpret_cast<double*>(mxGetData(clusterEls2ndDerivMATLAB));
        d2HBardu2.offsetArray=reinterpret_cast<size_t*>(mxGetData(offsetArrayMATLAB));
        d2HBardu2.clusterSizes=reinterpret_cast<size_t*>(mxGetData(clusterSizesMATLAB));
        
        normHelmHoltzDeriv2CPP(d2HBardu2,HBar);
        
        //Set the third return value
        mexCallMATLAB(1,&CSRetVal,0, 0, "ClusterSet");
        mxSetProperty(CSRetVal,0,"clusterEls",clusterEls2ndDerivMATLAB);
        mxSetProperty(CSRetVal,0,"clusterSizes",clusterSizesMATLAB);
        mxSetProperty(CSRetVal,0,"offsetArray",offsetArrayMATLAB);

        plhs[2]=CSRetVal;
        mxDestroyArray(clusterEls2ndDerivMATLAB);
    }
    
    //Free the buffers. The mxSetProperty command copied the data.
    mxDestroyArray(clusterElsMATLAB);
    mxDestroyArray(clusterSizesMATLAB);
    mxDestroyArray(offsetArrayMATLAB);
}
void spherHarmonicEvalCPP(double *V, double *gradV, const ClusterSetCPP<double> &C,const ClusterSetCPP<double> &S, const double *point, const size_t numPoints, const double a, const double c, const double scalFactor) {
    //If a NULL pointer is passed for gradV, then it is assumed that the
    //gradient is not desired. Otherwise, a pointer to a buffer for 3
    //doubles should be passed.
    double temp, r, lambda, *nCoeff;
    const size_t M=C.numClust-1;
    const double pi = 2*acos(0.0);
    size_t n,m,curPoint;
    double nf,mf;
    double rPrev,thetaPrev;
    ClusterSetCPP<double> FuncVals;
    ClusterSetCPP<double> FuncDerivs;
    //These are to store sin(m*lambda) and cos(m*lambda) for m=0->M.
    double *SinVec,*CosVec;//This are each length C.numClust.
    //These are never used at the same time as SinVec andCosVec and are the
    //same size, so they will point to the same memory.
    double *rm,*im;
    //These hold values for the modified forward row algorithm that stay
    //constant for points with the same range and latitude but different
    //longitudes.
    double *XC,*XS;
    //These values are only used if gradV!=NULL. There are initialized to
    //NULL here to suppress a warning if compiled using
    //-Wconditional-uninitialized
    double *XCdr=NULL;
    double *XSdr=NULL;
    double *XCdTheta=NULL;
    double *XSdTheta=NULL;
    //A big chunk of memory will be allocated into a single buffer and
    //split between the variables that need it. That is faster than
    //allocating a bunch of small buffers, and all of the variables are of
    //the same type.
    double *buffer;
        
    //Initialize the ClusterSet classes for the coefficients. The space
    //for the elements will be allocated shortly.
    FuncVals.numClust=C.numClust;
    FuncVals.totalNumEl=C.totalNumEl;
    FuncVals.offsetArray=C.offsetArray;
    FuncVals.clusterSizes=C.clusterSizes;
    FuncDerivs.numClust=C.numClust;
    FuncDerivs.totalNumEl=C.totalNumEl;
    FuncDerivs.offsetArray=C.offsetArray;
    FuncDerivs.clusterSizes=C.clusterSizes;

    //Allocate the buffer and partition it between variables.
    if(gradV==NULL){
        buffer = new double[C.totalNumEl+5*C.numClust];
    }else{
        buffer = new double[2*C.totalNumEl+9*C.numClust];
    }
    {
        double *tempPtr=buffer;
        //This stores all of the powers of a/r needed for the sum, regardless
        //of which algorithm is used.
        nCoeff=tempPtr;
        tempPtr+=C.numClust;
        //The sin and cosine values use the same memory as the im and rm
        //values, because only one will be used depending on which
        //algorithm is executed.
        SinVec=tempPtr;
        rm=SinVec;
        tempPtr+=C.numClust;
        CosVec=tempPtr;
        im=CosVec;
        tempPtr+=C.numClust;
        FuncVals.clusterEls=tempPtr;
        tempPtr+=C.totalNumEl;
        XC=tempPtr;
        tempPtr+=C.numClust;
        XS=tempPtr;
                
        if(gradV!=NULL) {
            tempPtr+=C.numClust;
            FuncDerivs.clusterEls=tempPtr;
            tempPtr+=C.numClust;
            XCdr=tempPtr;
            tempPtr+=C.numClust;
            XSdr=tempPtr;
            tempPtr+=C.numClust;
            XCdTheta=tempPtr;
            tempPtr+=C.numClust;
            XSdTheta=tempPtr;
        }
    }
        
    nCoeff[0]=1;
    
    rPrev=std::numeric_limits<double>::infinity();
    thetaPrev=std::numeric_limits<double>::infinity();
    for(curPoint=0;curPoint<numPoints;curPoint++) {
        double thetaCur;
        bool rChanged;
        bool thetaChanged;
        
        r=point[0+3*curPoint];
        lambda=point[1+3*curPoint];
        thetaCur=point[2+3*curPoint];
        
        rChanged=rPrev!=r;
        thetaChanged=thetaCur!=thetaPrev;
        rPrev=r;
        thetaPrev=thetaCur;
        
        if(rChanged) {
            temp=a/r;
            for(n=1;n<=M;n++) {
                nCoeff[n]=nCoeff[n-1]*temp;
            }
        }

        if(fabs(thetaCur)<88*pi/180||gradV==NULL) {
        //At latitudes that are not near the poles, the algorithm of Holmes and
        //Featherstone is used. It can not be used for the gradient near the
        //poles, because of the singularity of the spherical coordinate system.
            double u, theta;
            
            //Compute the sine and cosine terms 
            //Explicitely set the first two terms.
            SinVec[0]=0;
            CosVec[0]=1;
            SinVec[1]=sin(lambda);
            CosVec[1]=cos(lambda);
            //Use a double angle identity to get the second order term.
            SinVec[2]=2*SinVec[1]*CosVec[1];
            CosVec[2]=1-2*SinVec[1]*SinVec[1];
            //Use a two-part recursion for the rest of the terms.
            for(m=3;m<=M;m++){
                SinVec[m]=2*CosVec[1]*SinVec[m-1]-SinVec[m-2];
                CosVec[m]=2*CosVec[1]*CosVec[m-1]-CosVec[m-2];
            }
                        
        //The spherical coordinate system used in ellips2Sphere uses azimuth
        //and elevation (latitude). However the formulae for spherical harmonic
        //synthesis in the Holmes and Featherstone paper use, pi/2-elevation
        //(colatitude). Thus, the point must be transformed.
            theta=pi/2-thetaCur;
            u=sin(theta);
            if(thetaChanged) {
                //Get the associated Legendre function ratios.
                NALegendreCosRatCPP(FuncVals,  theta, scalFactor);

                //Get the derivatives of the ratios if the gradient is desired.
                if(gradV!=NULL) {
                    NALegendreCosRatDerivCPP(FuncDerivs,FuncVals,theta);
                }
            }
            
            //Evaluate Equation 7 from the Holmes and Featherstone paper.
            if(rChanged||thetaChanged) {
                //Zero the arrays.
                memset(XC,0,sizeof(double)*C.numClust);
                memset(XS,0,sizeof(double)*C.numClust);

                //Compute the X coefficients for the sum
                for(m=0;m<=M;m++) {
                    for(n=m;n<=M;n++) {
                        XC[m]+=nCoeff[n]*C[n][m]*FuncVals[n][m];
                        XS[m]+=nCoeff[n]*S[n][m]*FuncVals[n][m];
                    }
                }
            }
            
            //Use Horner's method to compute V.
            V[curPoint]=0;
            m=M+1;
            do {
                m--;
                
                V[curPoint]=V[curPoint]*u+XC[m]*CosVec[m]+XS[m]*SinVec[m];
            } while(m>0);

            //Multiple by the constant in front of the sum and get rid of the
            //scale factor.
            V[curPoint]=(c/r)*V[curPoint]/scalFactor;

            //Compute the gradient, if it is desired.
            if(gradV!=NULL) {
                double J[9];
                double dVdr=0;
                double dVdLambda=0;
                double dVdTheta=0;
                
                if(rChanged||thetaChanged) {
                    memset(XCdr,0,sizeof(double)*C.numClust);
                    memset(XSdr,0,sizeof(double)*C.numClust);
                    memset(XCdTheta,0,sizeof(double)*C.numClust);
                    memset(XSdTheta,0,sizeof(double)*C.numClust);

                    //Evaluate Equation 7 from the Holmes and Featherstone paper.
                    mf=0;
                    for(m=0;m<=M;m++) {
                        nf=mf;
                        for(n=m;n<=M;n++) {
                            double CScal=nCoeff[n]*C[n][m];
                            double SScal=nCoeff[n]*S[n][m];

                            XCdr[m]+=(nf+1)*CScal*FuncVals[n][m];
                            XSdr[m]+=(nf+1)*SScal*FuncVals[n][m];

                            XCdTheta[m]+=CScal*FuncDerivs[n][m];
                            XSdTheta[m]+=SScal*FuncDerivs[n][m];
                            
                            nf++;
                        }
                                    
                        mf++;
                    }
                 }
                
                //Use Horner's method to compute the partials.
                m=M+1;
                mf=static_cast<double>(m);
                do {
                    m--;
                    mf--;
                    
                    dVdr=dVdr*u+XCdr[m]*CosVec[m]+XSdr[m]*SinVec[m];
                    dVdLambda=dVdLambda*u+mf*(-XC[m]*SinVec[m]+XS[m]*CosVec[m]);
                    dVdTheta=dVdTheta*u+XCdTheta[m]*CosVec[m]+XSdTheta[m]*SinVec[m];
                } while(m>0);            

                dVdr=-(c/(r*r))*dVdr/scalFactor;
                dVdLambda=(c/r)*dVdLambda/scalFactor;
            //The minus sign is because the input coordinate was with respect
            //to latitude, not the co-latitude that the NALegendreCosRat
            //function uses.
                dVdTheta=-(c/r)*dVdTheta/scalFactor;

                calcSpherJacobCPP(J, point+3*curPoint,0);

                //Now, multiply the transpose of the Jacobian Matrix by the
                //vector of [dVdr;dVdLambda;dVdTheta]
                gradV[0+3*curPoint]=dVdr*J[0]+dVdLambda*J[1]+dVdTheta*J[2];
                gradV[1+3*curPoint]=dVdr*J[3]+dVdLambda*J[4]+dVdTheta*J[5];
                gradV[2+3*curPoint]=dVdr*J[6]+dVdLambda*J[7]+dVdTheta*J[8];
            }
        } else {  
        //At latitudes that are near the poles, the non-singular algorithm of
        //Pines using the fully normalized Helmholtz equations from Fantino and
        //Casotto is used. The algorithm has been slightly modified so that the
        //c/r term is out front and the fully normalized Helmholtz polynomials
        //can be scaled. Also, lumped coefficients are not used. The Pines
        //algorithm is generally slower than the algorithm of Holmes and
        //Featherstone and it is suffers a loss of precision near the equator.
        //Thus, the Pines algorithm is only used near the poles where the other
        //algorithm has issues with a singularity.
            double s,t,u, CartPoint[3];

            spher2CartCPP(CartPoint,point+3*curPoint,0);

            //Get the direction cosines used by Pines' algorithm.
            s=CartPoint[0]/r;
            t=CartPoint[1]/r;
            u=CartPoint[2]/r;

            //Compute the fully normalized Helmholtz polynomials.
            if(thetaChanged) {
                normHelmHoltzCPP(FuncVals,u, scalFactor);
                thetaPrev=thetaCur;
            }
            
            //Recursively compute the rm and im terms for the sums.
            rm[0]=1;
            im[0]=0;
            for(m=1;m<=M;m++) {
                //These are equation 49 in the Fantino and Casotto paper.
                rm[m]=s*rm[m-1]-t*im[m-1];
                im[m]=s*im[m-1]+t*rm[m-1];
            }

            //Perform the sum for the potential from Equation 44 in the
            //Fantino and Casotto paper.
            V[curPoint]=0;
            for(n=0;n<=M;n++) {
                double innerTerm=0;
                for(m=0;m<=n;m++) {
                    innerTerm+=(C[n][m]*rm[m]+S[n][m]*im[m])*FuncVals[n][m];
                }
                V[curPoint]+=nCoeff[n]*innerTerm;
            }

            V[curPoint]=(c/r)*V[curPoint]/scalFactor;

            //Compute the gradient.
            if(gradV!=NULL) {
                double a1=0;
                double a2=0;
                double a3=0;
                double a4=0;

                normHelmHoltzDerivCPP(FuncDerivs,FuncVals);

                //The equations in these loops are from Table 10.
                nf=0.0;
                for(n=0;n<=M;n++) {
                    double a1Loop=0;
                    double a2Loop=0;
                    double a3Loop;
                    double a4Loop;
                    double CProdMN;
                    double HVal;
                    double dHVal;
                    double Lmn;

                    //The m=0 case only applies to a3 and a4.
                    m=0;
                    mf=0.0;
                    HVal=FuncVals[n][m];
                    dHVal=FuncDerivs[n][m];
                    CProdMN=C[n][m]*rm[m]+S[n][m]*im[m];

                    a3Loop=CProdMN*dHVal;
                    Lmn=(nf+mf+1)*HVal+u*dHVal;//Defined in Table 14.
                    a4Loop=-CProdMN*Lmn;

                    mf=1.0;
                    for(m=1;m<=n;m++) {
                        HVal=FuncVals[n][m];
                        dHVal=FuncDerivs[n][m];

                        a1Loop+=mf*(C[n][m]*rm[m-1]+S[n][m]*im[m-1])*HVal;
                        a2Loop+=mf*(S[n][m]*rm[m-1]-C[n][m]*im[m-1])*HVal;

                        CProdMN=C[n][m]*rm[m]+S[n][m]*im[m];

                        a3Loop+=CProdMN*dHVal;
                        Lmn=(nf+mf+1)*HVal+u*dHVal;
                        a4Loop-=CProdMN*Lmn;

                        mf++;
                    }

                    a1+=nCoeff[n]*a1Loop;
                    a2+=nCoeff[n]*a2Loop;
                    a3+=nCoeff[n]*a3Loop;
                    a4+=nCoeff[n]*a4Loop;

                    nf++;
                }

       //These are equation 70. However, an additional 1/r term has been added,
       //which the original paper omitted when going from Equation 68 to 70.
                {
                    temp=c/(r*r);
                    gradV[0+3*curPoint]=temp*(a1+s*a4)/scalFactor;
                    gradV[1+3*curPoint]=temp*(a2+t*a4)/scalFactor;
                    gradV[2+3*curPoint]=temp*(a3+u*a4)/scalFactor;
                }
            }
        }
    }
    
    delete[] buffer;
}