QREngineResultCode GPUQREngine
(
    size_t gpuMemorySize,   // The total available GPU memory size in bytes
    Front *userFronts,      // The list of fronts to factorize
    Int numFronts,          // The number of fronts to factorize
    QREngineStats *stats    // An optional parameter. If present, statistics
                            // are collected and passed back to the caller
                            // via this struct
)
{
    /* Allocate workspaces */
    Front *fronts = (Front*) SuiteSparse_calloc(numFronts, sizeof(Front));
    if(!fronts)
    {
        return QRENGINE_OUTOFMEMORY;
    }

    size_t FSize, RSize;
    FSize = RSize = 0;
    for(int f=0; f<numFronts; f++)
    {
        /* Configure the front */
        Front *userFront = &(userFronts[f]);
        Int m = userFront->fm;
        Int n = userFront->fn;
        Front *front = new (&fronts[f]) Front(f, EMPTY, m, n);
        FSize += front->getNumFrontValues();
        RSize += front->getNumRValues();
    }

    // We have to allocate page-locked CPU-GPU space to leverage asynchronous
    // memory transfers.  This has to be done in a way that the CUDA driver is
    // aware of, which unfortunately means making a copy of the user input.

    // calloc pagelocked space on CPU, and calloc space on the GPU
    Workspace *wsMongoF = Workspace::allocate(FSize,    // CPU and GPU
        sizeof(double), true, true, true, true);

    // calloc pagelocked space on the CPU.  Nothing on the GPU
    Workspace *wsMongoR = Workspace::allocate(RSize,    // CPU
        sizeof(double), true, true, false, true);

    /* Cleanup and return if we ran out of memory. */
    if(!wsMongoF || !wsMongoR)
    {
        return GPUQREngine_Cleanup (QRENGINE_OUTOFMEMORY,
            userFronts, fronts, numFronts, wsMongoF, wsMongoR);
    }

    /* Prepare the fronts for GPU execution. */
    size_t FOffset, ROffset;
    FOffset = ROffset = 0;
    for(int f=0; f<numFronts; f++)
    {
        // Set the front pointers; make the copy from user data into front data.
        Front *front = &(fronts[f]);
        front->F    = CPU_REFERENCE(wsMongoF, double*) + FOffset;
        front->gpuF = GPU_REFERENCE(wsMongoF, double*) + FOffset;
        front->cpuR = CPU_REFERENCE(wsMongoR, double*) + ROffset;
        FOffset += front->getNumFrontValues();
        ROffset += front->getNumRValues();

        /* COPY USER DATA (user's F to our F) */
        Front *userFront = &(userFronts[f]);
        double *userF = userFront->F;
        double *F = front->F;
        Int m = userFront->fm;
        Int n = userFront->fn;
        bool isColMajor = userFront->isColMajor;
        Int ldn = userFront->ldn;
        for(Int i=0; i<m; i++)
        {
            for(Int j=0; j<n; j++)
            {
                F[i*n+j] = (isColMajor ? userF[j*ldn+i] : userF[i*ldn+j]);
            }
        }

        /* Attach either the user-specified Stair, or compute it. */
        front->Stair = userFront->Stair;
        if(!front->Stair) front->Stair = GPUQREngine_FindStaircase(front);

        /* Cleanup and return if we ran out of memory building the staircase */
        if(!front->Stair)
        {
            return GPUQREngine_Cleanup (QRENGINE_OUTOFMEMORY,
                userFronts, fronts, numFronts, wsMongoF, wsMongoR);
        }
    }

    /* Transfer the fronts to the GPU. */
    if(!wsMongoF->transfer(cudaMemcpyHostToDevice))
    {
        return GPUQREngine_Cleanup (QRENGINE_GPUERROR,
            userFronts, fronts, numFronts, wsMongoF, wsMongoR);
    }

    /* Do the factorization for this set of fronts. */
    QREngineResultCode result = GPUQREngine_Internal(gpuMemorySize, fronts,
        numFronts, NULL, NULL, NULL, stats);
    if(result != QRENGINE_SUCCESS)
    {
        return GPUQREngine_Cleanup (result,
            userFronts, fronts, numFronts, wsMongoF, wsMongoR);
    }

    /* COPY USER DATA (our R back to user's R) */
    for(int f=0; f<numFronts; f++)
    {
        Front *userFront = &(userFronts[f]);
        double *R = (&fronts[f])->cpuR;
        double *userR = userFront->cpuR;
        Int m = userFront->fm;
        Int n = userFront->fn;
        Int rank = userFront->rank;
        bool isColMajor = userFront->isColMajor;
        Int ldn = userFront->ldn;
        for(Int i=0; i<rank; i++)
        {
            for(Int j=0; j<n; j++)
            {
                userR[i*ldn+j] = (isColMajor ? R[j*n+i] : R[i*n+j]);
            }
        }
    }

    /* Return that the factorization was successful. */
    return GPUQREngine_Cleanup (QRENGINE_SUCCESS,
        userFronts, fronts, numFronts, wsMongoF, wsMongoR);
}