QREngineResultCode GPUQREngine
(
    size_t gpuMemorySize,   // The total available GPU memory size in bytes
    Front *userFronts,      // The list of fronts to factorize
    Int numFronts,          // The number of fronts to factorize
    QREngineStats *stats    // An optional parameter. If present, statistics
                            // are collected and passed back to the caller
                            // via this struct
)
{
    /* Allocate workspaces */
    Front *fronts = (Front*) SuiteSparse_calloc(numFronts, sizeof(Front));
    if(!fronts)
    {
        return QRENGINE_OUTOFMEMORY;
    }

    size_t FSize, RSize;
    FSize = RSize = 0;
    for(int f=0; f<numFronts; f++)
    {
        /* Configure the front */
        Front *userFront = &(userFronts[f]);
        Int m = userFront->fm;
        Int n = userFront->fn;
        Front *front = new (&fronts[f]) Front(f, EMPTY, m, n);
        FSize += front->getNumFrontValues();
        RSize += front->getNumRValues();
    }

    // We have to allocate page-locked CPU-GPU space to leverage asynchronous
    // memory transfers.  This has to be done in a way that the CUDA driver is
    // aware of, which unfortunately means making a copy of the user input.

    // calloc pagelocked space on CPU, and calloc space on the GPU
    Workspace *wsMongoF = Workspace::allocate(FSize,    // CPU and GPU
        sizeof(double), true, true, true, true);

    // calloc pagelocked space on the CPU.  Nothing on the GPU
    Workspace *wsMongoR = Workspace::allocate(RSize,    // CPU
        sizeof(double), true, true, false, true);

    /* Cleanup and return if we ran out of memory. */
    if(!wsMongoF || !wsMongoR)
    {
        return GPUQREngine_Cleanup (QRENGINE_OUTOFMEMORY,
            userFronts, fronts, numFronts, wsMongoF, wsMongoR);
    }

    /* Prepare the fronts for GPU execution. */
    size_t FOffset, ROffset;
    FOffset = ROffset = 0;
    for(int f=0; f<numFronts; f++)
    {
        // Set the front pointers; make the copy from user data into front data.
        Front *front = &(fronts[f]);
        front->F    = CPU_REFERENCE(wsMongoF, double*) + FOffset;
        front->gpuF = GPU_REFERENCE(wsMongoF, double*) + FOffset;
        front->cpuR = CPU_REFERENCE(wsMongoR, double*) + ROffset;
        FOffset += front->getNumFrontValues();
        ROffset += front->getNumRValues();

        /* COPY USER DATA (user's F to our F) */
        Front *userFront = &(userFronts[f]);
        double *userF = userFront->F;
        double *F = front->F;
        Int m = userFront->fm;
        Int n = userFront->fn;
        bool isColMajor = userFront->isColMajor;
        Int ldn = userFront->ldn;
        for(Int i=0; i<m; i++)
        {
            for(Int j=0; j<n; j++)
            {
                F[i*n+j] = (isColMajor ? userF[j*ldn+i] : userF[i*ldn+j]);
            }
        }

        /* Attach either the user-specified Stair, or compute it. */
        front->Stair = userFront->Stair;
        if(!front->Stair) front->Stair = GPUQREngine_FindStaircase(front);

        /* Cleanup and return if we ran out of memory building the staircase */
        if(!front->Stair)
        {
            return GPUQREngine_Cleanup (QRENGINE_OUTOFMEMORY,
                userFronts, fronts, numFronts, wsMongoF, wsMongoR);
        }
    }

    /* Transfer the fronts to the GPU. */
    if(!wsMongoF->transfer(cudaMemcpyHostToDevice))
    {
        return GPUQREngine_Cleanup (QRENGINE_GPUERROR,
            userFronts, fronts, numFronts, wsMongoF, wsMongoR);
    }

    /* Do the factorization for this set of fronts. */
    QREngineResultCode result = GPUQREngine_Internal(gpuMemorySize, fronts,
        numFronts, NULL, NULL, NULL, stats);
    if(result != QRENGINE_SUCCESS)
    {
        return GPUQREngine_Cleanup (result,
            userFronts, fronts, numFronts, wsMongoF, wsMongoR);
    }

    /* COPY USER DATA (our R back to user's R) */
    for(int f=0; f<numFronts; f++)
    {
        Front *userFront = &(userFronts[f]);
        double *R = (&fronts[f])->cpuR;
        double *userR = userFront->cpuR;
        Int m = userFront->fm;
        Int n = userFront->fn;
        Int rank = userFront->rank;
        bool isColMajor = userFront->isColMajor;
        Int ldn = userFront->ldn;
        for(Int i=0; i<rank; i++)
        {
            for(Int j=0; j<n; j++)
            {
                userR[i*ldn+j] = (isColMajor ? R[j*n+i] : R[i*n+j]);
            }
        }
    }

    /* Return that the factorization was successful. */
    return GPUQREngine_Cleanup (QRENGINE_SUCCESS,
        userFronts, fronts, numFronts, wsMongoF, wsMongoR);
}
Esempio n. 2
0
EdgeCutProblem *EdgeCutProblem::create(const Int _n, const Int _nz, Int *_p,
                                       Int *_i, double *_x, double *_w)
{
    void *memoryLocation = SuiteSparse_malloc(1, sizeof(EdgeCutProblem));
    if (!memoryLocation)
        return NULL;

    // Placement new
    EdgeCutProblem *graph = new (memoryLocation) EdgeCutProblem();

    graph->shallow_p = (_p != NULL);
    graph->shallow_i = (_i != NULL);
    graph->shallow_x = (_x != NULL);
    graph->shallow_w = (_w != NULL);

    size_t n = static_cast<size_t>(_n);
    graph->n = _n;

    size_t nz = static_cast<size_t>(_nz);
    graph->nz = _nz;

    graph->p = (graph->shallow_p)
               ? _p
               : (Int *)SuiteSparse_calloc(n + 1, sizeof(Int));
    graph->i
        = (graph->shallow_i) ? _i : (Int *)SuiteSparse_malloc(nz, sizeof(Int));
    graph->x = _x;
    graph->w = _w;
    graph->X = 0.0;
    graph->W = 0.0;
    graph->H = 0.0;
    if (!graph->p || !graph->i)
    {
        graph->~EdgeCutProblem();
        return NULL;
    }

    graph->partition      = (bool *)SuiteSparse_malloc(n, sizeof(bool));
    graph->vertexGains    = (double *)SuiteSparse_malloc(n, sizeof(double));
    graph->externalDegree = (Int *)SuiteSparse_calloc(n, sizeof(Int));
    graph->bhIndex        = (Int *)SuiteSparse_calloc(n, sizeof(Int));
    graph->bhHeap[0]      = (Int *)SuiteSparse_malloc(n, sizeof(Int));
    graph->bhHeap[1]      = (Int *)SuiteSparse_malloc(n, sizeof(Int));
    graph->bhSize[0] = graph->bhSize[1] = 0;
    if (!graph->partition || !graph->vertexGains || !graph->externalDegree
        || !graph->bhIndex || !graph->bhHeap[0] || !graph->bhHeap[1])
    {
        graph->~EdgeCutProblem();
        return NULL;
    }

    graph->heuCost   = 0.0;
    graph->cutCost   = 0.0;
    graph->W0        = 0.0;
    graph->W1        = 0.0;
    graph->imbalance = 0.0;

    graph->parent      = NULL;
    graph->clevel      = 0;
    graph->cn          = 0;
    graph->matching    = (Int *)SuiteSparse_calloc(n, sizeof(Int));
    graph->matchmap    = (Int *)SuiteSparse_malloc(n, sizeof(Int));
    graph->invmatchmap = (Int *)SuiteSparse_malloc(n, sizeof(Int));
    graph->matchtype   = (Int *)SuiteSparse_malloc(n, sizeof(Int));
    graph->markArray   = (Int *)SuiteSparse_calloc(n, sizeof(Int));
    graph->markValue   = 1;
    graph->singleton   = -1;
    if (!graph->matching || !graph->matchmap || !graph->invmatchmap
        || !graph->markArray || !graph->matchtype)
    {
        graph->~EdgeCutProblem();
        return NULL;
    }

    graph->initialized = false;

    return graph;
}