QREngineResultCode GPUQREngine ( size_t gpuMemorySize, // The total available GPU memory size in bytes Front *userFronts, // The list of fronts to factorize Int numFronts, // The number of fronts to factorize QREngineStats *stats // An optional parameter. If present, statistics // are collected and passed back to the caller // via this struct ) { /* Allocate workspaces */ Front *fronts = (Front*) SuiteSparse_calloc(numFronts, sizeof(Front)); if(!fronts) { return QRENGINE_OUTOFMEMORY; } size_t FSize, RSize; FSize = RSize = 0; for(int f=0; f<numFronts; f++) { /* Configure the front */ Front *userFront = &(userFronts[f]); Int m = userFront->fm; Int n = userFront->fn; Front *front = new (&fronts[f]) Front(f, EMPTY, m, n); FSize += front->getNumFrontValues(); RSize += front->getNumRValues(); } // We have to allocate page-locked CPU-GPU space to leverage asynchronous // memory transfers. This has to be done in a way that the CUDA driver is // aware of, which unfortunately means making a copy of the user input. // calloc pagelocked space on CPU, and calloc space on the GPU Workspace *wsMongoF = Workspace::allocate(FSize, // CPU and GPU sizeof(double), true, true, true, true); // calloc pagelocked space on the CPU. Nothing on the GPU Workspace *wsMongoR = Workspace::allocate(RSize, // CPU sizeof(double), true, true, false, true); /* Cleanup and return if we ran out of memory. */ if(!wsMongoF || !wsMongoR) { return GPUQREngine_Cleanup (QRENGINE_OUTOFMEMORY, userFronts, fronts, numFronts, wsMongoF, wsMongoR); } /* Prepare the fronts for GPU execution. */ size_t FOffset, ROffset; FOffset = ROffset = 0; for(int f=0; f<numFronts; f++) { // Set the front pointers; make the copy from user data into front data. Front *front = &(fronts[f]); front->F = CPU_REFERENCE(wsMongoF, double*) + FOffset; front->gpuF = GPU_REFERENCE(wsMongoF, double*) + FOffset; front->cpuR = CPU_REFERENCE(wsMongoR, double*) + ROffset; FOffset += front->getNumFrontValues(); ROffset += front->getNumRValues(); /* COPY USER DATA (user's F to our F) */ Front *userFront = &(userFronts[f]); double *userF = userFront->F; double *F = front->F; Int m = userFront->fm; Int n = userFront->fn; bool isColMajor = userFront->isColMajor; Int ldn = userFront->ldn; for(Int i=0; i<m; i++) { for(Int j=0; j<n; j++) { F[i*n+j] = (isColMajor ? userF[j*ldn+i] : userF[i*ldn+j]); } } /* Attach either the user-specified Stair, or compute it. */ front->Stair = userFront->Stair; if(!front->Stair) front->Stair = GPUQREngine_FindStaircase(front); /* Cleanup and return if we ran out of memory building the staircase */ if(!front->Stair) { return GPUQREngine_Cleanup (QRENGINE_OUTOFMEMORY, userFronts, fronts, numFronts, wsMongoF, wsMongoR); } } /* Transfer the fronts to the GPU. */ if(!wsMongoF->transfer(cudaMemcpyHostToDevice)) { return GPUQREngine_Cleanup (QRENGINE_GPUERROR, userFronts, fronts, numFronts, wsMongoF, wsMongoR); } /* Do the factorization for this set of fronts. */ QREngineResultCode result = GPUQREngine_Internal(gpuMemorySize, fronts, numFronts, NULL, NULL, NULL, stats); if(result != QRENGINE_SUCCESS) { return GPUQREngine_Cleanup (result, userFronts, fronts, numFronts, wsMongoF, wsMongoR); } /* COPY USER DATA (our R back to user's R) */ for(int f=0; f<numFronts; f++) { Front *userFront = &(userFronts[f]); double *R = (&fronts[f])->cpuR; double *userR = userFront->cpuR; Int m = userFront->fm; Int n = userFront->fn; Int rank = userFront->rank; bool isColMajor = userFront->isColMajor; Int ldn = userFront->ldn; for(Int i=0; i<rank; i++) { for(Int j=0; j<n; j++) { userR[i*ldn+j] = (isColMajor ? R[j*n+i] : R[i*n+j]); } } } /* Return that the factorization was successful. */ return GPUQREngine_Cleanup (QRENGINE_SUCCESS, userFronts, fronts, numFronts, wsMongoF, wsMongoR); }
EdgeCutProblem *EdgeCutProblem::create(const Int _n, const Int _nz, Int *_p, Int *_i, double *_x, double *_w) { void *memoryLocation = SuiteSparse_malloc(1, sizeof(EdgeCutProblem)); if (!memoryLocation) return NULL; // Placement new EdgeCutProblem *graph = new (memoryLocation) EdgeCutProblem(); graph->shallow_p = (_p != NULL); graph->shallow_i = (_i != NULL); graph->shallow_x = (_x != NULL); graph->shallow_w = (_w != NULL); size_t n = static_cast<size_t>(_n); graph->n = _n; size_t nz = static_cast<size_t>(_nz); graph->nz = _nz; graph->p = (graph->shallow_p) ? _p : (Int *)SuiteSparse_calloc(n + 1, sizeof(Int)); graph->i = (graph->shallow_i) ? _i : (Int *)SuiteSparse_malloc(nz, sizeof(Int)); graph->x = _x; graph->w = _w; graph->X = 0.0; graph->W = 0.0; graph->H = 0.0; if (!graph->p || !graph->i) { graph->~EdgeCutProblem(); return NULL; } graph->partition = (bool *)SuiteSparse_malloc(n, sizeof(bool)); graph->vertexGains = (double *)SuiteSparse_malloc(n, sizeof(double)); graph->externalDegree = (Int *)SuiteSparse_calloc(n, sizeof(Int)); graph->bhIndex = (Int *)SuiteSparse_calloc(n, sizeof(Int)); graph->bhHeap[0] = (Int *)SuiteSparse_malloc(n, sizeof(Int)); graph->bhHeap[1] = (Int *)SuiteSparse_malloc(n, sizeof(Int)); graph->bhSize[0] = graph->bhSize[1] = 0; if (!graph->partition || !graph->vertexGains || !graph->externalDegree || !graph->bhIndex || !graph->bhHeap[0] || !graph->bhHeap[1]) { graph->~EdgeCutProblem(); return NULL; } graph->heuCost = 0.0; graph->cutCost = 0.0; graph->W0 = 0.0; graph->W1 = 0.0; graph->imbalance = 0.0; graph->parent = NULL; graph->clevel = 0; graph->cn = 0; graph->matching = (Int *)SuiteSparse_calloc(n, sizeof(Int)); graph->matchmap = (Int *)SuiteSparse_malloc(n, sizeof(Int)); graph->invmatchmap = (Int *)SuiteSparse_malloc(n, sizeof(Int)); graph->matchtype = (Int *)SuiteSparse_malloc(n, sizeof(Int)); graph->markArray = (Int *)SuiteSparse_calloc(n, sizeof(Int)); graph->markValue = 1; graph->singleton = -1; if (!graph->matching || !graph->matchmap || !graph->invmatchmap || !graph->markArray || !graph->matchtype) { graph->~EdgeCutProblem(); return NULL; } graph->initialized = false; return graph; }