/** * Return the value the head should take if some propagation is possible. */ lbool MinisatID::canPropagateHead(const Agg& agg, const Weight& CC, const Weight& CP) { //if (nomoreprops[agg.getIndex()] || headproptime[agg.getIndex()]!=-1) { // return headvalue[agg.getIndex()]; //} auto result = l_Undef; //add if derived: headproptime[agg.getIndex()] = getStack().size(); auto b = agg.getBound(); if (agg.hasUB()) { if (CC > b) { result = l_False; } else if (CP <= b) { result = l_True; } } else { if (CC >= b) { result = l_True; } else if (CP < b) { result = l_False; } } if(agg.getSem()==AggSem::OR){ if(result==l_True){ result = l_Undef; }else if(result==l_False){ result = l_True; } } return result; }
__global__ void kAggShortRows(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) { const uint shmemX = THREADS_X + 1; __shared__ float shmem[AGG_SHORT_ROWS_THREADS_Y*shmemX]; const uint tidx = hipThreadIdx_y * THREADS_X + hipThreadIdx_x; const uint ty = LOOPS_X == 1 ? tidx / width : hipThreadIdx_y; // when loops==1, width is gonna be smaller than block x dim const uint tx = LOOPS_X == 1 ? tidx % width : hipThreadIdx_x; const uint bidx = hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x; const uint blockRowIdx = bidx * AGG_SHORT_ROWS_LOOPS_Y * AGG_SHORT_ROWS_THREADS_Y; float* shmemWrite = shmem + MUL24(ty, shmemX) + tx; matSum += blockRowIdx + tidx; // shmem[MUL24(hipThreadIdx_y, shmemX) + hipThreadIdx_x] = 0; mat += width * blockRowIdx + MUL24(ty, width) + tx; float* shmemWriteZeros = &shmem[MUL24(hipThreadIdx_y,shmemX) + hipThreadIdx_x]; bool doAgg = tidx < AGG_SHORT_ROWS_THREADS_Y ; if (blockRowIdx < height) { #pragma unroll for (uint y = 0; y < AGG_SHORT_ROWS_LOOPS_Y*AGG_SHORT_ROWS_THREADS_Y; y += AGG_SHORT_ROWS_THREADS_Y) { doAgg &= tidx + y + blockRowIdx < height; const bool heightIdxOK = ty < AGG_SHORT_ROWS_THREADS_Y && ty + y + blockRowIdx < height; shmemWriteZeros[0] = agg.getBaseValue(); __syncthreads(); #pragma unroll for(uint x = 0; x < LOOPS_X * THREADS_X; x+= THREADS_X) { // __syncthreads(); if (heightIdxOK && x + tx < width) { shmemWrite[0] = agg(uop(mat[x]), shmemWrite[0]); } } __syncthreads(); if (doAgg) { /* * I tried doing this final sum as a 4-step reduction, with 8 threads * per warp participating. It was slightly slower. */ float accum = agg.getBaseValue(); float* shmemRead = shmem + MUL24(tidx, shmemX); // this loops too much if the rows are really short :( #pragma unroll for (uint i = 0; i < THREADS_X; i++) { accum = agg(accum, shmemRead[0]); shmemRead++; } matSum[0] = bop(matSum[0], accum); matSum += AGG_SHORT_ROWS_THREADS_Y; } __syncthreads(); mat += width * AGG_SHORT_ROWS_THREADS_Y; } } }
__global__ void kTotalAgg(const float* a, float* const target, const uint numElements, Agg agg) { __shared__ float shmem[DP_BLOCKSIZE]; uint eidx = DP_BLOCKSIZE * hipBlockIdx_x + hipThreadIdx_x; shmem[hipThreadIdx_x] = agg.getBaseValue(); if (eidx < hipGridDim_x * DP_BLOCKSIZE) { for (; eidx < numElements; eidx += hipGridDim_x * DP_BLOCKSIZE) { shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], a[eidx]); } } __syncthreads(); if (hipThreadIdx_x < 256) { shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], shmem[hipThreadIdx_x + 256]); } __syncthreads(); if (hipThreadIdx_x < 128) { shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], shmem[hipThreadIdx_x + 128]); } __syncthreads(); if (hipThreadIdx_x < 64) { shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], shmem[hipThreadIdx_x + 64]); } __syncthreads(); if (hipThreadIdx_x < 32) { volatile float* mysh = &shmem[hipThreadIdx_x]; *mysh = agg(*mysh, mysh[32]); *mysh = agg(*mysh, mysh[16]); *mysh = agg(*mysh, mysh[8]); *mysh = agg(*mysh, mysh[4]); *mysh = agg(*mysh, mysh[2]); *mysh = agg(*mysh, mysh[1]); if (hipThreadIdx_x == 0) { target[hipBlockIdx_x] = *mysh; } } }
__global__ void kAggShortRows2(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) { const uint shmemX = AGG_SHORT_ROWS_THREADS_X + 1; __shared__ float shmem[AGG_SHORT_ROWS_THREADS_Y*shmemX]; const uint LOOPS_X = DIVUP(width, AGG_SHORT_ROWS_THREADS_X); const uint tidx = hipThreadIdx_y * AGG_SHORT_ROWS_THREADS_X + hipThreadIdx_x; const uint bidx = hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x; const uint blockRowIdx = bidx * AGG_SHORT_ROWS_LOOPS_Y * AGG_SHORT_ROWS_THREADS_Y; float* shmemWrite = shmem + MUL24(hipThreadIdx_y, shmemX) + hipThreadIdx_x; matSum += blockRowIdx + tidx; // shmem[MUL24(hipThreadIdx_y, shmemX) + hipThreadIdx_x] = 0; mat += width * blockRowIdx + MUL24(hipThreadIdx_y, width) + hipThreadIdx_x; bool doAgg = tidx < AGG_SHORT_ROWS_THREADS_Y; if(blockRowIdx < height) { for (uint y = 0; y < AGG_SHORT_ROWS_LOOPS_Y*AGG_SHORT_ROWS_THREADS_Y; y += AGG_SHORT_ROWS_THREADS_Y) { doAgg &= tidx + y + blockRowIdx < height; const bool heightIdxOK = hipThreadIdx_y + y + blockRowIdx < height; float accum = agg.getBaseValue(); shmemWrite[0] = agg.getBaseValue(); for(uint x = 0; x < LOOPS_X * AGG_SHORT_ROWS_THREADS_X; x+= AGG_SHORT_ROWS_THREADS_X) { // __syncthreads(); if (heightIdxOK && x + hipThreadIdx_x < width) { shmemWrite[0] = agg(uop(mat[x]), shmemWrite[0]); } } __syncthreads(); if (doAgg) { float* shmemRead = shmem + MUL24(tidx, shmemX); #pragma unroll for (uint i = 0; i < AGG_SHORT_ROWS_THREADS_X; i++) { accum = agg(accum, shmemRead[0]); shmemRead++; } matSum[0] = bop(matSum[0], accum); matSum += AGG_SHORT_ROWS_THREADS_Y; } __syncthreads(); mat += width * AGG_SHORT_ROWS_THREADS_Y; } } }
/** * Returns true if this aggregate can be propagated in the initialization, so it will never change truth value * and can be left out of any propagations. * Returns false if the aggregate is certainly unsat. */ lbool FWAgg::initialize(const Agg& agg) { auto confl = nullPtrClause; auto hv = canPropagateHead(agg, getCC(), getCP()); bool alwaystrue = false; if (hv != l_Undef) { alwaystrue = true; } if (hv == l_True) { confl = getSet().notifySolver(new HeadReason(agg, agg.getHead())); } else if (hv == l_False) { confl = getSet().notifySolver(new HeadReason(agg, not agg.getHead())); } if (confl != nullPtrClause) { return l_False; } return alwaystrue ? l_True : l_Undef; }
__global__ void kDumbAggCols(cudaTextureObject_t mat, float* const vec, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) { const uint idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; if (idx < width) { float mx = agg.getBaseValue(); for (uint j = 0; j < height; j++) { mx = agg(uop(tex1Dfetch<float>(mat, width * j + idx)), mx); } vec[idx] = bop(vec[idx], mx); } }
/** * Returns non-owning pointer */ rClause MaxFWAgg::propagateSpecificAtEnd(const Agg& agg, bool headtrue) { //if(nomoreprops[agg.getIndex()] || headproptime[agg.getIndex()]!=-1){ return nullPtrClause; } auto confl = nullPtrClause; if (headtrue && agg.hasUB()) { for (auto i = getSet().getWL().rbegin(); confl == nullPtrClause && i < getSet().getWL().rend() && agg.getBound() < i->getWeight(); ++i) { confl = getSet().notifySolver(new SetLitReason(agg, i->getLit(), i->getWeight(), false)); } } else if (!headtrue && agg.hasLB()) { for (auto i = getSet().getWL().rbegin(); confl == nullPtrClause && i < getSet().getWL().rend() && agg.getBound() <= i->getWeight(); ++i) { confl = getSet().notifySolver(new SetLitReason(agg, i->getLit(), i->getWeight(), false)); } } if (confl == nullPtrClause) { confl = propagateAll(agg, headtrue); } return confl; }
__global__ void kAggCols(cudaTextureObject_t mat, float* const vec, const uint width, const uint height, const uint sumLength, Agg agg, UnaryOp op) { const uint idxX = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; const uint idxY = hipBlockIdx_y * sumLength; if (idxX < width) { float mx = agg.getBaseValue(); for (uint j = idxY; j < min(height,idxY + sumLength); j++) { mx = agg(op(tex1Dfetch<float>(mat, j * width + idxX)), mx); } vec[hipBlockIdx_y * width + idxX] = mx; } }
// Can return NULL, if no heads are false (or unknown if includeunknown) Agg* GenPWAgg::getAggWithMostStringentBound(bool includeunknown) const { Agg* strongestagg = NULL; for (auto i = getAgg().cbegin(); i < getAgg().cend(); ++i) { bool relevantagg = false; // NOTE: recall HEAD OR AGG if (includeunknown) { relevantagg |= value((*i)->getHead()) != l_True; } else { relevantagg |= value((*i)->getHead()) == l_False; } if (relevantagg) { if (strongestagg == NULL) { strongestagg = *i; } else if (strongestagg->hasLB() && strongestagg->getBound() < (*i)->getBound()) { strongestagg = *i; } else if (strongestagg->hasUB() && strongestagg->getBound() > (*i)->getBound()) { strongestagg = *i; } } } return strongestagg; }
/** * Returns non-owning pointer */ rClause MaxFWAgg::propagateAll(const Agg& agg, bool headtrue) { rClause confl = nullPtrClause; // if(nomoreprops[agg.getIndex()] || headproptime[agg.getIndex()]!=-1){ return confl; } if ((!agg.hasLB() && headtrue) || (!agg.hasUB() && !headtrue)) { return confl; } Lit l = mkPosLit(0); Weight w(0); int found = 0; for (vwl::const_iterator i = getSet().getWL().cbegin(); found < 2 && i < getSet().getWL().cend(); ++i) { const WL& wl = (*i); if (headtrue) { if (agg.hasLB() && wl.getWeight() < agg.getBound()) { continue; } if (agg.hasUB() && wl.getWeight() > agg.getBound()) { continue; } } else { //headfalse if ((!agg.hasLB() || wl.getWeight() >= agg.getBound()) && (!agg.hasUB() || wl.getWeight() <= agg.getBound())) { continue; } } if (value(wl.getLit()) == l_Undef) { ++found; l = wl.getLit(); w = wl.getWeight(); } else if (value(wl.getLit()) == l_True) { found = 2; //hack to stop immediately, because no propagation necessary } } if (found == 1) { confl = getSet().notifySolver(new SetLitReason(agg, l, w, true)); } return confl; }
__global__ void kAggRows_wholerow_nosync(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) { const uint tidx = hipThreadIdx_x; const uint warpIdx = tidx / WARP_SIZE; const uint lane = tidx % WARP_SIZE; __shared__ float accum[(WARP_SIZE + 1) * AWR_NUM_WARPS]; __shared__ float finalAccum[AWR_NUM_WARPS]; float* myAccum = &accum[warpIdx * (WARP_SIZE + 1) + lane]; float* myFinalAccum = &finalAccum[tidx]; //volatile float* vMyAccum = &accum[warpIdx * (WARP_SIZE + 1) + lane]; matSum += hipBlockIdx_y; mat += width * hipBlockIdx_y; float rAccum = agg.getBaseValue(); // cache in register, a bit faster than shmem #pragma unroll 32 for (uint x = tidx; x < width; x += AWR_NUM_THREADS) { rAccum = agg(rAccum, uop(mat[x])); } myAccum[0] = rAccum; // Each warp does a reduction that doesn't require synchronizatoin #pragma unroll for (uint i = 0; i < LOG_WARP_SIZE; i++) { const uint d = 1 << i; myAccum[0] = agg(myAccum[0], shfl_down(myAccum[0], d)); } __syncthreads(); // The warps write their results if (tidx < AWR_NUM_WARPS) { //volatile float* vMyFinalAccum = &finalAccum[tidx]; myFinalAccum[0] = accum[tidx * (WARP_SIZE + 1)]; #pragma unroll for (uint i = 0; i < AWR_LOG_NUM_WARPS; i++) { const uint d = 1 << i; myFinalAccum[0] = agg(myFinalAccum[0], shfl_down(myFinalAccum[0], d)); } if (tidx == 0) { matSum[0] = bop(matSum[0], myFinalAccum[0]); matSum += hipGridDim_y; } } }
rClause GenPWAgg::checkHeadPropagationForAgg(bool& propagations, const Agg& agg, const minmaxBounds& bound) { auto confl = nullPtrClause; auto propagatehead = false; if (agg.hasLB() && bound.max < agg.getBound()) { propagatehead = true; } else if (agg.hasUB() && agg.getBound() < bound.min) { propagatehead = true; } if (propagatehead) { propagations = true; confl = getSet().notifySolver(new HeadReason(agg, agg.getHead())); notifyFirstPropagation(agg.getHead()); } return confl; }
__global__ void kAggRows_wholerow(const float* mat, float* matSum, const uint width, const uint height, Agg agg, BinaryOp op) { const int tidx = hipThreadIdx_x; __shared__ float accum[AWR_NUM_THREADS]; volatile float* vMyAccum = &accum[tidx]; float* myAccum = &accum[tidx]; matSum += hipBlockIdx_y; mat += width * hipBlockIdx_y; for (uint idxY = hipBlockIdx_y; idxY < height; idxY += hipGridDim_y) { myAccum[0] = agg.getBaseValue(); for (uint x = tidx; x < width; x += AWR_NUM_THREADS) { myAccum[0] = agg(myAccum[0], mat[x]); } #pragma unroll for (uint i = AWR_LOG_NUM_THREADS - 1; i > LOG_WARP_SIZE; i--) { const uint d = 1 << i; __syncthreads(); if (tidx < d) { myAccum[0] = agg(myAccum[0], myAccum[d]); } } __syncthreads(); if (tidx < WARP_SIZE) { #pragma unroll for (int i = LOG_WARP_SIZE; i >= 0; i--) { const uint d = 1 << i; vMyAccum[0] = agg(vMyAccum[0], vMyAccum[d]); } if (tidx == 0) { matSum[0] = op(matSum[0], vMyAccum[0]); matSum += hipGridDim_y; } } __syncthreads(); mat += width * hipGridDim_y; } }
/** * if headtrue && lb => make all literals true with weight > (CP - lb) * ub => make all literals false with weight > (ub - CC) * if !headtrue && lb => make all literals false with weight > (lb - CC) * ub => make all literals true with weight > (CP - ub) * if both bounds: do both for headtrue * do none for headfalse until cc >= lb or cp <= ub */ rClause SPFWAgg::propagateSpecificAtEnd(const Agg& agg, bool headtrue) { rClause c = nullPtrClause; //if (nomoreprops[agg.getIndex()] || headproptime[agg.getIndex()]!=-1) { // return nullPtrClause; //} auto& set = getSet(); const auto& wls = set.getWL(); auto from = wls.cend(); Weight weightbound; bool ub = agg.hasUB(); auto bound = agg.getBound(); //determine the lower bound of which weight literals to consider const AggProp& type = getSet().getType(); if (headtrue) { if (ub) { weightbound = type.removeMin(bound, getCC()); //+1 because larger and not eq if (type.add(weightbound, getCC()) <= bound) { weightbound += 1; } } else { weightbound = type.removeMax(getCP(), bound); //+1 because larger and not eq if (type.add(weightbound, bound) <= getCP()) { weightbound += 1; } } } else { //head false if (ub) { weightbound = type.removeMax(getCP(), bound); } else { weightbound = type.removeMin(bound, getCC()); } } #ifdef NOARBITPREC if (weightbound == posInfinity() || weightbound == negInfinity()) { return c; } #endif from = lower_bound(wls.cbegin(), wls.cend(), weightbound); if (from == getSet().getWL().cend()) { return c; } /** * The lower bound indicates from which bound all literals should be propagate that are not yet known to the aggregate solver * All literals known to the sat solver are certainly sa */ for (auto u = from; c == nullPtrClause && u < wls.cend(); ++u) { auto l = (*u).getLit(); bool propagate = value(l) == l_Undef; if (!propagate && getSet().getPCSolver().getLevel(var(l)) == getSet().getPCSolver().getCurrentDecisionLevel()) { bool found = false; for (auto i = getTrail().back()->props.cbegin(); !found && i < getTrail().back()->props.cend(); ++i) { if (var(l) == var(i->getLit())) { found = true; } } propagate = !found; } //Only propagate those that are not already known in the aggregate solver! if (propagate) { if ((agg.hasUB() && headtrue) || (!agg.hasUB() && !headtrue)) { c = getSet().notifySolver(new SetLitReason(agg, (*u).getLit(), (*u).getWeight(), false)); } else { c = getSet().notifySolver(new SetLitReason(agg, (*u).getLit(), (*u).getWeight(), true)); } } } //TODO the looping over the trail is TOO slow! compared to old card //TODO but bigger problem is that he keeps on deriving the same propagations! //=> add a check that does not do propagations if the derived weight bound is the same //=> add a check that if only cp or cc is adapted, only aggs with such bound are checked! return c; }
__global__ void kAggRows(const float* mat, float* matSum, const uint width, const uint height, const uint sumWidth, Agg agg, UnaryOp uop, BinaryOp bop) { const int idxX = hipBlockIdx_x * blockSize*2 + hipThreadIdx_x; __shared__ float accum[blockSize*2]; matSum += hipBlockIdx_y * sumWidth + hipBlockIdx_x; /* * Here it's important to make sure that all threads in a block call __syncthreads, * so I have even the redundant threads (for which idxX >= width) enter this loop * just so that they may call __syncthreads at the appropriate times. */ mat += width * hipBlockIdx_y + idxX; accum[hipThreadIdx_x] = agg.getBaseValue(); accum[hipThreadIdx_x + blockSize] = agg.getBaseValue(); for (uint idxY = hipBlockIdx_y; idxY < height; idxY += hipGridDim_y) { if (idxX < width) { accum[hipThreadIdx_x] = uop(mat[0]); if(idxX + blockSize < width) accum[hipThreadIdx_x + blockSize] = uop(mat[blockSize]); } if (blockSize >= 512) { __syncthreads(); if (hipThreadIdx_x < 512) accum[hipThreadIdx_x] = agg(accum[hipThreadIdx_x], accum[hipThreadIdx_x + 512]); } if (blockSize >= 256) { __syncthreads(); if (hipThreadIdx_x < 256) accum[hipThreadIdx_x] = agg(accum[hipThreadIdx_x],accum[hipThreadIdx_x + 256]); } if (blockSize >= 128) { __syncthreads(); if (hipThreadIdx_x < 128) accum[hipThreadIdx_x] = agg(accum[hipThreadIdx_x],accum[hipThreadIdx_x + 128]); } if (blockSize >= 64) { __syncthreads(); if (hipThreadIdx_x < 64) accum[hipThreadIdx_x] = agg(accum[hipThreadIdx_x],accum[hipThreadIdx_x + 64]); } __syncthreads(); volatile float* myAccum = &accum[hipThreadIdx_x]; if (hipThreadIdx_x < 32) { // executed only by first warp myAccum[0] = agg(myAccum[0], myAccum[32]); myAccum[0] = agg(myAccum[0], myAccum[16]); myAccum[0] = agg(myAccum[0], myAccum[8]); myAccum[0] = agg(myAccum[0], myAccum[4]); myAccum[0] = agg(myAccum[0], myAccum[2]); myAccum[0] = agg(myAccum[0], myAccum[1]); } if (hipThreadIdx_x == 0) { matSum[0] = bop(matSum[0], myAccum[0]); matSum += hipGridDim_y * sumWidth; } __syncthreads(); mat += width * hipGridDim_y; } }