C++ (Cpp) Aggの例、Agg C++ (Cpp)の例

コード例 #1

0

ファイルを表示

ファイル: FullyWatched.cpp プロジェクト: vonwenm/Minisatid

/**
 * Return the value the head should take if some propagation is possible.
 */
lbool MinisatID::canPropagateHead(const Agg& agg, const Weight& CC, const Weight& CP) {
	//if (nomoreprops[agg.getIndex()] || headproptime[agg.getIndex()]!=-1) {
	//	return headvalue[agg.getIndex()];
	//}

	auto result = l_Undef;

	//add if derived: headproptime[agg.getIndex()] = getStack().size();
	auto b = agg.getBound();
	if (agg.hasUB()) {
		if (CC > b) {
			result = l_False;
		} else if (CP <= b) {
			result = l_True;
		}
	} else {
		if (CC >= b) {
			result = l_True;
		} else if (CP < b) {
			result = l_False;
		}
	}

	if(agg.getSem()==AggSem::OR){
		if(result==l_True){
			result = l_Undef;
		}else if(result==l_False){
			result = l_True;
		}
	}

	return result;
}

コード例 #2

0

ファイルを表示

ファイル: hipmatrix_kernels.hpp プロジェクト: hgaspar/hip-convnet2

__global__ void kAggShortRows(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) {
    const uint shmemX = THREADS_X + 1;
    __shared__ float shmem[AGG_SHORT_ROWS_THREADS_Y*shmemX];

    const uint tidx = hipThreadIdx_y * THREADS_X + hipThreadIdx_x;
    const uint ty = LOOPS_X == 1 ? tidx / width : hipThreadIdx_y; // when loops==1, width is gonna be smaller than block x dim
    const uint tx = LOOPS_X == 1 ? tidx % width : hipThreadIdx_x;
    const uint bidx = hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x;
    const uint blockRowIdx = bidx * AGG_SHORT_ROWS_LOOPS_Y * AGG_SHORT_ROWS_THREADS_Y;
    float* shmemWrite = shmem + MUL24(ty, shmemX) + tx;
    matSum += blockRowIdx + tidx;
//    shmem[MUL24(hipThreadIdx_y, shmemX) + hipThreadIdx_x] = 0;
    mat += width * blockRowIdx + MUL24(ty, width) + tx;
    float* shmemWriteZeros = &shmem[MUL24(hipThreadIdx_y,shmemX) + hipThreadIdx_x];

    bool doAgg = tidx < AGG_SHORT_ROWS_THREADS_Y ;

    if (blockRowIdx < height) {
#pragma unroll
        for (uint y = 0; y < AGG_SHORT_ROWS_LOOPS_Y*AGG_SHORT_ROWS_THREADS_Y; y += AGG_SHORT_ROWS_THREADS_Y) {
            doAgg &= tidx + y + blockRowIdx < height;
            const bool heightIdxOK = ty < AGG_SHORT_ROWS_THREADS_Y && ty + y + blockRowIdx < height;

            shmemWriteZeros[0] = agg.getBaseValue();
            __syncthreads();
#pragma unroll
            for(uint x = 0; x < LOOPS_X * THREADS_X; x+= THREADS_X) {
//                __syncthreads();
                if (heightIdxOK && x + tx < width) {
                    shmemWrite[0] = agg(uop(mat[x]), shmemWrite[0]);
                }
            }
            __syncthreads();
            if (doAgg) {
                /*
                 * I tried doing this final sum as a 4-step reduction, with 8 threads
                 * per warp participating. It was slightly slower.
                 */
                float accum = agg.getBaseValue();
                float* shmemRead = shmem + MUL24(tidx, shmemX);
                // this loops too much if the rows are really short :(
#pragma unroll
                for (uint i = 0; i < THREADS_X; i++) {
                    accum = agg(accum, shmemRead[0]);
                    shmemRead++;
                }
                matSum[0] = bop(matSum[0], accum);
                matSum += AGG_SHORT_ROWS_THREADS_Y;
            }
            __syncthreads();
            mat += width * AGG_SHORT_ROWS_THREADS_Y;
        }
    }
}

コード例 #3

0

ファイルを表示

ファイル: hipmatrix_kernels.hpp プロジェクト: hgaspar/hip-convnet2

__global__ void kTotalAgg(const float* a, float* const target, const uint numElements, Agg agg) {
    __shared__ float shmem[DP_BLOCKSIZE];
    uint eidx = DP_BLOCKSIZE * hipBlockIdx_x + hipThreadIdx_x;
    shmem[hipThreadIdx_x] = agg.getBaseValue();
    if (eidx < hipGridDim_x * DP_BLOCKSIZE) {
        for (; eidx < numElements; eidx += hipGridDim_x * DP_BLOCKSIZE) {
            shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], a[eidx]);
        }
    }
    __syncthreads();
    if (hipThreadIdx_x < 256) {
        shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], shmem[hipThreadIdx_x + 256]);
    }
    __syncthreads();
    if (hipThreadIdx_x < 128) {
        shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], shmem[hipThreadIdx_x + 128]);
    }
    __syncthreads();
    if (hipThreadIdx_x < 64) {
        shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], shmem[hipThreadIdx_x + 64]);
    }
    __syncthreads();
    if (hipThreadIdx_x < 32) {
        volatile float* mysh = &shmem[hipThreadIdx_x];
        *mysh = agg(*mysh, mysh[32]);
        *mysh = agg(*mysh, mysh[16]);
        *mysh = agg(*mysh, mysh[8]);
        *mysh = agg(*mysh, mysh[4]);
        *mysh = agg(*mysh, mysh[2]);
        *mysh = agg(*mysh, mysh[1]);
        if (hipThreadIdx_x == 0) {
            target[hipBlockIdx_x] = *mysh;
        }
    }
}

コード例 #4

0

ファイルを表示

ファイル: hipmatrix_kernels.hpp プロジェクト: hgaspar/hip-convnet2

__global__ void kAggShortRows2(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) {
    const uint shmemX = AGG_SHORT_ROWS_THREADS_X + 1;
    __shared__ float shmem[AGG_SHORT_ROWS_THREADS_Y*shmemX];
    const uint LOOPS_X = DIVUP(width, AGG_SHORT_ROWS_THREADS_X);
    const uint tidx = hipThreadIdx_y * AGG_SHORT_ROWS_THREADS_X + hipThreadIdx_x;

    const uint bidx = hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x;
    const uint blockRowIdx = bidx * AGG_SHORT_ROWS_LOOPS_Y * AGG_SHORT_ROWS_THREADS_Y;

    float* shmemWrite = shmem + MUL24(hipThreadIdx_y, shmemX) + hipThreadIdx_x;
    matSum += blockRowIdx + tidx;
//    shmem[MUL24(hipThreadIdx_y, shmemX) + hipThreadIdx_x] = 0;
    mat += width * blockRowIdx + MUL24(hipThreadIdx_y, width) + hipThreadIdx_x;

    bool doAgg = tidx < AGG_SHORT_ROWS_THREADS_Y;
    if(blockRowIdx < height) {
        for (uint y = 0; y < AGG_SHORT_ROWS_LOOPS_Y*AGG_SHORT_ROWS_THREADS_Y; y += AGG_SHORT_ROWS_THREADS_Y) {
            doAgg &= tidx + y + blockRowIdx < height;
            const bool heightIdxOK = hipThreadIdx_y + y + blockRowIdx < height;
            float accum = agg.getBaseValue();
            shmemWrite[0] = agg.getBaseValue();

            for(uint x = 0; x < LOOPS_X * AGG_SHORT_ROWS_THREADS_X; x+= AGG_SHORT_ROWS_THREADS_X) {
//                __syncthreads();
                if (heightIdxOK && x + hipThreadIdx_x < width) {
                    shmemWrite[0] = agg(uop(mat[x]), shmemWrite[0]);
                }
            }

            __syncthreads();
            if (doAgg) {
                float* shmemRead = shmem + MUL24(tidx, shmemX);

#pragma unroll
                for (uint i = 0; i < AGG_SHORT_ROWS_THREADS_X; i++) {
                    accum = agg(accum, shmemRead[0]);
                    shmemRead++;
                }

                matSum[0] = bop(matSum[0], accum);
                matSum += AGG_SHORT_ROWS_THREADS_Y;
            }
            __syncthreads();
            mat += width * AGG_SHORT_ROWS_THREADS_Y;
        }
    }
}

コード例 #5

0

ファイルを表示

ファイル: FullyWatched.cpp プロジェクト: vonwenm/Minisatid

/**
 * Returns true if this aggregate can be propagated in the initialization, so it will never change truth value
 * and can be left out of any propagations.
 * Returns false if the aggregate is certainly unsat.
 */
lbool FWAgg::initialize(const Agg& agg) {
	auto confl = nullPtrClause;

	auto hv = canPropagateHead(agg, getCC(), getCP());
	bool alwaystrue = false;
	if (hv != l_Undef) {
		alwaystrue = true;
	}
	if (hv == l_True) {
		confl = getSet().notifySolver(new HeadReason(agg, agg.getHead()));
	} else if (hv == l_False) {
		confl = getSet().notifySolver(new HeadReason(agg, not agg.getHead()));
	}
	if (confl != nullPtrClause) {
		return l_False;
	}

	return alwaystrue ? l_True : l_Undef;
}

コード例 #6

0

ファイルを表示

ファイル: hipmatrix_kernels.hpp プロジェクト: hgaspar/hip-convnet2

__global__ void kDumbAggCols(cudaTextureObject_t mat, float* const vec, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) {
    const uint idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
    if (idx < width) {
        float mx = agg.getBaseValue();
        for (uint j = 0; j < height; j++) {
            mx = agg(uop(tex1Dfetch<float>(mat, width * j + idx)), mx);
        }
        vec[idx] = bop(vec[idx], mx);
    }
}

コード例 #7

0

ファイルを表示

ファイル: FullyWatched.cpp プロジェクト: vonwenm/Minisatid

/**
 * Returns non-owning pointer
 */
rClause MaxFWAgg::propagateSpecificAtEnd(const Agg& agg, bool headtrue) {
	//if(nomoreprops[agg.getIndex()] || headproptime[agg.getIndex()]!=-1){ return nullPtrClause; }

	auto confl = nullPtrClause;
	if (headtrue && agg.hasUB()) {
		for (auto i = getSet().getWL().rbegin(); confl == nullPtrClause && i < getSet().getWL().rend() && agg.getBound() < i->getWeight();
				++i) {
			confl = getSet().notifySolver(new SetLitReason(agg, i->getLit(), i->getWeight(), false));
		}
	} else if (!headtrue && agg.hasLB()) {
		for (auto i = getSet().getWL().rbegin(); confl == nullPtrClause && i < getSet().getWL().rend() && agg.getBound() <= i->getWeight();
				++i) {
			confl = getSet().notifySolver(new SetLitReason(agg, i->getLit(), i->getWeight(), false));
		}
	}
	if (confl == nullPtrClause) {
		confl = propagateAll(agg, headtrue);
	}
	return confl;
}

コード例 #8

0

ファイルを表示

ファイル: hipmatrix_kernels.hpp プロジェクト: hgaspar/hip-convnet2

__global__ void kAggCols(cudaTextureObject_t mat, float* const vec, const uint width, const uint height, const uint sumLength, Agg agg, UnaryOp op) {
    const uint idxX = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
    const uint idxY = hipBlockIdx_y * sumLength;
    if (idxX < width) {
        float mx = agg.getBaseValue();
        for (uint j = idxY; j < min(height,idxY + sumLength); j++) {
            mx = agg(op(tex1Dfetch<float>(mat, j * width + idxX)), mx);
        }
        vec[hipBlockIdx_y * width + idxX] = mx;
    }
}

コード例 #9

0

ファイルを表示

ファイル: PartiallyWatched.cpp プロジェクト: broesdecat/Minisatid

// Can return NULL, if no heads are false (or unknown if includeunknown)
Agg* GenPWAgg::getAggWithMostStringentBound(bool includeunknown) const {
	Agg* strongestagg = NULL;
	for (auto i = getAgg().cbegin(); i < getAgg().cend(); ++i) {
		bool relevantagg = false; // NOTE: recall HEAD OR AGG
		if (includeunknown) {
			relevantagg |= value((*i)->getHead()) != l_True;
		} else {
			relevantagg |= value((*i)->getHead()) == l_False;
		}
		if (relevantagg) {
			if (strongestagg == NULL) {
				strongestagg = *i;
			} else if (strongestagg->hasLB() && strongestagg->getBound() < (*i)->getBound()) {
				strongestagg = *i;
			} else if (strongestagg->hasUB() && strongestagg->getBound() > (*i)->getBound()) {
				strongestagg = *i;
			}
		}
	}
	return strongestagg;
}

コード例 #10

0

ファイルを表示

ファイル: FullyWatched.cpp プロジェクト: vonwenm/Minisatid

/**
 * Returns non-owning pointer
 */
rClause MaxFWAgg::propagateAll(const Agg& agg, bool headtrue) {
	rClause confl = nullPtrClause;

//	if(nomoreprops[agg.getIndex()] || headproptime[agg.getIndex()]!=-1){ return confl; }

	if ((!agg.hasLB() && headtrue) || (!agg.hasUB() && !headtrue)) {
		return confl;
	}

	Lit l = mkPosLit(0);
	Weight w(0);
	int found = 0;
	for (vwl::const_iterator i = getSet().getWL().cbegin(); found < 2 && i < getSet().getWL().cend(); ++i) {
		const WL& wl = (*i);
		if (headtrue) {
			if (agg.hasLB() && wl.getWeight() < agg.getBound()) {
				continue;
			}
			if (agg.hasUB() && wl.getWeight() > agg.getBound()) {
				continue;
			}
		} else { //headfalse
			if ((!agg.hasLB() || wl.getWeight() >= agg.getBound()) && (!agg.hasUB() || wl.getWeight() <= agg.getBound())) {
				continue;
			}
		}

		if (value(wl.getLit()) == l_Undef) {
			++found;
			l = wl.getLit();
			w = wl.getWeight();
		} else if (value(wl.getLit()) == l_True) {
			found = 2; //hack to stop immediately, because no propagation necessary
		}
	}
	if (found == 1) {
		confl = getSet().notifySolver(new SetLitReason(agg, l, w, true));
	}
	return confl;
}

コード例 #11

0

ファイルを表示

ファイル: hipmatrix_kernels.hpp プロジェクト: hgaspar/hip-convnet2

__global__ void kAggRows_wholerow_nosync(const float* mat, float* matSum, const uint width, const uint height,
                                         Agg agg, UnaryOp uop, BinaryOp bop) {
    const uint tidx = hipThreadIdx_x;
    const uint warpIdx = tidx / WARP_SIZE;
    const uint lane = tidx % WARP_SIZE;
    
    __shared__ float accum[(WARP_SIZE + 1) * AWR_NUM_WARPS];
    __shared__ float finalAccum[AWR_NUM_WARPS];

    float* myAccum = &accum[warpIdx * (WARP_SIZE + 1) + lane];
    float* myFinalAccum = &finalAccum[tidx];
    //volatile float* vMyAccum = &accum[warpIdx * (WARP_SIZE + 1) + lane];
    matSum += hipBlockIdx_y;
    mat += width * hipBlockIdx_y;

    float rAccum = agg.getBaseValue(); // cache in register, a bit faster than shmem
    #pragma unroll 32
    for (uint x = tidx; x < width; x += AWR_NUM_THREADS) {
        rAccum = agg(rAccum, uop(mat[x]));
    }
    myAccum[0] = rAccum;
    
    // Each warp does a reduction that doesn't require synchronizatoin
    #pragma unroll
    for (uint i = 0; i < LOG_WARP_SIZE; i++) {
        const uint d = 1 << i;
        myAccum[0] = agg(myAccum[0], shfl_down(myAccum[0], d));
    }
    __syncthreads();
    // The warps write their results
    if (tidx < AWR_NUM_WARPS) {
        //volatile float* vMyFinalAccum = &finalAccum[tidx];
        myFinalAccum[0] = accum[tidx * (WARP_SIZE + 1)];
        #pragma unroll
        for (uint i = 0; i < AWR_LOG_NUM_WARPS; i++) {
            const uint d = 1 << i;
            myFinalAccum[0] = agg(myFinalAccum[0], shfl_down(myFinalAccum[0], d));
        }
        if (tidx == 0) {
            matSum[0] = bop(matSum[0], myFinalAccum[0]);
            matSum += hipGridDim_y;
        }
    }
}

コード例 #12

0

ファイルを表示

ファイル: PartiallyWatched.cpp プロジェクト: broesdecat/Minisatid

rClause GenPWAgg::checkHeadPropagationForAgg(bool& propagations, const Agg& agg, const minmaxBounds& bound) {
	auto confl = nullPtrClause;
	auto propagatehead = false;
	if (agg.hasLB() && bound.max < agg.getBound()) {
		propagatehead = true;
	} else if (agg.hasUB() && agg.getBound() < bound.min) {
		propagatehead = true;
	}
	if (propagatehead) {
		propagations = true;
		confl = getSet().notifySolver(new HeadReason(agg, agg.getHead()));
		notifyFirstPropagation(agg.getHead());
	}
	return confl;
}

コード例 #13

0

ファイルを表示

ファイル: hipmatrix_kernels.hpp プロジェクト: hgaspar/hip-convnet2

__global__ void kAggRows_wholerow(const float* mat, float* matSum, const uint width, const uint height, Agg agg, BinaryOp op) {
    const int tidx = hipThreadIdx_x;

    __shared__ float accum[AWR_NUM_THREADS];
    volatile float* vMyAccum = &accum[tidx];
    float* myAccum = &accum[tidx];
    
    matSum += hipBlockIdx_y;
    mat += width * hipBlockIdx_y;

    for (uint idxY = hipBlockIdx_y; idxY < height; idxY += hipGridDim_y) {
        myAccum[0] = agg.getBaseValue();
        for (uint x = tidx; x < width; x += AWR_NUM_THREADS) {
            myAccum[0] = agg(myAccum[0], mat[x]);
        }
        #pragma unroll
        for (uint i = AWR_LOG_NUM_THREADS - 1; i > LOG_WARP_SIZE; i--) {
            const uint d = 1 << i;
            __syncthreads();
            if (tidx < d) {
                myAccum[0] = agg(myAccum[0], myAccum[d]);
            }
        }
        __syncthreads();
        if (tidx < WARP_SIZE) {
            #pragma unroll
            for (int i = LOG_WARP_SIZE; i >= 0; i--) {
                const uint d = 1 << i;
                vMyAccum[0] = agg(vMyAccum[0], vMyAccum[d]);
            }

            if (tidx == 0) {
                matSum[0] = op(matSum[0], vMyAccum[0]);
                matSum += hipGridDim_y;
            }
        }
        __syncthreads();
        mat += width * hipGridDim_y;
    }
}

コード例 #14

0

ファイルを表示

ファイル: FullyWatched.cpp プロジェクト: vonwenm/Minisatid

/**
 * if headtrue && lb => make all literals true with weight > (CP - lb)
 * 				  ub => make all literals false with weight > (ub - CC)
 * if !headtrue && lb => make all literals false with weight > (lb - CC)
 * 				   ub => make all literals true with weight > (CP - ub)
 * if both bounds: do both for headtrue
 * 					do none for headfalse until cc >= lb or cp <= ub
 */
rClause SPFWAgg::propagateSpecificAtEnd(const Agg& agg, bool headtrue) {
	rClause c = nullPtrClause;
	//if (nomoreprops[agg.getIndex()] || headproptime[agg.getIndex()]!=-1) {
	//	return nullPtrClause;
	//}

	auto& set = getSet();
	const auto& wls = set.getWL();
	auto from = wls.cend();
	Weight weightbound;

	bool ub = agg.hasUB();
	auto bound = agg.getBound();
	//determine the lower bound of which weight literals to consider
	const AggProp& type = getSet().getType();
	if (headtrue) {
		if (ub) {
			weightbound = type.removeMin(bound, getCC());
			//+1 because larger and not eq
			if (type.add(weightbound, getCC()) <= bound) {
				weightbound += 1;
			}
		} else {
			weightbound = type.removeMax(getCP(), bound);
			//+1 because larger and not eq
			if (type.add(weightbound, bound) <= getCP()) {
				weightbound += 1;
			}
		}
	} else { //head false
		if (ub) {
			weightbound = type.removeMax(getCP(), bound);
		} else {
			weightbound = type.removeMin(bound, getCC());
		}
	}

#ifdef NOARBITPREC
	if (weightbound == posInfinity() || weightbound == negInfinity()) {
		return c;
	}
#endif

	from = lower_bound(wls.cbegin(), wls.cend(), weightbound);
	if (from == getSet().getWL().cend()) {
		return c;
	}

	/**
	 * The lower bound indicates from which bound all literals should be propagate that are not yet known to the aggregate solver
	 * All literals known to the sat solver are certainly sa
	 */
	for (auto u = from; c == nullPtrClause && u < wls.cend(); ++u) {
		auto l = (*u).getLit();

		bool propagate = value(l) == l_Undef;

		if (!propagate && getSet().getPCSolver().getLevel(var(l)) == getSet().getPCSolver().getCurrentDecisionLevel()) {
			bool found = false;
			for (auto i = getTrail().back()->props.cbegin(); !found && i < getTrail().back()->props.cend(); ++i) {
				if (var(l) == var(i->getLit())) {
					found = true;
				}
			}
			propagate = !found;
		}

		//Only propagate those that are not already known in the aggregate solver!
		if (propagate) {
			if ((agg.hasUB() && headtrue) || (!agg.hasUB() && !headtrue)) {
				c = getSet().notifySolver(new SetLitReason(agg, (*u).getLit(), (*u).getWeight(), false));
			} else {
				c = getSet().notifySolver(new SetLitReason(agg, (*u).getLit(), (*u).getWeight(), true));
			}
		}
	}

	//TODO the looping over the trail is TOO slow! compared to old card
	//TODO but bigger problem is that he keeps on deriving the same propagations!
	//=> add a check that does not do propagations if the derived weight bound is the same
	//=> add a check that if only cp or cc is adapted, only aggs with such bound are checked!

	return c;
}

コード例 #15

0

ファイルを表示

ファイル: hipmatrix_kernels.hpp プロジェクト: hgaspar/hip-convnet2

__global__ void kAggRows(const float* mat, float* matSum, const uint width, const uint height, const uint sumWidth, Agg agg, UnaryOp uop, BinaryOp bop) {
    const int idxX = hipBlockIdx_x * blockSize*2 + hipThreadIdx_x;

    __shared__ float accum[blockSize*2];

    matSum += hipBlockIdx_y * sumWidth + hipBlockIdx_x;
    /*
     * Here it's important to make sure that all threads in a block call __syncthreads,
     * so I have even the redundant threads (for which idxX >= width) enter this loop
     * just so that they may call __syncthreads at the appropriate times.
     */
    mat += width * hipBlockIdx_y + idxX;

    accum[hipThreadIdx_x] = agg.getBaseValue();
    accum[hipThreadIdx_x + blockSize] = agg.getBaseValue();
    for (uint idxY = hipBlockIdx_y; idxY < height; idxY += hipGridDim_y) {
        if (idxX < width) {
            accum[hipThreadIdx_x] = uop(mat[0]);
            if(idxX + blockSize < width)
                accum[hipThreadIdx_x + blockSize] = uop(mat[blockSize]);
        }
        if (blockSize >= 512) {
            __syncthreads();
            if (hipThreadIdx_x < 512)
                accum[hipThreadIdx_x] = agg(accum[hipThreadIdx_x], accum[hipThreadIdx_x + 512]);
        }
        if (blockSize >= 256) {
            __syncthreads();
            if (hipThreadIdx_x < 256)
                accum[hipThreadIdx_x] = agg(accum[hipThreadIdx_x],accum[hipThreadIdx_x + 256]);
        }
        if (blockSize >= 128) {
            __syncthreads();
            if (hipThreadIdx_x < 128)
                accum[hipThreadIdx_x] = agg(accum[hipThreadIdx_x],accum[hipThreadIdx_x + 128]);
        }
        if (blockSize >= 64) {
            __syncthreads();
            if (hipThreadIdx_x < 64)
                accum[hipThreadIdx_x] = agg(accum[hipThreadIdx_x],accum[hipThreadIdx_x + 64]);
        }

        __syncthreads();
        volatile float* myAccum = &accum[hipThreadIdx_x];
        if (hipThreadIdx_x < 32) { // executed only by first warp
            myAccum[0] = agg(myAccum[0], myAccum[32]);
            myAccum[0] = agg(myAccum[0], myAccum[16]);
            myAccum[0] = agg(myAccum[0], myAccum[8]);
            myAccum[0] = agg(myAccum[0], myAccum[4]);
            myAccum[0] = agg(myAccum[0], myAccum[2]);
            myAccum[0] = agg(myAccum[0], myAccum[1]);
        }

        if (hipThreadIdx_x == 0) {
            matSum[0] = bop(matSum[0], myAccum[0]);
            matSum += hipGridDim_y * sumWidth;
        }
        __syncthreads();
        mat += width * hipGridDim_y;
    }
}