template<bool align> SIMD_INLINE v128_u8 BlurRow(const Buffer & buffer, size_t offset)
 {
     v128_u16 lo = DivideBy16(BinomialSum(Load<align>(buffer.src0 + offset), Load<align>(buffer.src1 + offset), Load<align>(buffer.src2 + offset)));
     offset += HA;
     v128_u16 hi = DivideBy16(BinomialSum(Load<align>(buffer.src0 + offset), Load<align>(buffer.src1 + offset), Load<align>(buffer.src2 + offset)));
     return vec_pack(lo, hi);
 }
 template<bool align> SIMD_INLINE __m128i BlurRow16(const Buffer & buffer, size_t offset)
 {
     return DivideBy16(BinomialSum16(
         Load<align>((__m128i*)(buffer.src0 + offset)), 
         Load<align>((__m128i*)(buffer.src1 + offset)),
         Load<align>((__m128i*)(buffer.src2 + offset))));
 }