static void multsum_f64_sse2_unroll4(double *dest, const double *src1, int sstr1, const double *src2, int sstr2, int n) { __m128d t1, t2; union { __m128d reg; double vals[2]; } sum; int i = 0; sum.reg = _mm_setzero_pd(); while (i < n-3) { MULTSUM_SSE2_STRIDED(0); MULTSUM_SSE2_STRIDED(2); OIL_INCREMENT(src1, 4*sstr1); OIL_INCREMENT(src2, 4*sstr2); i += 4; } while (i < n-1) { MULTSUM_SSE2_STRIDED(0); OIL_INCREMENT(src1, 2*sstr1); OIL_INCREMENT(src2, 2*sstr2); i+=2; } *dest = sum.vals[0] + sum.vals[1]; if (i < n) { *dest += (OIL_GET(src1,0,double)*OIL_GET(src2,0,double)); }
static void splat_u32_unroll2 (uint32_t *dest, int dstr, const uint32_t *param, int n) { int i; if (n&1) { *dest = *param; OIL_INCREMENT(dest,dstr); } n >>= 1; for(i=0;i<n;i++){ *dest = *param; OIL_INCREMENT(dest,dstr); *dest = *param; OIL_INCREMENT(dest,dstr); } }
static void sum_f64_i10_unroll4 (double *dest, double *src, int sstr, int n) { double sum1 = 0; double sum2 = 0; double sum3 = 0; double sum4 = 0; int i; while (n&3) { sum1 += *src; OIL_INCREMENT (src, sstr); n--; } for(i=0;i<n;i+=4){ sum1 += OIL_GET(src, sstr*i, double); sum2 += OIL_GET(src, sstr*(i+1), double); sum3 += OIL_GET(src, sstr*(i+2), double); sum4 += OIL_GET(src, sstr*(i+3), double); } *dest = sum1 + sum2 + sum3 + sum4; }