void dotproduct3(vec_ptr u, vec_ptr v, data_t *dest) { long int i; *dest = 1.0; int len = vec_length(u); data_t *data1 = get_vec_start(u); data_t *data2 = get_vec_start(v); for (i = 0; i < len; i++) { *dest = *dest + data1[i] * data2[i]; } }
/* Accumulate result in local variable */ void dotproduct4(vec_ptr u, vec_ptr v, data_t *dest) { long int i; long int length = vec_length(u); data_t *data1 = get_vec_start(u); data_t *data2 = get_vec_start(v); data_t acc = 1.0; for (i = 0; i < length; i++) { acc = acc + data1[i] * data2[i]; } *dest = acc; }
void dotproduct4(vec_ptr u, vec_ptr v, data_t *dest) { long int i; *dest = 1.0; int len = vec_length(u); data_t *data1 = get_vec_start(u); data_t *data2 = get_vec_start(v); data_t temp = 0; for (i = 0; i < len; i++) { temp = temp + data1[i] * data2[i]; } *dest = temp; }
void inner4(vec_ptr u, vec_ptr v, data_t *dest){ long int i; int length = vec_length(u); data_t *udata = get_vec_start(u); data_t *vdata = get_vec_start(v); data_t sum = (data_t) 0; //critical path of the function for (i=0; i<length;i++){ sum = sum + udata[i] * vdata[i]; } //optimization for our assigning the value to our pointer *dest = sum; }
void unroll12aa_combine(vec_ptr v, data_t *dest) { long int i; long int length = vec_length(v); long int limit = length-11; data_t *data = get_vec_start(v); data_t acc = IDENT; /* Combine 12 elements at a time */ for (i = 0; i < limit; i+=12) { data_t t1 = data[i] OP data[i+1]; data_t t2 = data[i+2] OP data[i+3]; data_t u1 = t1 OP t2; data_t t3 = data[i+4] OP data[i+5]; data_t t4 = data[i+6] OP data[i+7]; data_t u2 = t3 OP t4; data_t t5 = data[i+8] OP data[i+9]; data_t t6 = data[i+10] OP data[i+11]; data_t u3 = t5 OP t6; acc = acc OP (u1 OP u2 OP u3); } /* Finish any remaining elements */ for (; i < length; i++) { acc = acc OP data[i]; } *dest = acc; }
void unroll7aa_combine(vec_ptr v, data_t *dest) { long int i; long int length = vec_length(v); long int limit = length-6; data_t *data = get_vec_start(v); data_t acc = IDENT; /* Combine 7 elements at a time */ for (i = 0; i < limit; i+=7) { data_t t1 = data[i] OP data[i+1]; data_t t2 = data[i+2] OP data[i+3]; data_t u1 = t1 OP t2; data_t t3 = data[i+4] OP data[i+5]; data_t t4 = data[i+6]; data_t u2 = t3 OP t4; acc = acc OP (u1 OP u2); } /* Finish any remaining elements */ for (; i < length; i++) { acc = acc OP data[i]; } *dest = acc; }
void unroll8x8_combine(vec_ptr v, data_t *dest) { long int length = vec_length(v); data_t *data = get_vec_start(v); data_t *dend = data+length-7; data_t acc4 = IDENT; data_t acc5 = IDENT; data_t acc6 = IDENT; data_t acc7 = IDENT; data_t acc3 = IDENT; data_t acc0 = IDENT; data_t acc1 = IDENT; data_t acc2 = IDENT; while (data < dend) { acc0 = acc0 OP data[0]; acc1 = acc1 OP data[1]; acc2 = acc2 OP data[2]; acc3 = acc3 OP data[3]; acc4 = acc4 OP data[4]; acc6 = acc5 OP data[5]; acc6 = acc6 OP data[6]; acc7 = acc7 OP data[7]; data += 8; } dend += 7; while (data < dend) { acc0 = acc0 OP *data; data ++; } *dest = acc0 OP acc1 OP acc2 OP acc3 OP acc4 OP acc5 OP acc6 OP acc7; }
void simd_v2a_combine(vec_ptr v, data_t *dest) { long int i; pack_t xfer; vec_t accum; data_t *data = get_vec_start(v); int cnt = vec_length(v); data_t result = IDENT; /* Initialize accum to IDENT */ for (i = 0; i < VSIZE; i++) xfer.d[i] = IDENT; accum = xfer.v; while (((long) data) % VBYTES && cnt) { result = result OP *data++; cnt--; } while (cnt >= 2*VSIZE) { vec_t chunk0 = *((vec_t *) data); vec_t chunk1 = *((vec_t *) (data+VSIZE)); accum = accum OP (chunk0 OP chunk1); data += 2*VSIZE; cnt -= 2*VSIZE; } while (cnt) { result = result OP *data++; cnt--; } xfer.v = accum; for (i = 0; i < VSIZE; i++) result = result OP xfer.d[i]; *dest = result; }
void unroll12x6a_combine(vec_ptr v, data_t *dest) { long int i; long int length = vec_length(v); long int limit = length-11; data_t *data = get_vec_start(v); data_t acc0 = IDENT; data_t acc1 = IDENT; data_t acc2 = IDENT; data_t acc3 = IDENT; data_t acc4 = IDENT; data_t acc5 = IDENT; /* Combine 12 elements at a time */ for (i = 0; i < limit; i+=12) { acc0 = acc0 OP data[i]; acc0 = acc0 OP data[i+6]; acc1 = acc1 OP data[i+1]; acc1 = acc1 OP data[i+7]; acc2 = acc2 OP data[i+2]; acc2 = acc2 OP data[i+8]; acc3 = acc3 OP data[i+3]; acc3 = acc3 OP data[i+9]; acc4 = acc4 OP data[i+4]; acc4 = acc4 OP data[i+10]; acc5 = acc5 OP data[i+5]; acc5 = acc5 OP data[i+11]; } /* Finish any remaining elements */ for (; i < length; i++) { acc0 = acc0 OP data[i]; } *dest = (acc0 OP acc1) OP (acc2 OP acc3) OP (acc4 OP acc5); }
void unroll10x10a_combine(vec_ptr v, data_t *dest) { long int i; long int length = vec_length(v); long int limit = length-9; data_t *data = get_vec_start(v); data_t acc0 = IDENT; data_t acc1 = IDENT; data_t acc2 = IDENT; data_t acc3 = IDENT; data_t acc4 = IDENT; data_t acc5 = IDENT; data_t acc6 = IDENT; data_t acc7 = IDENT; data_t acc8 = IDENT; data_t acc9 = IDENT; /* Combine 10 elements at a time */ for (i = 0; i < limit; i+=10) { acc0 = acc0 OP data[i]; acc1 = acc1 OP data[i+1]; acc2 = acc2 OP data[i+2]; acc3 = acc3 OP data[i+3]; acc4 = acc4 OP data[i+4]; acc5 = acc5 OP data[i+5]; acc6 = acc6 OP data[i+6]; acc7 = acc7 OP data[i+7]; acc8 = acc8 OP data[i+8]; acc9 = acc9 OP data[i+9]; } /* Finish any remaining elements */ for (; i < length; i++) { acc0 = acc0 OP data[i]; } *dest = ((acc0 OP acc1) OP (acc2 OP acc3)) OP ((acc4 OP acc5) OP (acc6 OP acc7)) OP (acc8 OP acc9); }
void unroll16a_combine(vec_ptr v, data_t *dest) { long int i; long int length = vec_length(v); long int limit = length-15; data_t *data = get_vec_start(v); data_t acc = IDENT; /* Combine 16 elements at a time */ for (i = 0; i < limit; i+=16) { acc = acc OP data[i] OP data[i+1]; acc = acc OP data[i+2] OP data[i+3]; acc = acc OP data[i+4] OP data[i+5]; acc = acc OP data[i+6] OP data[i+7]; acc = acc OP data[i+8] OP data[i+9]; acc = acc OP data[i+10] OP data[i+11]; acc = acc OP data[i+12] OP data[i+13]; acc = acc OP data[i+14] OP data[i+15]; } /* Finish any remaining elements */ for (; i < length; i++) { acc = acc OP data[i]; } *dest = acc; }
//Requires that (length-2) is divisible by the block size void SOR_blocked(vec_ptr v, int *iterations, int b) { long int i, j, ii, jj; long int length = get_vec_length(v); data_t *data = get_vec_start(v); double change, mean_change = 100; int iters = 0; while (((mean_change/(double)(length*length)) > (double)TOL) || 1) { iters++; mean_change = 0; for (ii = 1; ii < length-1; ii+=b) for (jj = 1; jj < length-1; jj+=b) for (i = ii; i < ii+b; i++) for (j = jj; j < jj+b; j++) { change = data[i*length+j] - .25 * (data[(i-1)*length+j] + data[(i+1)*length+j] + data[i*length+j+1] + data[i*length+j-1]); data[i*length+j] -= change * OMEGA; if (change < 0){ change = -change; } mean_change += change; } if (abs(data[(length-2)*(length-2)]) > 10.0*(MAXVAL - MINVAL)) { printf("\n PROBABLY DIVERGENCE iter = %d", iters); break; } if(iters == MAX_ITERS) break; } *iterations = iters; }
void unroll7x7a_combine(vec_ptr v, data_t *dest) { long int i; long int length = vec_length(v); long int limit = length-6; data_t *data = get_vec_start(v); data_t acc0 = IDENT; data_t acc1 = IDENT; data_t acc2 = IDENT; data_t acc3 = IDENT; data_t acc4 = IDENT; data_t acc5 = IDENT; data_t acc6 = IDENT; /* Combine 7 elements at a time */ for (i = 0; i < limit; i+=7) { acc0 = acc0 OP data[i]; acc1 = acc1 OP data[i+1]; acc2 = acc2 OP data[i+2]; acc3 = acc3 OP data[i+3]; acc4 = acc4 OP data[i+4]; acc5 = acc5 OP data[i+5]; acc6 = acc6 OP data[i+6]; } /* Finish any remaining elements */ for (; i < length; i++) { acc0 = acc0 OP data[i]; } *dest = ((acc0 OP acc1) OP (acc2 OP acc3)) OP (acc4 OP acc5 OP acc6); }
void unroll8x4a_combine(vec_ptr v, data_t *dest) { long int i; long int length = vec_length(v); long int limit = length-7; data_t *data = get_vec_start(v); data_t acc0 = IDENT; data_t acc1 = IDENT; data_t acc2 = IDENT; data_t acc3 = IDENT; /* Combine 8 elements at a time */ for (i = 0; i < limit; i+=8) { acc0 = acc0 OP data[i]; acc1 = acc1 OP data[i+1]; acc2 = acc2 OP data[i+2]; acc3 = acc3 OP data[i+3]; acc0 = acc0 OP data[i+4]; acc1 = acc1 OP data[i+5]; acc2 = acc2 OP data[i+6]; acc3 = acc3 OP data[i+7]; } /* Finish any remaining elements */ for (; i < length; i++) { acc0 = acc0 OP data[i]; } *dest = acc0 OP acc1 OP acc2 OP acc3; }
void unroll9x3_combine(vec_ptr v, data_t *dest) { long int length = vec_length(v); data_t *data = get_vec_start(v); data_t *dend = data+length-8; data_t acc0 = IDENT; data_t acc1 = IDENT; data_t acc2 = IDENT; while (data < dend) { acc0 = acc0 OP data[0]; acc1 = acc1 OP data[1]; acc2 = acc2 OP data[2]; acc0 = acc0 OP data[3]; acc1 = acc1 OP data[4]; acc2 = acc2 OP data[5]; acc0 = acc0 OP data[6]; acc1 = acc1 OP data[7]; acc2 = acc2 OP data[8]; data += 9; } dend += 8; while (data < dend) { acc0 = acc0 OP *data; data ++; } *dest = acc0 OP acc1 OP acc2; }
void unroll16_combine(vec_ptr v, data_t *dest) { long int length = vec_length(v); data_t *data = get_vec_start(v); int over = length%16; data_t *dend = data+length-over; data_t acc = IDENT; while (data < dend) { acc = acc OP data[0]; acc = acc OP data[1]; acc = acc OP data[2]; acc = acc OP data[3]; acc = acc OP data[4]; acc = acc OP data[5]; acc = acc OP data[6]; acc = acc OP data[7]; acc = acc OP data[8]; acc = acc OP data[9]; acc = acc OP data[10]; acc = acc OP data[11]; acc = acc OP data[12]; acc = acc OP data[13]; acc = acc OP data[14]; acc = acc OP data[15]; data += 16; } dend += over; while (data < dend) { acc = acc OP *data; data ++; } *dest = acc; }
/* SOR */ void SOR(vec_ptr v, int *iterations) { long int i, j; long int length = get_vec_length(v); data_t *data = get_vec_start(v); double change, mean_change = 100; // start w/ something big int iters = 0; while ((mean_change/(double)(length*length)) > (double)TOL) { iters++; mean_change = 0; for (i = 1; i < length-1; i++) for (j = 1; j < length-1; j++) { change = data[i*length+j] - .25 * (data[(i-1)*length+j] + data[(i+1)*length+j] + data[i*length+j+1] + data[i*length+j-1]); data[i*length+j] -= change * OMEGA; if (change < 0){ change = -change; } mean_change += change; } if (abs(data[(length-2)*(length-2)]) > 10.0*(MAXVAL - MINVAL)) { printf("\n PROBABLY DIVERGENCE iter = %ld", iters); break; } //Limit the number of iterations, this adds a constant factor to the kernel if(iters == MAX_ITERS) break; } *iterations = iters; }
/* $begin combine5px8-ans */ void combine5px8(vec_ptr v, data_t *dest) { int length = vec_length(v); int limit = length - 8; data_t *data = get_vec_start(v); data_t x = IDENT; int i; /* Combine 8 elements at a time */ for (i = 0; i <= limit; i+=8) { x = x OPER data[0] OPER data[1] OPER data[2] OPER data[3] OPER data[4] OPER data[5] OPER data[6] OPER data[7]; data += 8; } /* Finish any remaining elements */ for (; i < length; i++) { x = x OPER data[0]; data++; } *dest = x; }
void inner_ged(vec_ptr u, vec_ptr v, data_t *dest) { int i; int length = vec_length(u); data_t *udata = get_vec_start(u); data_t *vdata = get_vec_start(v); data_t sum = (data_t) 0; for(i = 0; i < length; i+=4) { sum = sum + udata[i] * vdata[i]; sum = sum + udata[i+1] * vdata[i+1]; sum = sum + udata[i+2] * vdata[i+2]; sum = sum + udata[i+3] * vdata[i+3]; } for(i -= 4; i < length; i++) { sum = sum + udata[i] * vdata[i]; } *dest = sum; }
void dotproduct7(vec_ptr u, vec_ptr v, data_t *dest) { long int i; *dest = 1.0; int len = vec_length(u); int limit = len -1; data_t *data1 = get_vec_start(u); data_t *data2 = get_vec_start(v); data_t temp = 0; for (i = 0; i < limit; i+=2) { temp = temp + (data1[i] * data2[i] + data1[i+1]*data2[i+1]); } for (; i < len; i++) { temp = temp + data1[i] * data2[i]; } *dest = temp; }
void simd_v8_combine(vec_ptr v, data_t *dest) { long int i; pack_t xfer; vec_t accum0, accum1, accum2, accum3, accum4, accum5, accum6, accum7; data_t *data = get_vec_start(v); int cnt = vec_length(v); data_t result = IDENT; /* Initialize to accum IDENT */ for (i = 0; i < VSIZE; i++) xfer.d[i] = IDENT; accum0 = xfer.v; accum1 = xfer.v; accum2 = xfer.v; accum3 = xfer.v; accum4 = xfer.v; accum5 = xfer.v; accum6 = xfer.v; accum7 = xfer.v; while (((long) data) % VBYTES && cnt) { result = result OP *data++; cnt--; } while (cnt >= 8*VSIZE) { vec_t chunk0 = *((vec_t *) data); vec_t chunk1 = *((vec_t *) (data+VSIZE)); vec_t chunk2 = *((vec_t *) (data+2*VSIZE)); vec_t chunk3 = *((vec_t *) (data+3*VSIZE)); vec_t chunk4 = *((vec_t *) (data+4*VSIZE)); vec_t chunk5 = *((vec_t *) (data+5*VSIZE)); vec_t chunk6 = *((vec_t *) (data+6*VSIZE)); vec_t chunk7 = *((vec_t *) (data+7*VSIZE)); accum0 = accum0 OP chunk0; accum1 = accum1 OP chunk1; accum2 = accum2 OP chunk2; accum3 = accum3 OP chunk3; accum4 = accum4 OP chunk4; accum5 = accum5 OP chunk5; accum6 = accum6 OP chunk6; accum7 = accum7 OP chunk7; data += 8*VSIZE; cnt -= 8*VSIZE; } while (cnt) { result = result OP *data++; cnt--; } xfer.v = (accum0 OP accum1) OP (accum2 OP accum3); xfer.v = xfer.v OP (accum4 OP accum5) OP (accum6 OP accum7); for (i = 0; i < VSIZE; i++) result = result OP xfer.d[i]; *dest = result; }
void process(vec_t *v, data_t *dest) { int i; int length = vec_length(v); data_t *d = get_vec_start(v); data_t t = IDENT; for (i = 0; i < length; i++) t = t OP d[i]; *dest = t; }
/* Accumulate in local variable, pointer version */ void combine4p(vec_ptr v, data_t *dest) { long int length = vec_length(v); data_t *data = get_vec_start(v); data_t *dend = data+length; data_t acc = IDENT; for (; data < dend; data++) acc = acc OP *data; *dest = acc; }
/* Direct access to vector data */ void combine3(vec_ptr v, data_t *dest) { long int i; long int length = vec_length(v); data_t *data = get_vec_start(v); *dest = IDENT; for (i = 0; i < length; i++) { *dest = *dest OP data[i]; } }
/* Accumulate result in local variable */ void combine4(vec_ptr v, data_t *dest) { long int i; long int length = vec_length(v); data_t *data = get_vec_start(v); data_t acc = IDENT; for (i = 0; i < length; i++) { acc = acc OP data[i]; } *dest = acc; }
void inner4b(vec_ptr x, vec_ptr y, data_t *dest) { long int i; int length = vec_length(x); int limit = length - 2; data_t *xdata = get_vec_start(x); data_t *ydata = get_vec_start(y); data_t sum = (data_t) 0; data_t s1, s2, s3; for (i = 0; i < limit; i += 3){ s1 = xdata[i] * ydata[i]; s2 = xdata[i + 1] * ydata[i + 1]; s3 = xdata[i + 2] * ydata[i + 2]; sum += (s1 + s2 + s3); } for (; i < length; i++){ sum += xdata[i] * ydata[i]; } *dest = sum; }
/* Unroll loop by 2. Change associativity of combining operation */ void dotproduct7(vec_ptr u, vec_ptr v, data_t *dest) { long int i; long int length = vec_length(u); long int limit = length - 1; data_t *data1 = get_vec_start(u); data_t *data2 = get_vec_start(v); data_t acc = 1.0; /*combine 2 elements at a time */ for (i = 0; i < limit; i+=2) { acc = acc + (data1[i] * data2[i] + data1[i+1] * data2[i+1]); } /* finish any remaining elements */ for(; i < length; i++) { acc = acc + (data1[i] * data2[i]); } *dest = acc; }
void simd_v4_combine(vec_ptr v, data_t *dest) { long int i; pack_t xfer; data_t *data = get_vec_start(v); int cnt = vec_length(v); data_t result = IDENT; /* Create 4 accumulators and initialize elements to IDENT */ vec_t accum0, accum1, accum2, accum3; for (i = 0; i < VSIZE; i++) xfer.d[i] = IDENT; accum0 = xfer.v; accum1 = xfer.v; accum2 = xfer.v; accum3 = xfer.v; while (((long) data) % VBYTES && cnt) { result = result OP *data++; cnt--; } /* $begin simd_v4_loop-c */ /* Accumulate with 4x VSIZE parallelism */ while (cnt >= 4*VSIZE) { vec_t chunk0 = *((vec_t *) data); vec_t chunk1 = *((vec_t *) (data+VSIZE)); vec_t chunk2 = *((vec_t *) (data+2*VSIZE)); vec_t chunk3 = *((vec_t *) (data+3*VSIZE)); accum0 = accum0 OP chunk0; accum1 = accum1 OP chunk1; accum2 = accum2 OP chunk2; accum3 = accum3 OP chunk3; data += 4*VSIZE; cnt -= 4*VSIZE; } /* $end simd_v4_loop-c */ while (cnt) { result = result OP *data++; cnt--; } /* $begin simd_v4_accum-c */ /* Combine into single accumulator */ xfer.v = (accum0 OP accum1) OP (accum2 OP accum3); /* Combine results from accumulators within vector */ for (i = 0; i < VSIZE; i++) result = result OP xfer.d[i]; /* $end simd_v4_accum-c */ *dest = result; }
/* Make sure dest updated on each iteration */ void combine3w(vec_ptr v, data_t *dest) { long int i; long int length = vec_length(v); data_t *data = get_vec_start(v); data_t acc = IDENT; /* Initialize in event length <= 0 */ *dest = acc; for (i = 0; i < length; i++) { acc = acc OP data[i]; *dest = acc; } }
/* Unroll loop by 2, pointer version */ void combine5p(vec_ptr v, data_t *dest) { data_t *data = get_vec_start(v); data_t *dend = data+vec_length(v); data_t *dlimit = dend-1; data_t acc = IDENT; /* Combine 3 elements at a time */ for (; data < dlimit; data += 2) { acc = acc OP data[0] OP data[1]; } /* Finish any remaining elements */ for (; data < dend; data++) { acc = acc OP data[0]; } *dest = acc; }