int main (void) { { __v2si val; __v2si val2; __v2si val3; init_val(&val); /* Copy val to val2. */ vector_store (&val2, val); /* Copy val2 to val3. */ val3 = vector_load (&val2); /* Compare val to val3. */ { char *p = (char*)&val; char *p2 = (char*)&val3; if (p[0] != p2[0]) return 1; if (p[1] != p2[1]) return 1; if (p[2] != p2[2]) return 1; if (p[3] != p2[3]) return 1; if (p[4] != p2[4]) return 1; if (p[5] != p2[5]) return 1; if (p[6] != p2[6]) return 1; if (p[7] != p2[7]) return 1; } } { __v2si val4 = vector_const (); char *p = (char*)&val4; if (p[0] != 1) return 1; if (p[1] != 0) return 1; if (p[2] != 0) return 1; if (p[3] != 0) return 1; if (p[4] != 2) return 1; if (p[5] != 0) return 1; if (p[6] != 0) return 1; if (p[7] != 0) return 1; } return 0; }
void down_loop(struct rcvr_cb* rcb, int pass) { int i, j, k = 0; int dsample, count, coeff_len; bool IQ_swap; float* buf, *out; float* pSrc, *orig_buf; const vector_type* pFil; vector_type sum1;//, sum2; struct main_cb* mcb = rcb->mcb; if(1 == pass) { // 1st pass filter is hb for 768000 IQ_swap = true; dsample = 2; count = RTL_READ_COUNT; coeff_len = COEFF3072_H_16_LENGTH; buf = &(rcb->iq_buf[0]); out = &(rcb->iq_buf_final[COEFF1536_H_32_LENGTH * 2]); } else { IQ_swap = false; count = RTL_READ_COUNT / 2; coeff_len = mcb->length_fir; buf = &(rcb->iq_buf_final[0]); out = &(rcb->iqSamples[rcb->iqSamples_remaining * 2]); switch(mcb->output_rate) { case 48000: dsample = DOWNSAMPLE_192 * 2; break; case 96000: dsample = DOWNSAMPLE_192; break; case 192000: dsample = DOWNSAMPLE_192 / 2; break; case 384000: dsample = DOWNSAMPLE_192 / 4; break; } } orig_buf = buf; #if defined(INCLUDE_NEON) || defined(INCLUDE_SSE2) // filter is evaluated for two I/Q samples with each iteration, thus use of 'j += 2' for(j = 0; j < count / 2; j += 2) { pSrc = buf; // filter coefficients. NOTE: Assumes coefficients are aligned to 16-byte boundary if(1 == pass) pFil = (const vector_type*) mcb->align3072_768_H; else { switch(mcb->output_rate) { case 48000: pFil = (const vector_type*) mcb->align1536_48_H; break; case 96000: pFil = (const vector_type*) mcb->align1536_96_H; break; case 192000: pFil = (const vector_type*) mcb->align1536_192_H; break; case 384000: pFil = (const vector_type*) mcb->align1536_384_H; break; } } sum1 = vector_zero; // sum1 = sum2 = vector_zero; for(i = 0; i < coeff_len / 8; i++) { // Unroll loop for efficiency & calculate filter for 2*2 I/Q samples // at each pass // sum1 is accu for 2*2 filtered I/Q data at the primary data offset // sum2 is accu for 2*2 filtered I/Q data for the next sample offset. sum1 = vector_mac(sum1, pSrc, pFil[0]); // sum2 = vector_mac(sum2, pSrc+2, pFil[0]); sum1 = vector_mac(sum1, pSrc + 4, pFil[1]); // sum2 = vector_mac(sum2, pSrc+6, pFil[1]); sum1 = vector_mac(sum1, pSrc + 8, pFil[2]); // sum2 = vector_mac(sum2, pSrc+10, pFil[2]); sum1 = vector_mac(sum1, pSrc + 12, pFil[3]); // sum2 = vector_mac(sum2, pSrc+14, pFil[3]); pSrc += 16; pFil += 4; } buf += 4; // Now sum1 and sum2 both have a filtered 2-channel sample each, but we still need // to sum the two hi- and lo-floats of these registers together. Only perform this // if we actually need it for the downsampled output data. // UPDATE: since we're throwing away sum2, don't bother calculating it. if(0 == ((j + 2) % dsample)) { // post-shuffle & add the filtered values and store to dest. vector_store(rcb->dest, sum1, sum1); #if 0 _mm_store_ps(rcb->dest, _mm_add_ps(_mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(1, 0, 3, 2)), // s2_1 s2_0 s1_3 s1_2 _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(3, 2, 1, 0)) // s2_3 s2_2 s1_1 s1_0 )); #endif // Since we're decimating we only need one of the sums out[k++] = (IQ_swap) ? rcb->dest[0] : rcb->dest[1]; out[k++] = (IQ_swap) ? rcb->dest[1] : rcb->dest[0]; #else // non SIMD for(j = 0; j < count; j += 2) { pSrc = buf; if(1 == pass) pFil = mcb->coeff_768; else { switch(mcb->output_rate) { case 48000: pFil = mcb->coeff_48; break; case 96000: pFil = mcb->coeff_96; break; case 192000: pFil = mcb->coeff_192; break; case 384000: pFil = mcb->coeff_384; break; } } sum1 = sum2 = 0.0f; for(i = 0; i < coeff_len; i++) { sum1 += pSrc[0] * pFil[0]; sum2 += pSrc[1] * pFil[0]; pSrc += 2; pFil++; } buf += 2; if(0 == (((j + 2) / 2) % dsample)) { out[k++] = (IQ_swap) ? sum2 : sum1; out[k++] = (IQ_swap) ? sum1 : sum2; #endif } } // Move the last coeff_len*2 length of buffer to the front for the next call memmove(orig_buf, &orig_buf[count - (coeff_len * 2)], coeff_len * 2 * sizeof(float)); } void downsample(struct rcvr_cb * rcb) { down_loop(rcb, 1); down_loop(rcb, 2); }