void test(JNIEnv * env, jobject jRoot, jobject jObj) {
    int *testSet1 = (int*)malloc(sizeof(int)*DATA_SIZE);

    for(uint32_t i = 0; i<DATA_SIZE; i++) {
        testSet1[i] = i;
    }


    clock_t begin = clock();

    for (uint32_t i=0; i<DATA_SIZE/4/2; i++) {
        int32_t *src = testSet1+i*4;
        int32_t *dest = testSet1+DATA_SIZE - 4*(i+1);
        int32x4_t tmp = vld1q_dup_s32(src);
        int32x4_t destData = vld1q_dup_s32(dest);
        int32x4_t rDestData = vrev64q_s32(destData);
        vst1q_s32(src, rDestData);
        vst1q_s32(dest, tmp);
    }

    clock_t end = clock();

    for (uint32_t i = 0; i<DATA_SIZE/2; i++) {
        int t = testSet1[i];
        int d = testSet1[DATA_SIZE-1-i];
        testSet1[i] = d;
        testSet1[DATA_SIZE-1-i] = t;
    }

    clock_t end2 = clock();

    clock_t cost1 = end-begin;
    clock_t cost2 = end2-end;

    __android_log_print(ANDROID_LOG_DEBUG, "NEON", "last number is %d, acc=%.1fx", testSet1[DATA_SIZE-1], 1.f*cost2/cost1);

    free(testSet1);

    jclass clasz = env->FindClass("com/tencent/helloneon/BenchListener");
    jmethodID method = env->GetMethodID(clasz, "onResult", "(Ljava/lang/String;)V");

    std::stringstream out;
    out << "benchResult:" << 1.f*cost2/cost1;
    env->CallVoidMethod(jObj, method, env->NewStringUTF(out.str().c_str()));
}
void WebRtcIsacfix_AllpassFilter2FixDec16Neon(
    int16_t* data_ch1,  // Input and output in channel 1, in Q0
    int16_t* data_ch2,  // Input and output in channel 2, in Q0
    const int16_t* factor_ch1,  // Scaling factor for channel 1, in Q15
    const int16_t* factor_ch2,  // Scaling factor for channel 2, in Q15
    const int length,  // Length of the data buffers
    int32_t* filter_state_ch1,  // Filter state for channel 1, in Q16
    int32_t* filter_state_ch2) {  // Filter state for channel 2, in Q16
  assert(length % 2 == 0);
  int n = 0;
  int16x4_t factorv;
  int16x4_t datav;
  int32x4_t statev;
  int32x2_t tmp;

  // Load factor_ch1 and factor_ch2.
  tmp = vld1_dup_s32((int32_t*)factor_ch1);
  tmp = vld1_lane_s32((int32_t*)factor_ch2, tmp, 1);
  factorv = vreinterpret_s16_s32(tmp);
  // Load filter_state_ch1[0] and filter_state_ch2[0].
  statev = vld1q_dup_s32(filter_state_ch1);
  statev = vld1q_lane_s32(filter_state_ch2, statev, 2);

  // Loop unrolling preprocessing.
  int32x4_t a;
  int16x4_t tmp1, tmp2;

  // Load data_ch1[0] and data_ch2[0].
  datav = vld1_dup_s16(data_ch1);
  datav = vld1_lane_s16(data_ch2, datav, 2);

  a = vqdmlal_s16(statev, datav, factorv);
  tmp1 = vshrn_n_s32(a, 16);

  // Update filter_state_ch1[0] and filter_state_ch2[0].
  statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp1, factorv);

  // Load filter_state_ch1[1] and filter_state_ch2[1].
  statev = vld1q_lane_s32(filter_state_ch1 + 1, statev, 1);
  statev = vld1q_lane_s32(filter_state_ch2 + 1, statev, 3);

  // Load data_ch1[1] and data_ch2[1].
  tmp1 = vld1_lane_s16(data_ch1 + 1, tmp1, 1);
  tmp1 = vld1_lane_s16(data_ch2 + 1, tmp1, 3);
  datav = vrev32_s16(tmp1);

  // Loop unrolling processing.
  for (n = 0; n < length - 2; n += 2) {
    a = vqdmlal_s16(statev, datav, factorv);
    tmp1 = vshrn_n_s32(a, 16);
    // Store data_ch1[n] and data_ch2[n].
    vst1_lane_s16(data_ch1 + n, tmp1, 1);
    vst1_lane_s16(data_ch2 + n, tmp1, 3);

    // Update filter_state_ch1[0], filter_state_ch1[1]
    // and filter_state_ch2[0], filter_state_ch2[1].
    statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp1, factorv);

    // Load data_ch1[n + 2] and data_ch2[n + 2].
    tmp1 = vld1_lane_s16(data_ch1 + n + 2, tmp1, 1);
    tmp1 = vld1_lane_s16(data_ch2 + n + 2, tmp1, 3);
    datav = vrev32_s16(tmp1);

    a = vqdmlal_s16(statev, datav, factorv);
    tmp2 = vshrn_n_s32(a, 16);
    // Store data_ch1[n + 1] and data_ch2[n + 1].
    vst1_lane_s16(data_ch1 + n + 1, tmp2, 1);
    vst1_lane_s16(data_ch2 + n + 1, tmp2, 3);

    // Update filter_state_ch1[0], filter_state_ch1[1]
    // and filter_state_ch2[0], filter_state_ch2[1].
    statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp2, factorv);

    // Load data_ch1[n + 3] and data_ch2[n + 3].
    tmp2 = vld1_lane_s16(data_ch1 + n + 3, tmp2, 1);
    tmp2 = vld1_lane_s16(data_ch2 + n + 3, tmp2, 3);
    datav = vrev32_s16(tmp2);
  }

  // Loop unrolling post-processing.
  a = vqdmlal_s16(statev, datav, factorv);
  tmp1 = vshrn_n_s32(a, 16);
  // Store data_ch1[n] and data_ch2[n].
  vst1_lane_s16(data_ch1 + n, tmp1, 1);
  vst1_lane_s16(data_ch2 + n, tmp1, 3);

  // Update filter_state_ch1[0], filter_state_ch1[1]
  // and filter_state_ch2[0], filter_state_ch2[1].
  statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp1, factorv);
  // Store filter_state_ch1[0] and filter_state_ch2[0].
  vst1q_lane_s32(filter_state_ch1, statev, 0);
  vst1q_lane_s32(filter_state_ch2, statev, 2);

  datav = vrev32_s16(tmp1);
  a = vqdmlal_s16(statev, datav, factorv);
  tmp2 = vshrn_n_s32(a, 16);
  // Store data_ch1[n + 1] and data_ch2[n + 1].
  vst1_lane_s16(data_ch1 + n + 1, tmp2, 1);
  vst1_lane_s16(data_ch2 + n + 1, tmp2, 3);

  // Update filter_state_ch1[1] and filter_state_ch2[1].
  statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp2, factorv);
  // Store filter_state_ch1[1] and filter_state_ch2[1].
  vst1q_lane_s32(filter_state_ch1 + 1, statev, 1);
  vst1q_lane_s32(filter_state_ch2 + 1, statev, 3);
}
Exemple #3
0
void test_vld1Q_dups32 (void)
{
  int32x4_t out_int32x4_t;

  out_int32x4_t = vld1q_dup_s32 (0);
}