// 64-bit integer multiplication int test_simd_mul_u64() { int test_result = 0; const int alignment = SIMD_WIDTH_BYTES; // Integer { const int num_elems = SIMD_STREAMS_64; const TEST_TYPES test_type = TEST_U64; unsigned long int *arr_A = NULL, *arr_B = NULL, *arr_C1 = NULL, *arr_C2 = NULL; create_test_array(test_type, (void **)&arr_A, num_elems, alignment); create_test_array(test_type, (void **)&arr_B, num_elems, alignment); create_test_array(test_type, (void **)&arr_C1, num_elems, alignment); create_test_array(test_type, (void **)&arr_C2, num_elems, alignment); SIMD_INT va = simd_load(arr_A); SIMD_INT vb = simd_load(arr_B); SIMD_INT vc = simd_mul_u64(va, vb); for (int i = 0; i < num_elems; ++i) arr_C2[i] = arr_A[i] * arr_B[i]; simd_store(arr_C1, vc); test_result += validate_test_arrays(test_type, (void *)arr_C1, (void *)arr_C2, num_elems); free(arr_A); free(arr_B); free(arr_C1); free(arr_C2); } return test_result; }
// Convert unsigned 64-bit integer to 32/64-bit floating-point int test_simd_cvt_u64_fp() { int test_result = 0; const int alignment = SIMD_WIDTH_BYTES; // Float { const int num_elems = SIMD_STREAMS_32; const TEST_TYPES test_type = TEST_FLT; unsigned long int *arr_A = NULL; float *arr_C1 = NULL, *arr_C2 = NULL; create_test_array(TEST_U64, (void **)&arr_A, SIMD_STREAMS_64, alignment); create_test_array(test_type, (void **)&arr_C1, num_elems, alignment); create_test_array(test_type, (void **)&arr_C2, num_elems, alignment); SIMD_INT va = simd_load(arr_A); SIMD_FLT vc = simd_cvt_u64_f32(va); for (int i = 0; i < num_elems/2; ++i) arr_C2[i] = (float)arr_A[i]; for (int i = 0; i < num_elems/2; ++i) arr_C2[num_elems/2 + i] = 0.0; simd_store(arr_C1, vc); test_result += validate_test_arrays(test_type, (void *)arr_C1, (void *)arr_C2, num_elems); free(arr_A); free(arr_C1); free(arr_C2); } // Double { const int num_elems = SIMD_STREAMS_64; const TEST_TYPES test_type = TEST_DBL; unsigned long int *arr_A = NULL; double *arr_C1 = NULL, *arr_C2 = NULL; create_test_array(TEST_U64, (void **)&arr_A, SIMD_STREAMS_64, alignment); create_test_array(test_type, (void **)&arr_C1, num_elems, alignment); create_test_array(test_type, (void **)&arr_C2, num_elems, alignment); SIMD_INT va = simd_load(arr_A); SIMD_DBL vc = simd_cvt_u64_f64(va); for (int i = 0; i < num_elems; ++i) arr_C2[i] = (double)arr_A[i]; simd_store(arr_C1, vc); test_result += validate_test_arrays(test_type, (void *)arr_C1, (void *)arr_C2, num_elems); free(arr_A); free(arr_C1); free(arr_C2); } return test_result; }
// Aligned loads and stores int test_simd_loadstore() { int test_result = 0; const int alignment = SIMD_WIDTH_BYTES; // Integer { const int num_elems = SIMD_STREAMS_32; const TEST_TYPES test_type = TEST_I32; int *arr_A = NULL, *arr_B = NULL; create_test_array(test_type, (void **)&arr_A, num_elems, alignment); create_test_array(test_type, (void **)&arr_B, num_elems, alignment); SIMD_INT va = simd_load(arr_A); simd_store(arr_B, va); test_result += validate_test_arrays(test_type, (void *)arr_A, (void *)arr_B, num_elems); free(arr_A); free(arr_B); } // Float { const int num_elems = SIMD_STREAMS_32; const TEST_TYPES test_type = TEST_FLT; float *arr_A = NULL, *arr_B = NULL; create_test_array(test_type, (void **)&arr_A, num_elems, alignment); create_test_array(test_type, (void **)&arr_B, num_elems, alignment); SIMD_FLT va = simd_load(arr_A); simd_store(arr_B, va); test_result += validate_test_arrays(test_type, (void *)arr_A, (void *)arr_B, num_elems); free(arr_A); free(arr_B); } return test_result; }
// Pack and merge the low 32-bits of 64-bit integers int test_simd_packmerge_i32() { int test_result = 0; const int alignment = SIMD_WIDTH_BYTES; // Integer { const int num_elems = SIMD_STREAMS_32; const TEST_TYPES test_type = TEST_I32; long int *arr_A = NULL, *arr_B = NULL; int *arr_C1 = NULL, *arr_C2 = NULL; create_test_array(TEST_I64, (void **)&arr_A, SIMD_STREAMS_64, alignment); create_test_array(TEST_I64, (void **)&arr_B, SIMD_STREAMS_64, alignment); create_test_array(test_type, (void **)&arr_C1, num_elems, alignment); create_test_array(test_type, (void **)&arr_C2, num_elems, alignment); SIMD_INT va = simd_load(arr_A); SIMD_INT vb = simd_load(arr_B); SIMD_INT vc = simd_packmerge_i32(va, vb); for (int i = 0; i < num_elems/2; ++i) arr_C2[i] = ((int *)arr_A)[2*i]; for (int i = 0; i < num_elems/2; ++i) arr_C2[num_elems/2 + i] = ((int *)arr_B)[2*i]; simd_store(arr_C1, vc); test_result += validate_test_arrays(test_type, (void *)arr_C1, (void *)arr_C2, num_elems); free(arr_A); free(arr_B); free(arr_C1); free(arr_C2); } return test_result; }
// Floating-point fused multiply-add int test_simd_fmadd() { int test_result = 0; const int alignment = SIMD_WIDTH_BYTES; // Float { const int num_elems = SIMD_STREAMS_32; const TEST_TYPES test_type = TEST_FLT; float *arr_A = NULL, *arr_B = NULL, *arr_C1 = NULL, *arr_C2 = NULL; create_test_array(test_type, (void **)&arr_A, num_elems, alignment); create_test_array(test_type, (void **)&arr_B, num_elems, alignment); create_test_array(test_type, (void **)&arr_C1, num_elems, alignment); create_test_array(test_type, (void **)&arr_C2, num_elems, alignment); SIMD_FLT va = simd_load(arr_A); SIMD_FLT vb = simd_load(arr_B); SIMD_FLT vc = simd_load(arr_C1); vc = simd_fmadd(va, vb, vc); for (int i = 0; i < num_elems; ++i) arr_C2[i] = arr_A[i] * arr_B[i] + arr_C1[i]; simd_store(arr_C1, vc); test_result += validate_test_arrays(test_type, (void *)arr_C1, (void *)arr_C2, num_elems); free(arr_A); free(arr_B); free(arr_C1); free(arr_C2); } // Double { const int num_elems = SIMD_STREAMS_64; const TEST_TYPES test_type = TEST_DBL; double *arr_A = NULL, *arr_B = NULL, *arr_C1 = NULL, *arr_C2 = NULL; create_test_array(test_type, (void **)&arr_A, num_elems, alignment); create_test_array(test_type, (void **)&arr_B, num_elems, alignment); create_test_array(test_type, (void **)&arr_C1, num_elems, alignment); create_test_array(test_type, (void **)&arr_C2, num_elems, alignment); SIMD_DBL va = simd_load(arr_A); SIMD_DBL vb = simd_load(arr_B); SIMD_DBL vc = simd_load(arr_C1); vc = simd_fmadd(va, vb, vc); for (int i = 0; i < num_elems; ++i) arr_C2[i] = arr_A[i] * arr_B[i] + arr_C1[i]; simd_store(arr_C1, vc); test_result += validate_test_arrays(test_type, (void *)arr_C1, (void *)arr_C2, num_elems); free(arr_A); free(arr_B); free(arr_C1); free(arr_C2); } return test_result; }
RT_engine_error surface_intersection(Surface::SurfaceClass surf_class, real_t* params, size_t Nrays, real_t* X, real_t* Y, real_t* Z, real_t* cX, real_t* cY, real_t* cZ, size_t *N_bad, beam_flag_t* flag) { size_t N_simd = Nrays - (Nrays % SIMD_VEC_LEN); real_t zero = 0.0; #ifdef USE_SIMD real_t s[SIMD_VEC_LEN]; simd_real_t simd_s; simd_real_t simd_X; simd_real_t simd_Y; simd_real_t simd_Z; simd_real_t simd_cX; simd_real_t simd_cY; simd_real_t simd_cZ; #ifdef USE_OPENMP #pragma omp parallel for #endif for (size_t i = 0; i < N_simd; i += SIMD_VEC_LEN) { switch ( surf_class ) { case Surface::Plane: { // trivial case of Z=0 plane simd_Z = simd_load(Z+i); simd_cZ = simd_load(cZ+i); simd_s = -simd_div(simd_Z,simd_cZ); // -Z/cZ simd_Z = simd_broadcast_scalar(&zero); // Z = 0 simd_store(s,simd_s); for (int j = 0; j < SIMD_VEC_LEN; ++j) { if (s[j] < 0) { // distance must be non-negative! flag[i+j] = 0; ++(*N_bad); } } simd_X = simd_load(X+i); simd_Y = simd_load(Y+i); simd_cX = simd_load(cX+i); simd_cY = simd_load(cY+i); simd_cX = simd_mul(simd_cX,simd_s); // cX*s simd_cY = simd_mul(simd_cY,simd_s); // cY*s simd_X = simd_add(simd_X,simd_cX); // X_new = X + cX*s simd_Y = simd_add(simd_Y,simd_cY); // Y_new = Y + cY*s simd_store(X+i,simd_X); simd_store(Y+i,simd_Y); simd_store(Z+i,simd_Z); break; } case Surface::Conic: { break; } } } #else // generic non-SIMD realization real_t s; #endif }