void DoubleToComplex(double *srcI, double *srcQ, Complex *dst, const unsigned int len) { __m256d avxR_D, avxI_D, avxX_D, avxY_D, avxA_D, avxB_D; __m128 avxA, avxB; #if 1 __m256 avxD; #endif for (unsigned int i=0; i+4<=len; i+=4) { avxR_D = _mm256_loadu_pd(srcI + i); avxI_D = _mm256_loadu_pd(srcQ + i); avxX_D = _mm256_unpacklo_pd(avxR_D, avxI_D); //swizzle avxY_D = _mm256_unpackhi_pd(avxR_D, avxI_D); avxA_D = _mm256_permute2f128_pd(avxX_D, avxY_D, 0x20); avxB_D = _mm256_permute2f128_pd(avxX_D, avxY_D, 0x31); avxA = _mm256_cvtpd_ps(avxA_D); //double to float avxB = _mm256_cvtpd_ps(avxB_D); #if 0 avxD = _mm256_castps128_ps256(avxA); avxD = _mm256_insertf128_ps(avxD, avxB, 1); _mm256_storeu_ps((float*)(dst+i), avxD); #else _mm_maskstore_ps((float*)(dst+i), _mm_set_epi32(SET_1, SET_1, SET_1, SET_1), avxA); _mm_maskstore_ps((float*)(dst+i+2), _mm_set_epi32(SET_1, SET_1, SET_1, SET_1), avxB); #endif } for (unsigned int i=len-(len & 0x03); i<len; ++i) { dst[i].m_real = static_cast<float>(srcI[i]); dst[i].m_imag = static_cast<float>(srcQ[i]); } }
inline void transpose_4x4block_AVX_64(double* A, double* B, const size_t lda, const size_t ldb) { __m256d row0 = _mm256_load_pd(&A[0*ldb]); __m256d row1 = _mm256_load_pd(&A[1*ldb]); __m256d row2 = _mm256_load_pd(&A[2*ldb]); __m256d row3 = _mm256_load_pd(&A[3*ldb]); __m256d tmp3, tmp2, tmp1, tmp0; tmp0 = _mm256_unpacklo_pd(row0, row1); tmp1 = _mm256_unpackhi_pd(row0, row1); tmp2 = _mm256_unpacklo_pd(row2, row3); tmp3 = _mm256_unpackhi_pd(row2, row3); row0 = _mm256_permute2f128_pd(tmp0, tmp2, 0x20); row1 = _mm256_permute2f128_pd(tmp1, tmp3, 0x20); row2 = _mm256_permute2f128_pd(tmp0, tmp2, 0x31); row3 = _mm256_permute2f128_pd(tmp1, tmp3, 0x31); _mm256_store_pd(&B[0*lda], row0); _mm256_store_pd(&B[1*lda], row1); _mm256_store_pd(&B[2*lda], row2); _mm256_store_pd(&B[3*lda], row3); }