// Computes part of matrix.vector v = Wu. Computes N=8 results. // For details see PartialMatrixDotVector64 with N=8. static void PartialMatrixDotVector8(const int8_t* wi, const double* scales, const int8_t* u, int num_in, int num_out, double* v) { // Register containing 16-bit ones for horizontal add with 16->32 bit // conversion. __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1); // Initialize all the results to 0. __m256i result0 = _mm256_setzero_si256(); // Iterate over the input (u), one registerful at a time. for (int j = 0; j < num_in;) { __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(u + j)); // Inputs are processed in groups of kNumInputsPerGroup, replicated // kNumInputGroups times. for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) { // Replicate the low 32 bits (4 inputs) 8 times. __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs)); // Rotate the inputs in groups of 4, so the next 4 inputs are ready. inputs = _mm256_permutevar8x32_epi32(inputs, shift_id); __m256i weights, reps; // Mul-add, with horizontal add of the 4 inputs to each of the results. MultiplyGroup(rep_input, ones, wi, weights, reps, result0); } } ExtractResults(result0, shift_id, wi, scales, num_out, v); }
__m256i test_mm256_broadcastd_epi32(__m128i a) { // CHECK: @llvm.x86.avx2.pbroadcastd.256 return _mm256_broadcastd_epi32(a); }
__m256i test_mm256_broadcastd_epi32(__m128i a) { // CHECK-LABEL: test_mm256_broadcastd_epi32 // CHECK-NOT: @llvm.x86.avx2.pbroadcastd.256 // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> zeroinitializer return _mm256_broadcastd_epi32(a); }