template<> void copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size) { for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep ) { const uchar* src = (const uchar*)_src; uchar* dst = (uchar*)_dst; int x = 0; #if CV_SSE4_2 if(USE_SSE4_2)// { __m128i zero = _mm_setzero_si128 (); for( ; x <= size.width - 16; x += 16 ) { const __m128i rSrc = _mm_lddqu_si128((const __m128i*)(src+x)); __m128i _mask = _mm_lddqu_si128((const __m128i*)(mask+x)); __m128i rDst = _mm_lddqu_si128((__m128i*)(dst+x)); __m128i _negMask = _mm_cmpeq_epi8(_mask, zero); rDst = _mm_blendv_epi8(rSrc, rDst, _negMask); _mm_storeu_si128((__m128i*)(dst + x), rDst); } } #endif for( ; x < size.width; x++ ) if( mask[x] ) dst[x] = src[x]; } }
// Bytewise c ? t : e. static __m128i if_then_else(__m128i c, __m128i t, __m128i e) { #if 0 && defined(__SSE4_1__) // Make sure we have a bot testing this before enabling. return _mm_blendv_epi8(e,t,c); #else return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e)); #endif }
/* Bytewise c ? t : e. */ static __m128i if_then_else(__m128i c, __m128i t, __m128i e) { #if PNG_INTEL_SSE_IMPLEMENTATION >= 3 return _mm_blendv_epi8(e,t,c); #else return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e)); #endif }
__m128i branchfree_search4_avx(int* source, size_t n, __m128i target) { __m128i offsets = _mm_setzero_si128(); if(n == 0) return offsets; __m128i ha = _mm_set1_epi32(n>>1); while(n>1) { n -= n>>1; __m128i offsetsplushalf = _mm_add_epi32(offsets,ha); ha = _mm_sub_epi32(ha,_mm_srli_epi32(ha,1)); __m128i keys = _mm_i32gather_epi32(source,offsetsplushalf,4); __m128i lt = _mm_cmplt_epi32(keys,target); offsets = _mm_blendv_epi8(offsets,offsetsplushalf,lt); } __m128i lastkeys = _mm_i32gather_epi32(source,offsets,4); __m128i lastlt = _mm_cmplt_epi32(lastkeys,target); __m128i oneswhereneeded = _mm_srli_epi32(lastlt,31); __m128i answer = _mm_add_epi32(offsets,oneswhereneeded); return answer; }
static void sse4_1_test (void) { union { __m128i x[NUM]; unsigned char c[NUM * 16]; } dst, src1, src2, mask; int i; init_pblendvb (src1.c, src2.c, mask.c); for (i = 0; i < NUM; i++) { dst.x[i] = _mm_blendv_epi8 (src1.x[i], src2.x[i], mask.x[i]); if (check_pblendvb (&dst.x[i], &src1.c[i * 16], &src2.c[i * 16], &mask.c[i * 16])) abort (); } }
/***************************************************************************** * This function utilises 3 properties of the cost function lookup tables, * * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * * vp9_encoder.c. * * For the joint cost: * * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * * For the component costs: * * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * * (Equal costs for both components) * * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * * (Cost function is even) * * If these do not hold, then this function cannot be used without * * modification, in which case you can revert to using the C implementation, * * which does not rely on these properties. * *****************************************************************************/ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max); const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int); const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min); const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int); const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit); const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]); const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]); // search_param determines the length of the initial step and hence the number // of iterations. // 0 = initial step (MAX_FIRST_STEP) pel // 1 = (MAX_FIRST_STEP/2) pel, // 2 = (MAX_FIRST_STEP/4) pel... const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; const int tot_steps = cfg->total_steps - search_param; const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int); const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); int_mv bmv = pack_int_mv(ref_row, ref_col); int_mv new_bmv = bmv; __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int); const int what_stride = x->plane[0].src.stride; const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; const uint8_t *const what = x->plane[0].src.buf; const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; // Work out the start point for the search const uint8_t *best_address = in_what; const uint8_t *new_best_address = best_address; #if ARCH_X86_64 __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif unsigned int best_sad; int i; int j; int step; // Check the prerequisite cost function properties that are easy to check // in an assert. See the function-level documentation for details on all // prerequisites. assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); // Check the starting position best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); *num00 = 0; for (i = 0, step = 0; step < tot_steps; step++) { for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { __m128i v_sad_d; __m128i v_cost_d; __m128i v_outside_d; __m128i v_inside_d; __m128i v_diff_mv_w; #if ARCH_X86_64 __m128i v_blocka[2]; #else __m128i v_blocka[1]; #endif // Compute the candidate motion vectors const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]); const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w); // Clamp them to the search bounds __m128i v_these_mv_clamp_w = v_these_mv_w; v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w); v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w); // The ones that did not change are inside the search area v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w); // If none of them are inside, then move on if (__likely__(_mm_test_all_zeros(v_inside_d, v_inside_d))) { continue; } // The inverse mask indicates which of the MVs are outside v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff)); // Shift right to keep the sign bit clear, we will use this later // to set the cost to the maximum value. v_outside_d = _mm_srli_epi32(v_outside_d, 1); // Compute the difference MV v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv); // We utilise the fact that the cost function is even, and use the // absolute difference. This allows us to use unsigned indexes later // and reduces cache pressure somewhat as only a half of the table // is ever referenced. v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w); // Compute the SIMD pointer offsets. { #if ARCH_X86_64 // sizeof(intptr_t) == 8 // Load the offsets __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]); __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]); // Set the ones falling outside to zero v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d)); v_bo32_q = _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d)); // Compute the candidate addresses v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q); v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q); #else // ARCH_X86 // sizeof(intptr_t) == 4 __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]); v_bo_d = _mm_and_si128(v_bo_d, v_inside_d); v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d); #endif } fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], in_what_stride, (uint32_t*)&v_sad_d); // Look up the component cost of the residual motion vector { const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0); const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1); const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2); const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3); const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4); const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5); const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6); const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7); // Note: This is a use case for vpgather in AVX2 const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0]; const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1]; const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2]; const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3]; __m128i v_cost_10_d, v_cost_32_d; v_cost_10_d = _mm_cvtsi32_si128(cost0); v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1); v_cost_32_d = _mm_cvtsi32_si128(cost2); v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1); v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d); } // Now add in the joint cost { const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128()); const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d); v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d); } // Multiply by sad_per_bit v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d); // ROUND_POWER_OF_TWO(v_cost_d, 8) v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80)); v_cost_d = _mm_srai_epi32(v_cost_d, 8); // Add the cost to the sad v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d); // Make the motion vectors outside the search area have max cost // by or'ing in the comparison mask, this way the minimum search won't // pick them. v_sad_d = _mm_or_si128(v_sad_d, v_outside_d); // Find the minimum value and index horizontally in v_sad_d { // Try speculatively on 16 bits, so we can use the minpos intrinsic const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d); const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w); uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0); uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1); // If the local best value is not saturated, just use it, otherwise // find the horizontal minimum again the hard way on 32 bits. // This is executed rarely. if (__unlikely__(local_best_sad == 0xffff)) { __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d; v_loval_d = v_sad_d; v_loidx_d = _mm_set_epi32(3, 2, 1, 0); v_hival_d = _mm_srli_si128(v_loval_d, 8); v_hiidx_d = _mm_srli_si128(v_loidx_d, 8); v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); v_hival_d = _mm_srli_si128(v_loval_d, 4); v_hiidx_d = _mm_srli_si128(v_loidx_d, 4); v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); local_best_sad = _mm_extract_epi32(v_loval_d, 0); local_best_idx = _mm_extract_epi32(v_loidx_d, 0); } // Update the global minimum if the local minimum is smaller if (__likely__(local_best_sad < best_sad)) { new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; best_sad = local_best_sad; } } } bmv = new_bmv; best_address = new_best_address; v_bmv_w = _mm_set1_epi32(bmv.as_int); #if ARCH_X86_64 v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif if (__unlikely__(best_address == in_what)) { (*num00)++; } } *best_mv = bmv.as_mv; return best_sad; }
bool WidgetAugmentedView::render() { if (!stream) return false; stream->getColorFrame(colorFrame); stream->getDepthFrame(depthFrame); // Correct the depth map if (depthCorrector == nullptr) depthBuffer = depthFrame; else depthCorrector->correct(depthFrame, depthBuffer); // Setup perspective glMatrixMode(GL_PROJECTION); glLoadIdentity(); gluPerspective(fovY, float(ColorFrame::WIDTH) / float(ColorFrame::HEIGHT), zNear, zFar); glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glEnable(GL_DEPTH_TEST); glColor4f(1.0f, 1.0f, 1.0f, 1.0f); // // Draw real world (2D color image) // glDepthFunc(GL_ALWAYS); glActiveTexture(GL_TEXTURE0); glBindTexture(GL_TEXTURE_2D, textureColor); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, ColorFrame::WIDTH, ColorFrame::HEIGHT, GL_RGBA, GL_UNSIGNED_BYTE, (GLvoid*)colorFrame.pixels); glActiveTexture(GL_TEXTURE1); glBindTexture(GL_TEXTURE_2D, textureDepth); KinectStream* kinect = dynamic_cast<KinectStream*>(stream.obj); if (kinect != nullptr) { kinect->mapColorFrameToDepthFrame(depthBuffer, OUT mapping); const NUI_DEPTH_IMAGE_POINT* src = mapping; GLushort* dest = textureDepthBuffer; GLushort* end = textureDepthBuffer + ColorFrame::SIZE; #define SRC(i) static_cast<short>(static_cast<unsigned short>((src + i)->depth)) #ifndef NOT_VECTORIZED // Vectorized assuming ColorFrame::SIZE % 8 == 0 __m128i min = _mm_set1_epi16(static_cast<short>(DepthFrame::MIN_DEPTH)); __m128i max = _mm_set1_epi16(static_cast<short>(DepthFrame::MAX_DEPTH)); __m128i _0 = _mm_setzero_si128(); for (; dest < end; dest += 8, src += 8) { __m128i v = _mm_set_epi16(SRC(7), SRC(6), SRC(5), SRC(4), SRC(3), SRC(2), SRC(1), SRC(0)); v = _mm_max_epu16(min, _mm_min_epu16(max, v)); v = _mm_blendv_epi8(v, max, _mm_cmpeq_epi16(_0, v)); _mm_store_si128((__m128i*)dest, v); } #else for (; dest < end; ++dest, ++src) { unsigned short s = SRC(0); s = (s > DepthFrame::MAX_DEPTH) ? DepthFrame::MAX_DEPTH : s; s = (s < DepthFrame::MIN_DEPTH) ? DepthFrame::MIN_DEPTH : s; *dest = static_cast<GLushort>(s); } #endif glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, ColorFrame::WIDTH, ColorFrame::HEIGHT, GL_RED_INTEGER, GL_UNSIGNED_SHORT, (GLvoid*)textureDepthBuffer); } glActiveTexture(GL_TEXTURE0); shader2D.bind(); RenderUtils::drawRect(-1.0f, 1.0f, 2.0f, -2.0f); shader2D.release(); // // Draw augmented world // glDepthFunc(GL_LESS); glScalef(1.0f, 1.0f, -1.0f); // Invert Z axis so that +Z is in front of the camera // A plane to test occlusion /*glColor3f(0.0f, 1.0f, 0.0f); glBegin(GL_TRIANGLE_STRIP); glVertex3f(-0.5f, -0.5f, 0.5f); glVertex3f(-0.5f, 0.5f, 2.5f); glVertex3f(0.5f, -0.5f, 2.5f); glVertex3f(0.5f, 0.5f, 4.5f); glEnd();*/ glEnable(GL_LIGHTING); // Draw the objects world.render(renderManager); glDisable(GL_LIGHTING); return true; }
void blurRemoveMinMax_(const Mat& src, Mat& dest, const int r) { const Size ksize = Size(2 * r + 1, 2 * r + 1); if (src.data != dest.data)src.copyTo(dest); Mat xv; Mat nv; Mat element = Mat::ones(2 * r + 1, 2 * r + 1, CV_8U); dilate(src, xv, element); erode(src, nv, element); Mat mind; Mat maxd; Mat mask; absdiff(src, nv, mind);//can move to loop absdiff(src, xv, maxd);// min(mind, maxd, mask);// T* n = nv.ptr<T>(0); T* x = xv.ptr<T>(0); T* d = dest.ptr<T>(0); T* nd = mind.ptr<T>(0); T* mk = mask.ptr<T>(0); int remsize = src.size().area(); #if CV_SSE4_1 if (src.depth() == CV_8U) { const int ssesize = src.size().area() / 16; remsize = src.size().area() - ssesize * 16; for (int i = 0; i < ssesize; i++) { __m128i mmk = _mm_load_si128((__m128i*)mk); __m128i mnd = _mm_load_si128((__m128i*)nd); __m128i mmn = _mm_load_si128((__m128i*)n); __m128i mmx = _mm_load_si128((__m128i*)x); __m128i msk = _mm_cmpeq_epi8(mnd, mmk); _mm_stream_si128((__m128i*)d, _mm_blendv_epi8(mmx, mmn, msk)); nd += 16; mk += 16; d += 16; n += 16; x += 16; } } else if (src.depth() == CV_16S || src.depth() == CV_16U) { const int ssesize = src.size().area() / 8; remsize = src.size().area() - ssesize * 8; for (int i = 0; i < ssesize; i++) { __m128i mmk = _mm_load_si128((__m128i*)mk); __m128i mnd = _mm_load_si128((__m128i*)nd); __m128i mmn = _mm_load_si128((__m128i*)n); __m128i mmx = _mm_load_si128((__m128i*)x); __m128i msk = _mm_cmpeq_epi16(mnd, mmk); _mm_stream_si128((__m128i*)d, _mm_blendv_epi8(mmx, mmn, msk)); nd += 8; mk += 8; d += 8; n += 8; x += 8; } } else if (src.depth() == CV_32F) { const int ssesize = src.size().area() / 4; remsize = src.size().area() - ssesize * 4; for (int i = 0; i < ssesize; i++) { __m128 mmk = _mm_load_ps((float*)mk); __m128 mnd = _mm_load_ps((float*)nd); __m128 mmn = _mm_load_ps((float*)n); __m128 mmx = _mm_load_ps((float*)x); __m128 msk = _mm_cmpeq_ps(mnd, mmk); _mm_stream_ps((float*)d, _mm_blendv_ps(mmx, mmn, msk)); nd += 4; mk += 4; d += 4; n += 4; x += 4; } } else if (src.depth() == CV_64F) { const int ssesize = src.size().area() / 2; remsize = src.size().area() - ssesize * 2; for (int i = 0; i < ssesize; i++) { __m128d mmk = _mm_load_pd((double*)mk); __m128d mnd = _mm_load_pd((double*)nd); __m128d mmn = _mm_load_pd((double*)n); __m128d mmx = _mm_load_pd((double*)x); __m128d msk = _mm_cmpeq_pd(mnd, mmk); _mm_stream_pd((double*)d, _mm_blendv_pd(mmx, mmn, msk)); nd += 2; mk += 2; d += 2; n += 2; x += 2; } } #endif for (int i = 0; i < remsize; i++) { { if (nd[i] == mk[i]) { d[i] = n[i]; } else { d[i] = x[i]; } } } }
}bool validate_utf8_sse(const char *src, size_t len) { const char *end = src + len; while (src + 16 < end) { __m128i chunk = _mm_loadu_si128((const __m128i *)(src)); int asciiMask = _mm_movemask_epi8(chunk); if (!asciiMask) { src += 16; continue; } __m128i chunk_signed = _mm_add_epi8(chunk, _mm_set1_epi8(0x80)); __m128i cond2 = _mm_cmplt_epi8(_mm_set1_epi8(0xc2 - 1 - 0x80), chunk_signed); __m128i state = _mm_set1_epi8((char)(0x0 | 0x80)); state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x2 | 0xc0)), cond2); __m128i cond3 = _mm_cmplt_epi8(_mm_set1_epi8(0xe0 - 1 - 0x80), chunk_signed); state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x3 | 0xe0)), cond3); __m128i mask3 = _mm_slli_si128(cond3, 1); __m128i cond4 = _mm_cmplt_epi8(_mm_set1_epi8(0xf0 - 1 - 0x80), chunk_signed); // Fall back to the scalar processing if (_mm_movemask_epi8(cond4)) { break; } __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7)); __m128i count_sub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1)); __m128i counts = _mm_add_epi8(count, _mm_slli_si128(count_sub1, 1)); __m128i shifts = count_sub1; shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 1)); counts = _mm_add_epi8( counts, _mm_slli_si128(_mm_subs_epu8(counts, _mm_set1_epi8(0x2)), 2)); shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 2)); if (asciiMask ^ _mm_movemask_epi8(_mm_cmpgt_epi8(counts, _mm_set1_epi8(0)))) return false; // error shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 4)); if (_mm_movemask_epi8(_mm_cmpgt_epi8( _mm_sub_epi8(_mm_slli_si128(counts, 1), counts), _mm_set1_epi8(1)))) return false; // error shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 8)); __m128i mask = _mm_and_si128(state, _mm_set1_epi8(0xf8)); shifts = _mm_and_si128(shifts, _mm_cmplt_epi8(counts, _mm_set1_epi8(2))); // <=1 chunk = _mm_andnot_si128(mask, chunk); // from now on, we only have usefull bits shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 1), _mm_srli_si128(_mm_slli_epi16(shifts, 7), 1)); __m128i chunk_right = _mm_slli_si128(chunk, 1); __m128i chunk_low = _mm_blendv_epi8( chunk, _mm_or_si128(chunk, _mm_and_si128(_mm_slli_epi16(chunk_right, 6), _mm_set1_epi8(0xc0))), _mm_cmpeq_epi8(counts, _mm_set1_epi8(1))); __m128i chunk_high = _mm_and_si128(chunk, _mm_cmpeq_epi8(counts, _mm_set1_epi8(2))); shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 2), _mm_srli_si128(_mm_slli_epi16(shifts, 6), 2)); chunk_high = _mm_srli_epi32(chunk_high, 2); shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 4), _mm_srli_si128(_mm_slli_epi16(shifts, 5), 4)); chunk_high = _mm_or_si128( chunk_high, _mm_and_si128(_mm_and_si128(_mm_slli_epi32(chunk_right, 4), _mm_set1_epi8(0xf0)), mask3)); int c = _mm_extract_epi16(counts, 7); int source_advance = !(c & 0x0200) ? 16 : !(c & 0x02) ? 15 : 14; __m128i high_bits = _mm_and_si128(chunk_high, _mm_set1_epi8(0xf8)); if (!_mm_testz_si128( mask3, _mm_or_si128(_mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0x00)), _mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0xd8))))) return false; shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 8), _mm_srli_si128(_mm_slli_epi16(shifts, 4), 8)); chunk_high = _mm_slli_si128(chunk_high, 1); __m128i shuf = _mm_add_epi8(shifts, _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); chunk_low = _mm_shuffle_epi8(chunk_low, shuf); chunk_high = _mm_shuffle_epi8(chunk_high, shuf); __m128i utf16_low = _mm_unpacklo_epi8(chunk_low, chunk_high); __m128i utf16_high = _mm_unpackhi_epi8(chunk_low, chunk_high); if (_mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_high, 8, _SIDD_UWORD_OPS | _SIDD_CMP_RANGES) | _mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_low, 8, _SIDD_UWORD_OPS | _SIDD_CMP_RANGES)) { return false; } src += source_advance; } return validate_utf8(src, end - src); }
__m128i test_blendv_epi8(__m128i V1, __m128i V2, __m128i V3) { // CHECK-LABEL: test_blendv_epi8 // CHECK: call <16 x i8> @llvm.x86.sse41.pblendvb // CHECK-ASM: pblendvb %xmm{{.*}}, %xmm{{.*}} return _mm_blendv_epi8(V1, V2, V3); }
__m128i test_mm_blendv_epi8(__m128i V1, __m128i V2, __m128i V3) { // CHECK-LABEL: test_mm_blendv_epi8 // CHECK: call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) return _mm_blendv_epi8(V1, V2, V3); }