static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16, const __m128i mul_constants_0, const __m128i mul_constants_1, const int strength, const int rounding, const int weight) { const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); const __m128i rounding_u16 = _mm_set1_epi16(rounding); const __m128i weight_u16 = _mm_set1_epi16(weight); const __m128i sixteen = _mm_set1_epi16(16); __m128i input_0, input_1; input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0); input_0 = _mm_adds_epu16(input_0, rounding_u16); input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1); input_1 = _mm_adds_epu16(input_1, rounding_u16); input_0 = _mm_srl_epi16(input_0, strength_u128); input_1 = _mm_srl_epi16(input_1, strength_u128); input_0 = _mm_min_epu16(input_0, sixteen); input_1 = _mm_min_epu16(input_1, sixteen); input_0 = _mm_sub_epi16(sixteen, input_0); input_1 = _mm_sub_epi16(sixteen, input_1); *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16); *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16); }
static void sse4_1_test (void) { union { __m128i x[NUM / 8]; unsigned short i[NUM]; } dst, src1, src2; int i; unsigned short min; for (i = 0; i < NUM; i++) { src1.i[i] = i * i; src2.i[i] = i + 20; if ((i % 8)) src2.i[i] |= 0x8000; } for (i = 0; i < NUM; i += 8) dst.x[i / 8] = _mm_min_epu16 (src1.x[i / 8], src2.x[i / 8]); for (i = 0; i < NUM; i++) { min = src1.i[i] >= src2.i[i] ? src2.i[i] : src1.i[i]; if (min != dst.i[i]) abort (); } }
static __m128i test_mm_min_epu16 (struct testcase *tc) { tc->expect = (tc->i > tc->j) ? tc->j : tc->i; return _mm_min_epu16( _mm_set1_epi16(tc->i), _mm_set1_epi16(tc->j)); }
// Average the value based on the number of values summed (9 for pixels away // from the border, 4 for pixels in corners, and 6 for other edge values). // // Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply // by weight. static __m128i average_8(__m128i sum, const __m128i mul_constants, const int strength, const int rounding, const int weight) { // _mm_srl_epi16 uses the lower 64 bit value for the shift. const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); const __m128i rounding_u16 = _mm_set1_epi16(rounding); const __m128i weight_u16 = _mm_set1_epi16(weight); const __m128i sixteen = _mm_set1_epi16(16); // modifier * 3 / index; sum = _mm_mulhi_epu16(sum, mul_constants); sum = _mm_adds_epu16(sum, rounding_u16); sum = _mm_srl_epi16(sum, strength_u128); // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 // So this needs to use the epu16 version which did not come until SSE4. sum = _mm_min_epu16(sum, sixteen); sum = _mm_sub_epi16(sixteen, sum); return _mm_mullo_epi16(sum, weight_u16); }
bool WidgetAugmentedView::render() { if (!stream) return false; stream->getColorFrame(colorFrame); stream->getDepthFrame(depthFrame); // Correct the depth map if (depthCorrector == nullptr) depthBuffer = depthFrame; else depthCorrector->correct(depthFrame, depthBuffer); // Setup perspective glMatrixMode(GL_PROJECTION); glLoadIdentity(); gluPerspective(fovY, float(ColorFrame::WIDTH) / float(ColorFrame::HEIGHT), zNear, zFar); glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glEnable(GL_DEPTH_TEST); glColor4f(1.0f, 1.0f, 1.0f, 1.0f); // // Draw real world (2D color image) // glDepthFunc(GL_ALWAYS); glActiveTexture(GL_TEXTURE0); glBindTexture(GL_TEXTURE_2D, textureColor); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, ColorFrame::WIDTH, ColorFrame::HEIGHT, GL_RGBA, GL_UNSIGNED_BYTE, (GLvoid*)colorFrame.pixels); glActiveTexture(GL_TEXTURE1); glBindTexture(GL_TEXTURE_2D, textureDepth); KinectStream* kinect = dynamic_cast<KinectStream*>(stream.obj); if (kinect != nullptr) { kinect->mapColorFrameToDepthFrame(depthBuffer, OUT mapping); const NUI_DEPTH_IMAGE_POINT* src = mapping; GLushort* dest = textureDepthBuffer; GLushort* end = textureDepthBuffer + ColorFrame::SIZE; #define SRC(i) static_cast<short>(static_cast<unsigned short>((src + i)->depth)) #ifndef NOT_VECTORIZED // Vectorized assuming ColorFrame::SIZE % 8 == 0 __m128i min = _mm_set1_epi16(static_cast<short>(DepthFrame::MIN_DEPTH)); __m128i max = _mm_set1_epi16(static_cast<short>(DepthFrame::MAX_DEPTH)); __m128i _0 = _mm_setzero_si128(); for (; dest < end; dest += 8, src += 8) { __m128i v = _mm_set_epi16(SRC(7), SRC(6), SRC(5), SRC(4), SRC(3), SRC(2), SRC(1), SRC(0)); v = _mm_max_epu16(min, _mm_min_epu16(max, v)); v = _mm_blendv_epi8(v, max, _mm_cmpeq_epi16(_0, v)); _mm_store_si128((__m128i*)dest, v); } #else for (; dest < end; ++dest, ++src) { unsigned short s = SRC(0); s = (s > DepthFrame::MAX_DEPTH) ? DepthFrame::MAX_DEPTH : s; s = (s < DepthFrame::MIN_DEPTH) ? DepthFrame::MIN_DEPTH : s; *dest = static_cast<GLushort>(s); } #endif glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, ColorFrame::WIDTH, ColorFrame::HEIGHT, GL_RED_INTEGER, GL_UNSIGNED_SHORT, (GLvoid*)textureDepthBuffer); } glActiveTexture(GL_TEXTURE0); shader2D.bind(); RenderUtils::drawRect(-1.0f, 1.0f, 2.0f, -2.0f); shader2D.release(); // // Draw augmented world // glDepthFunc(GL_LESS); glScalef(1.0f, 1.0f, -1.0f); // Invert Z axis so that +Z is in front of the camera // A plane to test occlusion /*glColor3f(0.0f, 1.0f, 0.0f); glBegin(GL_TRIANGLE_STRIP); glVertex3f(-0.5f, -0.5f, 0.5f); glVertex3f(-0.5f, 0.5f, 2.5f); glVertex3f(0.5f, -0.5f, 2.5f); glVertex3f(0.5f, 0.5f, 4.5f); glEnd();*/ glEnable(GL_LIGHTING); // Draw the objects world.render(renderManager); glDisable(GL_LIGHTING); return true; }
__m128i test_mm_min_epu16(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_min_epu16 // CHECK: call <8 x i16> @llvm.x86.sse41.pminuw // CHECK-ASM: pminuw %xmm{{.*}}, %xmm{{.*}} return _mm_min_epu16(x, y); }
__m128i test_mm_min_epu16(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_min_epu16 // CHECK: [[CMP:%.*]] = icmp ult <8 x i16> [[X:%.*]], [[Y:%.*]] // CHECK-NEXT: select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]] return _mm_min_epu16(x, y); }