コード例 #1
0
ファイル: fast.cpp プロジェクト: 165-goethals/opencv
void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool nonmax_suppression)
{
    Mat img = _img.getMat();
    const int K = patternSize/2, N = patternSize + K + 1;
#if CV_SSE2
    const int quarterPatternSize = patternSize/4;
    (void)quarterPatternSize;
#endif
    int i, j, k, pixel[25];
    makeOffsets(pixel, (int)img.step, patternSize);

    keypoints.clear();

    threshold = std::min(std::max(threshold, 0), 255);

#if CV_SSE2
    __m128i delta = _mm_set1_epi8(-128), t = _mm_set1_epi8((char)threshold), K16 = _mm_set1_epi8((char)K);
    (void)K16;
    (void)delta;
    (void)t;
#endif
    uchar threshold_tab[512];
    for( i = -255; i <= 255; i++ )
        threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0);

    AutoBuffer<uchar> _buf((img.cols+16)*3*(sizeof(int) + sizeof(uchar)) + 128);
    uchar* buf[3];
    buf[0] = _buf; buf[1] = buf[0] + img.cols; buf[2] = buf[1] + img.cols;
    int* cpbuf[3];
    cpbuf[0] = (int*)alignPtr(buf[2] + img.cols, sizeof(int)) + 1;
    cpbuf[1] = cpbuf[0] + img.cols + 1;
    cpbuf[2] = cpbuf[1] + img.cols + 1;
    memset(buf[0], 0, img.cols*3);

    for(i = 3; i < img.rows-2; i++)
    {
        const uchar* ptr = img.ptr<uchar>(i) + 3;
        uchar* curr = buf[(i - 3)%3];
        int* cornerpos = cpbuf[(i - 3)%3];
        memset(curr, 0, img.cols);
        int ncorners = 0;

        if( i < img.rows - 3 )
        {
            j = 3;
    #if CV_SSE2
            if( patternSize == 16 )
            {
                for(; j < img.cols - 16 - 3; j += 16, ptr += 16)
                {
                    __m128i m0, m1;
                    __m128i v0 = _mm_loadu_si128((const __m128i*)ptr);
                    __m128i v1 = _mm_xor_si128(_mm_subs_epu8(v0, t), delta);
                    v0 = _mm_xor_si128(_mm_adds_epu8(v0, t), delta);

                    __m128i x0 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[0])), delta);
                    __m128i x1 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[quarterPatternSize])), delta);
                    __m128i x2 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[2*quarterPatternSize])), delta);
                    __m128i x3 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[3*quarterPatternSize])), delta);
                    m0 = _mm_and_si128(_mm_cmpgt_epi8(x0, v0), _mm_cmpgt_epi8(x1, v0));
                    m1 = _mm_and_si128(_mm_cmpgt_epi8(v1, x0), _mm_cmpgt_epi8(v1, x1));
                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x1, v0), _mm_cmpgt_epi8(x2, v0)));
                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x1), _mm_cmpgt_epi8(v1, x2)));
                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x2, v0), _mm_cmpgt_epi8(x3, v0)));
                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x2), _mm_cmpgt_epi8(v1, x3)));
                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x3, v0), _mm_cmpgt_epi8(x0, v0)));
                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x3), _mm_cmpgt_epi8(v1, x0)));
                    m0 = _mm_or_si128(m0, m1);
                    int mask = _mm_movemask_epi8(m0);
                    if( mask == 0 )
                        continue;
                    if( (mask & 255) == 0 )
                    {
                        j -= 8;
                        ptr -= 8;
                        continue;
                    }

                    __m128i c0 = _mm_setzero_si128(), c1 = c0, max0 = c0, max1 = c0;
                    for( k = 0; k < N; k++ )
                    {
                        __m128i x = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(ptr + pixel[k])), delta);
                        m0 = _mm_cmpgt_epi8(x, v0);
                        m1 = _mm_cmpgt_epi8(v1, x);

                        c0 = _mm_and_si128(_mm_sub_epi8(c0, m0), m0);
                        c1 = _mm_and_si128(_mm_sub_epi8(c1, m1), m1);

                        max0 = _mm_max_epu8(max0, c0);
                        max1 = _mm_max_epu8(max1, c1);
                    }

                    max0 = _mm_max_epu8(max0, max1);
                    int m = _mm_movemask_epi8(_mm_cmpgt_epi8(max0, K16));

                    for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
                        if(m & 1)
                        {
                            cornerpos[ncorners++] = j+k;
                            if(nonmax_suppression)
                                curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
                        }
                }
            }
    #endif
            for( ; j < img.cols - 3; j++, ptr++ )
            {
                int v = ptr[0];
                const uchar* tab = &threshold_tab[0] - v + 255;
                int d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]];

                if( d == 0 )
                    continue;

                d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]];
                d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]];
                d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]];

                if( d == 0 )
                    continue;

                d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]];
                d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]];
                d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]];
                d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]];

                if( d & 1 )
                {
                    int vt = v - threshold, count = 0;

                    for( k = 0; k < N; k++ )
                    {
                        int x = ptr[pixel[k]];
                        if(x < vt)
                        {
                            if( ++count > K )
                            {
                                cornerpos[ncorners++] = j;
                                if(nonmax_suppression)
                                    curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold);
                                break;
                            }
                        }
                        else
                            count = 0;
                    }
                }

                if( d & 2 )
                {
                    int vt = v + threshold, count = 0;

                    for( k = 0; k < N; k++ )
                    {
                        int x = ptr[pixel[k]];
                        if(x > vt)
                        {
                            if( ++count > K )
                            {
                                cornerpos[ncorners++] = j;
                                if(nonmax_suppression)
                                    curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold);
                                break;
                            }
                        }
                        else
                            count = 0;
                    }
                }
            }
        }

        cornerpos[-1] = ncorners;

        if( i == 3 )
            continue;

        const uchar* prev = buf[(i - 4 + 3)%3];
        const uchar* pprev = buf[(i - 5 + 3)%3];
        cornerpos = cpbuf[(i - 4 + 3)%3];
        ncorners = cornerpos[-1];

        for( k = 0; k < ncorners; k++ )
        {
            j = cornerpos[k];
            int score = prev[j];
            if( !nonmax_suppression ||
               (score > prev[j+1] && score > prev[j-1] &&
                score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] &&
                score > curr[j-1] && score > curr[j] && score > curr[j+1]) )
            {
                keypoints.push_back(KeyPoint((float)j, (float)(i-1), 7.f, -1, (float)score));
            }
        }
    }
コード例 #2
0
ファイル: sum_squares_sse2.c プロジェクト: brion/aomedia
static uint64_t
aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int size) {
  int r, c;

  const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
  __m128i v_acc_q = _mm_setzero_si128();

  for (r = 0; r < size; r += 8) {
    __m128i v_acc_d = _mm_setzero_si128();

    for (c = 0; c < size; c += 8) {
      const int16_t *b = src + c;

      const __m128i v_val_0_w =
          _mm_load_si128((const __m128i *)(b + 0 * stride));
      const __m128i v_val_1_w =
          _mm_load_si128((const __m128i *)(b + 1 * stride));
      const __m128i v_val_2_w =
          _mm_load_si128((const __m128i *)(b + 2 * stride));
      const __m128i v_val_3_w =
          _mm_load_si128((const __m128i *)(b + 3 * stride));
      const __m128i v_val_4_w =
          _mm_load_si128((const __m128i *)(b + 4 * stride));
      const __m128i v_val_5_w =
          _mm_load_si128((const __m128i *)(b + 5 * stride));
      const __m128i v_val_6_w =
          _mm_load_si128((const __m128i *)(b + 6 * stride));
      const __m128i v_val_7_w =
          _mm_load_si128((const __m128i *)(b + 7 * stride));

      const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
      const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
      const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
      const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
      const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
      const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
      const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
      const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);

      const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
      const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
      const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
      const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);

      const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
      const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);

      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
    }

    v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
    v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));

    src += 8 * stride;
  }

  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));

#if ARCH_X86_64
  return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
#else
  {
    uint64_t tmp;
    _mm_storel_epi64((__m128i *)&tmp, v_acc_q);
    return tmp;
  }
#endif
}
コード例 #3
0
int bit_vec_filter_m128_sse11(uint8_t *read_vec, uint8_t *ref_vec, int length,
		int max_error) {
	const __m128i zero_mask = _mm_set1_epi8(0x00);
	const __m128i one_mask = _mm_set1_epi8(0xff);

	int total_byte = (length - 1) / BYTE_BASE_NUM11 + 1;

	int total_difference = 0;

	//Start iteration
	int i, j;
	//read data
	__m128i prev_read_XMM = _mm_set1_epi8(0x0);
	__m128i curr_read_XMM = *((__m128i *) (read_vec));
	//ref data
	__m128i prev_ref_XMM = _mm_set1_epi8(0x0);
	__m128i curr_ref_XMM = *((__m128i *) (ref_vec));

	__m128i read_XMM;
	__m128i ref_XMM;
	__m128i temp_diff_XMM;
	__m128i diff_XMM;
	__m128i mask;
	for (i = 0; i < total_byte; i += SSE_BYTE_NUM) {

		curr_read_XMM = *((__m128i *) (read_vec + i));
		curr_ref_XMM = *((__m128i *) (ref_vec + i));

		diff_XMM = _mm_xor_si128(curr_read_XMM, curr_ref_XMM);
		diff_XMM = xor11complement_sse(diff_XMM);
		

		if (i + SSE_BYTE_NUM >= total_byte) {
			if (length % SSE_BASE_NUM11) {
				mask = _mm_load_si128(
						(__m128i *) (MASK_SSE_END11
								+ (length % SSE_BASE_NUM11) * SSE_BYTE_NUM));
				diff_XMM = _mm_and_si128(mask, diff_XMM);
			}
		}

		for (j = 1; j <= max_error; j++) {
			//Right shift read
			read_XMM = shift_right_sse11(prev_read_XMM, curr_read_XMM, j);
			
			temp_diff_XMM = _mm_xor_si128(read_XMM, curr_ref_XMM);
			temp_diff_XMM = xor11complement_sse(temp_diff_XMM);
			
			if (i == 0) {
				mask = _mm_load_si128(
						(__m128i *) (MASK_SSE_BEG11 + (j - 1) * SSE_BYTE_NUM));

				temp_diff_XMM = _mm_and_si128(mask, temp_diff_XMM);
			}
			if (i + SSE_BYTE_NUM >= total_byte) {
				if (length % SSE_BASE_NUM11) {
					mask = _mm_load_si128(
							(__m128i *) (MASK_SSE_END11
									+ (length % SSE_BASE_NUM11) * SSE_BYTE_NUM));
					temp_diff_XMM = _mm_and_si128(mask, temp_diff_XMM);
				}
			}

			diff_XMM = _mm_and_si128(diff_XMM, temp_diff_XMM);

			//Right shift ref
			ref_XMM = shift_right_sse11(prev_ref_XMM, curr_ref_XMM, j);
			
			temp_diff_XMM = _mm_xor_si128(curr_read_XMM, ref_XMM);
			temp_diff_XMM = xor11complement_sse(temp_diff_XMM);
			
			if (i == 0) {
				mask = _mm_load_si128(
						(__m128i *) (MASK_SSE_BEG11 + (j - 1) * SSE_BYTE_NUM));

				temp_diff_XMM = _mm_and_si128(mask, temp_diff_XMM);
			}
			if (i + SSE_BYTE_NUM >= total_byte) {
				if (length % SSE_BASE_NUM11) {
					mask = _mm_load_si128(
							(__m128i *) (MASK_SSE_END11
									+ (length % SSE_BASE_NUM11) * SSE_BYTE_NUM));
					temp_diff_XMM = _mm_and_si128(mask, temp_diff_XMM);
				}
			}

			diff_XMM = _mm_and_si128(diff_XMM, temp_diff_XMM);
		}

		total_difference += popcount11_m128i_sse(diff_XMM);

		prev_read_XMM = curr_read_XMM;
		prev_ref_XMM = curr_ref_XMM;

		if (total_difference > max_error)
			return 0;
	}

	return 1;
}
コード例 #4
0
//-------------------------------------------------------------------------------
// For each tile go through all the bins and process all the triangles in it.
// Rasterize each triangle to the CPU depth buffer. 
//-------------------------------------------------------------------------------
void DepthBufferRasterizerSSEST::RasterizeBinnedTrianglesToDepthBuffer(UINT tileId, UINT idx)
{
	// Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster)
	_mm_setcsr( _mm_getcsr() | 0x8040 );

	__m128i colOffset = _mm_setr_epi32(0, 1, 0, 1);
	__m128i rowOffset = _mm_setr_epi32(0, 0, 1, 1);

	__m128i fxptZero = _mm_setzero_si128();
	float* pDepthBuffer = (float*)mpRenderTargetPixels[idx]; 

	// Based on TaskId determine which tile to process
	UINT screenWidthInTiles = SCREENW/TILE_WIDTH_IN_PIXELS;
    UINT tileX = tileId % screenWidthInTiles;
    UINT tileY = tileId / screenWidthInTiles;

    int tileStartX = tileX * TILE_WIDTH_IN_PIXELS;
	int tileEndX   = tileStartX + TILE_WIDTH_IN_PIXELS - 1;
	
	int tileStartY = tileY * TILE_HEIGHT_IN_PIXELS;
	int tileEndY   = tileStartY + TILE_HEIGHT_IN_PIXELS - 1;

	ClearDepthTile(tileStartX, tileStartY, tileEndX+1, tileEndY+1, idx);

	UINT bin = 0;
	UINT binIndex = 0;
	UINT offset1 = YOFFSET1_ST * tileY + XOFFSET1_ST * tileX;
	UINT offset2 = YOFFSET2_ST * tileY + XOFFSET2_ST * tileX;
	UINT numTrisInBin = mpNumTrisInBin[idx][offset1 + bin];

	__m128 gatherBuf[4][3];
	bool done = false;
	bool allBinsEmpty = true;
	mNumRasterizedTris[idx][tileId] = numTrisInBin;
	while(!done)
	{
		// Loop through all the bins and process 4 binned traingles at a time
		UINT ii;
		int numSimdTris = 0;
		for(ii = 0; ii < SSE; ii++)
		{
			while(numTrisInBin <= 0)
			{
				 // This bin is empty.  Move to next bin.
				if(++bin >= 1)
				{
					break;
				}
				numTrisInBin = mpNumTrisInBin[idx][offset1 + bin];
				mNumRasterizedTris[idx][tileId] += numTrisInBin;
				binIndex = 0;
			}
			if(!numTrisInBin)
			{
				break; // No more tris in the bins
			}
			USHORT modelId = mpBinModel[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex];
			USHORT meshId = mpBinMesh[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex];
			UINT triIdx = mpBin[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex];
			mpTransformedModels1[modelId].Gather(gatherBuf[ii], meshId, triIdx, idx);
			allBinsEmpty = false;
			numSimdTris++; 

			++binIndex;
			--numTrisInBin;
		}
		done = bin >= NUM_XFORMVERTS_TASKS;
		
		if(allBinsEmpty)
		{
			return;
		}

		// use fixed-point only for X and Y.  Avoid work for Z and W.
        __m128i fxPtX[3], fxPtY[3];
		__m128 Z[3];
		for(int i = 0; i < 3; i++)
		{
			__m128 v0 = gatherBuf[0][i];
			__m128 v1 = gatherBuf[1][i];
			__m128 v2 = gatherBuf[2][i];
			__m128 v3 = gatherBuf[3][i];

  			// transpose into SoA layout
			_MM_TRANSPOSE4_PS(v0, v1, v2, v3);
			fxPtX[i] = _mm_cvtps_epi32(v0);
			fxPtY[i] = _mm_cvtps_epi32(v1);
			Z[i] = v2;
		}

		// Fab(x, y) =     Ax       +       By     +      C              = 0
		// Fab(x, y) = (ya - yb)x   +   (xb - xa)y + (xa * yb - xb * ya) = 0
		// Compute A = (ya - yb) for the 3 line segments that make up each triangle
		__m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]);
		__m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]);
		__m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]);

		// Compute B = (xb - xa) for the 3 line segments that make up each triangle
		__m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]);
		__m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]);
		__m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]);

		// Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle
		__m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1]));
		__m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2]));
		__m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0]));

		// Compute triangle area
		__m128i triArea = _mm_mullo_epi32(B2, A1);
		triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2));
		__m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea));

		Z[1] = _mm_mul_ps(_mm_sub_ps(Z[1], Z[0]), oneOverTriArea);
		Z[2] = _mm_mul_ps(_mm_sub_ps(Z[2], Z[0]), oneOverTriArea);

		// Use bounding box traversal strategy to determine which pixels to rasterize 
		__m128i startX = _mm_and_si128(Max(Min(Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(tileStartX)), _mm_set1_epi32(0xFFFFFFFE));
		__m128i endX   = Min(_mm_add_epi32(Max(Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndX));

		__m128i startY = _mm_and_si128(Max(Min(Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(tileStartY)), _mm_set1_epi32(0xFFFFFFFE));
		__m128i endY   = Min(_mm_add_epi32(Max(Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndY));

		// Now we have 4 triangles set up.  Rasterize them each individually.
        for(int lane=0; lane < numSimdTris; lane++)
        {
			// Extract this triangle's properties from the SIMD versions
            __m128 zz[3];
			for(int vv = 0; vv < 3; vv++)
			{
				zz[vv] = _mm_set1_ps(Z[vv].m128_f32[lane]);
			}
									
			int startXx = startX.m128i_i32[lane];
			int endXx	= endX.m128i_i32[lane];
			int startYy = startY.m128i_i32[lane];
			int endYy	= endY.m128i_i32[lane];
		
			__m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]);
			__m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]);
			__m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]);

			__m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]);
			__m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]);
			__m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]);

			__m128i aa0Inc = _mm_slli_epi32(aa0, 1);
			__m128i aa1Inc = _mm_slli_epi32(aa1, 1);
			__m128i aa2Inc = _mm_slli_epi32(aa2, 1);

			__m128i row, col;

			// Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X
			// This method provides better perfromance
			int rowIdx = (startYy * SCREENW + 2 * startXx);

			col = _mm_add_epi32(colOffset, _mm_set1_epi32(startXx));
			__m128i aa0Col = _mm_mullo_epi32(aa0, col);
			__m128i aa1Col = _mm_mullo_epi32(aa1, col);
			__m128i aa2Col = _mm_mullo_epi32(aa2, col);

			row = _mm_add_epi32(rowOffset, _mm_set1_epi32(startYy));
			__m128i bb0Row = _mm_add_epi32(_mm_mullo_epi32(bb0, row), _mm_set1_epi32(C0.m128i_i32[lane]));
			__m128i bb1Row = _mm_add_epi32(_mm_mullo_epi32(bb1, row), _mm_set1_epi32(C1.m128i_i32[lane]));
			__m128i bb2Row = _mm_add_epi32(_mm_mullo_epi32(bb2, row), _mm_set1_epi32(C2.m128i_i32[lane]));

			__m128i sum0Row = _mm_add_epi32(aa0Col, bb0Row);
			__m128i sum1Row = _mm_add_epi32(aa1Col, bb1Row);
			__m128i sum2Row = _mm_add_epi32(aa2Col, bb2Row);

			__m128i bb0Inc = _mm_slli_epi32(bb0, 1);
			__m128i bb1Inc = _mm_slli_epi32(bb1, 1);
			__m128i bb2Inc = _mm_slli_epi32(bb2, 1);

			__m128 zx = _mm_mul_ps(_mm_cvtepi32_ps(aa1Inc), zz[1]);
			zx = _mm_add_ps(zx, _mm_mul_ps(_mm_cvtepi32_ps(aa2Inc), zz[2]));

			// Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY)
			for(int r = startYy; r < endYy; r += 2,
											rowIdx += 2 * SCREENW,
											sum0Row = _mm_add_epi32(sum0Row, bb0Inc),
											sum1Row = _mm_add_epi32(sum1Row, bb1Inc),
											sum2Row = _mm_add_epi32(sum2Row, bb2Inc))
			{
				// Compute barycentric coordinates 
				int index = rowIdx;
				__m128i alpha = sum0Row;
				__m128i beta = sum1Row;
				__m128i gama = sum2Row;

				//Compute barycentric-interpolated depth
				__m128 depth = zz[0];
				depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1]));
				depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2]));

				for(int c = startXx; c < endXx; c += 2,
												index += 4,
												alpha = _mm_add_epi32(alpha, aa0Inc),
												beta  = _mm_add_epi32(beta, aa1Inc),
												gama  = _mm_add_epi32(gama, aa2Inc),
												depth = _mm_add_ps(depth, zx))
				{
					//Test Pixel inside triangle
					__m128i mask = _mm_or_si128(_mm_or_si128(alpha, beta), gama);
					
					__m128 previousDepthValue = _mm_load_ps(&pDepthBuffer[index]);
					__m128 mergedDepth = _mm_max_ps(depth, previousDepthValue);
					__m128 finalDepth = _mm_blendv_ps(mergedDepth, previousDepthValue, _mm_castsi128_ps(mask));
					_mm_store_ps(&pDepthBuffer[index], finalDepth);
				}//for each column											
			}// for each row
		}// for each triangle
	}// for each set of SIMD# triangles
}
コード例 #5
0
void flip_false_zero(__m128i& vec) {

//	printf("vec: \t\t");
//	print128_bit(vec);

	//For not crossing bits
	__m128i *boundary= (__m128i *) MASK_7F;
//	printf("MASK_7F: \t");
//	print128_bit(*boundary);

	__m128i shift = _mm_and_si128(*boundary, vec);
	
//	printf("After and: \t");
//	print128_bit(shift);

	__m128i *mask = (__m128i *) MASK_0TO1;

	shift = _mm_shuffle_epi8(*mask, shift);
	vec = _mm_or_si128(vec, shift);
	
//	printf("Last cases %d: \t", 0);
//	print128_bit(vec);

	int i;
	for (i = 1; i < 4; i++) {
		shift = _mm_srli_epi16(vec, i);
		shift = _mm_and_si128(*boundary, shift);
//		printf("shift %d: \t", i);
//		print128_bit(shift);
		shift = _mm_shuffle_epi8(*mask, shift);
//		printf("shuffle %d: \t", i);
//		print128_bit(shift);
		shift = _mm_slli_epi16(shift, i);
		vec = _mm_or_si128(vec, shift);
//		printf("Last cases %d: \t", i);
//		print128_bit(vec);
	}

	//For the crossing bits
	__m128i shifted_vec = shift_right_sse1(vec, 4);
//	printf("shifted_vec: \t");
//	print128_bit(shifted_vec);

	shift = _mm_and_si128(*boundary, shifted_vec);
//	printf("After and: \t");
//	print128_bit(shift);
	
	shift = _mm_shuffle_epi8(*mask, shift);
	shifted_vec = _mm_or_si128(shifted_vec, shift);
//	printf("Cross cases %d: \t", 0);
//	print128_bit(shifted_vec);

	for (i = 1; i < 4; i++) {
		shift = _mm_srli_epi16(shifted_vec, i);
		shift = _mm_and_si128(*boundary, shift);
		shift = _mm_shuffle_epi8(*mask, shift);
		shift = _mm_slli_epi16(shift, i);
		shifted_vec = _mm_or_si128(shifted_vec, shift);
//		printf("Cross cases %d: \t", i);
//		print128_bit(shifted_vec);
	}

	shifted_vec = shift_left_sse1(shifted_vec, 4);
	vec = _mm_or_si128(shifted_vec, vec);
//	printf("Final case: \t");
//	print128_bit(vec);

	
}
コード例 #6
0
int bit_vec_filter_no_flipping_m128_sse1(uint8_t *read_vec0, uint8_t *read_vec1, uint8_t
				*ref_vec0, uint8_t *ref_vec1, __m128i mask, int max_error) {

	int total_difference = 0;

	//Start iteration
	int j;
	//read data
	__m128i read_XMM0 = *((__m128i *) (read_vec0));
	__m128i read_XMM1 = *((__m128i *) (read_vec1));
	//ref data
	__m128i ref_XMM0 = *((__m128i *) (ref_vec0));
	__m128i ref_XMM1 = *((__m128i *) (ref_vec1));

	__m128i shift_XMM;
	__m128i diff_XMM;
	__m128i temp_diff_XMM;
	__m128i temp_shift_XMM;
	__m128i temp_mask;

	diff_XMM = _mm_xor_si128(read_XMM0, ref_XMM0);
	temp_diff_XMM = _mm_xor_si128(read_XMM1, ref_XMM1);
	diff_XMM = _mm_or_si128(diff_XMM, temp_diff_XMM);

	//printf("diff_XMM: \n");
	//print128_bit_twice(diff_XMM);

	for (j = 1; j <= max_error; j++) {
		temp_mask = _mm_load_si128( (__m128i *) (MASK_SSE_BEG1 + (j - 1) *
								SSE_BYTE_NUM));
		temp_mask = _mm_and_si128(temp_mask, mask);
		
		//Right shift read
		shift_XMM = shift_right_sse1(read_XMM0, j);
		temp_diff_XMM = _mm_xor_si128(shift_XMM, ref_XMM0);
		shift_XMM = shift_right_sse1(read_XMM1, j);
		temp_shift_XMM = _mm_xor_si128(shift_XMM, ref_XMM1);
		temp_diff_XMM = _mm_or_si128(temp_shift_XMM, temp_diff_XMM);
		temp_diff_XMM = _mm_and_si128(temp_diff_XMM, temp_mask);
//		printf("Before flip: \t");
//		print128_bit(temp_diff_XMM);
//		flip_false_zero(temp_diff_XMM); //No flipping
//		printf("After flip: \t");
//		print128_bit(temp_diff_XMM);
		diff_XMM = _mm_and_si128(diff_XMM, temp_diff_XMM);

		//printf("read shift %d diff_XMM: \n", j);
		//print128_bit_twice(diff_XMM);

		//Right shift ref
		shift_XMM = shift_right_sse1(ref_XMM0, j);
		temp_diff_XMM = _mm_xor_si128(shift_XMM, read_XMM0);
		shift_XMM = shift_right_sse1(ref_XMM1, j);
		temp_shift_XMM = _mm_xor_si128(shift_XMM, read_XMM1);
		temp_diff_XMM = _mm_or_si128(temp_shift_XMM, temp_diff_XMM);
		temp_diff_XMM = _mm_and_si128(temp_diff_XMM, temp_mask);
//		printf("Before flip: \t");
//		print128_bit(temp_diff_XMM);
//		flip_false_zero(temp_diff_XMM); //No flipping
//		printf("After flip: \t");
//		print128_bit(temp_diff_XMM);
		diff_XMM = _mm_and_si128(diff_XMM, temp_diff_XMM);
		
		//printf("ref shift %d diff_XMM: \n", j);
		//print128_bit_twice(diff_XMM);
	}

	total_difference = popcount1_m128i_sse(diff_XMM);

	if (total_difference > max_error)
		return 0;
	else
		return 1;
}
コード例 #7
0
ファイル: test_intrinsics.c プロジェクト: lglin/EC527Project
void InvSubBytes_sse(BYTE state[][4])
{
        BYTE aes_invsbox[4][4];
        aes_invsbox[0][0] = 'a';
        aes_invsbox[0][1] = 'b';
        aes_invsbox[0][2] = 'c';
        aes_invsbox[0][3] = 'd';
        aes_invsbox[1][0] = 'e';
        aes_invsbox[1][1] = 'f';
        aes_invsbox[1][2] = 'g';
        aes_invsbox[1][3] = 'h';
        aes_invsbox[2][0] = 'i';
        aes_invsbox[2][1] = 'j';
        aes_invsbox[2][2] = 'k';
        aes_invsbox[2][3] = 'l';
        aes_invsbox[3][0] = 'm';
        aes_invsbox[3][1] = 'n';
        aes_invsbox[3][2] = 'o';
        aes_invsbox[3][3] = 'p';

/*      __m128i stateDiv16 = _mm_set_epi8(state[3][3] >> 4, 
                                        state[3][2] >> 4, 
                                        state[3][1] >> 4, 
                                        state[3][0] >> 4, 
                                        state[2][3] >> 4,
                                        state[2][2] >> 4,
                                        state[2][1] >> 4,
                                        state[2][0] >> 4,
                                        state[1][3] >> 4,
                                        state[1][2] >> 4,
                                        state[1][1] >> 4,
                                        state[1][0] >> 4,
                                        state[0][3] >> 4,
                                        state[0][2] >> 4,
                                        state[0][1] >> 4,
                                        state[0][0] >> 4); 
*/
        __m128i stateSse = _mm_set_epi8(state[3][3],
                                        state[3][2],
                                        state[3][1],
                                        state[3][0],
                                        state[2][3],
                                        state[2][2],
                                        state[2][1],
                                        state[2][0],
                                        state[1][3],
                                        state[1][2],
                                        state[1][1],
                                        state[1][0],
                                        state[0][3],
                                        state[0][2],
                                        state[0][1],
                                        state[0][0]);

        __m128i andFSse = _mm_set_epi8(0x0F,
                                        0x0F,
                                        0x0F,
                                        0x0F,
                                        0x0F,
                                        0x0F,
                                        0x0F,
                                        0x0F,
                                        0x0F,
                                        0x0F,
                                        0x0F,
                                        0x0F,
                                        0x0F,
                                        0x0F,
                                        0x0F,
                                        0x0F);

        __m128i stateAndFSse = _mm_and_si128(stateSse, andFSse);

        BYTE stateAndF[4][4];
        _mm_storeu_si128(stateAndF, stateAndFSse);

        state[0][0] = aes_invsbox[state[0][0] >> 4][stateAndF[0][0]];
        state[0][1] = aes_invsbox[state[0][1] >> 4][stateAndF[0][1]];
        state[0][2] = aes_invsbox[state[0][2] >> 4][stateAndF[0][2]];
        state[0][3] = aes_invsbox[state[0][3] >> 4][stateAndF[0][3]];
        state[1][0] = aes_invsbox[state[1][0] >> 4][stateAndF[1][0]];
        state[1][1] = aes_invsbox[state[1][1] >> 4][stateAndF[1][1]];
        state[1][2] = aes_invsbox[state[1][2] >> 4][stateAndF[1][2]];
        state[1][3] = aes_invsbox[state[1][3] >> 4][stateAndF[1][3]];
        state[2][0] = aes_invsbox[state[2][0] >> 4][stateAndF[2][0]];
        state[2][1] = aes_invsbox[state[2][1] >> 4][stateAndF[2][1]];
        state[2][2] = aes_invsbox[state[2][2] >> 4][stateAndF[2][2]];
        state[2][3] = aes_invsbox[state[2][3] >> 4][stateAndF[2][3]];
        state[3][0] = aes_invsbox[state[3][0] >> 4][stateAndF[3][0]];
        state[3][1] = aes_invsbox[state[3][1] >> 4][stateAndF[3][1]];
        state[3][2] = aes_invsbox[state[3][2] >> 4][stateAndF[3][2]];
        state[3][3] = aes_invsbox[state[3][3] >> 4][stateAndF[3][3]];
} 
コード例 #8
0
static int HafCpu_Histogram16Bins_DATA_U8
	(
		vx_uint32   * dstHist,
		vx_uint8      distOffset, 
		vx_uint8      distWindow,
		vx_uint32     srcWidth,
		vx_uint32     srcHeight,
		vx_uint8    * pSrcImage,
		vx_uint32     srcImageStrideInBytes
	)
{
	// offset: to convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
	// thresh: source threshold in -128..127 range
	__m128i offset = _mm_set1_epi8((char)0x80);
	__m128i T0 = _mm_set1_epi8((char)(((distOffset ? distOffset : distWindow) - 1) ^ 0x80));
	__m128i dT = _mm_set1_epi8((char)distWindow);
	__m128i onemask = _mm_set1_epi8((char)1);
	// process one pixel row at a time that counts "pixel < srcThreshold"
	vx_uint32 count[16] = { 0 };
	vx_uint8 * srcRow = pSrcImage;
	vx_uint32 width = (srcWidth + 15) >> 4;
	for (unsigned int y = 0; y < srcHeight; y++) {
		__m128i * src = (__m128i *)srcRow;
		__m128i count0 = _mm_set1_epi8((char)0);
		__m128i count1 = _mm_set1_epi8((char)0);
		__m128i count2 = _mm_set1_epi8((char)0);
		__m128i count3 = _mm_set1_epi8((char)0);
		for (unsigned int x = 0; x < width; x++) {
			__m128i pixels = _mm_load_si128(src++);
			pixels = _mm_xor_si128(pixels, offset);
			__m128i cmpout, Tnext = T0;
			// 0..3
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			count0 = _mm_add_epi32(count0, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 16);
			count0 = _mm_add_epi32(count0, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 32);
			count0 = _mm_add_epi32(count0, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 48);
			count0 = _mm_add_epi32(count0, cmpout);
			// 4..7
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			count1 = _mm_add_epi32(count1, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 16);
			count1 = _mm_add_epi32(count1, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 32);
			count1 = _mm_add_epi32(count1, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 48);
			count1 = _mm_add_epi32(count1, cmpout);
			// 8..11
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			count2 = _mm_add_epi32(count2, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 16);
			count2 = _mm_add_epi32(count2, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 32);
			count2 = _mm_add_epi32(count2, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 48);
			count2 = _mm_add_epi32(count2, cmpout);
			// 12..15
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			count3 = _mm_add_epi32(count3, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 16);
			count3 = _mm_add_epi32(count3, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 32);
			count3 = _mm_add_epi32(count3, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 48);
			count3 = _mm_add_epi32(count3, cmpout);
		}
		srcRow += srcImageStrideInBytes;
		// move counts from count0..2 into count[]
		for (int i = 0; i < 4; i++) {
			count[ 0 + i] += M128I(count0).m128i_u16[i] + M128I(count0).m128i_u16[4 + i];
			count[ 4 + i] += M128I(count1).m128i_u16[i] + M128I(count1).m128i_u16[4 + i];
			count[ 8 + i] += M128I(count2).m128i_u16[i] + M128I(count2).m128i_u16[4 + i];
			count[12 + i] += M128I(count3).m128i_u16[i] + M128I(count3).m128i_u16[4 + i];
		}
	}
	// extract histogram from count
	if (distOffset == 0) {
		vx_uint32 last = (distWindow >= 16) ? srcWidth * srcHeight : count[15];
		for (int i = 14; i >= 0; i--) {
			count[i] = last - count[i];
			last -= count[i];
		}
		dstHist[0] = last;
		for (int i = 1; i < 16; i++)
			dstHist[i] = count[i - 1];
	}
	else {
		vx_uint32 last = srcWidth * srcHeight;
		for (int i = 15; i >= 0; i--) {
			count[i] = last - count[i];
			last -= count[i];
			dstHist[i] = count[i];
		}
	}
	return AGO_SUCCESS;
}
コード例 #9
0
ファイル: denoising_sse2.c プロジェクト: Corax26/libvpx
int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
                             int mc_avg_y_stride, unsigned char *running_avg_y,
                             int avg_y_stride, unsigned char *sig,
                             int sig_stride, unsigned int motion_magnitude,
                             int increase_denoising) {
  unsigned char *running_avg_y_start = running_avg_y;
  unsigned char *sig_start = sig;
  unsigned int sum_diff_thresh;
  int r;
  int shift_inc =
      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
          ? 1
          : 0;
  __m128i acc_diff = _mm_setzero_si128();
  const __m128i k_0 = _mm_setzero_si128();
  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
  const __m128i k_8 = _mm_set1_epi8(8);
  const __m128i k_16 = _mm_set1_epi8(16);
  /* Modify each level's adjustment according to motion_magnitude. */
  const __m128i l3 = _mm_set1_epi8(
      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
  /* Difference between level 3 and level 2 is 2. */
  const __m128i l32 = _mm_set1_epi8(2);
  /* Difference between level 2 and level 1 is 1. */
  const __m128i l21 = _mm_set1_epi8(1);

  for (r = 0; r < 16; ++r) {
    /* Calculate differences */
    const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
    const __m128i v_mc_running_avg_y =
        _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0]));
    __m128i v_running_avg_y;
    const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
    const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
    /* Obtain the sign. FF if diff is negative. */
    const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
    /* Clamp absolute difference to 16 to be used to get mask. Doing this
     * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
    const __m128i clamped_absdiff =
        _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_16);
    /* Get masks for l2 l1 and l0 adjustments */
    const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
    const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
    const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
    /* Get adjustments for l2, l1, and l0 */
    __m128i adj2 = _mm_and_si128(mask2, l32);
    const __m128i adj1 = _mm_and_si128(mask1, l21);
    const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
    __m128i adj, padj, nadj;

    /* Combine the adjustments and get absolute adjustments. */
    adj2 = _mm_add_epi8(adj2, adj1);
    adj = _mm_sub_epi8(l3, adj2);
    adj = _mm_andnot_si128(mask0, adj);
    adj = _mm_or_si128(adj, adj0);

    /* Restore the sign and get positive and negative adjustments. */
    padj = _mm_andnot_si128(diff_sign, adj);
    nadj = _mm_and_si128(diff_sign, adj);

    /* Calculate filtered value. */
    v_running_avg_y = _mm_adds_epu8(v_sig, padj);
    v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
    _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);

    /* Adjustments <=7, and each element in acc_diff can fit in signed
     * char.
     */
    acc_diff = _mm_adds_epi8(acc_diff, padj);
    acc_diff = _mm_subs_epi8(acc_diff, nadj);

    /* Update pointers for next iteration. */
    sig += sig_stride;
    mc_running_avg_y += mc_avg_y_stride;
    running_avg_y += avg_y_stride;
  }

  {
    /* Compute the sum of all pixel differences of this MB. */
    unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
    sum_diff_thresh = SUM_DIFF_THRESHOLD;
    if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
    if (abs_sum_diff > sum_diff_thresh) {
      // Before returning to copy the block (i.e., apply no denoising),
      // check if we can still apply some (weaker) temporal filtering to
      // this block, that would otherwise not be denoised at all. Simplest
      // is to apply an additional adjustment to running_avg_y to bring it
      // closer to sig. The adjustment is capped by a maximum delta, and
      // chosen such that in most cases the resulting sum_diff will be
      // within the acceptable range given by sum_diff_thresh.

      // The delta is set by the excess of absolute pixel diff over the
      // threshold.
      int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
      // Only apply the adjustment for max delta up to 3.
      if (delta < 4) {
        const __m128i k_delta = _mm_set1_epi8(delta);
        sig -= sig_stride * 16;
        mc_running_avg_y -= mc_avg_y_stride * 16;
        running_avg_y -= avg_y_stride * 16;
        for (r = 0; r < 16; ++r) {
          __m128i v_running_avg_y =
              _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
          // Calculate differences.
          const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
          const __m128i v_mc_running_avg_y =
              _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0]));
          const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
          const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
          // Obtain the sign. FF if diff is negative.
          const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
          // Clamp absolute difference to delta to get the adjustment.
          const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
          // Restore the sign and get positive and negative adjustments.
          __m128i padj, nadj;
          padj = _mm_andnot_si128(diff_sign, adj);
          nadj = _mm_and_si128(diff_sign, adj);
          // Calculate filtered value.
          v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
          v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
          _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);

          // Accumulate the adjustments.
          acc_diff = _mm_subs_epi8(acc_diff, padj);
          acc_diff = _mm_adds_epi8(acc_diff, nadj);

          // Update pointers for next iteration.
          sig += sig_stride;
          mc_running_avg_y += mc_avg_y_stride;
          running_avg_y += avg_y_stride;
        }
        abs_sum_diff = abs_sum_diff_16x1(acc_diff);
        if (abs_sum_diff > sum_diff_thresh) {
          return COPY_BLOCK;
        }
      } else {
        return COPY_BLOCK;
      }
    }
  }
コード例 #10
0
ファイル: sse_operators.cpp プロジェクト: Eppie/sse-popcount
__m128i operator&(sse_vector a, sse_vector b) {

    return _mm_and_si128(a.v, b.v);
}
コード例 #11
0
static inline __m128i _mm_blendv_epi8_rpl(__m128i a, __m128i b, __m128i mask) {
    a = _mm_andnot_si128(mask, a);
    a = _mm_or_si128(a, _mm_and_si128(mask, b));
    return a;
}
コード例 #12
0
/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them..
   it is almost as fast, and gives you a free cosine with your sine */
void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {

  v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
  v8si imm0, imm2, imm4;

#ifndef __AVX2__
  v4si imm0_1, imm0_2;
  v4si imm2_1, imm2_2;
  v4si imm4_1, imm4_2;
#endif

  sign_bit_sin = x;
  /* take the absolute value */
  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
  /* extract the sign bit (upper one) */
  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
  
  /* scale by 4/Pi */
  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);

#ifdef __AVX2__    
  /* store the integer part of y in imm2 */
  imm2 = _mm256_cvttps_epi32(y);

  /* j=(j+1) & (~1) (see the cephes sources) */
  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1);

  y = _mm256_cvtepi32_ps(imm2);
  imm4 = imm2;

  /* get the swap sign flag for the sine */
  imm0 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_4);
  imm0 = _mm256_slli_epi32(imm0, 29);
  //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);

  /* get the polynom selection mask for the sine*/
  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2);
  imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
  //v8sf poly_mask = _mm256_castsi256_ps(imm2);
#else
  /* we use SSE2 routines to perform the integer ops */
  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);

  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
  
  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);

  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
  y = _mm256_cvtepi32_ps(imm2);

  imm4_1 = imm2_1;
  imm4_2 = imm2_2;

  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
  
  imm0_1 = _mm_slli_epi32(imm0_1, 29);
  imm0_2 = _mm_slli_epi32(imm0_2, 29);

  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);

  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);

  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());

  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
#endif
  v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
  v8sf poly_mask = _mm256_castsi256_ps(imm2);

  /* The magic pass: "******" 
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
  xmm1 = _mm256_mul_ps(y, xmm1);
  xmm2 = _mm256_mul_ps(y, xmm2);
  xmm3 = _mm256_mul_ps(y, xmm3);
  x = _mm256_add_ps(x, xmm1);
  x = _mm256_add_ps(x, xmm2);
  x = _mm256_add_ps(x, xmm3);

#ifdef __AVX2__
  imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2);
  imm4 = _mm256_andnot_si128(imm4, *(v8si*)_pi32_256_4);
  imm4 = _mm256_slli_epi32(imm4, 29);
#else
  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si*)_pi32avx_2);
  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si*)_pi32avx_2);

  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si*)_pi32avx_4);
  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si*)_pi32avx_4);
  
  imm4_1 = _mm_slli_epi32(imm4_1, 29);
  imm4_2 = _mm_slli_epi32(imm4_2, 29);

  COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
#endif

  v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);

  sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
  
  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
  v8sf z = _mm256_mul_ps(x,x);
  y = *(v8sf*)_ps256_coscof_p0;

  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
  y = _mm256_mul_ps(y, z);
  y = _mm256_mul_ps(y, z);
  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
  y = _mm256_sub_ps(y, tmp);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
  
  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_mul_ps(y2, x);
  y2 = _mm256_add_ps(y2, x);

  /* select the correct result from the two polynoms */  
  xmm3 = poly_mask;
  v8sf ysin2 = _mm256_and_ps(xmm3, y2);
  v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
  y2 = _mm256_sub_ps(y2,ysin2);
  y = _mm256_sub_ps(y, ysin1);

  xmm1 = _mm256_add_ps(ysin1,ysin2);
  xmm2 = _mm256_add_ps(y,y2);
 
  /* update the sign */
  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
}
コード例 #13
0
/* evaluation of 8 sines at onces using AVX intrisics

   The code is the exact rewriting of the cephes sinf function.
   Precision is excellent as long as x < 8192 (I did not bother to
   take into account the special handling they have for greater values
   -- it does not return garbage for arguments over 8192, though, but
   the extra precision is missing).

   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
   surprising but correct result.

*/
v8sf sin256_ps(v8sf x) { // any x
  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
  v8si imm0, imm2;

#ifndef __AVX2__
  v4si imm0_1, imm0_2;
  v4si imm2_1, imm2_2;
#endif

  sign_bit = x;
  /* take the absolute value */
  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
  /* extract the sign bit (upper one) */
  sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
  
  /* scale by 4/Pi */
  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);

  /*
    Here we start a series of integer operations, which are in the
    realm of AVX2.
    If we don't have AVX, let's perform them using SSE2 directives
  */

#ifdef __AVX2__
  /* store the integer part of y in mm0 */
  imm2 = _mm256_cvttps_epi32(y);
  /* j=(j+1) & (~1) (see the cephes sources) */
  // another two AVX2 instruction
  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1);
  y = _mm256_cvtepi32_ps(imm2);

  /* get the swap sign flag */
  imm0 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_4);
  imm0 = _mm256_slli_epi32(imm0, 29);
  /* get the polynom selection mask 
     there is one polynom for 0 <= x <= Pi/4
     and another one for Pi/4<x<=Pi/2

     Both branches will be computed.
  */
  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2);
  imm2 = _mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0);
#else
  /* we use SSE2 routines to perform the integer ops */
  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);

  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);

  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);

  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
  y = _mm256_cvtepi32_ps(imm2);

  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);

  imm0_1 = _mm_slli_epi32(imm0_1, 29);
  imm0_2 = _mm_slli_epi32(imm0_2, 29);

  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);

  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);

  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());

  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
#endif
 
  v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
  v8sf poly_mask = _mm256_castsi256_ps(imm2);
  sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);

  /* The magic pass: "******" 
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
  xmm1 = _mm256_mul_ps(y, xmm1);
  xmm2 = _mm256_mul_ps(y, xmm2);
  xmm3 = _mm256_mul_ps(y, xmm3);
  x = _mm256_add_ps(x, xmm1);
  x = _mm256_add_ps(x, xmm2);
  x = _mm256_add_ps(x, xmm3);

  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
  y = *(v8sf*)_ps256_coscof_p0;
  v8sf z = _mm256_mul_ps(x,x);

  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
  y = _mm256_mul_ps(y, z);
  y = _mm256_mul_ps(y, z);
  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
  y = _mm256_sub_ps(y, tmp);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
  
  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_mul_ps(y2, x);
  y2 = _mm256_add_ps(y2, x);

  /* select the correct result from the two polynoms */  
  xmm3 = poly_mask;
  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
  y = _mm256_andnot_ps(xmm3, y);
  y = _mm256_add_ps(y,y2);
  /* update the sign */
  y = _mm256_xor_ps(y, sign_bit);

  return y;
}
コード例 #14
0
static inline __m128i softlight_byte_SSE2(const __m128i& sc, const __m128i& dc,
                                          const __m128i& sa, const __m128i& da) {
    __m128i tmp1, tmp2, tmp3;

    // int m = da ? dc * 256 / da : 0;
    __m128i cmp = _mm_cmpeq_epi32(da, _mm_setzero_si128());
    __m128i m = _mm_slli_epi32(dc, 8);
    __m128 x = _mm_cvtepi32_ps(m);
    __m128 y = _mm_cvtepi32_ps(da);
    m = _mm_cvttps_epi32(_mm_div_ps(x, y));
    m = _mm_andnot_si128(cmp, m);

    // if (2 * sc <= sa)
    tmp1 = _mm_slli_epi32(sc, 1);                      // 2 * sc
    __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa);
    tmp1 = _mm_sub_epi32(tmp1, sa);                    // 2 * sc - sa
    tmp2 = _mm_sub_epi32(_mm_set1_epi32(256), m);      // 256 - m
    tmp1 = Multiply32_SSE2(tmp1, tmp2);
    tmp1 = _mm_srai_epi32(tmp1, 8);
    tmp1 = _mm_add_epi32(sa, tmp1);
    tmp1 = Multiply32_SSE2(dc, tmp1);
    __m128i rc1 = _mm_andnot_si128(cmp1, tmp1);

    // else if (4 * dc <= da)
    tmp2 = _mm_slli_epi32(dc, 2);                      // dc * 4
    __m128i cmp2 = _mm_cmpgt_epi32(tmp2, da);
    __m128i i = _mm_slli_epi32(m, 2);                  // 4 * m
    __m128i j = _mm_add_epi32(i, _mm_set1_epi32(256)); // 4 * m + 256
    __m128i k = Multiply32_SSE2(i, j);                 // 4 * m * (4 * m + 256)
    __m128i t = _mm_sub_epi32(m, _mm_set1_epi32(256)); // m - 256
    i = Multiply32_SSE2(k, t);                         // 4 * m * (4 * m + 256) * (m - 256)
    i = _mm_srai_epi32(i, 16);                         // >> 16
    j = Multiply32_SSE2(_mm_set1_epi32(7), m);         // 7 * m
    tmp2 = _mm_add_epi32(i, j);
    i = Multiply32_SSE2(dc, sa);                       // dc * sa
    j = _mm_slli_epi32(sc, 1);                         // 2 * sc
    j = _mm_sub_epi32(j, sa);                          // 2 * sc - sa
    j = Multiply32_SSE2(da, j);                        // da * (2 * sc - sa)
    tmp2 = Multiply32_SSE2(j, tmp2);                   // * tmp
    tmp2 = _mm_srai_epi32(tmp2, 8);                    // >> 8
    tmp2 = _mm_add_epi32(i, tmp2);
    cmp = _mm_andnot_si128(cmp2, cmp1);
    __m128i rc2 = _mm_and_si128(cmp, tmp2);
    __m128i rc = _mm_or_si128(rc1, rc2);

    // else
    tmp3 = sqrt_unit_byte_SSE2(m);
    tmp3 = _mm_sub_epi32(tmp3, m);
    tmp3 = Multiply32_SSE2(j, tmp3);                   // j = da * (2 * sc - sa)
    tmp3 = _mm_srai_epi32(tmp3, 8);
    tmp3 = _mm_add_epi32(i, tmp3);                     // i = dc * sa
    cmp = _mm_and_si128(cmp1, cmp2);
    __m128i rc3 = _mm_and_si128(cmp, tmp3);
    rc = _mm_or_si128(rc, rc3);

    tmp1 = _mm_sub_epi32(_mm_set1_epi32(255), da);     // 255 - da
    tmp1 = _mm_mullo_epi16(sc, tmp1);
    tmp2 = _mm_sub_epi32(_mm_set1_epi32(255), sa);     // 255 - sa
    tmp2 = _mm_mullo_epi16(dc, tmp2);
    rc = _mm_add_epi32(rc, tmp1);
    rc = _mm_add_epi32(rc, tmp2);
    return clamp_div255round_SSE2(rc);
}
コード例 #15
0
ファイル: AEUtil.cpp プロジェクト: AndyPeterman/mrmc
    MEMALIGN(16, __m128i mod_mask);
    MEMALIGN(16, __m128 res);
    MEMALIGN(16, static const unsigned int mult  [4]) = {214013, 17405, 214013, 69069};
    MEMALIGN(16, static const unsigned int gadd  [4]) = {2531011, 10395331, 13737667, 1};
    MEMALIGN(16, static const unsigned int mask  [4]) = {0xFFFFFFFF, 0, 0xFFFFFFFF, 0};

    adder          = _mm_load_si128((__m128i*)gadd);
    multiplier     = _mm_load_si128((__m128i*)mult);
    mod_mask       = _mm_load_si128((__m128i*)mask);
    cur_seed_split = _mm_shuffle_epi32(m_sseSeed, _MM_SHUFFLE(2, 3, 0, 1));

    m_sseSeed      = _mm_mul_epu32(m_sseSeed, multiplier);
    multiplier     = _mm_shuffle_epi32(multiplier, _MM_SHUFFLE(2, 3, 0, 1));
    cur_seed_split = _mm_mul_epu32(cur_seed_split, multiplier);

    m_sseSeed      = _mm_and_si128(m_sseSeed, mod_mask);
    cur_seed_split = _mm_and_si128(cur_seed_split, mod_mask);
    cur_seed_split = _mm_shuffle_epi32(cur_seed_split, _MM_SHUFFLE(2, 3, 0, 1));
    m_sseSeed      = _mm_or_si128(m_sseSeed, cur_seed_split);
    m_sseSeed      = _mm_add_epi32(m_sseSeed, adder);

    /* adjust the value to the range requested */
    res = _mm_cvtepi32_ps(m_sseSeed);
    if (sseresult)
      *sseresult = _mm_mul_ps(res, f);
    else
    {
      res = _mm_mul_ps(res, f);
      _mm_storeu_ps(result, res);

      /* returning a float array, so cleanup */
コード例 #16
0
static inline __m128i SkMin32_SSE2(const __m128i& a, const __m128i& b) {
    __m128i cmp = _mm_cmplt_epi32(a, b);
    return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, b));
}
コード例 #17
0
ファイル: enc_sse2.c プロジェクト: 0871087123/godot
// Simple quantization
static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
                             int n, const VP8Matrix* const mtx) {
  const __m128i max_coeff_2047 = _mm_set1_epi16(2047);
  const __m128i zero = _mm_set1_epi16(0);
  __m128i sign0, sign8;
  __m128i coeff0, coeff8;
  __m128i out0, out8;
  __m128i packed_out;

  // Load all inputs.
  // TODO(cduvivier): Make variable declarations and allocations aligned so that
  //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
  const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
  const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
  const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
  const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
  const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
  const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
  const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
  const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
  const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
  const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);

  // sign(in) = in >> 15  (0x0000 if positive, 0xffff if negative)
  sign0 = _mm_srai_epi16(in0, 15);
  sign8 = _mm_srai_epi16(in8, 15);

  // coeff = abs(in) = (in ^ sign) - sign
  coeff0 = _mm_xor_si128(in0, sign0);
  coeff8 = _mm_xor_si128(in8, sign8);
  coeff0 = _mm_sub_epi16(coeff0, sign0);
  coeff8 = _mm_sub_epi16(coeff8, sign8);

  // coeff = abs(in) + sharpen
  coeff0 = _mm_add_epi16(coeff0, sharpen0);
  coeff8 = _mm_add_epi16(coeff8, sharpen8);

  // if (coeff > 2047) coeff = 2047
  coeff0 = _mm_min_epi16(coeff0, max_coeff_2047);
  coeff8 = _mm_min_epi16(coeff8, max_coeff_2047);

  // out = (coeff * iQ + B) >> QFIX;
  {
    // doing calculations with 32b precision (QFIX=17)
    // out = (coeff * iQ)
    __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
    __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
    __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
    __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
    // expand bias from 16b to 32b
    __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
    __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
    __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
    __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
    // out = (coeff * iQ + B)
    out_00 = _mm_add_epi32(out_00, bias_00);
    out_04 = _mm_add_epi32(out_04, bias_04);
    out_08 = _mm_add_epi32(out_08, bias_08);
    out_12 = _mm_add_epi32(out_12, bias_12);
    // out = (coeff * iQ + B) >> QFIX;
    out_00 = _mm_srai_epi32(out_00, QFIX);
    out_04 = _mm_srai_epi32(out_04, QFIX);
    out_08 = _mm_srai_epi32(out_08, QFIX);
    out_12 = _mm_srai_epi32(out_12, QFIX);
    // pack result as 16b
    out0 = _mm_packs_epi32(out_00, out_04);
    out8 = _mm_packs_epi32(out_08, out_12);
  }

  // get sign back (if (sign[j]) out_n = -out_n)
  out0 = _mm_xor_si128(out0, sign0);
  out8 = _mm_xor_si128(out8, sign8);
  out0 = _mm_sub_epi16(out0, sign0);
  out8 = _mm_sub_epi16(out8, sign8);

  // in = out * Q
  in0 = _mm_mullo_epi16(out0, q0);
  in8 = _mm_mullo_epi16(out8, q8);

  // if (coeff <= mtx->zthresh_) {in=0; out=0;}
  {
    __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
    __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
    in0 = _mm_and_si128(in0, cmp0);
    in8 = _mm_and_si128(in8, cmp8);
    _mm_storeu_si128((__m128i*)&in[0], in0);
    _mm_storeu_si128((__m128i*)&in[8], in8);
    out0 = _mm_and_si128(out0, cmp0);
    out8 = _mm_and_si128(out8, cmp8);
  }

  // zigzag the output before storing it.
  //
  // The zigzag pattern can almost be reproduced with a small sequence of
  // shuffles. After it, we only need to swap the 7th (ending up in third
  // position instead of twelfth) and 8th values.
  {
    __m128i outZ0, outZ8;
    outZ0 = _mm_shufflehi_epi16(out0,  _MM_SHUFFLE(2, 1, 3, 0));
    outZ0 = _mm_shuffle_epi32  (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
    outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
    outZ8 = _mm_shufflelo_epi16(out8,  _MM_SHUFFLE(3, 0, 2, 1));
    outZ8 = _mm_shuffle_epi32  (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
    outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
    _mm_storeu_si128((__m128i*)&out[0], outZ0);
    _mm_storeu_si128((__m128i*)&out[8], outZ8);
    packed_out = _mm_packs_epi16(outZ0, outZ8);
  }
  {
    const int16_t outZ_12 = out[12];
    const int16_t outZ_3 = out[3];
    out[3] = outZ_12;
    out[12] = outZ_3;
  }

  // detect if all 'out' values are zeroes or not
  {
    int32_t tmp[4];
    _mm_storeu_si128((__m128i*)tmp, packed_out);
    if (n) {
      tmp[0] &= ~0xff;
    }
    return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
  }
}