コード例 #1
0
FORCE_INLINE void convolve_block(double* im, int ii, int jj, int N, uint32_t r, uint32_t c, double* dst){
    const double rw = 0.2989;
    const double gw = 0.5870;
    const double bw = 0.1140;
    const __m256d rgb0W = _mm256_setr_pd (rw,gw,bw,0); //rgb0
    const __m256d onehalf = _mm256_set1_pd(0.5);
    const __m256d minustwelvehalf = _mm256_set1_pd(-12.5);
    int64_t hi_bit_set = ((int64_t)1) << 63;
    const __m256i mask1110 = _mm256_setr_epi64x(hi_bit_set,hi_bit_set, hi_bit_set, 0);

    //determine non-special case boundaries
    int i_start = ii;
    int i_end = ii+N;
    int j_start = jj;
    int j_end = jj+N;
    if( ii == 0){
        i_start = 1;
    }
    if( ii + N == r){
        i_end = r-1;
    }
    if( jj == 0){
        j_start = 1;
    }
    if( jj + N == c){
        j_end = c-1;
    }

    if(ii == 0){
        if( jj == 0){
            //top left
            int i = 0;
            int j = 0;
            int center =  i*c+j;
            int bottom = (i+1)*c+j;
            int right   = i*c+j+1;
            dst[center] = single_pixel(im,
                                       3*center,
                                       3*center,3*center,3*right,3*bottom,
                                       mask1110,rgb0W,onehalf,minustwelvehalf);
            COST_INC_LOAD(1);
            COST_INC_STORE(1);
        }
        if( jj + N == c){
            //top right
            int i = 0;
            int j = c-1;
            int center =  i*c+j;
            int bottom = (i+1)*c+j;
            int left   = i*c+j-1;
            dst[center] = single_pixel(im,
                                       3*center,
                                       3*center,3*left,3*center,3*bottom,
                                       mask1110,rgb0W,onehalf,minustwelvehalf);
            COST_INC_LOAD(1);
            COST_INC_STORE(1);
        }
        //need to fill in top row
        int i = 0;
        for(int j = j_start; j < j_end; j++){
            int center =  i*c+j;
            int bottom = (i+1)*c+j;
            int left   = i*c+j-1;
            int right   = i*c+j+1;
            dst[center] = single_pixel(im,
                                       3*center,
                                       3*center,3*left,3*right,3*bottom,
                                       mask1110,rgb0W,onehalf,minustwelvehalf);
            COST_INC_LOAD(1);
            COST_INC_STORE(1);
        }
    }
    if(ii + N == r){
        if( jj == 0){
            //bottom left
            int i = r-1;
            int j = 0;
            int center =  i*c+j;
            int top    = (i-1)*c+j;
            int right   = i*c+j+1;
            dst[center] = single_pixel(im,
                                       3*center,
                                       3*top,3*center,3*right,3*center,
                                       mask1110,rgb0W,onehalf,minustwelvehalf);
            COST_INC_LOAD(1);
            COST_INC_STORE(1);
        }
        if( jj + N == c){
            //bottom right
            int i = r-1;
            int j = c-1;
            int center =  i*c+j;
            int top    = (i-1)*c+j;
            int left   = i*c+j-1;
            dst[center] = single_pixel(im,
                                       3*center,
                                       3*top,3*left,3*center,3*center,
                                       mask1110,rgb0W,onehalf,minustwelvehalf);
            COST_INC_LOAD(1);
            COST_INC_STORE(1);
        }
        //need to fill in bottom row
        int i = r-1;
        for(int j = j_start; j < j_end; j++){
            int center =  i*c+j;
            int top    = (i-1)*c+j;
            int left   = i*c+j-1;
            int right   = i*c+j+1;
            dst[center] = single_pixel(im,
                                       3*center,
                                       3*top,3*left,3*right,3*center,
                                       mask1110,rgb0W,onehalf,minustwelvehalf);
            COST_INC_LOAD(1);
            COST_INC_STORE(1);
        }
    }
    if( jj == 0){
        //need to fill in left edge
        int j = 0;
        for(int i = i_start; i < i_end; i++){
            int center =  i*c+j;
            int top    = (i-1)*c+j;
            int bottom = (i+1)*c+j;
            int right   = i*c+j+1;
            dst[center] = single_pixel(im,
                                       3*center,
                                       3*top,3*center,3*right,3*bottom,
                                       mask1110,rgb0W,onehalf,minustwelvehalf);
            COST_INC_LOAD(1);
            COST_INC_STORE(1);
        }
    }
    if( jj + N == c){
        //need to fill in right edge
        int j = c-1;
        for(int i = i_start; i < i_end; i++){
            int center =  i*c+j;
            int top    = (i-1)*c+j;
            int bottom = (i+1)*c+j;
            int left   = i*c+j-1;
            dst[center] = single_pixel(im,
                                       3*center,
                                       3*top,3*left,3*center,3*bottom,
                                       mask1110,rgb0W,onehalf,minustwelvehalf);
            COST_INC_LOAD(1);
            COST_INC_STORE(1);
        }
    }

    //now we can safely process everything inside the _start and _end margins
    //read pixels from _start-1 until _end+1 and process all inside

    for(int i = i_start; i < i_end; i++){
        for(int j = j_start; j < j_end; j++){
            int center =  i*c+j;
            int top    = (i-1)*c+j;
            int bottom = (i+1)*c+j;
            int left   = i*c+j-1;
            int right   = i*c+j+1;
            dst[center] = single_pixel(im,
                                       3*center,
                                       3*top,3*left,3*right,3*bottom,
                                       mask1110,rgb0W,onehalf,minustwelvehalf);
            COST_INC_LOAD(1);
            COST_INC_STORE(1);
        }
    }

}
コード例 #2
0
ファイル: Primate.c プロジェクト: opolo/Bitsliced-AEAD
void mixcolumns_inv_80bit(__m256i (*state)[2]) {
	/*
	6       22      31      1       15
	15      19      8       1       20
	20      5       30      5       11
	11      3       19      8       18
	18      2       2       18      1
	*/

	__m256i T2_regs[5][2];
	__m256i T3_regs[5][2];
	__m256i T4_regs[5][2];
	__m256i T5_regs[5];
	__m256i T6_regs[5];
	__m256i T8_regs[5][2];
	__m256i T11_regs[5][2];
	__m256i T15_regs[5][2];
	__m256i T16_regs[5][2];
	__m256i T18_regs[5][2];
	__m256i T19_regs[5];
	__m256i T20_regs[5][2];
	__m256i T22_regs[5];
	__m256i T30_regs[5];
	__m256i T31_regs[5];

	T2(state, T2_regs); //T2
	T2(T2_regs, T4_regs); //T4
	T2(T4_regs, T8_regs); //T8
	T2(T8_regs, T16_regs); //T16

	for (int i = 0; i < 5; i++) {
		T3_regs[i][0] = XOR(state[i][0], T2_regs[i][0]);
		T3_regs[i][1] = XOR(state[i][1], T2_regs[i][1]);

		T5_regs[i] = XOR(state[i][0], T4_regs[i][0]);

		T6_regs[i] = XOR(T2_regs[i][0], T4_regs[i][0]);

		T11_regs[i][0] = XOR(T3_regs[i][0], T8_regs[i][0]);
		T11_regs[i][1] = XOR(T3_regs[i][1], T8_regs[i][1]);

		T15_regs[i][0] = XOR(T11_regs[i][0], T4_regs[i][0]);
		T15_regs[i][1] = XOR(T11_regs[i][1], T4_regs[i][1]);

		T18_regs[i][0] = XOR(T16_regs[i][0], T2_regs[i][0]);
		T18_regs[i][1] = XOR(T16_regs[i][1], T2_regs[i][1]);

		T19_regs[i] = XOR(T16_regs[i][0], T3_regs[i][0]);

		T20_regs[i][0] = XOR(T16_regs[i][0], T4_regs[i][0]);
		T20_regs[i][1] = XOR(T16_regs[i][1], T4_regs[i][1]);

		T22_regs[i] = XOR(T16_regs[i][0], T6_regs[i]);

		T30_regs[i] = XOR(T22_regs[i], T8_regs[i][0]);

		T31_regs[i] = XOR(T30_regs[i], state[i][0]);
	}

	for (int i = 0; i < 5; i++) {

		state[i][0] = XOR5(
			_mm256_setr_epi64x(_mm256_extract_epi64(T6_regs[i], 0), _mm256_extract_epi64(T15_regs[i][0], 0), _mm256_extract_epi64(T20_regs[i][0], 0), _mm256_extract_epi64(T11_regs[i][0], 0)),
			_mm256_setr_epi64x(_mm256_extract_epi64(T22_regs[i], 1), _mm256_extract_epi64(T19_regs[i], 1), _mm256_extract_epi64(T5_regs[i], 1), _mm256_extract_epi64(T3_regs[i][0], 1)),
			_mm256_setr_epi64x(_mm256_extract_epi64(T31_regs[i], 2), _mm256_extract_epi64(T8_regs[i][0], 2), _mm256_extract_epi64(T30_regs[i], 2), _mm256_extract_epi64(T19_regs[i], 2)),
			_mm256_setr_epi64x(_mm256_extract_epi64(state[i][0], 3), _mm256_extract_epi64(state[i][0], 3), _mm256_extract_epi64(T5_regs[i], 3), _mm256_extract_epi64(T8_regs[i][0], 3)),
			_mm256_setr_epi64x(_mm256_extract_epi64(T15_regs[i][1], 0), _mm256_extract_epi64(T20_regs[i][1], 0), _mm256_extract_epi64(T11_regs[i][1], 0), _mm256_extract_epi64(T18_regs[i][1], 0)));

		state[i][1] = XOR5(
			_mm256_setr_epi64x(_mm256_extract_epi64(T18_regs[i][0], 0), 0, 0, 0),
			_mm256_setr_epi64x(_mm256_extract_epi64(T2_regs[i][0], 1), 0, 0, 0),
			_mm256_setr_epi64x(_mm256_extract_epi64(T2_regs[i][0], 2), 0, 0, 0),
			_mm256_setr_epi64x(_mm256_extract_epi64(T18_regs[i][0], 3), 0, 0, 0),
			_mm256_setr_epi64x(_mm256_extract_epi64(state[i][1], 0), 0, 0, 0));
	}
}