0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xa,0xb,0xFF,0xFF,0xFF,0xFF,
                            0x4,0x5,0x6,0x7,0x8,0x9,0xa,0xb,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
                            0x0,0x1,0x2,0x3,0x8,0x9,0xa,0xb,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
                            0x8,0x9,0xa,0xb,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
                            0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
                            0x4,0x5,0x6,0x7,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
                            0x0,0x1,0x2,0x3,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
                            0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF
                           };


// write vector new, while omitting repeated values assuming that previously written vector was "old"
static int store_unique(__m128i old,__m128i new, uint32_t *  output) {
    __m128i vecTmp = _mm_alignr_epi8(new, old, 16-4);
    int M = _mm_movemask_epi8(_mm_cmpeq_epi32(vecTmp,new));//_pdep_u32(,0x1111);
    M=_pext_u32(M,0x1111);
    int numberofnewvalues = 4 - _mm_popcnt_u32(M);
    __m128i key =  _mm_lddqu_si128((const __m128i* )uniqshuf + M);
    __m128i val =  _mm_shuffle_epi8(new,key);
    _mm_storeu_si128((__m128i* )output,val);
    return numberofnewvalues;
}

// working in-place, this function overwrites the repeated values
static uint32_t unique(uint32_t *  out, uint32_t len) {
    uint32_t pos = 1;
    for(uint32_t i = 1; i < len; ++i) {
        if(out[i] != out[i-1]) {
            out[pos++] = out[i];
        }
    }
Пример #2
0
double grad_descent_step_mt(DataPoint *datapoints, long int n_dp, Weights *weights, double alpha){

  long int bounds[TH_NUM + 1];

  int th; int interval = n_dp/4;
  for(th = 0; th < TH_NUM; th++){
    bounds[th] = interval * th;
  }
  bounds[th] = n_dp;

  double deriv_sqr[TH_NUM];
  
  global_datapoints = datapoints;
  global_bounds = bounds;
  global_deriv_sqr = deriv_sqr;
  global_n_dp = n_dp;
  global_alpha = alpha;
  
  global_confcount = malloc(sizeof(ConfigCounter));
  memset(global_confcount, 0, sizeof(ConfigCounter));
  long int i;
  for(i = 0; i < n_dp; i++){
    increment_confcount(global_confcount, datapoints[i].board);
  }

  pthread_t tids[TH_NUM];
  Weights *section_delta[TH_NUM];
  for(th = 0; th < TH_NUM; th++){
    pthread_create(&(tids[th]), NULL, grad_descent_step_thread, &bounds[th]);
  }
  for(th = 0; th < TH_NUM; th++){
    pthread_join(tids[th], (void *)&section_delta[th]);
  }

  for(th = 0; th < TH_NUM; th++){
    uint32_t white, black;
    for(white = 0; white < 256; white++){
      for(black = 0; black < 256; black++){
	if(white & black){
	  continue;
	}
	weights->row_1[white][black] += section_delta[th]->row_1[white][black];
	weights->row_2[white][black] += section_delta[th]->row_2[white][black];
	weights->row_3[white][black] += section_delta[th]->row_3[white][black];
	weights->row_4[white][black] += section_delta[th]->row_4[white][black];

	weights->diag_8[white][black] += section_delta[th]->diag_8[white][black];
      }
    }
    for(white = 0; white < 128; white++){
      for(black = 0; black < 128; black++){
	if(white & black){
	  continue;
	}
	weights->diag_7[white][black] += section_delta[th]->diag_7[white][black];
      }
    }
    for(white = 0; white < 64; white++){
      for(black = 0; black < 64; black++){
	if(white & black){
	  continue;
	}
	weights->diag_6[white][black] += section_delta[th]->diag_6[white][black];
      }
    }
    for(white = 0; white < 32; white++){
      for(black = 0; black < 32; black++){
	if(white & black){
	  continue;
	}
	weights->diag_5[white][black] += section_delta[th]->diag_5[white][black];
      }
    }
    for(white = 0; white < 16; white++){
      for(black = 0; black < 16; black++){
	if(white & black){
	  continue;
	}
	weights->diag_4[white][black] += section_delta[th]->diag_4[white][black];
      }
    }
    for(white = 0; white < 512; white++){
      for(black = 0; black < 512; black++){
	if(white & black){
	  continue;
	}
	int index = offset_19683[white] + _pext_u32(black, ~white);
	weights->corner_33[index] += section_delta[th]->corner_33[index];
      }
    }
    for(white = 0; white < 1024; white++){
      for(black = 0; black < 1024; black++){
	if(white & black){
	  continue;
	}
	int index = offset_59049[white] + _pext_u32(black, ~white);
	weights->corner_25[index] += section_delta[th]->corner_25[index];
	weights->edge_xx[index] += section_delta[th]->edge_xx[index];
      }
    }
  }

  for(th = 0; th < TH_NUM; th++){
    free(section_delta[th]);
  }
  
  double total_deriv = 0;
  for(th = 0; th < TH_NUM; th++){
    total_deriv += global_deriv_sqr[th];
  }
  total_deriv = sqrt(total_deriv);

  return total_deriv;
}
Пример #3
0
unsigned int test_pext_u32(unsigned int __X, unsigned int __Y) {
  // CHECK: @llvm.x86.bmi.pext.32
  return _pext_u32(__X, __Y);
}