0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xa,0xb,0xFF,0xFF,0xFF,0xFF, 0x4,0x5,0x6,0x7,0x8,0x9,0xa,0xb,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0x0,0x1,0x2,0x3,0x8,0x9,0xa,0xb,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0x8,0x9,0xa,0xb,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0x4,0x5,0x6,0x7,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0x0,0x1,0x2,0x3,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF }; // write vector new, while omitting repeated values assuming that previously written vector was "old" static int store_unique(__m128i old,__m128i new, uint32_t * output) { __m128i vecTmp = _mm_alignr_epi8(new, old, 16-4); int M = _mm_movemask_epi8(_mm_cmpeq_epi32(vecTmp,new));//_pdep_u32(,0x1111); M=_pext_u32(M,0x1111); int numberofnewvalues = 4 - _mm_popcnt_u32(M); __m128i key = _mm_lddqu_si128((const __m128i* )uniqshuf + M); __m128i val = _mm_shuffle_epi8(new,key); _mm_storeu_si128((__m128i* )output,val); return numberofnewvalues; } // working in-place, this function overwrites the repeated values static uint32_t unique(uint32_t * out, uint32_t len) { uint32_t pos = 1; for(uint32_t i = 1; i < len; ++i) { if(out[i] != out[i-1]) { out[pos++] = out[i]; } }
double grad_descent_step_mt(DataPoint *datapoints, long int n_dp, Weights *weights, double alpha){ long int bounds[TH_NUM + 1]; int th; int interval = n_dp/4; for(th = 0; th < TH_NUM; th++){ bounds[th] = interval * th; } bounds[th] = n_dp; double deriv_sqr[TH_NUM]; global_datapoints = datapoints; global_bounds = bounds; global_deriv_sqr = deriv_sqr; global_n_dp = n_dp; global_alpha = alpha; global_confcount = malloc(sizeof(ConfigCounter)); memset(global_confcount, 0, sizeof(ConfigCounter)); long int i; for(i = 0; i < n_dp; i++){ increment_confcount(global_confcount, datapoints[i].board); } pthread_t tids[TH_NUM]; Weights *section_delta[TH_NUM]; for(th = 0; th < TH_NUM; th++){ pthread_create(&(tids[th]), NULL, grad_descent_step_thread, &bounds[th]); } for(th = 0; th < TH_NUM; th++){ pthread_join(tids[th], (void *)§ion_delta[th]); } for(th = 0; th < TH_NUM; th++){ uint32_t white, black; for(white = 0; white < 256; white++){ for(black = 0; black < 256; black++){ if(white & black){ continue; } weights->row_1[white][black] += section_delta[th]->row_1[white][black]; weights->row_2[white][black] += section_delta[th]->row_2[white][black]; weights->row_3[white][black] += section_delta[th]->row_3[white][black]; weights->row_4[white][black] += section_delta[th]->row_4[white][black]; weights->diag_8[white][black] += section_delta[th]->diag_8[white][black]; } } for(white = 0; white < 128; white++){ for(black = 0; black < 128; black++){ if(white & black){ continue; } weights->diag_7[white][black] += section_delta[th]->diag_7[white][black]; } } for(white = 0; white < 64; white++){ for(black = 0; black < 64; black++){ if(white & black){ continue; } weights->diag_6[white][black] += section_delta[th]->diag_6[white][black]; } } for(white = 0; white < 32; white++){ for(black = 0; black < 32; black++){ if(white & black){ continue; } weights->diag_5[white][black] += section_delta[th]->diag_5[white][black]; } } for(white = 0; white < 16; white++){ for(black = 0; black < 16; black++){ if(white & black){ continue; } weights->diag_4[white][black] += section_delta[th]->diag_4[white][black]; } } for(white = 0; white < 512; white++){ for(black = 0; black < 512; black++){ if(white & black){ continue; } int index = offset_19683[white] + _pext_u32(black, ~white); weights->corner_33[index] += section_delta[th]->corner_33[index]; } } for(white = 0; white < 1024; white++){ for(black = 0; black < 1024; black++){ if(white & black){ continue; } int index = offset_59049[white] + _pext_u32(black, ~white); weights->corner_25[index] += section_delta[th]->corner_25[index]; weights->edge_xx[index] += section_delta[th]->edge_xx[index]; } } } for(th = 0; th < TH_NUM; th++){ free(section_delta[th]); } double total_deriv = 0; for(th = 0; th < TH_NUM; th++){ total_deriv += global_deriv_sqr[th]; } total_deriv = sqrt(total_deriv); return total_deriv; }
unsigned int test_pext_u32(unsigned int __X, unsigned int __Y) { // CHECK: @llvm.x86.bmi.pext.32 return _pext_u32(__X, __Y); }