int searchSIMDTree(int32_t **tree, int *fanout, int levels, int32_t value) { int iLevel = 0; int lOffset = 0; int pOffset = 0; int32_t cmpmask = 0; int32_t eqmask = 0; __m128i key = _mm_cvtsi32_si128(value); key = _mm_shuffle_epi32(key, _MM_SHUFFLE(0,0,0,0)); while (iLevel < levels) { int f = fanout[iLevel]; pOffset = lOffset; lOffset *= f - 1; int iter = 0; int position = 0; while (iter < f/4) { __m128i delimiters = _mm_load_si128((__m128i const*)&tree[iLevel][lOffset + iter*4]); __m128i compare = _mm_cmpgt_epi32(key, delimiters); cmpmask = _mm_movemask_ps(_mm_castsi128_ps(compare)); cmpmask ^= 0x0F; if (cmpmask) { position = _bit_scan_forward(cmpmask); break; } iter++; } int offset = lOffset + iter*4 + position; lOffset = offset + pOffset; iLevel++; } return lOffset; }
BOOST_FORCEINLINE unsigned find_lsb(unsigned mask, mpl::int_<1> const&) { return _bit_scan_forward(mask); }
void run_graph_program(GraphProgram<T,U,V>* gp, Graph<V>& g, int iterations=1, struct run_graph_program_temp_structure<T,U,V>* rgpts=NULL) { //iterations = -1 ==> until convergence int it = 0; int converged = 1; unsigned long long int init_start = __rdtsc(); auto act = gp->getActivity(); SparseInVector<T>* px; SparseOutVector<U>* py; if (rgpts == NULL) { px = new SparseInVector<T>(g.nvertices); py = new SparseOutVector<U>(g.nvertices); } SparseInVector<T>&x = (rgpts==NULL)?(*px):*(rgpts->px); SparseOutVector<U>& y = (rgpts==NULL)?(*py):*(rgpts->py); #ifdef __TIMING printf("Nvertices = %d numints = %d \n", g.nvertices, y.numInts); #endif unsigned long long int start, end; int* start_vertex = new int[nthreads+1]; //divide numInts to start_vertex //divide the active vertices in each into start_index start_vertex[nthreads] = g.nvertices; #pragma omp parallel num_threads(nthreads) { int tid = omp_get_thread_num(); int ints_per_th = (y.numInts/nthreads)*32; int sv = ints_per_th*tid; sv = (((sv/32)/4)*4)*32; //sv is multiple of 32 and sv/32 is a multiple of 4 sv = (((sv/32)/SIMD_WIDTH)*SIMD_WIDTH)*32; //sv is multiple of 32 and sv/32 is a multiple of SIMD_WIDTH if (sv >= g.nvertices) sv = g.nvertices; if (sv == 0) sv = 0; start_vertex[tid] = sv; } unsigned long long int init_end = __rdtsc(); #ifdef __TIMING printf("GraphMat init time = %f ms \n", (init_end-init_start)/(CPU_FREQ)*1e3); #endif while(1) { unsigned long long int iteration_start = __rdtsc(); x.clear(); y.clear(); converged = 1; start = __rdtsc(); //check active vector and set message vector int count = 0; #pragma omp parallel num_threads(nthreads) reduction(+:count) { int tid = omp_get_thread_num(); for (int i = start_vertex[tid]; i < start_vertex[tid+1]; i++){ if (g.active[i]) { T message; bool msg_opt = gp->send_message(g.vertexproperty[i], message); if (msg_opt) { x.set(i, message); count++; } } } } x.length = count; #ifdef __TIMING printf("x.length = %d \n", x.length); #endif end = __rdtsc(); #ifdef __TIMING printf("Send message time = %.3f ms \n", (end-start)/(CPU_FREQ)*1e3); #endif start = __rdtsc(); //do SpMV if (gp->getOrder() == OUT_EDGES) { SpMTSpV(g, gp, x, y); } else if (gp->getOrder() == IN_EDGES) { SpMSpV(g, gp, x, y); } else if (gp->getOrder() == ALL_EDGES) { SpMTSpV(g, gp, x, y); SpMSpV(g, gp, x, y); } else { printf("Unrecognized option \n"); exit(1); } end = __rdtsc(); #ifdef __TIMING printf("SPMV time = %.3f ms \n", (end-start)/(CPU_FREQ)*1e3); #endif start = __rdtsc(); g.setAllInactive(); //update state and activity and check for convergence if needed int nout = 0; int total_search = 0; converged = 1; #pragma omp parallel num_threads(nthreads) reduction(+:nout) reduction(&:converged) reduction(+:total_search) //schedule(static) { int zero = 0; SIMDINTTYPE xmm_zero = _MM_SET1(zero); int tid = omp_get_thread_num(); int count_ones = 0; int end_of_numInts = start_vertex[tid+1]/32; if (tid == nthreads-1) end_of_numInts = y.numInts; for (int ii = start_vertex[tid]/32; ii < end_of_numInts; ii+=SIMD_WIDTH) { __m128i xmm_local_bitvec = _mm_loadu_si128((__m128i*)(y.bitvector + ii)); __m128 xmm_cmp_mask = _mm_castsi128_ps(_mm_cmpeq_epi32((xmm_local_bitvec), (xmm_zero))); int mask_value_0 = _mm_movemask_ps(xmm_cmp_mask); if(mask_value_0 == 15) { continue; } for(int i = ii; i < ii+SIMD_WIDTH; i++) { unsigned int value = y.bitvector[i]; while (value != 0) { int last_bit = _bit_scan_forward(value); int idx = i*32 + last_bit; V old_prop; old_prop = g.vertexproperty[idx]; gp->apply(y.value[idx], g.vertexproperty[idx]); nout++; if (old_prop != g.vertexproperty[idx]) { g.setActive(idx); count_ones++; converged = 0; total_search++; } value &= (~(1<<last_bit)); } } } } if (act == ALL_VERTICES) { g.setAllActive(); } #ifdef __TIMING printf("Number of vertices that changed state = %d \n", total_search); #endif end = __rdtsc(); #ifdef __TIMING printf("Apply time = %.3f ms \n", (end-start)/(CPU_FREQ)*1e3); #endif gp->do_every_iteration(it); unsigned long long int iteration_end = __rdtsc(); #ifdef __TIMING printf("Iteration %d :: %f msec :: updated %d vertices \n", it, (iteration_end-iteration_start)/(CPU_FREQ)*1e3, nout); #endif it++; if (it == iterations) { break; } if (iterations <= 0 && converged == 1) { break; } } unsigned long long int clear_start = __rdtsc(); delete [] start_vertex; if (rgpts == NULL) { delete px; delete py; } unsigned long long int clear_end = __rdtsc(); #ifdef __TIMING printf("GraphMat clear time = %f msec \n", (clear_end-clear_start)/(CPU_FREQ)*1e3); #endif printf("Completed %d iterations \n", it); }