int searchSIMDTree(int32_t **tree, int *fanout, int levels, int32_t value) {
    int iLevel = 0;
    int lOffset = 0;
    int pOffset = 0;
    int32_t cmpmask = 0;
    int32_t eqmask = 0;

     __m128i key = _mm_cvtsi32_si128(value);
    key = _mm_shuffle_epi32(key, _MM_SHUFFLE(0,0,0,0));

    while (iLevel < levels) {
        int f = fanout[iLevel];
        pOffset = lOffset;
        lOffset *= f - 1;
        int iter = 0;
        int position = 0;
        while (iter < f/4) {
            __m128i delimiters = _mm_load_si128((__m128i const*)&tree[iLevel][lOffset + iter*4]);
            __m128i compare = _mm_cmpgt_epi32(key, delimiters);
            cmpmask = _mm_movemask_ps(_mm_castsi128_ps(compare));
            cmpmask ^= 0x0F;
            if (cmpmask) {
                position = _bit_scan_forward(cmpmask);
                break;
            }
            iter++;
        }
        int offset = lOffset + iter*4 + position;
        lOffset = offset + pOffset;
        iLevel++;
    }
    return lOffset;
}
Пример #2
0
BOOST_FORCEINLINE unsigned find_lsb(unsigned mask, mpl::int_<1> const&)
{
   return _bit_scan_forward(mask);
}
Пример #3
0
void run_graph_program(GraphProgram<T,U,V>* gp, Graph<V>& g, int iterations=1, struct run_graph_program_temp_structure<T,U,V>* rgpts=NULL) { //iterations = -1 ==> until convergence
  int it = 0;
  int converged = 1;

  unsigned long long int init_start = __rdtsc();

  auto act = gp->getActivity();

  SparseInVector<T>* px;
  SparseOutVector<U>* py;

  if (rgpts == NULL) {
    px  = new SparseInVector<T>(g.nvertices);
    py  = new SparseOutVector<U>(g.nvertices);
  }

  SparseInVector<T>&x = (rgpts==NULL)?(*px):*(rgpts->px);
  SparseOutVector<U>& y = (rgpts==NULL)?(*py):*(rgpts->py);

  #ifdef __TIMING
  printf("Nvertices = %d numints = %d \n", g.nvertices, y.numInts);
  #endif

  unsigned long long int start, end;
  int* start_vertex = new int[nthreads+1];

  //divide numInts to start_vertex
  //divide the active vertices in each into start_index
  start_vertex[nthreads] = g.nvertices;
  #pragma omp parallel num_threads(nthreads)
  {
    int tid = omp_get_thread_num();
    int ints_per_th = (y.numInts/nthreads)*32;
    int sv  = ints_per_th*tid;
    sv = (((sv/32)/4)*4)*32; //sv is multiple of 32 and sv/32 is a multiple of 4
    sv = (((sv/32)/SIMD_WIDTH)*SIMD_WIDTH)*32; //sv is multiple of 32 and sv/32 is a multiple of SIMD_WIDTH
    if (sv >= g.nvertices) sv = g.nvertices;
    if (sv == 0) sv = 0;
    start_vertex[tid] = sv;
  }

  unsigned long long int init_end = __rdtsc();
  #ifdef __TIMING
  printf("GraphMat init time = %f ms \n", (init_end-init_start)/(CPU_FREQ)*1e3);
  #endif

  while(1) {
    unsigned long long int iteration_start = __rdtsc();
    x.clear();
    y.clear();
    converged = 1;

    start = __rdtsc();

    //check active vector and set message vector
    int count = 0;
    #pragma omp parallel num_threads(nthreads) reduction(+:count)
    {
    int tid = omp_get_thread_num();
    for (int i = start_vertex[tid]; i < start_vertex[tid+1]; i++){
      if (g.active[i]) {
        T message;
        bool msg_opt = gp->send_message(g.vertexproperty[i], message);
        if (msg_opt) {
          x.set(i, message);
          count++;
        }
      }
    }
    }
    x.length = count;

    #ifdef __TIMING
    printf("x.length = %d \n", x.length);
    #endif
    end = __rdtsc();
    #ifdef __TIMING
    printf("Send message time = %.3f ms \n", (end-start)/(CPU_FREQ)*1e3);
    #endif

    start = __rdtsc();
    
    //do SpMV
    if (gp->getOrder() == OUT_EDGES) {

      SpMTSpV(g, gp, x, y);

    } else if (gp->getOrder() == IN_EDGES) {

      SpMSpV(g, gp, x, y);

    } else if (gp->getOrder() == ALL_EDGES) {

      SpMTSpV(g, gp, x, y);
      SpMSpV(g, gp, x, y);

    } else {
      printf("Unrecognized option \n");
      exit(1);
    }
    end = __rdtsc();
    #ifdef __TIMING
    printf("SPMV time = %.3f ms \n", (end-start)/(CPU_FREQ)*1e3);
    #endif
    
    start = __rdtsc();
    g.setAllInactive();

    //update state and activity and check for convergence if needed
    int nout = 0;
    int total_search = 0;
    converged = 1;
    #pragma omp parallel num_threads(nthreads) reduction(+:nout) reduction(&:converged) reduction(+:total_search) //schedule(static)
    {
      int zero = 0;
      SIMDINTTYPE xmm_zero = _MM_SET1(zero);
      int tid = omp_get_thread_num();
      int count_ones = 0;
    int end_of_numInts = start_vertex[tid+1]/32;
    if (tid == nthreads-1) end_of_numInts = y.numInts;
    for (int ii = start_vertex[tid]/32; ii < end_of_numInts; ii+=SIMD_WIDTH) {

      __m128i xmm_local_bitvec = _mm_loadu_si128((__m128i*)(y.bitvector + ii));
      __m128 xmm_cmp_mask = _mm_castsi128_ps(_mm_cmpeq_epi32((xmm_local_bitvec), (xmm_zero)));
      int mask_value_0 = _mm_movemask_ps(xmm_cmp_mask);
      if(mask_value_0 == 15)
      {
        continue;
      }
      for(int i = ii; i < ii+SIMD_WIDTH; i++)
      {
        unsigned int value = y.bitvector[i];
        while (value != 0) {
          int last_bit = _bit_scan_forward(value);
          int idx = i*32 + last_bit;

          V old_prop;
            old_prop = g.vertexproperty[idx];
      
          gp->apply(y.value[idx], g.vertexproperty[idx]);
          nout++;

            if (old_prop != g.vertexproperty[idx]) {
	      g.setActive(idx);
              count_ones++;
              converged = 0;
              total_search++;
            }

          value &= (~(1<<last_bit));
        }
      }
    }
    
    }
    if (act == ALL_VERTICES) {
      g.setAllActive();
    }

    #ifdef __TIMING
    printf("Number of vertices that changed state = %d \n", total_search);
    #endif

    end = __rdtsc();
    #ifdef __TIMING
    printf("Apply time = %.3f ms \n", (end-start)/(CPU_FREQ)*1e3);
    #endif
    
    gp->do_every_iteration(it);

    unsigned long long int iteration_end = __rdtsc();
    #ifdef __TIMING
    printf("Iteration %d :: %f msec :: updated %d vertices \n", it, (iteration_end-iteration_start)/(CPU_FREQ)*1e3, nout);
    #endif

    it++;
    if (it == iterations) {
      break;
    }
    if (iterations <= 0 && converged == 1) {
      break;
    }
  }

  unsigned long long int clear_start = __rdtsc();
  delete [] start_vertex;

  if (rgpts == NULL) {
    delete px;
    delete py;
  }

  unsigned long long int clear_end = __rdtsc();
  #ifdef __TIMING
  printf("GraphMat clear time = %f msec \n", (clear_end-clear_start)/(CPU_FREQ)*1e3);
  #endif

  printf("Completed %d iterations \n", it);

}