predicate_numerator *predicate_numerator_alloc(pred *preds, size_t sz) {
  predicate_numerator *pred_num = malloc(sizeof(predicate_numerator));
  assert(pred_num);

  pred_num->uniq_sz    = sz;
  pred_num->uniq_preds = preds;
  
  /* alloc mem for reverse index */
  for(uint32_t ar = 0; ar <= 2; ++ar) {
    uint64_t num = int_pow2(int_pow(K, ar));
    pred_num->uniq_pred_idx[ar] = malloc(num * sizeof(pred_idx_t)); 
    assert(pred_num->uniq_pred_idx[ar] != NULL);
    memset(pred_num->uniq_pred_idx[ar], 0xFF, num * sizeof(pred_idx_t));
  }
  
  /* construct reverse index */  
  for(size_t pidx = 0; pidx < pred_num->uniq_sz; ++pidx) {
    pred *p = pred_num->uniq_preds + pidx;
    assert(p->arity <= 2 && "predicate numerator supports predicates of arity <= 2 only");
    assert(p->data <= int_pow2(int_pow(K, p->arity)) && "predicate_numerator: broken predicate");
    pred_num->uniq_pred_idx[p->arity][p->data] = pidx;
  }
  
  return pred_num;
}
hashtable *hashtable_alloc(size_t capacity,
                           uint32_t (*hash) (const void *),
                           int (*eq) (const void *, const void *)) {
  hashtable *ht = malloc(sizeof(hashtable));
  assert(ht);
  /* capacity must be power of 2 */
  ht->size     = 0;
  ht->capacity = int_pow2(int_log(2, 2*capacity));
  ht->table    = malloc(ht->capacity * sizeof(hash_elem *));
  assert(ht->table);
  memset(ht->table, 0, ht->capacity * sizeof(hash_elem *));
  ht->hash     = hash;
  ht->eq       = eq;
  return ht;
}
Example #3
0
/* *****************************************************************
   compute locations of "marked" occurrences. This procedure does
   the following: 
     1) compute the desired # of marked chars (=desired_marked_chars)
     2) compute the best pair i,j such that (occ[i]/2^j) is as close
        as possible (but <= ) to desired_marked_chars
        write  i in s->chosen_char and 2^j in s->skip 
     3) scan s->bwt[] and "select" one out of s->skip occurrences of 
        s->chosen_char. For each selected occurrence write in s->loc_occ
       its position in the original text.

   I think this procedure could be improved (that is, simplified 
   and faster in doing the search with bwhuffw) using the ideas
   introduced in compute_locations_dict() and compute_locations_huffword()
   more precisely
    1) remove s->chosen_char and mark simply one row every s->skip
       (this would make the marked chars more evely distributed in the
        text).
    2) consider the row starting with a "marked char" rather than ending
       (this would simplify the code)
   **************************************************************** */
int compute_locations(bwi_input *s)
{
  int i,max,j,count,chosen_occ;
  int ch_occ[256],rescaled,skip;
  int exponent, desired_marked_chars, marked_chars;
  
  if(Marked_char_freq==0) {
    s->skip=0; s->chosen_char = 0; 
    return 0;
  }
  /* ------ compute the desired number of marked chars ------ */
  desired_marked_chars =  (int) (s->text_size * Marked_char_freq);
  if(desired_marked_chars==0)
    desired_marked_chars=1;
      
  // ---- Count occurrences for each character
  ch_occ[s->alpha_size-1]= s->text_size-s->pfx_char_occ[s->alpha_size-1];
  for(i=0;i<s->alpha_size-1;i++)
    ch_occ[i]=s->pfx_char_occ[i+1]-s->pfx_char_occ[i];
  
  // ----- select best (char,skip) pair
  for(i=0, max=-1; i<s->alpha_size; i++){
    if(i==s->bwt[0]) continue;  // Exclude bwt-first-char (see below)
    /* --- determine the number of skipped char for i */
    if (ch_occ[i] > desired_marked_chars) {
      exponent = int_log2(ch_occ[i]/desired_marked_chars);
      assert(exponent > 0);
      skip = int_pow2(exponent);
    }
    else
      skip = 1;
    /* --- check if this is the best choice seen so far --- */
    rescaled = ch_occ[i] / skip;
    if(rescaled>max && rescaled <= desired_marked_chars) {
      max = rescaled;
      s->chosen_char = i;
      s->skip = skip;
    }
  }
  assert(max > 0);
  assert(s->skip>0);
  
  if(Verbose>1) {
    for(i=0;i<256;i++)
      if(s->char_map[i]==s->chosen_char) break;
    fprintf(stderr,"Marked char is ascii %d; ", i); 
    fprintf(stderr,"one occ every %d is marked; ",s->skip);
  }

  // ------- compute number of marked chars
  chosen_occ = ch_occ[s->chosen_char];
  if(chosen_occ % s->skip)
    marked_chars = chosen_occ/s->skip + 1;
  else
    marked_chars = chosen_occ/s->skip;
  // -------- alloc s->loc_occ
  s->loc_occ = (int *) malloc(sizeof(int) * (marked_chars));

  // write the text location of the ROWS ending with ch
  for(i=1,j=0,count=0; i<s->text_size; i++) 
    {                         // bwt[0] is not the marked char (see above) 
      if (s->bwt[i] == s->chosen_char) {
	if ((count % s->skip) == 0) {
	  if (i <= s->bwt_eof_pos) { 
	    s->loc_occ[j] = (int) s->sa[i-1];
	    assert(s->text[s->loc_occ[j]-1] == s->chosen_char);
	  }
	  else {
	    s->loc_occ[j] = s->sa[i]; 
	    assert(s->text[s->loc_occ[j]-1] == s->chosen_char);
	  }
	  j++;
	} 
	count++;
      }
    }
  // j is the number of marked chars
  if(Verbose>1) 
    fprintf(stderr,"%d chars marked.\n", j); 
  assert(j == marked_chars);
  assert(count == chosen_occ);  
  return j;
}