predicate_numerator *predicate_numerator_alloc(pred *preds, size_t sz) { predicate_numerator *pred_num = malloc(sizeof(predicate_numerator)); assert(pred_num); pred_num->uniq_sz = sz; pred_num->uniq_preds = preds; /* alloc mem for reverse index */ for(uint32_t ar = 0; ar <= 2; ++ar) { uint64_t num = int_pow2(int_pow(K, ar)); pred_num->uniq_pred_idx[ar] = malloc(num * sizeof(pred_idx_t)); assert(pred_num->uniq_pred_idx[ar] != NULL); memset(pred_num->uniq_pred_idx[ar], 0xFF, num * sizeof(pred_idx_t)); } /* construct reverse index */ for(size_t pidx = 0; pidx < pred_num->uniq_sz; ++pidx) { pred *p = pred_num->uniq_preds + pidx; assert(p->arity <= 2 && "predicate numerator supports predicates of arity <= 2 only"); assert(p->data <= int_pow2(int_pow(K, p->arity)) && "predicate_numerator: broken predicate"); pred_num->uniq_pred_idx[p->arity][p->data] = pidx; } return pred_num; }
hashtable *hashtable_alloc(size_t capacity, uint32_t (*hash) (const void *), int (*eq) (const void *, const void *)) { hashtable *ht = malloc(sizeof(hashtable)); assert(ht); /* capacity must be power of 2 */ ht->size = 0; ht->capacity = int_pow2(int_log(2, 2*capacity)); ht->table = malloc(ht->capacity * sizeof(hash_elem *)); assert(ht->table); memset(ht->table, 0, ht->capacity * sizeof(hash_elem *)); ht->hash = hash; ht->eq = eq; return ht; }
/* ***************************************************************** compute locations of "marked" occurrences. This procedure does the following: 1) compute the desired # of marked chars (=desired_marked_chars) 2) compute the best pair i,j such that (occ[i]/2^j) is as close as possible (but <= ) to desired_marked_chars write i in s->chosen_char and 2^j in s->skip 3) scan s->bwt[] and "select" one out of s->skip occurrences of s->chosen_char. For each selected occurrence write in s->loc_occ its position in the original text. I think this procedure could be improved (that is, simplified and faster in doing the search with bwhuffw) using the ideas introduced in compute_locations_dict() and compute_locations_huffword() more precisely 1) remove s->chosen_char and mark simply one row every s->skip (this would make the marked chars more evely distributed in the text). 2) consider the row starting with a "marked char" rather than ending (this would simplify the code) **************************************************************** */ int compute_locations(bwi_input *s) { int i,max,j,count,chosen_occ; int ch_occ[256],rescaled,skip; int exponent, desired_marked_chars, marked_chars; if(Marked_char_freq==0) { s->skip=0; s->chosen_char = 0; return 0; } /* ------ compute the desired number of marked chars ------ */ desired_marked_chars = (int) (s->text_size * Marked_char_freq); if(desired_marked_chars==0) desired_marked_chars=1; // ---- Count occurrences for each character ch_occ[s->alpha_size-1]= s->text_size-s->pfx_char_occ[s->alpha_size-1]; for(i=0;i<s->alpha_size-1;i++) ch_occ[i]=s->pfx_char_occ[i+1]-s->pfx_char_occ[i]; // ----- select best (char,skip) pair for(i=0, max=-1; i<s->alpha_size; i++){ if(i==s->bwt[0]) continue; // Exclude bwt-first-char (see below) /* --- determine the number of skipped char for i */ if (ch_occ[i] > desired_marked_chars) { exponent = int_log2(ch_occ[i]/desired_marked_chars); assert(exponent > 0); skip = int_pow2(exponent); } else skip = 1; /* --- check if this is the best choice seen so far --- */ rescaled = ch_occ[i] / skip; if(rescaled>max && rescaled <= desired_marked_chars) { max = rescaled; s->chosen_char = i; s->skip = skip; } } assert(max > 0); assert(s->skip>0); if(Verbose>1) { for(i=0;i<256;i++) if(s->char_map[i]==s->chosen_char) break; fprintf(stderr,"Marked char is ascii %d; ", i); fprintf(stderr,"one occ every %d is marked; ",s->skip); } // ------- compute number of marked chars chosen_occ = ch_occ[s->chosen_char]; if(chosen_occ % s->skip) marked_chars = chosen_occ/s->skip + 1; else marked_chars = chosen_occ/s->skip; // -------- alloc s->loc_occ s->loc_occ = (int *) malloc(sizeof(int) * (marked_chars)); // write the text location of the ROWS ending with ch for(i=1,j=0,count=0; i<s->text_size; i++) { // bwt[0] is not the marked char (see above) if (s->bwt[i] == s->chosen_char) { if ((count % s->skip) == 0) { if (i <= s->bwt_eof_pos) { s->loc_occ[j] = (int) s->sa[i-1]; assert(s->text[s->loc_occ[j]-1] == s->chosen_char); } else { s->loc_occ[j] = s->sa[i]; assert(s->text[s->loc_occ[j]-1] == s->chosen_char); } j++; } count++; } } // j is the number of marked chars if(Verbose>1) fprintf(stderr,"%d chars marked.\n", j); assert(j == marked_chars); assert(count == chosen_occ); return j; }