Exemple #1
0
uint32_t
PolyReader_sub_tick(I32Array *offsets, int32_t doc_id) {
    int32_t size = I32Arr_Get_Size(offsets);
    if (size == 0) {
        return 0;
    }

    int32_t lo = -1;
    int32_t hi = size;
    while (hi - lo > 1) {
        int32_t mid = lo + ((hi - lo) / 2);
        int32_t offset = I32Arr_Get(offsets, mid);
        if (doc_id <= offset) {
            hi = mid;
        }
        else {
            lo = mid;
        }
    }
    if (hi == size) {
        hi--;
    }

    while (hi > 0) {
        int32_t offset = I32Arr_Get(offsets, hi);
        if (doc_id <= offset) {
            hi--;
        }
        else {
            break;
        }
    }

    return hi;
}
Exemple #2
0
static SeriesMatcher*
S_make_series_matcher(I32Array *doc_ids, I32Array *offsets, int32_t doc_max) {
    int32_t  num_doc_ids  = I32Arr_Get_Size(doc_ids);
    int32_t  num_matchers = I32Arr_Get_Size(offsets);
    VArray  *matchers     = VA_new(num_matchers);
    int32_t  tick         = 0;
    int32_t  i;

    // Divvy up doc_ids by segment into BitVectors.
    for (i = 0; i < num_matchers; i++) {
        int32_t offset = I32Arr_Get(offsets, i);
        int32_t max    = i == num_matchers - 1
                         ? doc_max + 1
                         : I32Arr_Get(offsets, i + 1);
        BitVector *bit_vec = BitVec_new(max - offset);
        while (tick < num_doc_ids) {
            int32_t doc_id = I32Arr_Get(doc_ids, tick);
            if (doc_id > max) { break; }
            else               { tick++; }
            BitVec_Set(bit_vec, doc_id - offset);
        }
        VA_Push(matchers, (Obj*)BitVecMatcher_new(bit_vec));
        DECREF(bit_vec);
    }

    SeriesMatcher *series_matcher = SeriesMatcher_new(matchers, offsets);
    DECREF(matchers);
    return series_matcher;
}
Exemple #3
0
uint32_t
IxManager_Choose_Sparse_IMP(IndexManager *self, I32Array *doc_counts) {
    UNUSED_VAR(self);
    uint32_t threshold  = 0;
    int32_t total_docs = 0;
    const uint32_t num_candidates = (uint32_t)I32Arr_Get_Size(doc_counts);

    // Find sparsely populated segments.
    for (uint32_t i = 0; i < num_candidates; i++) {
        uint32_t num_segs_when_done = num_candidates - threshold + 1;
        total_docs += I32Arr_Get(doc_counts, i);
        if (total_docs < (int32_t)S_fibonacci(num_segs_when_done + 5)) {
            threshold = i + 1;
        }
    }

    // If recycling, try not to get stuck merging the same big segment over
    // and over on small commits.
    if (threshold == 1 && num_candidates > 2) {
        int32_t this_seg_doc_count = I32Arr_Get(doc_counts, 0);
        int32_t next_seg_doc_count = I32Arr_Get(doc_counts, 1);
        // Try to merge 2 segments worth of stuff, so long as the next segment
        // is less than double the size.
        if (next_seg_doc_count / 2 < this_seg_doc_count) {
            threshold = 2;
        }
    }

    return threshold;
}
Exemple #4
0
Vector*
TermCompiler_Highlight_Spans_IMP(TermCompiler *self, Searcher *searcher,
                                 DocVector *doc_vec, String *field) {

    TermCompilerIVARS *const ivars = TermCompiler_IVARS(self);
    TermQueryIVARS *const parent_ivars
        = TermQuery_IVARS((TermQuery*)ivars->parent);
    Vector *spans = Vec_new(0);
    TermVector *term_vector;
    I32Array *starts, *ends;
    UNUSED_VAR(searcher);

    if (!Str_Equals(parent_ivars->field, (Obj*)field)) { return spans; }

    // Add all starts and ends.
    term_vector
        = DocVec_Term_Vector(doc_vec, field, (String*)parent_ivars->term);
    if (!term_vector) { return spans; }

    starts = TV_Get_Start_Offsets(term_vector);
    ends   = TV_Get_End_Offsets(term_vector);
    for (size_t i = 0, max = I32Arr_Get_Size(starts); i < max; i++) {
        int32_t start  = I32Arr_Get(starts, i);
        int32_t length = I32Arr_Get(ends, i) - start;
        Vec_Push(spans,
                (Obj*)Span_new(start, length, TermCompiler_Get_Weight(self)));
    }

    DECREF(term_vector);
    return spans;
}
Exemple #5
0
static void
test_To_Array(TestBatchRunner *runner) {
    uint64_t  *source_ints = TestUtils_random_u64s(NULL, 20, 0, 200);
    BitVector *bit_vec = BitVec_new(0);
    I32Array  *array;
    unsigned   num_unique = 0;

    // Unique the random ints.
    qsort(source_ints, 20, sizeof(uint64_t), S_compare_u64s);
    for (unsigned i = 0; i < 19; i++) {
        if (source_ints[i] != source_ints[i + 1]) {
            source_ints[num_unique] = source_ints[i];
            num_unique++;
        }
    }

    // Set bits.
    for (unsigned i = 0; i < num_unique; i++) {
        BitVec_Set(bit_vec, (size_t)source_ints[i]);
    }

    // Create the array and compare it to the source.
    array = BitVec_To_Array(bit_vec);
    unsigned i;
    for (i = 0; i < num_unique; i++) {
        if (I32Arr_Get(array, (size_t)i) != (int32_t)source_ints[i]) { break; }
    }
    TEST_UINT_EQ(runner, i, num_unique, "To_Array (%u == %u)", i,
                 num_unique);

    DECREF(array);
    DECREF(bit_vec);
    FREEMEM(source_ints);
}
Exemple #6
0
static void
test_To_Array(TestBatch *batch) {
    uint64_t  *source_ints = TestUtils_random_u64s(NULL, 20, 0, 200);
    BitVector *bit_vec = BitVec_new(0);
    I32Array  *array;
    long       num_unique = 0;
    long       i;

    // Unique the random ints.
    Sort_quicksort(source_ints, 20, sizeof(uint64_t),
                   S_compare_u64s, NULL);
    for (i = 0; i < 19; i++) {
        if (source_ints[i] != source_ints[i + 1]) {
            source_ints[num_unique] = source_ints[i];
            num_unique++;
        }
    }

    // Set bits.
    for (i = 0; i < num_unique; i++) {
        BitVec_Set(bit_vec, (uint32_t)source_ints[i]);
    }

    // Create the array and compare it to the source.
    array = BitVec_To_Array(bit_vec);
    for (i = 0; i < num_unique; i++) {
        if (I32Arr_Get(array, i) != (int32_t)source_ints[i]) { break; }
    }
    TEST_INT_EQ(batch, i, num_unique, "To_Array (%ld == %ld)", i,
                num_unique);

    DECREF(array);
    DECREF(bit_vec);
    FREEMEM(source_ints);
}
PolyPostingList*
PolyPList_init(PolyPostingList *self, const CharBuf *field, 
                VArray *readers, I32Array *starts)
{
    u32_t i;
    const u32_t num_readers = VA_Get_Size(readers);

    /* Init. */
    self->tick            = 0;
    self->current         = NULL;

    /* Assign. */
    self->field           = CB_Clone(field);

    /* Get sub-posting_lists and assign offsets. */
    self->sub_plists = VA_new(num_readers);
    for (i = 0; i < num_readers; i++) {
        PostingsReader *const post_reader = (PostingsReader*)ASSERT_IS_A(
            VA_Fetch(readers, i), POSTINGSREADER);
        i32_t offset = I32Arr_Get(starts, i);
        SegPostingList *sub_plist = (SegPostingList*)PostReader_Posting_List(
            post_reader, field, NULL);

        if (sub_plist) {
            ASSERT_IS_A(sub_plist, SEGPOSTINGLIST);
            SegPList_Set_Doc_Base(sub_plist, offset);
            VA_Push(self->sub_plists, (Obj*)sub_plist);
        }
    }
    self->num_subs = VA_Get_Size(self->sub_plists);

    return self;
}
Exemple #8
0
// Adjust current doc id. We create our own doc_count rather than rely on
// SegReader's number because the DeletionsWriter and the SegReader are
// probably out of sync.
static void
S_adjust_doc_id(SegWriter *self, SegReader *reader, I32Array *doc_map) {
    SegWriterIVARS *const ivars = SegWriter_IVARS(self);
    int32_t doc_count = SegReader_Doc_Max(reader);
    for (size_t i = 1, max = I32Arr_Get_Size(doc_map); i < max; i++) {
        if (I32Arr_Get(doc_map, i) == 0) { doc_count--; }
    }
    Seg_Increment_Count(ivars->segment, doc_count);
}
Exemple #9
0
DocVector*
PolySearcher_Fetch_Doc_Vec_IMP(PolySearcher *self, int32_t doc_id) {
    PolySearcherIVARS *const ivars = PolySearcher_IVARS(self);
    uint32_t  tick     = PolyReader_sub_tick(ivars->starts, doc_id);
    Searcher *searcher = (Searcher*)VA_Fetch(ivars->searchers, tick);
    int32_t   start    = I32Arr_Get(ivars->starts, tick);
    if (!searcher) { THROW(ERR, "Invalid doc id: %i32", doc_id); }
    return Searcher_Fetch_Doc_Vec(searcher, doc_id - start);
}
Exemple #10
0
int32_t
MockMatcher_next(MockMatcher* self) {
    MockMatcherIVARS *const ivars = MockMatcher_IVARS(self);
    if (++ivars->tick >= (int32_t)ivars->size) {
        ivars->tick--;
        return 0;
    }
    return I32Arr_Get(ivars->doc_ids, ivars->tick);
}
Exemple #11
0
DocVector*
PolyHLReader_fetch_doc_vec(PolyHighlightReader *self, int32_t doc_id) {
    uint32_t seg_tick = PolyReader_sub_tick(self->offsets, doc_id);
    int32_t  offset   = I32Arr_Get(self->offsets, seg_tick);
    HighlightReader *sub_reader
        = (HighlightReader*)VA_Fetch(self->readers, seg_tick);
    if (!sub_reader) { THROW(ERR, "Invalid doc_id: %i32", doc_id); }
    return HLReader_Fetch_Doc_Vec(sub_reader, doc_id - offset);
}
Exemple #12
0
DocVector*
PolyHLReader_Fetch_Doc_Vec_IMP(PolyHighlightReader *self, int32_t doc_id) {
    PolyHighlightReaderIVARS *const ivars = PolyHLReader_IVARS(self);
    uint32_t seg_tick = PolyReader_sub_tick(ivars->offsets, doc_id);
    int32_t  offset   = I32Arr_Get(ivars->offsets, seg_tick);
    HighlightReader *sub_reader
        = (HighlightReader*)Vec_Fetch(ivars->readers, seg_tick);
    if (!sub_reader) { THROW(ERR, "Invalid doc_id: %i32", doc_id); }
    return HLReader_Fetch_Doc_Vec(sub_reader, doc_id - offset);
}
Exemple #13
0
HitDoc*
PolySearcher_Fetch_Doc_IMP(PolySearcher *self, int32_t doc_id) {
    PolySearcherIVARS *const ivars = PolySearcher_IVARS(self);
    uint32_t  tick     = PolyReader_sub_tick(ivars->starts, doc_id);
    Searcher *searcher = (Searcher*)VA_Fetch(ivars->searchers, tick);
    int32_t   offset   = I32Arr_Get(ivars->starts, tick);
    if (!searcher) { THROW(ERR, "Invalid doc id: %i32", doc_id); }
    HitDoc *hit_doc = Searcher_Fetch_Doc(searcher, doc_id - offset);
    HitDoc_Set_Doc_ID(hit_doc, doc_id);
    return hit_doc;
}
void
DefDelWriter_Delete_By_Doc_ID_IMP(DefaultDeletionsWriter *self, int32_t doc_id) {
    DefaultDeletionsWriterIVARS *const ivars = DefDelWriter_IVARS(self);
    uint32_t   sub_tick   = PolyReader_sub_tick(ivars->seg_starts, doc_id);
    BitVector *bit_vec    = (BitVector*)VA_Fetch(ivars->bit_vecs, sub_tick);
    uint32_t   offset     = I32Arr_Get(ivars->seg_starts, sub_tick);
    int32_t    seg_doc_id = doc_id - offset;

    if (!BitVec_Get(bit_vec, seg_doc_id)) {
        ivars->updated[sub_tick] = true;
        BitVec_Set(bit_vec, seg_doc_id);
    }
}
Exemple #15
0
void
PolySearcher_Collect_IMP(PolySearcher *self, Query *query,
                         Collector *collector) {
    PolySearcherIVARS *const ivars = PolySearcher_IVARS(self);
    VArray *const searchers = ivars->searchers;
    I32Array *starts = ivars->starts;

    for (uint32_t i = 0, max = VA_Get_Size(searchers); i < max; i++) {
        int32_t start = I32Arr_Get(starts, i);
        Searcher *searcher = (Searcher*)VA_Fetch(searchers, i);
        OffsetCollector *offset_coll = OffsetColl_new(collector, start);
        Searcher_Collect(searcher, query, (Collector*)offset_coll);
        DECREF(offset_coll);
    }
}
uint32_t
SortFieldWriter_Refill_IMP(SortFieldWriter *self) {
    SortFieldWriterIVARS *const ivars = SortFieldWriter_IVARS(self);
    if (!ivars->sort_cache) { return 0; }

    // Sanity check, then reset the buffer and prepare to start loading items.
    uint32_t buf_count = SortFieldWriter_Buffer_Count(self);
    if (buf_count) {
        THROW(ERR, "Refill called but buffer contains %u32 items",
              buf_count);
    }
    SortFieldWriter_Clear_Buffer(self);
    Counter_Reset(ivars->counter);
    S_lazy_init_sorted_ids(self);

    const int32_t    null_ord   = ivars->null_ord;
    I32Array *const  doc_map    = ivars->doc_map;
    SortCache *const sort_cache = ivars->sort_cache;

    uint32_t count = 0;
    while (ivars->run_tick <= ivars->run_max
           && Counter_Get_Value(ivars->counter) < ivars->mem_thresh
          ) {
        int32_t raw_doc_id = ivars->sorted_ids[ivars->run_tick];
        int32_t ord = SortCache_Ordinal(sort_cache, raw_doc_id);
        if (ord != null_ord) {
            int32_t remapped = doc_map
                               ? I32Arr_Get(doc_map, raw_doc_id)
                               : raw_doc_id;
            if (remapped) {
                Obj *val = SortCache_Value(sort_cache, ord);
                SortFieldWriter_Add(self, remapped, val);
                count++;
                DECREF(val);
            }
        }
        ivars->run_tick++;
    }

    if (ivars->run_tick > ivars->run_max) {
        DECREF(ivars->sort_cache);
        ivars->sort_cache = NULL;
        FREEMEM(ivars->sorted_ids);
        ivars->sorted_ids = NULL;
    }

    return count;
}
Exemple #17
0
HitDoc*
PolyDocReader_Fetch_Doc_IMP(PolyDocReader *self, int32_t doc_id) {
    PolyDocReaderIVARS *const ivars = PolyDocReader_IVARS(self);
    uint32_t seg_tick = PolyReader_sub_tick(ivars->offsets, doc_id);
    int32_t  offset   = I32Arr_Get(ivars->offsets, seg_tick);
    DocReader *doc_reader = (DocReader*)Vec_Fetch(ivars->readers, seg_tick);
    HitDoc *hit_doc = NULL;
    if (!doc_reader) {
        THROW(ERR, "Invalid doc_id: %i32", doc_id);
    }
    else {
        hit_doc = DocReader_Fetch_Doc(doc_reader, doc_id - offset);
        HitDoc_Set_Doc_ID(hit_doc, doc_id);
    }
    return hit_doc;
}
Obj*
PolyDocReader_fetch(PolyDocReader *self, i32_t doc_id, float score, 
                    i32_t offset)
{
    u32_t seg_tick  = PolyReader_sub_tick(self->offsets, doc_id);
    i32_t my_offset = I32Arr_Get(self->offsets, seg_tick);
    DocReader *doc_reader = (DocReader*)VA_Fetch(self->readers, seg_tick);
    Obj *hit = NULL;
    if (!doc_reader) { 
        THROW("Invalid doc_id: %i32", doc_id); 
    }
    else {
        hit = DocReader_Fetch(doc_reader, doc_id - my_offset, score, 
            offset + my_offset);
    }
    return hit;
}
Exemple #19
0
TopDocs*
PolySearcher_Top_Docs_IMP(PolySearcher *self, Query *query,
                          uint32_t num_wanted, SortSpec *sort_spec) {
    PolySearcherIVARS *const ivars = PolySearcher_IVARS(self);
    Schema   *schema      = PolySearcher_Get_Schema(self);
    VArray   *searchers   = ivars->searchers;
    I32Array *starts      = ivars->starts;
    HitQueue *hit_q       = sort_spec
                            ? HitQ_new(schema, sort_spec, num_wanted)
                            : HitQ_new(NULL, NULL, num_wanted);
    uint32_t  total_hits  = 0;
    Compiler *compiler    = Query_Is_A(query, COMPILER)
                            ? ((Compiler*)INCREF(query))
                            : Query_Make_Compiler(query, (Searcher*)self,
                                                  Query_Get_Boost(query),
                                                  false);

    for (uint32_t i = 0, max = VA_Get_Size(searchers); i < max; i++) {
        Searcher   *searcher   = (Searcher*)VA_Fetch(searchers, i);
        int32_t     base       = I32Arr_Get(starts, i);
        TopDocs    *top_docs   = Searcher_Top_Docs(searcher, (Query*)compiler,
                                                   num_wanted, sort_spec);
        VArray     *sub_match_docs = TopDocs_Get_Match_Docs(top_docs);

        total_hits += TopDocs_Get_Total_Hits(top_docs);

        S_modify_doc_ids(sub_match_docs, base);
        for (uint32_t j = 0, jmax = VA_Get_Size(sub_match_docs); j < jmax; j++) {
            MatchDoc *match_doc = (MatchDoc*)VA_Fetch(sub_match_docs, j);
            if (!HitQ_Insert(hit_q, INCREF(match_doc))) { break; }
        }

        DECREF(top_docs);
    }

    VArray  *match_docs = HitQ_Pop_All(hit_q);
    TopDocs *retval     = TopDocs_new(match_docs, total_hits);

    DECREF(match_docs);
    DECREF(compiler);
    DECREF(hit_q);
    return retval;
}
i32_t
SeriesMatcher_advance(SeriesMatcher *self, i32_t target) 
{
    if (target >= self->next_offset) {
        /* Proceed to next matcher or bail. */
        if (self->tick < self->num_matchers) {
            while (1) {
                u32_t next_offset = self->tick + 1 == self->num_matchers
                    ? I32_MAX 
                    : I32Arr_Get(self->offsets, self->tick + 1);
                self->current_matcher = (Matcher*)VA_Fetch(self->matchers,
                    self->tick);
                self->current_offset = self->next_offset;
                self->next_offset = next_offset;
                self->doc_id = next_offset - 1;
                self->tick++;
                if (   self->current_matcher != NULL 
                    || self->tick >= self->num_matchers
                ) {
                    break;
                }
            } 
            return SeriesMatcher_advance(self, target); /* Recurse. */
        }
        else {
            /* We're done. */
            self->doc_id = 0;
            return 0;
        }
    }
    else {
        i32_t target_minus_offset = target - self->current_offset;
        i32_t found 
            = Matcher_Advance(self->current_matcher, target_minus_offset);
        if (found) {
            self->doc_id = found + self->current_offset;
            return self->doc_id;
        }
        else {
            return SeriesMatcher_advance(self, self->next_offset); /* Recurse. */
        }
    }
}
void
HLWriter_Add_Segment_IMP(HighlightWriter *self, SegReader *reader,
                         I32Array *doc_map) {
    HighlightWriterIVARS *const ivars = HLWriter_IVARS(self);
    int32_t doc_max = SegReader_Doc_Max(reader);

    if (doc_max == 0) {
        // Bail if the supplied segment is empty.
        return;
    }
    else {
        DefaultHighlightReader *hl_reader
            = (DefaultHighlightReader*)CERTIFY(
                  SegReader_Obtain(reader, Class_Get_Name(HIGHLIGHTREADER)),
                  DEFAULTHIGHLIGHTREADER);
        OutStream *dat_out = S_lazy_init(self);
        OutStream *ix_out  = ivars->ix_out;
        int32_t    orig;
        ByteBuf   *bb = BB_new(0);

        for (orig = 1; orig <= doc_max; orig++) {
            // Skip deleted docs.
            if (doc_map && !I32Arr_Get(doc_map, orig)) {
                continue;
            }

            // Write file pointer.
            OutStream_Write_I64(ix_out, OutStream_Tell(dat_out));

            // Copy the raw record.
            DefHLReader_Read_Record(hl_reader, orig, bb);
            OutStream_Write_Bytes(dat_out, BB_Get_Buf(bb), BB_Get_Size(bb));

            BB_Set_Size(bb, 0);
        }
        DECREF(bb);
    }
}
Exemple #22
0
void
DocWriter_Add_Segment_IMP(DocWriter *self, SegReader *reader,
                          I32Array *doc_map) {
    DocWriterIVARS *const ivars = DocWriter_IVARS(self);
    int32_t doc_max = SegReader_Doc_Max(reader);

    if (doc_max == 0) {
        // Bail if the supplied segment is empty.
        return;
    }
    else {
        OutStream *const dat_out = S_lazy_init(self);
        OutStream *const ix_out  = ivars->ix_out;
        ByteBuf   *const buffer  = BB_new(0);
        DefaultDocReader *const doc_reader
            = (DefaultDocReader*)CERTIFY(
                  SegReader_Obtain(reader, VTable_Get_Name(DOCREADER)),
                  DEFAULTDOCREADER);

        for (int32_t i = 1, max = SegReader_Doc_Max(reader); i <= max; i++) {
            if (I32Arr_Get(doc_map, i)) {
                int64_t  start = OutStream_Tell(dat_out);

                // Copy record over.
                DefDocReader_Read_Record(doc_reader, buffer, i);
                char *buf   = BB_Get_Buf(buffer);
                size_t size = BB_Get_Size(buffer);
                OutStream_Write_Bytes(dat_out, buf, size);

                // Write file pointer.
                OutStream_Write_I64(ix_out, start);
            }
        }

        DECREF(buffer);
    }
}
void
HLWriter_add_segment(HighlightWriter *self, SegReader *reader, 
                     I32Array *doc_map)
{
    i32_t doc_max = SegReader_Doc_Max(reader);

    if (doc_max == 0) {
        /* Bail if the supplied segment is empty. */
        return;
    }
    else {
        DefaultHighlightReader *hl_reader = (DefaultHighlightReader*)
            ASSERT_IS_A(SegReader_Obtain(reader, HIGHLIGHTREADER.name),
            DEFAULTHIGHLIGHTREADER);
        OutStream *dat_out = S_lazy_init(self);
        OutStream *ix_out  = self->ix_out;
        i32_t      orig;
        ByteBuf   *bb = BB_new(0);

        for (orig = 1; orig <= doc_max; orig++) {
            /* Skip deleted docs. */
            if (doc_map && !I32Arr_Get(doc_map, orig))
                continue;

            /* Write file pointer. */
            OutStream_Write_U64( ix_out, OutStream_Tell(dat_out) );
            
            /* Copy the raw record. */
            DefHLReader_Read_Record(hl_reader, orig, bb);
            OutStream_Write_Bytes(dat_out, bb->ptr, bb->size);

            bb->size = 0;
        }
        DECREF(bb);
    }
}
Exemple #24
0
static void
S_do_test_matrix(TestBatch *batch, int32_t doc_max, int32_t first_doc_id,
                 int32_t doc_inc, int32_t offset_inc) {
    I32Array *doc_ids
        = S_generate_match_list(first_doc_id, doc_max, doc_inc);
    I32Array *offsets
        = S_generate_match_list(0, doc_max, offset_inc);
    SeriesMatcher *series_matcher
        = S_make_series_matcher(doc_ids, offsets, doc_max);
    uint32_t num_in_agreement = 0;
    int32_t got;

    while (0 != (got = SeriesMatcher_Next(series_matcher))) {
        if (got != I32Arr_Get(doc_ids, num_in_agreement)) { break; }
        num_in_agreement++;
    }
    TEST_INT_EQ(batch, num_in_agreement, I32Arr_Get_Size(doc_ids),
                "doc_max=%d first_doc_id=%d doc_inc=%d offset_inc=%d",
                doc_max, first_doc_id, doc_inc, offset_inc);

    DECREF(doc_ids);
    DECREF(offsets);
    DECREF(series_matcher);
}
Exemple #25
0
int32_t
MockMatcher_get_doc_id(MockMatcher* self) {
    MockMatcherIVARS *const ivars = MockMatcher_IVARS(self);
    return I32Arr_Get(ivars->doc_ids, ivars->tick);
}
Exemple #26
0
static bool
S_merge_updated_deletions(BackgroundMerger *self) {
    BackgroundMergerIVARS *const ivars = BGMerger_IVARS(self);
    Hash *updated_deletions = NULL;

    PolyReader *new_polyreader
        = PolyReader_open((Obj*)ivars->folder, NULL, NULL);
    Vector *new_seg_readers
        = PolyReader_Get_Seg_Readers(new_polyreader);
    Vector *old_seg_readers
        = PolyReader_Get_Seg_Readers(ivars->polyreader);
    Hash *new_segs = Hash_new(Vec_Get_Size(new_seg_readers));

    for (uint32_t i = 0, max = Vec_Get_Size(new_seg_readers); i < max; i++) {
        SegReader *seg_reader = (SegReader*)Vec_Fetch(new_seg_readers, i);
        String    *seg_name   = SegReader_Get_Seg_Name(seg_reader);
        Hash_Store(new_segs, seg_name, INCREF(seg_reader));
    }

    for (uint32_t i = 0, max = Vec_Get_Size(old_seg_readers); i < max; i++) {
        SegReader *seg_reader = (SegReader*)Vec_Fetch(old_seg_readers, i);
        String    *seg_name   = SegReader_Get_Seg_Name(seg_reader);

        // If this segment was merged away...
        if (Hash_Fetch(ivars->doc_maps, seg_name)) {
            SegReader *new_seg_reader
                = (SegReader*)CERTIFY(
                      Hash_Fetch(new_segs, seg_name),
                      SEGREADER);
            int32_t old_del_count = SegReader_Del_Count(seg_reader);
            int32_t new_del_count = SegReader_Del_Count(new_seg_reader);
            // ... were any new deletions applied against it?
            if (old_del_count != new_del_count) {
                DeletionsReader *del_reader
                    = (DeletionsReader*)SegReader_Obtain(
                          new_seg_reader,
                          Class_Get_Name(DELETIONSREADER));
                if (!updated_deletions) {
                    updated_deletions = Hash_new(max);
                }
                Hash_Store(updated_deletions, seg_name,
                           (Obj*)DelReader_Iterator(del_reader));
            }
        }
    }

    DECREF(new_polyreader);
    DECREF(new_segs);

    if (!updated_deletions) {
        return false;
    }
    else {
        PolyReader *merge_polyreader
            = PolyReader_open((Obj*)ivars->folder, ivars->snapshot, NULL);
        Vector *merge_seg_readers
            = PolyReader_Get_Seg_Readers(merge_polyreader);
        Snapshot *latest_snapshot
            = Snapshot_Read_File(Snapshot_new(), ivars->folder, NULL);
        int64_t new_seg_num
            = IxManager_Highest_Seg_Num(ivars->manager, latest_snapshot) + 1;
        Segment   *new_segment = Seg_new(new_seg_num);
        SegWriter *seg_writer  = SegWriter_new(ivars->schema, ivars->snapshot,
                                               new_segment, merge_polyreader);
        DeletionsWriter *del_writer = SegWriter_Get_Del_Writer(seg_writer);
        int64_t  merge_seg_num = Seg_Get_Number(ivars->segment);
        uint32_t seg_tick      = INT32_MAX;
        int32_t  offset        = INT32_MAX;

        SegWriter_Prep_Seg_Dir(seg_writer);

        for (uint32_t i = 0, max = Vec_Get_Size(merge_seg_readers); i < max; i++) {
            SegReader *seg_reader
                = (SegReader*)Vec_Fetch(merge_seg_readers, i);
            if (SegReader_Get_Seg_Num(seg_reader) == merge_seg_num) {
                I32Array *offsets = PolyReader_Offsets(merge_polyreader);
                seg_tick = i;
                offset = I32Arr_Get(offsets, seg_tick);
                DECREF(offsets);
            }
        }
        if (offset == INT32_MAX) { THROW(ERR, "Failed sanity check"); }

        HashIterator *iter = HashIter_new(updated_deletions);
        while (HashIter_Next(iter)) {
            String  *seg_name  = HashIter_Get_Key(iter);
            Matcher *deletions = (Matcher*)HashIter_Get_Value(iter);

            I32Array *doc_map
                = (I32Array*)CERTIFY(
                      Hash_Fetch(ivars->doc_maps, seg_name),
                      I32ARRAY);
            int32_t del;
            while (0 != (del = Matcher_Next(deletions))) {
                // Find the slot where the deleted doc resides in the
                // rewritten segment. If the doc was already deleted when we
                // were merging, do nothing.
                int32_t remapped = I32Arr_Get(doc_map, del);
                if (remapped) {
                    // It's a new deletion, so carry it forward and zap it in
                    // the rewritten segment.
                    DelWriter_Delete_By_Doc_ID(del_writer, remapped + offset);
                }
            }
        }
        DECREF(iter);

        // Finish the segment and clean up.
        DelWriter_Finish(del_writer);
        SegWriter_Finish(seg_writer);
        DECREF(seg_writer);
        DECREF(new_segment);
        DECREF(latest_snapshot);
        DECREF(merge_polyreader);
        DECREF(updated_deletions);
    }

    return true;
}
Exemple #27
0
uint32_t
PostPool_Refill_IMP(PostingPool *self) {
    PostingPoolIVARS *const ivars = PostPool_IVARS(self);
    Lexicon *const     lexicon     = ivars->lexicon;
    PostingList *const plist       = ivars->plist;
    I32Array    *const doc_map     = ivars->doc_map;
    const uint32_t     mem_thresh  = ivars->mem_thresh;
    const int32_t      doc_base    = ivars->doc_base;
    uint32_t           num_elems   = 0; // number of items recovered
    String            *term_text   = NULL;

    if (ivars->lexicon == NULL) { return 0; }
    else { term_text = (String*)Lex_Get_Term(lexicon); }

    // Make sure buffer is empty.
    if (ivars->buf_max - ivars->buf_tick > 0) {
        THROW(ERR, "Refill called but buffer contains %u32 items",
              ivars->buf_max - ivars->buf_tick);
    }
    ivars->buf_max  = 0;
    ivars->buf_tick = 0;

    // Ditch old MemoryPool and get another.
    DECREF(ivars->mem_pool);
    ivars->mem_pool = MemPool_new(0);
    MemoryPool *const mem_pool = ivars->mem_pool;
    MemoryPoolIVARS *const mem_pool_ivars = MemPool_IVARS(mem_pool);


    while (1) {
        if (ivars->post_count == 0) {
            // Read a term.
            if (Lex_Next(lexicon)) {
                ivars->post_count = Lex_Doc_Freq(lexicon);
                term_text = (String*)Lex_Get_Term(lexicon);
                if (term_text && !Obj_Is_A((Obj*)term_text, STRING)) {
                    THROW(ERR, "Only String terms are supported for now");
                }
                Posting *posting = PList_Get_Posting(plist);
                Post_Set_Doc_ID(posting, doc_base);
                ivars->last_doc_id = doc_base;
            }
            // Bail if we've read everything in this run.
            else {
                break;
            }
        }

        // Bail if we've hit the ceiling for this run's buffer.
        if (mem_pool_ivars->consumed >= mem_thresh && num_elems > 0) {
            break;
        }

        // Read a posting from the input stream.
        RawPosting *rawpost
            = PList_Read_Raw(plist, ivars->last_doc_id, term_text, mem_pool);
        RawPostingIVARS *const rawpost_ivars = RawPost_IVARS(rawpost);
        ivars->last_doc_id = rawpost_ivars->doc_id;
        ivars->post_count--;

        // Skip deletions.
        if (doc_map != NULL) {
            const int32_t remapped
                = I32Arr_Get(doc_map, rawpost_ivars->doc_id - doc_base);
            if (!remapped) {
                continue;
            }
            rawpost_ivars->doc_id = remapped;
        }

        // Add to the run's buffer.
        if (num_elems >= ivars->buf_cap) {
            size_t new_cap = Memory_oversize(num_elems + 1, sizeof(Obj*));
            PostPool_Grow_Buffer(self, new_cap);
        }
        ivars->buffer[num_elems] = (Obj*)rawpost;
        num_elems++;
    }

    // Reset the buffer array position and length; remember file pos.
    ivars->buf_max   = num_elems;
    ivars->buf_tick  = 0;

    return num_elems;
}
Exemple #28
0
VArray*
PhraseCompiler_highlight_spans(PhraseCompiler *self, Searcher *searcher, 
                               DocVector *doc_vec, const CharBuf *field)
{
    PhraseQuery *const parent = (PhraseQuery*)self->parent;
    VArray      *const terms  = parent->terms;
    VArray      *const spans  = VA_new(0);
    VArray      *term_vectors;
    BitVector   *posit_vec;
    BitVector   *other_posit_vec;
    uint32_t     i;
    const uint32_t  num_terms = VA_Get_Size(terms);
    uint32_t     num_tvs;
    UNUSED_VAR(searcher);

    // Bail if no terms or field doesn't match. 
    if (!num_terms) { return spans; }
    if (!CB_Equals(field, (Obj*)parent->field)) { return spans; }

    term_vectors    = VA_new(num_terms);
    posit_vec       = BitVec_new(0);
    other_posit_vec = BitVec_new(0);
    for (i = 0; i < num_terms; i++) {
        Obj *term = VA_Fetch(terms, i);
        TermVector *term_vector 
            = DocVec_Term_Vector(doc_vec, field, (CharBuf*)term);

        // Bail if any term is missing. 
        if (!term_vector)
            break;

        VA_Push(term_vectors, (Obj*)term_vector);

        if (i == 0) {
            // Set initial positions from first term. 
            uint32_t j;
            I32Array *positions = TV_Get_Positions(term_vector);
            for (j = I32Arr_Get_Size(positions); j > 0; j--) {
                BitVec_Set(posit_vec, I32Arr_Get(positions, j - 1));
            }
        }
        else {
            // Filter positions using logical "and". 
            uint32_t j;
            I32Array *positions = TV_Get_Positions(term_vector);

            BitVec_Clear_All(other_posit_vec);
            for (j = I32Arr_Get_Size(positions); j > 0; j--) {
                int32_t pos = I32Arr_Get(positions, j - 1) - i;
                if (pos >= 0) {
                    BitVec_Set(other_posit_vec, pos);
                }
            }
            BitVec_And(posit_vec, other_posit_vec);
        }
    }

    // Proceed only if all terms are present. 
    num_tvs = VA_Get_Size(term_vectors);
    if (num_tvs == num_terms) {
        TermVector *first_tv = (TermVector*)VA_Fetch(term_vectors, 0);
        TermVector *last_tv  
            = (TermVector*)VA_Fetch(term_vectors, num_tvs - 1);
        I32Array *tv_start_positions = TV_Get_Positions(first_tv);
        I32Array *tv_end_positions   = TV_Get_Positions(last_tv);
        I32Array *tv_start_offsets   = TV_Get_Start_Offsets(first_tv);
        I32Array *tv_end_offsets     = TV_Get_End_Offsets(last_tv);
        uint32_t  terms_max          = num_terms - 1;
        I32Array *valid_posits       = BitVec_To_Array(posit_vec);
        uint32_t  num_valid_posits   = I32Arr_Get_Size(valid_posits);
        uint32_t j = 0;
        uint32_t posit_tick;
        float weight = PhraseCompiler_Get_Weight(self);
        i = 0;

        // Add only those starts/ends that belong to a valid position. 
        for (posit_tick = 0; posit_tick < num_valid_posits; posit_tick++) {
            int32_t valid_start_posit = I32Arr_Get(valid_posits, posit_tick);
            int32_t valid_end_posit   = valid_start_posit + terms_max;
            int32_t start_offset = 0, end_offset = 0;
            uint32_t max;

            for (max = I32Arr_Get_Size(tv_start_positions); i < max; i++) {
                if (I32Arr_Get(tv_start_positions, i) == valid_start_posit) {
                    start_offset = I32Arr_Get(tv_start_offsets, i);
                    break;
                }
            }
            for (max = I32Arr_Get_Size(tv_end_positions); j < max; j++) {
                if (I32Arr_Get(tv_end_positions, j) == valid_end_posit) {
                    end_offset = I32Arr_Get(tv_end_offsets, j);
                    break;
                }
            }

            VA_Push(spans, (Obj*)Span_new(start_offset, 
                end_offset - start_offset, weight) );

            i++, j++;
        }

        DECREF(valid_posits);
    }

    DECREF(other_posit_vec);
    DECREF(posit_vec);
    DECREF(term_vectors);
    return spans;
}