Ejemplo n.º 1
0
static SeriesMatcher*
S_make_series_matcher(I32Array *doc_ids, I32Array *offsets, int32_t doc_max) {
    int32_t  num_doc_ids  = I32Arr_Get_Size(doc_ids);
    int32_t  num_matchers = I32Arr_Get_Size(offsets);
    VArray  *matchers     = VA_new(num_matchers);
    int32_t  tick         = 0;
    int32_t  i;

    // Divvy up doc_ids by segment into BitVectors.
    for (i = 0; i < num_matchers; i++) {
        int32_t offset = I32Arr_Get(offsets, i);
        int32_t max    = i == num_matchers - 1
                         ? doc_max + 1
                         : I32Arr_Get(offsets, i + 1);
        BitVector *bit_vec = BitVec_new(max - offset);
        while (tick < num_doc_ids) {
            int32_t doc_id = I32Arr_Get(doc_ids, tick);
            if (doc_id > max) { break; }
            else               { tick++; }
            BitVec_Set(bit_vec, doc_id - offset);
        }
        VA_Push(matchers, (Obj*)BitVecMatcher_new(bit_vec));
        DECREF(bit_vec);
    }

    SeriesMatcher *series_matcher = SeriesMatcher_new(matchers, offsets);
    DECREF(matchers);
    return series_matcher;
}
Ejemplo n.º 2
0
uint32_t
PolyReader_sub_tick(I32Array *offsets, int32_t doc_id) {
    int32_t size = I32Arr_Get_Size(offsets);
    if (size == 0) {
        return 0;
    }

    int32_t lo = -1;
    int32_t hi = size;
    while (hi - lo > 1) {
        int32_t mid = lo + ((hi - lo) / 2);
        int32_t offset = I32Arr_Get(offsets, mid);
        if (doc_id <= offset) {
            hi = mid;
        }
        else {
            lo = mid;
        }
    }
    if (hi == size) {
        hi--;
    }

    while (hi > 0) {
        int32_t offset = I32Arr_Get(offsets, hi);
        if (doc_id <= offset) {
            hi--;
        }
        else {
            break;
        }
    }

    return hi;
}
Ejemplo n.º 3
0
uint32_t
IxManager_Choose_Sparse_IMP(IndexManager *self, I32Array *doc_counts) {
    UNUSED_VAR(self);
    uint32_t threshold  = 0;
    int32_t total_docs = 0;
    const uint32_t num_candidates = (uint32_t)I32Arr_Get_Size(doc_counts);

    // Find sparsely populated segments.
    for (uint32_t i = 0; i < num_candidates; i++) {
        uint32_t num_segs_when_done = num_candidates - threshold + 1;
        total_docs += I32Arr_Get(doc_counts, i);
        if (total_docs < (int32_t)S_fibonacci(num_segs_when_done + 5)) {
            threshold = i + 1;
        }
    }

    // If recycling, try not to get stuck merging the same big segment over
    // and over on small commits.
    if (threshold == 1 && num_candidates > 2) {
        int32_t this_seg_doc_count = I32Arr_Get(doc_counts, 0);
        int32_t next_seg_doc_count = I32Arr_Get(doc_counts, 1);
        // Try to merge 2 segments worth of stuff, so long as the next segment
        // is less than double the size.
        if (next_seg_doc_count / 2 < this_seg_doc_count) {
            threshold = 2;
        }
    }

    return threshold;
}
Ejemplo n.º 4
0
Vector*
TermCompiler_Highlight_Spans_IMP(TermCompiler *self, Searcher *searcher,
                                 DocVector *doc_vec, String *field) {

    TermCompilerIVARS *const ivars = TermCompiler_IVARS(self);
    TermQueryIVARS *const parent_ivars
        = TermQuery_IVARS((TermQuery*)ivars->parent);
    Vector *spans = Vec_new(0);
    TermVector *term_vector;
    I32Array *starts, *ends;
    UNUSED_VAR(searcher);

    if (!Str_Equals(parent_ivars->field, (Obj*)field)) { return spans; }

    // Add all starts and ends.
    term_vector
        = DocVec_Term_Vector(doc_vec, field, (String*)parent_ivars->term);
    if (!term_vector) { return spans; }

    starts = TV_Get_Start_Offsets(term_vector);
    ends   = TV_Get_End_Offsets(term_vector);
    for (size_t i = 0, max = I32Arr_Get_Size(starts); i < max; i++) {
        int32_t start  = I32Arr_Get(starts, i);
        int32_t length = I32Arr_Get(ends, i) - start;
        Vec_Push(spans,
                (Obj*)Span_new(start, length, TermCompiler_Get_Weight(self)));
    }

    DECREF(term_vector);
    return spans;
}
Ejemplo n.º 5
0
// Adjust current doc id. We create our own doc_count rather than rely on
// SegReader's number because the DeletionsWriter and the SegReader are
// probably out of sync.
static void
S_adjust_doc_id(SegWriter *self, SegReader *reader, I32Array *doc_map) {
    SegWriterIVARS *const ivars = SegWriter_IVARS(self);
    int32_t doc_count = SegReader_Doc_Max(reader);
    for (size_t i = 1, max = I32Arr_Get_Size(doc_map); i < max; i++) {
        if (I32Arr_Get(doc_map, i) == 0) { doc_count--; }
    }
    Seg_Increment_Count(ivars->segment, doc_count);
}
Ejemplo n.º 6
0
MockMatcher*
MockMatcher_init(MockMatcher *self, I32Array *doc_ids, ByteBuf *scores) {
    Matcher_init((Matcher*)self);
    MockMatcherIVARS *const ivars = MockMatcher_IVARS(self);
    ivars->tick    = -1;
    ivars->size    = I32Arr_Get_Size(doc_ids);
    ivars->doc_ids = (I32Array*)INCREF(doc_ids);
    ivars->scores  = (ByteBuf*)INCREF(scores);
    return self;
}
Ejemplo n.º 7
0
TermVector*
TV_init(TermVector *self, const CharBuf *field, const CharBuf *text, 
        I32Array *positions, I32Array *start_offsets, I32Array *end_offsets)
{
    /* Assign. */
    self->field          = CB_Clone(field);
    self->text           = CB_Clone(text);
    self->num_pos        = I32Arr_Get_Size(positions);
    self->positions      = (I32Array*)INCREF(positions);
    self->start_offsets  = (I32Array*)INCREF(start_offsets);
    self->end_offsets    = (I32Array*)INCREF(end_offsets);

    if (   I32Arr_Get_Size(start_offsets) != self->num_pos
        || I32Arr_Get_Size(end_offsets)   != self->num_pos
    ) {
        THROW("Unbalanced arrays: %u32 %u32 %u32", self->num_pos,
           I32Arr_Get_Size(start_offsets), I32Arr_Get_Size(end_offsets));
    }
    
    return self;
}
Ejemplo n.º 8
0
TermVector*
TV_init(TermVector *self, String *field, String *text,
        I32Array *positions, I32Array *start_offsets, I32Array *end_offsets) {
    TermVectorIVARS *const ivars = TV_IVARS(self);

    // Assign.
    ivars->field          = Str_Clone(field);
    ivars->text           = Str_Clone(text);
    ivars->num_pos        = I32Arr_Get_Size(positions);
    ivars->positions      = (I32Array*)INCREF(positions);
    ivars->start_offsets  = (I32Array*)INCREF(start_offsets);
    ivars->end_offsets    = (I32Array*)INCREF(end_offsets);

    if (I32Arr_Get_Size(start_offsets) != ivars->num_pos
        || I32Arr_Get_Size(end_offsets) != ivars->num_pos
       ) {
        THROW(ERR, "Unbalanced arrays: %u64 %u64 %u64",
              (uint64_t)ivars->num_pos,
              (uint64_t)I32Arr_Get_Size(start_offsets),
              (uint64_t)I32Arr_Get_Size(end_offsets));
    }

    return self;
}
Ejemplo n.º 9
0
SeriesMatcher*
SeriesMatcher_init(SeriesMatcher *self, VArray *matchers, I32Array *offsets)
{
    Matcher_init((Matcher*)self);

    /* Init. */
    self->current_matcher = NULL;
    self->current_offset  = 0;
    self->next_offset     = 0;
    self->doc_id          = 0;
    self->tick            = 0;

    /* Assign. */
    self->matchers        = (VArray*)INCREF(matchers);
    self->offsets         = (I32Array*)INCREF(offsets);

    /* Derive. */
    self->num_matchers    = (i32_t)I32Arr_Get_Size(offsets);

    return self;
}
Ejemplo n.º 10
0
static void
S_do_test_matrix(TestBatch *batch, int32_t doc_max, int32_t first_doc_id,
                 int32_t doc_inc, int32_t offset_inc) {
    I32Array *doc_ids
        = S_generate_match_list(first_doc_id, doc_max, doc_inc);
    I32Array *offsets
        = S_generate_match_list(0, doc_max, offset_inc);
    SeriesMatcher *series_matcher
        = S_make_series_matcher(doc_ids, offsets, doc_max);
    uint32_t num_in_agreement = 0;
    int32_t got;

    while (0 != (got = SeriesMatcher_Next(series_matcher))) {
        if (got != I32Arr_Get(doc_ids, num_in_agreement)) { break; }
        num_in_agreement++;
    }
    TEST_INT_EQ(batch, num_in_agreement, I32Arr_Get_Size(doc_ids),
                "doc_max=%d first_doc_id=%d doc_inc=%d offset_inc=%d",
                doc_max, first_doc_id, doc_inc, offset_inc);

    DECREF(doc_ids);
    DECREF(offsets);
    DECREF(series_matcher);
}
Ejemplo n.º 11
0
VArray*
PhraseCompiler_highlight_spans(PhraseCompiler *self, Searcher *searcher, 
                               DocVector *doc_vec, const CharBuf *field)
{
    PhraseQuery *const parent = (PhraseQuery*)self->parent;
    VArray      *const terms  = parent->terms;
    VArray      *const spans  = VA_new(0);
    VArray      *term_vectors;
    BitVector   *posit_vec;
    BitVector   *other_posit_vec;
    uint32_t     i;
    const uint32_t  num_terms = VA_Get_Size(terms);
    uint32_t     num_tvs;
    UNUSED_VAR(searcher);

    // Bail if no terms or field doesn't match. 
    if (!num_terms) { return spans; }
    if (!CB_Equals(field, (Obj*)parent->field)) { return spans; }

    term_vectors    = VA_new(num_terms);
    posit_vec       = BitVec_new(0);
    other_posit_vec = BitVec_new(0);
    for (i = 0; i < num_terms; i++) {
        Obj *term = VA_Fetch(terms, i);
        TermVector *term_vector 
            = DocVec_Term_Vector(doc_vec, field, (CharBuf*)term);

        // Bail if any term is missing. 
        if (!term_vector)
            break;

        VA_Push(term_vectors, (Obj*)term_vector);

        if (i == 0) {
            // Set initial positions from first term. 
            uint32_t j;
            I32Array *positions = TV_Get_Positions(term_vector);
            for (j = I32Arr_Get_Size(positions); j > 0; j--) {
                BitVec_Set(posit_vec, I32Arr_Get(positions, j - 1));
            }
        }
        else {
            // Filter positions using logical "and". 
            uint32_t j;
            I32Array *positions = TV_Get_Positions(term_vector);

            BitVec_Clear_All(other_posit_vec);
            for (j = I32Arr_Get_Size(positions); j > 0; j--) {
                int32_t pos = I32Arr_Get(positions, j - 1) - i;
                if (pos >= 0) {
                    BitVec_Set(other_posit_vec, pos);
                }
            }
            BitVec_And(posit_vec, other_posit_vec);
        }
    }

    // Proceed only if all terms are present. 
    num_tvs = VA_Get_Size(term_vectors);
    if (num_tvs == num_terms) {
        TermVector *first_tv = (TermVector*)VA_Fetch(term_vectors, 0);
        TermVector *last_tv  
            = (TermVector*)VA_Fetch(term_vectors, num_tvs - 1);
        I32Array *tv_start_positions = TV_Get_Positions(first_tv);
        I32Array *tv_end_positions   = TV_Get_Positions(last_tv);
        I32Array *tv_start_offsets   = TV_Get_Start_Offsets(first_tv);
        I32Array *tv_end_offsets     = TV_Get_End_Offsets(last_tv);
        uint32_t  terms_max          = num_terms - 1;
        I32Array *valid_posits       = BitVec_To_Array(posit_vec);
        uint32_t  num_valid_posits   = I32Arr_Get_Size(valid_posits);
        uint32_t j = 0;
        uint32_t posit_tick;
        float weight = PhraseCompiler_Get_Weight(self);
        i = 0;

        // Add only those starts/ends that belong to a valid position. 
        for (posit_tick = 0; posit_tick < num_valid_posits; posit_tick++) {
            int32_t valid_start_posit = I32Arr_Get(valid_posits, posit_tick);
            int32_t valid_end_posit   = valid_start_posit + terms_max;
            int32_t start_offset = 0, end_offset = 0;
            uint32_t max;

            for (max = I32Arr_Get_Size(tv_start_positions); i < max; i++) {
                if (I32Arr_Get(tv_start_positions, i) == valid_start_posit) {
                    start_offset = I32Arr_Get(tv_start_offsets, i);
                    break;
                }
            }
            for (max = I32Arr_Get_Size(tv_end_positions); j < max; j++) {
                if (I32Arr_Get(tv_end_positions, j) == valid_end_posit) {
                    end_offset = I32Arr_Get(tv_end_offsets, j);
                    break;
                }
            }

            VA_Push(spans, (Obj*)Span_new(start_offset, 
                end_offset - start_offset, weight) );

            i++, j++;
        }

        DECREF(valid_posits);
    }

    DECREF(other_posit_vec);
    DECREF(posit_vec);
    DECREF(term_vectors);
    return spans;
}