Exemplo n.º 1
0
TermVector*
TV_deserialize(TermVector *self, InStream *instream)
{
    u32_t i;
    CharBuf *field = (CharBuf*)CB_deserialize(NULL, instream);
    CharBuf *text  = (CharBuf*)CB_deserialize(NULL, instream);
    u32_t num_pos  = InStream_Read_C32(instream);
    i32_t *posits, *starts, *ends;
    I32Array *positions, *start_offsets, *end_offsets;

    /* Read positional data. */
    posits    = MALLOCATE(num_pos, i32_t);
    starts    = MALLOCATE(num_pos, i32_t);
    ends      = MALLOCATE(num_pos, i32_t);
    for (i = 0; i < num_pos; i++) {
        posits[i] = InStream_Read_C32(instream);
        starts[i] = InStream_Read_C32(instream);
        ends[i]   = InStream_Read_C32(instream);
    }
    positions     = I32Arr_new_steal(posits, num_pos);
    start_offsets = I32Arr_new_steal(starts, num_pos);
    end_offsets   = I32Arr_new_steal(ends, num_pos);
    
    self = self ? self : (TermVector*)VTable_Make_Obj(&TERMVECTOR);
    self = TV_init(self, field, text, positions, start_offsets, end_offsets);

    DECREF(positions);
    DECREF(start_offsets);
    DECREF(end_offsets);
    DECREF(text);
    DECREF(field);

    return self;
}
Exemplo n.º 2
0
TermVector*
TV_Deserialize_IMP(TermVector *self, InStream *instream) {
    String *field = Freezer_read_string(instream);
    String *text  = Freezer_read_string(instream);
    size_t  num_pos = InStream_Read_C64(instream);

    // Read positional data.
    int32_t *posits = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t));
    int32_t *starts = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t));
    int32_t *ends   = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t));
    for (size_t i = 0; i < num_pos; i++) {
        posits[i] = InStream_Read_C32(instream);
        starts[i] = InStream_Read_C32(instream);
        ends[i]   = InStream_Read_C32(instream);
    }
    I32Array *positions     = I32Arr_new_steal(posits, num_pos);
    I32Array *start_offsets = I32Arr_new_steal(starts, num_pos);
    I32Array *end_offsets   = I32Arr_new_steal(ends, num_pos);

    TV_init(self, field, text, positions, start_offsets, end_offsets);

    DECREF(positions);
    DECREF(start_offsets);
    DECREF(end_offsets);
    DECREF(text);
    DECREF(field);

    return self;
}
Exemplo n.º 3
0
PolySearcher*
PolySearcher_init(PolySearcher *self, Schema *schema, VArray *searchers) {
    const uint32_t num_searchers = VA_Get_Size(searchers);
    int32_t *starts_array = (int32_t*)MALLOCATE(num_searchers * sizeof(int32_t));
    int32_t  doc_max      = 0;

    Searcher_init((Searcher*)self, schema);
    PolySearcherIVARS *const ivars = PolySearcher_IVARS(self);
    ivars->searchers = (VArray*)INCREF(searchers);
    ivars->starts = NULL; // Safe cleanup.

    for (uint32_t i = 0; i < num_searchers; i++) {
        Searcher *searcher
            = (Searcher*)CERTIFY(VA_Fetch(searchers, i), SEARCHER);
        Schema *candidate       = Searcher_Get_Schema(searcher);
        Class  *orig_class      = Schema_Get_Class(schema);
        Class  *candidate_class = Schema_Get_Class(candidate);

        // Confirm that searchers all use the same schema.
        if (orig_class != candidate_class) {
            THROW(ERR, "Conflicting schemas: '%o', '%o'",
                  Schema_Get_Class_Name(schema),
                  Schema_Get_Class_Name(candidate));
        }

        // Derive doc_max and relative start offsets.
        starts_array[i] = (int32_t)doc_max;
        doc_max += Searcher_Doc_Max(searcher);
    }

    ivars->doc_max = doc_max;
    ivars->starts  = I32Arr_new_steal(starts_array, num_searchers);

    return self;
}
Exemplo n.º 4
0
I32Array*
SegReader_offsets(SegReader *self)
{
    i32_t *ints = CALLOCATE(1, i32_t);
    UNUSED_VAR(self);
    return I32Arr_new_steal(ints, 1);
}
Exemplo n.º 5
0
PolyReader*
PolyReader_init(PolyReader *self, Schema *schema, Folder *folder,
                Snapshot *snapshot, IndexManager *manager,
                VArray *sub_readers) {
    PolyReaderIVARS *const ivars = PolyReader_IVARS(self);
    ivars->doc_max    = 0;
    ivars->del_count  = 0;

    if (sub_readers) {
        uint32_t num_segs = VA_Get_Size(sub_readers);
        VArray *segments = VA_new(num_segs);
        for (uint32_t i = 0; i < num_segs; i++) {
            SegReader *seg_reader
                = (SegReader*)CERTIFY(VA_Fetch(sub_readers, i), SEGREADER);
            VA_Push(segments, INCREF(SegReader_Get_Segment(seg_reader)));
        }
        IxReader_init((IndexReader*)self, schema, folder, snapshot,
                      segments, -1, manager);
        DECREF(segments);
        S_init_sub_readers(self, sub_readers);
    }
    else {
        IxReader_init((IndexReader*)self, schema, folder, snapshot,
                      NULL, -1, manager);
        ivars->sub_readers = VA_new(0);
        ivars->offsets = I32Arr_new_steal(NULL, 0);
    }

    return self;
}
Exemplo n.º 6
0
Vector*
IxManager_Recycle_IMP(IndexManager *self, PolyReader *reader,
                      DeletionsWriter *del_writer, int64_t cutoff,
                      bool optimize) {
    Vector *seg_readers = PolyReader_Get_Seg_Readers(reader);
    size_t num_seg_readers = Vec_Get_Size(seg_readers);
    SegReader **candidates
        = (SegReader**)MALLOCATE(num_seg_readers * sizeof(SegReader*));
    size_t num_candidates = 0;
    for (size_t i = 0; i < num_seg_readers; i++) {
        SegReader *seg_reader = (SegReader*)Vec_Fetch(seg_readers, i);
        if (SegReader_Get_Seg_Num(seg_reader) > cutoff) {
            candidates[num_candidates++] = seg_reader;
        }
    }

    Vector *recyclables = Vec_new(num_candidates);

    if (optimize) {
        for (size_t i = 0; i < num_candidates; i++) {
            Vec_Push(recyclables, INCREF(candidates[i]));
        }
        FREEMEM(candidates);
        return recyclables;
    }

    // Sort by ascending size in docs, choose sparsely populated segments.
    qsort(candidates, num_candidates, sizeof(SegReader*), S_compare_doc_count);
    int32_t *counts = (int32_t*)MALLOCATE(num_candidates * sizeof(int32_t));
    for (uint32_t i = 0; i < num_candidates; i++) {
        counts[i] = SegReader_Doc_Count(candidates[i]);
    }
    I32Array *doc_counts = I32Arr_new_steal(counts, num_candidates);
    uint32_t threshold = IxManager_Choose_Sparse(self, doc_counts);
    DECREF(doc_counts);

    // Move SegReaders to be recycled.
    for (uint32_t i = 0; i < threshold; i++) {
        Vec_Store(recyclables, i, INCREF(candidates[i]));
    }

    // Find segments where at least 10% of all docs have been deleted.
    for (uint32_t i = threshold; i < num_candidates; i++) {
        SegReader *seg_reader = candidates[i];
        String    *seg_name   = SegReader_Get_Seg_Name(seg_reader);
        double doc_max = SegReader_Doc_Max(seg_reader);
        double num_deletions = DelWriter_Seg_Del_Count(del_writer, seg_name);
        double del_proportion = num_deletions / doc_max;
        if (del_proportion >= 0.1) {
            Vec_Push(recyclables, INCREF(seg_reader));
        }
    }

    FREEMEM(candidates);
    return recyclables;
}
Exemplo n.º 7
0
static TermVector*
S_extract_tv_from_tv_buf(String *field, String *term_text,
                         ByteBuf *tv_buf) {
    TermVector *retval      = NULL;
    const char *posdata     = BB_Get_Buf(tv_buf);
    const char *posdata_end = posdata + BB_Get_Size(tv_buf);
    int32_t    *positions   = NULL;
    int32_t    *starts      = NULL;
    int32_t    *ends        = NULL;
    uint32_t    num_pos     = 0;

    if (posdata != posdata_end) {
        num_pos   = NumUtil_decode_c32(&posdata);
        positions = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t));
        starts    = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t));
        ends      = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t));
    }

    // Expand C32s.
    for (uint32_t i = 0; i < num_pos; i++) {
        positions[i] = NumUtil_decode_c32(&posdata);
        starts[i]    = NumUtil_decode_c32(&posdata);
        ends[i]      = NumUtil_decode_c32(&posdata);
    }

    if (posdata != posdata_end) {
        THROW(ERR, "Bad encoding of posdata");
    }
    else {
        I32Array *posits_map = I32Arr_new_steal(positions, num_pos);
        I32Array *starts_map = I32Arr_new_steal(starts, num_pos);
        I32Array *ends_map   = I32Arr_new_steal(ends, num_pos);
        retval = TV_new(field, term_text, posits_map, starts_map, ends_map);
        DECREF(posits_map);
        DECREF(starts_map);
        DECREF(ends_map);
    }

    return retval;
}
Exemplo n.º 8
0
static void
S_init_sub_readers(PolyReader *self, VArray *sub_readers) {
    PolyReaderIVARS *const ivars = PolyReader_IVARS(self);
    uint32_t  num_sub_readers = VA_Get_Size(sub_readers);
    int32_t *starts = (int32_t*)MALLOCATE(num_sub_readers * sizeof(int32_t));
    Hash  *data_readers = Hash_new(0);

    DECREF(ivars->sub_readers);
    DECREF(ivars->offsets);
    ivars->sub_readers       = (VArray*)INCREF(sub_readers);

    // Accumulate doc_max, subreader start offsets, and DataReaders.
    ivars->doc_max = 0;
    for (uint32_t i = 0; i < num_sub_readers; i++) {
        SegReader *seg_reader = (SegReader*)VA_Fetch(sub_readers, i);
        Hash *components = SegReader_Get_Components(seg_reader);
        CharBuf *api;
        DataReader *component;
        starts[i] = ivars->doc_max;
        ivars->doc_max += SegReader_Doc_Max(seg_reader);
        Hash_Iterate(components);
        while (Hash_Next(components, (Obj**)&api, (Obj**)&component)) {
            VArray *readers = (VArray*)Hash_Fetch(data_readers, (Obj*)api);
            if (!readers) {
                readers = VA_new(num_sub_readers);
                Hash_Store(data_readers, (Obj*)api, (Obj*)readers);
            }
            VA_Store(readers, i, INCREF(component));
        }
    }
    ivars->offsets = I32Arr_new_steal(starts, num_sub_readers);

    CharBuf *api;
    VArray  *readers;
    Hash_Iterate(data_readers);
    while (Hash_Next(data_readers, (Obj**)&api, (Obj**)&readers)) {
        DataReader *datareader
            = (DataReader*)CERTIFY(S_first_non_null(readers), DATAREADER);
        DataReader *aggregator
            = DataReader_Aggregator(datareader, readers, ivars->offsets);
        if (aggregator) {
            CERTIFY(aggregator, DATAREADER);
            Hash_Store(ivars->components, (Obj*)api, (Obj*)aggregator);
        }
    }
    DECREF(data_readers);

    DeletionsReader *del_reader
        = (DeletionsReader*)Hash_Fetch(
              ivars->components, (Obj*)VTable_Get_Name(DELETIONSREADER));
    ivars->del_count = del_reader ? DelReader_Del_Count(del_reader) : 0;
}
Exemplo n.º 9
0
static I32Array*
S_generate_match_list(int32_t first, int32_t max, int32_t doc_inc) {
    int32_t  count     = (int32_t)ceil(((float)max - first) / doc_inc);
    int32_t *doc_ids   = (int32_t*)MALLOCATE(count * sizeof(int32_t));
    int32_t  doc_id    = first;
    int32_t  i         = 0;

    for (; doc_id < max; doc_id += doc_inc, i++) {
        doc_ids[i] = doc_id;
    }
    if (i != count) { THROW(ERR, "Screwed up somehow: %i32 %i32", i, count); }

    return I32Arr_new_steal(doc_ids, count);
}
Exemplo n.º 10
0
VArray*
IxManager_recycle(IndexManager *self, PolyReader *reader, 
                  DeletionsWriter *del_writer, int64_t cutoff, bool_t optimize)
{
    VArray *seg_readers = PolyReader_Get_Seg_Readers(reader);
    VArray *candidates  = VA_Gather(seg_readers, S_check_cutoff, &cutoff);
    VArray *recyclables = VA_new(VA_Get_Size(candidates));
    const uint32_t num_candidates = VA_Get_Size(candidates);

    if (optimize) { 
        DECREF(recyclables);
        return candidates; 
    }

    // Sort by ascending size in docs, choose sparsely populated segments.
    VA_Sort(candidates, S_compare_doc_count, NULL);
    int32_t *counts = (int32_t*)MALLOCATE(num_candidates * sizeof(int32_t));
    for (uint32_t i = 0; i < num_candidates; i++) {
        SegReader *seg_reader = (SegReader*)CERTIFY(
            VA_Fetch(candidates, i), SEGREADER);
        counts[i] = SegReader_Doc_Count(seg_reader);
    }
    I32Array *doc_counts = I32Arr_new_steal(counts, num_candidates);
    uint32_t threshold = IxManager_Choose_Sparse(self, doc_counts);
    DECREF(doc_counts);

    // Move SegReaders to be recycled.
    for (uint32_t i = 0; i < threshold; i++) {
        VA_Store(recyclables, i, VA_Delete(candidates, i));
    }

    // Find segments where at least 10% of all docs have been deleted. 
    for (uint32_t i = threshold; i < num_candidates; i++) {
        SegReader *seg_reader = (SegReader*)VA_Delete(candidates, i);
        CharBuf   *seg_name   = SegReader_Get_Seg_Name(seg_reader);
        double doc_max = SegReader_Doc_Max(seg_reader);
        double num_deletions = DelWriter_Seg_Del_Count(del_writer, seg_name);
        double del_proportion = num_deletions / doc_max;
        if (del_proportion >= 0.1) {
            VA_Push(recyclables, (Obj*)seg_reader);
        }
        else {
            DECREF(seg_reader);
        }
    }

    DECREF(candidates);
    return recyclables;
}
Exemplo n.º 11
0
I32Array*
DelWriter_Generate_Doc_Map_IMP(DeletionsWriter *self, Matcher *deletions,
                               int32_t doc_max, int32_t offset) {
    int32_t *doc_map = (int32_t*)CALLOCATE(doc_max + 1, sizeof(int32_t));
    int32_t  next_deletion = deletions ? Matcher_Next(deletions) : INT32_MAX;
    UNUSED_VAR(self);

    // 0 for a deleted doc, a new number otherwise
    for (int32_t i = 1, new_doc_id = 1; i <= doc_max; i++) {
        if (i == next_deletion) {
            next_deletion = Matcher_Next(deletions);
        }
        else {
            doc_map[i] = offset + new_doc_id++;
        }
    }

    return I32Arr_new_steal(doc_map, doc_max + 1);
}
Exemplo n.º 12
0
I32Array*
SegReader_Offsets_IMP(SegReader *self) {
    int32_t *ints = (int32_t*)CALLOCATE(1, sizeof(int32_t));
    UNUSED_VAR(self);
    return I32Arr_new_steal(ints, 1);
}