Ejemplo n.º 1
0
void
PostPool_Add_Segment_IMP(PostingPool *self, SegReader *reader,
                         I32Array *doc_map, int32_t doc_base) {
    PostingPoolIVARS *const ivars = PostPool_IVARS(self);
    LexiconReader *lex_reader = (LexiconReader*)SegReader_Fetch(
                                    reader, Class_Get_Name(LEXICONREADER));
    Lexicon *lexicon = lex_reader
                       ? LexReader_Lexicon(lex_reader, ivars->field, NULL)
                       : NULL;

    if (lexicon) {
        PostingListReader *plist_reader
            = (PostingListReader*)SegReader_Fetch(
                  reader, Class_Get_Name(POSTINGLISTREADER));
        PostingList *plist = plist_reader
                             ? PListReader_Posting_List(plist_reader, ivars->field, NULL)
                             : NULL;
        if (!plist) {
            THROW(ERR, "Got a Lexicon but no PostingList for '%o' in '%o'",
                  ivars->field, SegReader_Get_Seg_Name(reader));
        }
        PostingPool *run
            = PostPool_new(ivars->schema, ivars->snapshot, ivars->segment,
                           ivars->polyreader, ivars->field, ivars->lex_writer,
                           ivars->mem_pool, ivars->lex_temp_out,
                           ivars->post_temp_out, ivars->skip_out);
        PostingPoolIVARS *const run_ivars = PostPool_IVARS(run);
        run_ivars->lexicon  = lexicon;
        run_ivars->plist    = plist;
        run_ivars->doc_base = doc_base;
        run_ivars->doc_map  = (I32Array*)INCREF(doc_map);
        PostPool_Add_Run(self, (SortExternal*)run);
    }
}
Ejemplo n.º 2
0
Vector*
IxManager_Recycle_IMP(IndexManager *self, PolyReader *reader,
                      DeletionsWriter *del_writer, int64_t cutoff,
                      bool optimize) {
    Vector *seg_readers = PolyReader_Get_Seg_Readers(reader);
    size_t num_seg_readers = Vec_Get_Size(seg_readers);
    SegReader **candidates
        = (SegReader**)MALLOCATE(num_seg_readers * sizeof(SegReader*));
    size_t num_candidates = 0;
    for (size_t i = 0; i < num_seg_readers; i++) {
        SegReader *seg_reader = (SegReader*)Vec_Fetch(seg_readers, i);
        if (SegReader_Get_Seg_Num(seg_reader) > cutoff) {
            candidates[num_candidates++] = seg_reader;
        }
    }

    Vector *recyclables = Vec_new(num_candidates);

    if (optimize) {
        for (size_t i = 0; i < num_candidates; i++) {
            Vec_Push(recyclables, INCREF(candidates[i]));
        }
        FREEMEM(candidates);
        return recyclables;
    }

    // Sort by ascending size in docs, choose sparsely populated segments.
    qsort(candidates, num_candidates, sizeof(SegReader*), S_compare_doc_count);
    int32_t *counts = (int32_t*)MALLOCATE(num_candidates * sizeof(int32_t));
    for (uint32_t i = 0; i < num_candidates; i++) {
        counts[i] = SegReader_Doc_Count(candidates[i]);
    }
    I32Array *doc_counts = I32Arr_new_steal(counts, num_candidates);
    uint32_t threshold = IxManager_Choose_Sparse(self, doc_counts);
    DECREF(doc_counts);

    // Move SegReaders to be recycled.
    for (uint32_t i = 0; i < threshold; i++) {
        Vec_Store(recyclables, i, INCREF(candidates[i]));
    }

    // Find segments where at least 10% of all docs have been deleted.
    for (uint32_t i = threshold; i < num_candidates; i++) {
        SegReader *seg_reader = candidates[i];
        String    *seg_name   = SegReader_Get_Seg_Name(seg_reader);
        double doc_max = SegReader_Doc_Max(seg_reader);
        double num_deletions = DelWriter_Seg_Del_Count(del_writer, seg_name);
        double del_proportion = num_deletions / doc_max;
        if (del_proportion >= 0.1) {
            Vec_Push(recyclables, INCREF(seg_reader));
        }
    }

    FREEMEM(candidates);
    return recyclables;
}
Ejemplo n.º 3
0
VArray*
IxManager_recycle(IndexManager *self, PolyReader *reader, 
                  DeletionsWriter *del_writer, int64_t cutoff, bool_t optimize)
{
    VArray *seg_readers = PolyReader_Get_Seg_Readers(reader);
    VArray *candidates  = VA_Gather(seg_readers, S_check_cutoff, &cutoff);
    VArray *recyclables = VA_new(VA_Get_Size(candidates));
    const uint32_t num_candidates = VA_Get_Size(candidates);

    if (optimize) { 
        DECREF(recyclables);
        return candidates; 
    }

    // Sort by ascending size in docs, choose sparsely populated segments.
    VA_Sort(candidates, S_compare_doc_count, NULL);
    int32_t *counts = (int32_t*)MALLOCATE(num_candidates * sizeof(int32_t));
    for (uint32_t i = 0; i < num_candidates; i++) {
        SegReader *seg_reader = (SegReader*)CERTIFY(
            VA_Fetch(candidates, i), SEGREADER);
        counts[i] = SegReader_Doc_Count(seg_reader);
    }
    I32Array *doc_counts = I32Arr_new_steal(counts, num_candidates);
    uint32_t threshold = IxManager_Choose_Sparse(self, doc_counts);
    DECREF(doc_counts);

    // Move SegReaders to be recycled.
    for (uint32_t i = 0; i < threshold; i++) {
        VA_Store(recyclables, i, VA_Delete(candidates, i));
    }

    // Find segments where at least 10% of all docs have been deleted. 
    for (uint32_t i = threshold; i < num_candidates; i++) {
        SegReader *seg_reader = (SegReader*)VA_Delete(candidates, i);
        CharBuf   *seg_name   = SegReader_Get_Seg_Name(seg_reader);
        double doc_max = SegReader_Doc_Max(seg_reader);
        double num_deletions = DelWriter_Seg_Del_Count(del_writer, seg_name);
        double del_proportion = num_deletions / doc_max;
        if (del_proportion >= 0.1) {
            VA_Push(recyclables, (Obj*)seg_reader);
        }
        else {
            DECREF(seg_reader);
        }
    }

    DECREF(candidates);
    return recyclables;
}
Ejemplo n.º 4
0
static uint32_t
S_maybe_merge(BackgroundMerger *self) {
    BackgroundMergerIVARS *const ivars = BGMerger_IVARS(self);
    Vector *to_merge = IxManager_Recycle(ivars->manager, ivars->polyreader,
                                         ivars->del_writer, 0, ivars->optimize);
    int32_t num_to_merge = Vec_Get_Size(to_merge);

    // There's no point in merging one segment if it has no deletions, because
    // we'd just be rewriting it. */
    if (num_to_merge == 1) {
        SegReader *seg_reader = (SegReader*)Vec_Fetch(to_merge, 0);
        if (!SegReader_Del_Count(seg_reader)) {
            DECREF(to_merge);
            return 0;
        }
    }
    else if (num_to_merge == 0) {
        DECREF(to_merge);
        return 0;
    }

    // Now that we're sure we're writing a new segment, prep the seg dir.
    SegWriter_Prep_Seg_Dir(ivars->seg_writer);

    // Consolidate segments.
    for (uint32_t i = 0, max = num_to_merge; i < max; i++) {
        SegReader *seg_reader = (SegReader*)Vec_Fetch(to_merge, i);
        String    *seg_name   = SegReader_Get_Seg_Name(seg_reader);
        int64_t    doc_count  = Seg_Get_Count(ivars->segment);
        Matcher *deletions
            = DelWriter_Seg_Deletions(ivars->del_writer, seg_reader);
        I32Array *doc_map = DelWriter_Generate_Doc_Map(
                                ivars->del_writer, deletions,
                                SegReader_Doc_Max(seg_reader),
                                (int32_t)doc_count);

        Hash_Store(ivars->doc_maps, seg_name, (Obj*)doc_map);
        SegWriter_Merge_Segment(ivars->seg_writer, seg_reader, doc_map);
        DECREF(deletions);
    }

    DECREF(to_merge);
    return num_to_merge;
}
Ejemplo n.º 5
0
DefaultDeletionsWriter*
DefDelWriter_init(DefaultDeletionsWriter *self, Schema *schema,
                  Snapshot *snapshot, Segment *segment,
                  PolyReader *polyreader) {

    DataWriter_init((DataWriter*)self, schema, snapshot, segment, polyreader);
    DefaultDeletionsWriterIVARS *const ivars = DefDelWriter_IVARS(self);
    ivars->seg_readers          = PolyReader_Seg_Readers(polyreader);
    uint32_t num_seg_readers    = VA_Get_Size(ivars->seg_readers);
    ivars->seg_starts           = PolyReader_Offsets(polyreader);
    ivars->bit_vecs             = VA_new(num_seg_readers);
    ivars->updated              = (bool*)CALLOCATE(num_seg_readers, sizeof(bool));
    ivars->searcher             = IxSearcher_new((Obj*)polyreader);
    ivars->name_to_tick         = Hash_new(num_seg_readers);

    // Materialize a BitVector of deletions for each segment.
    for (uint32_t i = 0; i < num_seg_readers; i++) {
        SegReader *seg_reader = (SegReader*)VA_Fetch(ivars->seg_readers, i);
        BitVector *bit_vec    = BitVec_new(SegReader_Doc_Max(seg_reader));
        DeletionsReader *del_reader
            = (DeletionsReader*)SegReader_Fetch(
                  seg_reader, Class_Get_Name(DELETIONSREADER));
        Matcher *seg_dels = del_reader
                            ? DelReader_Iterator(del_reader)
                            : NULL;

        if (seg_dels) {
            int32_t del;
            while (0 != (del = Matcher_Next(seg_dels))) {
                BitVec_Set(bit_vec, del);
            }
            DECREF(seg_dels);
        }
        VA_Store(ivars->bit_vecs, i, (Obj*)bit_vec);
        Hash_Store(ivars->name_to_tick,
                   (Obj*)SegReader_Get_Seg_Name(seg_reader),
                   (Obj*)Int32_new(i));
    }

    return self;
}
Ejemplo n.º 6
0
static bool_t
S_maybe_merge(Indexer *self, VArray *seg_readers)
{
    bool_t    merge_happened  = false;
    uint32_t  num_seg_readers = VA_Get_Size(seg_readers);
    Lock     *merge_lock      = IxManager_Make_Merge_Lock(self->manager);
    bool_t    got_merge_lock  = Lock_Obtain(merge_lock);
    int64_t   cutoff;
    VArray   *to_merge;
    uint32_t  i, max;

    if (got_merge_lock) {
        self->merge_lock = merge_lock;
        cutoff = 0;
    }
    else {
        // If something else holds the merge lock, don't interfere. 
        Hash *merge_data = IxManager_Read_Merge_Data(self->manager);
        if (merge_data) {
            Obj *cutoff_obj = Hash_Fetch_Str(merge_data, "cutoff", 6);
            if (cutoff_obj) {
                cutoff = Obj_To_I64(cutoff_obj);
            }
            else {
                cutoff = I64_MAX;
            }
            DECREF(merge_data);
        }
        else {
            cutoff = I64_MAX;
        }
        DECREF(merge_lock);
    }

    // Get a list of segments to recycle.  Validate and confirm that there are
    // no dupes in the list.
    to_merge = IxManager_Recycle(self->manager, self->polyreader, 
        self->del_writer, cutoff, self->optimize);
    {
        Hash *seen = Hash_new(VA_Get_Size(to_merge));
        for (i = 0, max = VA_Get_Size(to_merge); i < max; i++) {
            SegReader *seg_reader = (SegReader*)CERTIFY(
                VA_Fetch(to_merge, i), SEGREADER);
            CharBuf *seg_name = SegReader_Get_Seg_Name(seg_reader);
            if (Hash_Fetch(seen, (Obj*)seg_name)) {
                DECREF(seen);
                DECREF(to_merge);
                THROW(ERR, "Recycle() tried to merge segment '%o' twice",
                    seg_name);
            }
            Hash_Store(seen, (Obj*)seg_name, INCREF(&EMPTY));
        }
        DECREF(seen);
    }

    // Consolidate segments if either sparse or optimizing forced. 
    for (i = 0, max = VA_Get_Size(to_merge); i < max; i++) {
        SegReader *seg_reader = (SegReader*)VA_Fetch(to_merge, i);
        int64_t seg_num = SegReader_Get_Seg_Num(seg_reader);
        Matcher *deletions 
            = DelWriter_Seg_Deletions(self->del_writer, seg_reader);
        I32Array *doc_map = DelWriter_Generate_Doc_Map(self->del_writer,
            deletions, SegReader_Doc_Max(seg_reader),
            (int32_t)Seg_Get_Count(self->segment) 
        );
        if (seg_num <= cutoff) {
            THROW(ERR, "Segment %o violates cutoff (%i64 <= %i64)",
                SegReader_Get_Seg_Name(seg_reader), seg_num, cutoff);
        }
        SegWriter_Merge_Segment(self->seg_writer, seg_reader, doc_map);
        merge_happened = true;
        DECREF(deletions);
        DECREF(doc_map);
    }

    // Write out new deletions. 
    if (DelWriter_Updated(self->del_writer)) {
        // Only write out if they haven't all been applied. 
        if (VA_Get_Size(to_merge) != num_seg_readers) {
            DelWriter_Finish(self->del_writer);
        }
    }

    DECREF(to_merge);
    return merge_happened;
}
Ejemplo n.º 7
0
static bool
S_merge_updated_deletions(BackgroundMerger *self) {
    BackgroundMergerIVARS *const ivars = BGMerger_IVARS(self);
    Hash *updated_deletions = NULL;

    PolyReader *new_polyreader
        = PolyReader_open((Obj*)ivars->folder, NULL, NULL);
    Vector *new_seg_readers
        = PolyReader_Get_Seg_Readers(new_polyreader);
    Vector *old_seg_readers
        = PolyReader_Get_Seg_Readers(ivars->polyreader);
    Hash *new_segs = Hash_new(Vec_Get_Size(new_seg_readers));

    for (uint32_t i = 0, max = Vec_Get_Size(new_seg_readers); i < max; i++) {
        SegReader *seg_reader = (SegReader*)Vec_Fetch(new_seg_readers, i);
        String    *seg_name   = SegReader_Get_Seg_Name(seg_reader);
        Hash_Store(new_segs, seg_name, INCREF(seg_reader));
    }

    for (uint32_t i = 0, max = Vec_Get_Size(old_seg_readers); i < max; i++) {
        SegReader *seg_reader = (SegReader*)Vec_Fetch(old_seg_readers, i);
        String    *seg_name   = SegReader_Get_Seg_Name(seg_reader);

        // If this segment was merged away...
        if (Hash_Fetch(ivars->doc_maps, seg_name)) {
            SegReader *new_seg_reader
                = (SegReader*)CERTIFY(
                      Hash_Fetch(new_segs, seg_name),
                      SEGREADER);
            int32_t old_del_count = SegReader_Del_Count(seg_reader);
            int32_t new_del_count = SegReader_Del_Count(new_seg_reader);
            // ... were any new deletions applied against it?
            if (old_del_count != new_del_count) {
                DeletionsReader *del_reader
                    = (DeletionsReader*)SegReader_Obtain(
                          new_seg_reader,
                          Class_Get_Name(DELETIONSREADER));
                if (!updated_deletions) {
                    updated_deletions = Hash_new(max);
                }
                Hash_Store(updated_deletions, seg_name,
                           (Obj*)DelReader_Iterator(del_reader));
            }
        }
    }

    DECREF(new_polyreader);
    DECREF(new_segs);

    if (!updated_deletions) {
        return false;
    }
    else {
        PolyReader *merge_polyreader
            = PolyReader_open((Obj*)ivars->folder, ivars->snapshot, NULL);
        Vector *merge_seg_readers
            = PolyReader_Get_Seg_Readers(merge_polyreader);
        Snapshot *latest_snapshot
            = Snapshot_Read_File(Snapshot_new(), ivars->folder, NULL);
        int64_t new_seg_num
            = IxManager_Highest_Seg_Num(ivars->manager, latest_snapshot) + 1;
        Segment   *new_segment = Seg_new(new_seg_num);
        SegWriter *seg_writer  = SegWriter_new(ivars->schema, ivars->snapshot,
                                               new_segment, merge_polyreader);
        DeletionsWriter *del_writer = SegWriter_Get_Del_Writer(seg_writer);
        int64_t  merge_seg_num = Seg_Get_Number(ivars->segment);
        uint32_t seg_tick      = INT32_MAX;
        int32_t  offset        = INT32_MAX;

        SegWriter_Prep_Seg_Dir(seg_writer);

        for (uint32_t i = 0, max = Vec_Get_Size(merge_seg_readers); i < max; i++) {
            SegReader *seg_reader
                = (SegReader*)Vec_Fetch(merge_seg_readers, i);
            if (SegReader_Get_Seg_Num(seg_reader) == merge_seg_num) {
                I32Array *offsets = PolyReader_Offsets(merge_polyreader);
                seg_tick = i;
                offset = I32Arr_Get(offsets, seg_tick);
                DECREF(offsets);
            }
        }
        if (offset == INT32_MAX) { THROW(ERR, "Failed sanity check"); }

        HashIterator *iter = HashIter_new(updated_deletions);
        while (HashIter_Next(iter)) {
            String  *seg_name  = HashIter_Get_Key(iter);
            Matcher *deletions = (Matcher*)HashIter_Get_Value(iter);

            I32Array *doc_map
                = (I32Array*)CERTIFY(
                      Hash_Fetch(ivars->doc_maps, seg_name),
                      I32ARRAY);
            int32_t del;
            while (0 != (del = Matcher_Next(deletions))) {
                // Find the slot where the deleted doc resides in the
                // rewritten segment. If the doc was already deleted when we
                // were merging, do nothing.
                int32_t remapped = I32Arr_Get(doc_map, del);
                if (remapped) {
                    // It's a new deletion, so carry it forward and zap it in
                    // the rewritten segment.
                    DelWriter_Delete_By_Doc_ID(del_writer, remapped + offset);
                }
            }
        }
        DECREF(iter);

        // Finish the segment and clean up.
        DelWriter_Finish(del_writer);
        SegWriter_Finish(seg_writer);
        DECREF(seg_writer);
        DECREF(new_segment);
        DECREF(latest_snapshot);
        DECREF(merge_polyreader);
        DECREF(updated_deletions);
    }

    return true;
}