Exemplo n.º 1
0
PolyPostingList*
PolyPList_init(PolyPostingList *self, const CharBuf *field, 
                VArray *readers, I32Array *starts)
{
    u32_t i;
    const u32_t num_readers = VA_Get_Size(readers);

    /* Init. */
    self->tick            = 0;
    self->current         = NULL;

    /* Assign. */
    self->field           = CB_Clone(field);

    /* Get sub-posting_lists and assign offsets. */
    self->sub_plists = VA_new(num_readers);
    for (i = 0; i < num_readers; i++) {
        PostingsReader *const post_reader = (PostingsReader*)ASSERT_IS_A(
            VA_Fetch(readers, i), POSTINGSREADER);
        i32_t offset = I32Arr_Get(starts, i);
        SegPostingList *sub_plist = (SegPostingList*)PostReader_Posting_List(
            post_reader, field, NULL);

        if (sub_plist) {
            ASSERT_IS_A(sub_plist, SEGPOSTINGLIST);
            SegPList_Set_Doc_Base(sub_plist, offset);
            VA_Push(self->sub_plists, (Obj*)sub_plist);
        }
    }
    self->num_subs = VA_Get_Size(self->sub_plists);

    return self;
}
Exemplo n.º 2
0
void
Indexer_add_index(Indexer *self, Obj *index)
{
    Folder *other_folder = NULL;
    IndexReader *reader  = NULL;
    
    if (Obj_Is_A(index, FOLDER)) {
        other_folder = (Folder*)INCREF(index);
    }
    else if (Obj_Is_A(index, CHARBUF)) {
        other_folder = (Folder*)FSFolder_new((CharBuf*)index);
    }
    else {
        THROW(ERR, "Invalid type for 'index': %o", Obj_Get_Class_Name(index));
    }

    reader = IxReader_open((Obj*)other_folder, NULL, NULL);
    if (reader == NULL) {
        THROW(ERR, "Index doesn't seem to contain any data");
    }
    else {
        Schema *schema       = self->schema;
        Schema *other_schema = IxReader_Get_Schema(reader);
        VArray *other_fields = Schema_All_Fields(other_schema);
        VArray *seg_readers  = IxReader_Seg_Readers(reader);
        uint32_t i, max;

        // Validate schema compatibility and add fields. 
        Schema_Eat(schema, other_schema);

        // Add fields to Segment. 
        for (i = 0, max = VA_Get_Size(other_fields); i < max; i++) {
            CharBuf *other_field = (CharBuf*)VA_Fetch(other_fields, i);
            Seg_Add_Field(self->segment, other_field);
        }
        DECREF(other_fields);

        // Add all segments. 
        for (i = 0, max = VA_Get_Size(seg_readers); i < max; i++) {
            SegReader *seg_reader = (SegReader*)VA_Fetch(seg_readers, i);
            DeletionsReader *del_reader = (DeletionsReader*)SegReader_Fetch(
                seg_reader, VTable_Get_Name(DELETIONSREADER));
            Matcher *deletions = del_reader 
                               ? DelReader_Iterator(del_reader) 
                               : NULL;
            I32Array *doc_map = DelWriter_Generate_Doc_Map(self->del_writer,
                deletions, SegReader_Doc_Max(seg_reader),
                (int32_t)Seg_Get_Count(self->segment) 
            );
            SegWriter_Add_Segment(self->seg_writer, seg_reader, doc_map);
            DECREF(deletions);
            DECREF(doc_map);
        }
        DECREF(seg_readers);
    }

    DECREF(reader);
    DECREF(other_folder);
}
Exemplo n.º 3
0
VArray*
HeatMap_Flatten_Spans_IMP(HeatMap *self, VArray *spans) {
    const uint32_t num_spans = VA_Get_Size(spans);
    UNUSED_VAR(self);

    if (!num_spans) {
        return VA_new(0);
    }
    else {
        VArray *flattened = S_flattened_but_empty_spans(spans);
        const uint32_t num_raw_flattened = VA_Get_Size(flattened);

        // Iterate over each of the source spans, contributing their scores to
        // any destination span that falls within range.
        uint32_t dest_tick = 0;
        for (uint32_t i = 0; i < num_spans; i++) {
            Span *source_span = (Span*)VA_Fetch(spans, i);
            int32_t source_span_offset = Span_Get_Offset(source_span);
            int32_t source_span_len    = Span_Get_Length(source_span);
            int32_t source_span_end    = source_span_offset + source_span_len;

            // Get the location of the flattened span that shares the source
            // span's offset.
            for (; dest_tick < num_raw_flattened; dest_tick++) {
                Span *dest_span = (Span*)VA_Fetch(flattened, dest_tick);
                if (Span_Get_Offset(dest_span) == source_span_offset) {
                    break;
                }
            }

            // Fill in scores.
            for (uint32_t j = dest_tick; j < num_raw_flattened; j++) {
                Span *dest_span = (Span*)VA_Fetch(flattened, j);
                if (Span_Get_Offset(dest_span) == source_span_end) {
                    break;
                }
                else {
                    float new_weight = Span_Get_Weight(dest_span)
                                       + Span_Get_Weight(source_span);
                    Span_Set_Weight(dest_span, new_weight);
                }
            }
        }

        // Leave holes instead of spans that don't have any score.
        dest_tick = 0;
        for (uint32_t i = 0; i < num_raw_flattened; i++) {
            Span *span = (Span*)VA_Fetch(flattened, i);
            if (Span_Get_Weight(span)) {
                VA_Store(flattened, dest_tick++, INCREF(span));
            }
        }
        VA_Excise(flattened, dest_tick, num_raw_flattened - dest_tick);

        return flattened;
    }
}
Exemplo n.º 4
0
bool_t
Folder_delete_tree(Folder *self, const CharBuf *path) {
    Folder *enclosing_folder = Folder_Enclosing_Folder(self, path);

    // Don't allow Folder to delete itself.
    if (!path || !CB_Get_Size(path)) {
        return false;
    }

    if (enclosing_folder) {
        ZombieCharBuf *local = IxFileNames_local_part(path, ZCB_BLANK());
        if (Folder_Local_Is_Directory(enclosing_folder, (CharBuf*)local)) {
            Folder *inner_folder
                = Folder_Local_Find_Folder(enclosing_folder, (CharBuf*)local);
            DirHandle *dh = Folder_Local_Open_Dir(inner_folder);
            if (dh) {
                VArray *files = VA_new(20);
                VArray *dirs  = VA_new(20);
                CharBuf *entry = DH_Get_Entry(dh);
                while (DH_Next(dh)) {
                    VA_Push(files, (Obj*)CB_Clone(entry));
                    if (DH_Entry_Is_Dir(dh) && !DH_Entry_Is_Symlink(dh)) {
                        VA_Push(dirs, (Obj*)CB_Clone(entry));
                    }
                }
                for (uint32_t i = 0, max = VA_Get_Size(dirs); i < max; i++) {
                    CharBuf *name = (CharBuf*)VA_Fetch(files, i);
                    bool_t success = Folder_Delete_Tree(inner_folder, name);
                    if (!success && Folder_Local_Exists(inner_folder, name)) {
                        break;
                    }
                }
                for (uint32_t i = 0, max = VA_Get_Size(files); i < max; i++) {
                    CharBuf *name = (CharBuf*)VA_Fetch(files, i);
                    bool_t success = Folder_Local_Delete(inner_folder, name);
                    if (!success && Folder_Local_Exists(inner_folder, name)) {
                        break;
                    }
                }
                DECREF(dirs);
                DECREF(files);
                DECREF(dh);
            }
        }
        return Folder_Local_Delete(enclosing_folder, (CharBuf*)local);
    }
    else {
        // Return failure if the entry wasn't there in the first place.
        return false;
    }
}
Exemplo n.º 5
0
VArray*
IxManager_recycle(IndexManager *self, PolyReader *reader, 
                  DeletionsWriter *del_writer, int64_t cutoff, bool_t optimize)
{
    VArray *seg_readers = PolyReader_Get_Seg_Readers(reader);
    VArray *candidates  = VA_Gather(seg_readers, S_check_cutoff, &cutoff);
    VArray *recyclables = VA_new(VA_Get_Size(candidates));
    const uint32_t num_candidates = VA_Get_Size(candidates);

    if (optimize) { 
        DECREF(recyclables);
        return candidates; 
    }

    // Sort by ascending size in docs, choose sparsely populated segments.
    VA_Sort(candidates, S_compare_doc_count, NULL);
    int32_t *counts = (int32_t*)MALLOCATE(num_candidates * sizeof(int32_t));
    for (uint32_t i = 0; i < num_candidates; i++) {
        SegReader *seg_reader = (SegReader*)CERTIFY(
            VA_Fetch(candidates, i), SEGREADER);
        counts[i] = SegReader_Doc_Count(seg_reader);
    }
    I32Array *doc_counts = I32Arr_new_steal(counts, num_candidates);
    uint32_t threshold = IxManager_Choose_Sparse(self, doc_counts);
    DECREF(doc_counts);

    // Move SegReaders to be recycled.
    for (uint32_t i = 0; i < threshold; i++) {
        VA_Store(recyclables, i, VA_Delete(candidates, i));
    }

    // Find segments where at least 10% of all docs have been deleted. 
    for (uint32_t i = threshold; i < num_candidates; i++) {
        SegReader *seg_reader = (SegReader*)VA_Delete(candidates, i);
        CharBuf   *seg_name   = SegReader_Get_Seg_Name(seg_reader);
        double doc_max = SegReader_Doc_Max(seg_reader);
        double num_deletions = DelWriter_Seg_Del_Count(del_writer, seg_name);
        double del_proportion = num_deletions / doc_max;
        if (del_proportion >= 0.1) {
            VA_Push(recyclables, (Obj*)seg_reader);
        }
        else {
            DECREF(seg_reader);
        }
    }

    DECREF(candidates);
    return recyclables;
}
Exemplo n.º 6
0
void
SortWriter_add_segment(SortWriter *self, SegReader *reader,
                       I32Array *doc_map) {
    SortWriterIVARS *const ivars = SortWriter_IVARS(self);
    VArray *fields = Schema_All_Fields(ivars->schema);

    // Proceed field-at-a-time, rather than doc-at-a-time.
    for (uint32_t i = 0, max = VA_Get_Size(fields); i < max; i++) {
        CharBuf *field = (CharBuf*)VA_Fetch(fields, i);
        SortReader *sort_reader = (SortReader*)SegReader_Fetch(
                                      reader, VTable_Get_Name(SORTREADER));
        SortCache *cache = sort_reader
                           ? SortReader_Fetch_Sort_Cache(sort_reader, field)
                           : NULL;
        if (cache) {
            int32_t field_num = Seg_Field_Num(ivars->segment, field);
            SortFieldWriter *field_writer
                = S_lazy_init_field_writer(self, field_num);
            SortFieldWriter_Add_Segment(field_writer, reader, doc_map, cache);
            ivars->flush_at_finish = true;
        }
    }

    DECREF(fields);
}
Exemplo n.º 7
0
void
SortWriter_add_inverted_doc(SortWriter *self, Inverter *inverter,
                            int32_t doc_id) {
    SortWriterIVARS *const ivars = SortWriter_IVARS(self);
    int32_t field_num;

    Inverter_Iterate(inverter);
    while (0 != (field_num = Inverter_Next(inverter))) {
        FieldType *type = Inverter_Get_Type(inverter);
        if (FType_Sortable(type)) {
            SortFieldWriter *field_writer
                = S_lazy_init_field_writer(self, field_num);
            SortFieldWriter_Add(field_writer, doc_id,
                                Inverter_Get_Value(inverter));
        }
    }

    // If our SortFieldWriters have collectively passed the memory threshold,
    // flush all of them, then release all unique values with a single action.
    if (MemPool_Get_Consumed(ivars->mem_pool) > ivars->mem_thresh) {
        for (uint32_t i = 0; i < VA_Get_Size(ivars->field_writers); i++) {
            SortFieldWriter *const field_writer
                = (SortFieldWriter*)VA_Fetch(ivars->field_writers, i);
            if (field_writer) { SortFieldWriter_Flush(field_writer); }
        }
        MemPool_Release_All(ivars->mem_pool);
        ivars->flush_at_finish = true;
    }
}
Exemplo n.º 8
0
static void
S_read_fsfolder(RAMFolder *self) 
{
    u32_t i, max;
    /* Open an FSFolder for reading. */
    FSFolder *source_folder = FSFolder_new(self->path);
    VArray *files = FSFolder_List(source_folder);

    /* Copy every file in the FSFolder into RAM. */
    for (i = 0, max = VA_Get_Size(files); i < max; i++) {
        CharBuf *filepath = (CharBuf*)VA_Fetch(files, i);
        InStream *source_stream 
            = FSFolder_Open_In(source_folder, filepath);
        OutStream *outstream = RAMFolder_Open_Out(self, filepath);
        if (!source_stream) { THROW("Can't open %o", filepath); }
        if (!outstream)     { THROW("Can't open %o", filepath); }
        OutStream_Absorb(outstream, source_stream);
        OutStream_Close(outstream);
        InStream_Close(source_stream);
        DECREF(outstream);
        DECREF(source_stream);
    }

    DECREF(files);
    FSFolder_Close(source_folder);
    DECREF(source_folder);
}
Exemplo n.º 9
0
static void
test_stemming(TestBatchRunner *runner) {
    FSFolder *modules_folder = TestUtils_modules_folder();
    String *path = Str_newf("analysis/snowstem/source/test/tests.json");
    Hash *tests = (Hash*)Json_slurp_json((Folder*)modules_folder, path);
    if (!tests) { RETHROW(Err_get_error()); }

    String *iso;
    Hash *lang_data;
    Hash_Iterate(tests);
    while (Hash_Next(tests, (Obj**)&iso, (Obj**)&lang_data)) {
        VArray *words = (VArray*)Hash_Fetch_Utf8(lang_data, "words", 5);
        VArray *stems = (VArray*)Hash_Fetch_Utf8(lang_data, "stems", 5);
        SnowballStemmer *stemmer = SnowStemmer_new(iso);
        for (uint32_t i = 0, max = VA_Get_Size(words); i < max; i++) {
            String *word  = (String*)VA_Fetch(words, i);
            VArray *got   = SnowStemmer_Split(stemmer, word);
            String *stem  = (String*)VA_Fetch(got, 0);
            TEST_TRUE(runner,
                      stem
                      && Str_Is_A(stem, STRING)
                      && Str_Equals(stem, VA_Fetch(stems, i)),
                      "Stem %s: %s", Str_Get_Ptr8(iso), Str_Get_Ptr8(word)
                     );
            DECREF(got);
        }
        DECREF(stemmer);
    }

    DECREF(tests);
    DECREF(modules_folder);
    DECREF(path);
}
Exemplo n.º 10
0
ProximityCompiler*
ProximityCompiler_init(ProximityCompiler *self, ProximityQuery *parent,
                       Searcher *searcher, float boost, uint32_t within) {
    ProximityCompilerIVARS *const ivars = ProximityCompiler_IVARS(self);
    ProximityQueryIVARS *const parent_ivars = ProximityQuery_IVARS(parent);
    Schema     *schema = Searcher_Get_Schema(searcher);
    Similarity *sim    = Schema_Fetch_Sim(schema, parent_ivars->field);
    VArray     *terms  = parent_ivars->terms;

    ivars->within = within;

    // Try harder to find a Similarity if necessary.
    if (!sim) { sim = Schema_Get_Similarity(schema); }

    // Init.
    Compiler_init((Compiler*)self, (Query*)parent, searcher, sim, boost);

    // Store IDF for the phrase.
    ivars->idf = 0;
    for (uint32_t i = 0, max = VA_Get_Size(terms); i < max; i++) {
        Obj *term = VA_Fetch(terms, i);
        int32_t doc_max  = Searcher_Doc_Max(searcher);
        int32_t doc_freq
            = Searcher_Doc_Freq(searcher, parent_ivars->field,term);
        ivars->idf += Sim_IDF(sim, doc_freq, doc_max);
    }

    // Calculate raw weight.
    ivars->raw_weight = ivars->idf * ivars->boost;

    return self;
}
Exemplo n.º 11
0
void
SortEx_Shrink_IMP(SortExternal *self) {
    SortExternalIVARS *const ivars = SortEx_IVARS(self);
    if (ivars->buf_max - ivars->buf_tick > 0) {
        size_t buf_count = SortEx_Buffer_Count(self);
        size_t size        = buf_count * sizeof(Obj*);
        if (ivars->buf_tick > 0) {
            Obj **start = ivars->buffer + ivars->buf_tick;
            memmove(ivars->buffer, start, size);
        }
        ivars->buffer   = (Obj**)REALLOCATE(ivars->buffer, size);
        ivars->buf_tick = 0;
        ivars->buf_max  = buf_count;
        ivars->buf_cap  = buf_count;
    }
    else {
        FREEMEM(ivars->buffer);
        ivars->buffer   = NULL;
        ivars->buf_tick = 0;
        ivars->buf_max  = 0;
        ivars->buf_cap  = 0;
    }
    ivars->scratch_cap = 0;
    FREEMEM(ivars->scratch);
    ivars->scratch = NULL;

    for (uint32_t i = 0, max = VA_Get_Size(ivars->runs); i < max; i++) {
        SortExternal *run = (SortExternal*)VA_Fetch(ivars->runs, i);
        SortEx_Shrink(run);
    }
}
static Folder*
S_create_index()
{
    Schema     *schema  = (Schema*)TestSchema_new();
    RAMFolder  *folder  = RAMFolder_new(NULL);
    VArray     *doc_set = TestUtils_doc_set();
    Indexer    *indexer = Indexer_new(schema, (Obj*)folder, NULL, NULL, 0);
    u32_t i, max;

    for (i = 0, max = VA_Get_Size(doc_set); i < max; i++) {
        static CharBuf field = ZCB_LITERAL("content");
        Doc *doc = Doc_new(NULL, 0);
        Doc_Store(doc, &field, VA_Fetch(doc_set, i));
        Indexer_Add_Doc(indexer, doc, 1.0f);
        DECREF(doc);
    }

    Indexer_Commit(indexer);

    DECREF(doc_set);
    DECREF(indexer);
    DECREF(schema);
        
    return (Folder*)folder;
}
Exemplo n.º 13
0
static ORMatcher*
S_ormatcher_init2(ORMatcher *self, ORMatcherIVARS *ivars, VArray *children,
                  Similarity *sim) {
    // Init.
    PolyMatcher_init((PolyMatcher*)self, children, sim);
    ivars->size = 0;

    // Derive.
    ivars->max_size = VA_Get_Size(children);

    // Allocate.
    ivars->heap = (HeapedMatcherDoc**)CALLOCATE(ivars->max_size + 1, sizeof(HeapedMatcherDoc*));

    // Create a pool of HMDs.  Encourage CPU cache hits by using a single
    // allocation for all of them.
    size_t amount_to_malloc = (ivars->max_size + 1) * sizeof(HeapedMatcherDoc);
    ivars->blob = (char*)MALLOCATE(amount_to_malloc);
    ivars->pool = (HeapedMatcherDoc**)CALLOCATE(ivars->max_size + 1, sizeof(HeapedMatcherDoc*));
    for (uint32_t i = 1; i <= ivars->max_size; i++) {
        size_t offset = i * sizeof(HeapedMatcherDoc);
        HeapedMatcherDoc *hmd = (HeapedMatcherDoc*)(ivars->blob + offset);
        ivars->pool[i] = hmd;
    }

    // Prime queue.
    for (uint32_t i = 0; i < ivars->max_size; i++) {
        Matcher *matcher = (Matcher*)VA_Fetch(children, i);
        if (matcher) {
            S_add_element(self, ivars, (Matcher*)INCREF(matcher), 0);
        }
    }

    return self;
}
Exemplo n.º 14
0
VArray*
HeatMap_generate_proximity_boosts(HeatMap *self, VArray *spans) 
{
    VArray *boosts = VA_new(0);
    const uint32_t num_spans = VA_Get_Size(spans);

    if (num_spans > 1) {
        for (uint32_t i = 0, max = num_spans - 1; i < max; i++ ) {
            Span *span1 = (Span*)VA_Fetch(spans, i);

            for (uint32_t j = i + 1; j <= max; j++) {
                Span *span2 = (Span*)VA_Fetch(spans, j);
                float prox_score 
                    = HeatMap_Calc_Proximity_Boost(self, span1, span2);
                if (prox_score == 0) {
                    break;
                }
                else {
                    int32_t length = (span2->offset - span1->offset)
                        + span2->length;
                    VA_Push(boosts, 
                        (Obj*)Span_new(span1->offset, length, prox_score));
                }
            }
        }
    }

    return boosts;
}
Exemplo n.º 15
0
PolySearcher*
PolySearcher_init(PolySearcher *self, Schema *schema, VArray *searchers) {
    const uint32_t num_searchers = VA_Get_Size(searchers);
    int32_t *starts_array = (int32_t*)MALLOCATE(num_searchers * sizeof(int32_t));
    int32_t  doc_max      = 0;

    Searcher_init((Searcher*)self, schema);
    PolySearcherIVARS *const ivars = PolySearcher_IVARS(self);
    ivars->searchers = (VArray*)INCREF(searchers);
    ivars->starts = NULL; // Safe cleanup.

    for (uint32_t i = 0; i < num_searchers; i++) {
        Searcher *searcher
            = (Searcher*)CERTIFY(VA_Fetch(searchers, i), SEARCHER);
        Schema *candidate       = Searcher_Get_Schema(searcher);
        Class  *orig_class      = Schema_Get_Class(schema);
        Class  *candidate_class = Schema_Get_Class(candidate);

        // Confirm that searchers all use the same schema.
        if (orig_class != candidate_class) {
            THROW(ERR, "Conflicting schemas: '%o', '%o'",
                  Schema_Get_Class_Name(schema),
                  Schema_Get_Class_Name(candidate));
        }

        // Derive doc_max and relative start offsets.
        starts_array[i] = (int32_t)doc_max;
        doc_max += Searcher_Doc_Max(searcher);
    }

    ivars->doc_max = doc_max;
    ivars->starts  = I32Arr_new_steal(starts_array, num_searchers);

    return self;
}
Exemplo n.º 16
0
Inversion*
PolyAnalyzer_Transform_Text_IMP(PolyAnalyzer *self, String *text) {
    VArray *const   analyzers     = PolyAnalyzer_IVARS(self)->analyzers;
    const uint32_t  num_analyzers = VA_Get_Size(analyzers);
    Inversion      *retval;

    if (num_analyzers == 0) {
        size_t      token_len = Str_Get_Size(text);
        const char *buf       = Str_Get_Ptr8(text);
        Token *seed = Token_new(buf, token_len, 0, token_len, 1.0f, 1);
        retval = Inversion_new(seed);
        DECREF(seed);
    }
    else {
        Analyzer *first_analyzer = (Analyzer*)VA_Fetch(analyzers, 0);
        retval = Analyzer_Transform_Text(first_analyzer, text);
        for (uint32_t i = 1; i < num_analyzers; i++) {
            Analyzer *analyzer = (Analyzer*)VA_Fetch(analyzers, i);
            Inversion *new_inversion = Analyzer_Transform(analyzer, retval);
            DECREF(retval);
            retval = new_inversion;
        }
    }

    return retval;
}
Exemplo n.º 17
0
static void
S_init_arena(MemoryPool *self, size_t amount) {
    ByteBuf *bb;

    // Indicate which arena we're using at present.
    self->tick++;

    if (self->tick < (int32_t)VA_Get_Size(self->arenas)) {
        // In recycle mode, use previously acquired memory.
        bb = (ByteBuf*)VA_Fetch(self->arenas, self->tick);
        if (amount >= BB_Get_Size(bb)) {
            BB_Grow(bb, amount);
            BB_Set_Size(bb, amount);
        }
    }
    else {
        // In add mode, get more mem from system.
        size_t buf_size = (amount + 1) > self->arena_size
                          ? (amount + 1)
                          : self->arena_size;
        char *ptr = (char*)MALLOCATE(buf_size);
        bb = BB_new_steal_bytes(ptr, buf_size - 1, buf_size);
        VA_Push(self->arenas, (Obj*)bb);
    }

    // Recalculate consumption to take into account blocked off space.
    self->consumed = 0;
    for (int32_t i = 0; i < self->tick; i++) {
        ByteBuf *bb = (ByteBuf*)VA_Fetch(self->arenas, i);
        self->consumed += BB_Get_Size(bb);
    }

    self->buf   = BB_Get_Buf(bb);
    self->limit = self->buf + BB_Get_Size(bb);
}
Exemplo n.º 18
0
void
DefDelWriter_Delete_By_Query_IMP(DefaultDeletionsWriter *self, Query *query) {
    DefaultDeletionsWriterIVARS *const ivars = DefDelWriter_IVARS(self);
    Compiler *compiler = Query_Make_Compiler(query, (Searcher*)ivars->searcher,
                                             Query_Get_Boost(query), false);

    for (uint32_t i = 0, max = VA_Get_Size(ivars->seg_readers); i < max; i++) {
        SegReader *seg_reader = (SegReader*)VA_Fetch(ivars->seg_readers, i);
        BitVector *bit_vec = (BitVector*)VA_Fetch(ivars->bit_vecs, i);
        Matcher *matcher = Compiler_Make_Matcher(compiler, seg_reader, false);

        if (matcher) {
            int32_t doc_id;
            int32_t num_zapped = 0;

            // Iterate through matches, marking each doc as deleted.
            while (0 != (doc_id = Matcher_Next(matcher))) {
                num_zapped += !BitVec_Get(bit_vec, doc_id);
                BitVec_Set(bit_vec, doc_id);
            }
            if (num_zapped) { ivars->updated[i] = true; }

            DECREF(matcher);
        }
    }

    DECREF(compiler);
}
Exemplo n.º 19
0
PhraseCompiler*
PhraseCompiler_init(PhraseCompiler *self, PhraseQuery *parent, 
                    Searcher *searcher, float boost)
{
    Schema     *schema = Searcher_Get_Schema(searcher);
    Similarity *sim    = Schema_Fetch_Sim(schema, parent->field);
    VArray     *terms  = parent->terms;
    uint32_t i, max;

    // Try harder to find a Similarity if necessary. 
    if (!sim) { sim = Schema_Get_Similarity(schema); }

    // Init. 
    Compiler_init((Compiler*)self, (Query*)parent, searcher, sim, boost);

    // Store IDF for the phrase. 
    self->idf = 0;
    for (i = 0, max = VA_Get_Size(terms); i < max; i++) {
        Obj *term = VA_Fetch(terms, i);
        int32_t doc_max  = Searcher_Doc_Max(searcher);
        int32_t doc_freq = Searcher_Doc_Freq(searcher, parent->field, term);
        self->idf += Sim_IDF(sim, doc_freq, doc_max);
    }

    // Calculate raw weight. 
    self->raw_weight = self->idf * self->boost;

    // Make final preparations. 
    PhraseCompiler_Normalize(self);

    return self;
}
Exemplo n.º 20
0
Hash*
DefDelWriter_Metadata_IMP(DefaultDeletionsWriter *self) {
    DefaultDeletionsWriterIVARS *const ivars = DefDelWriter_IVARS(self);
    DefDelWriter_Metadata_t super_meta
        = (DefDelWriter_Metadata_t)SUPER_METHOD_PTR(DEFAULTDELETIONSWRITER,
                                                    LUCY_DefDelWriter_Metadata);
    Hash    *const metadata = super_meta(self);
    Hash    *const files    = Hash_new(0);

    for (uint32_t i = 0, max = VA_Get_Size(ivars->seg_readers); i < max; i++) {
        SegReader *seg_reader = (SegReader*)VA_Fetch(ivars->seg_readers, i);
        if (ivars->updated[i]) {
            BitVector *deldocs   = (BitVector*)VA_Fetch(ivars->bit_vecs, i);
            Segment   *segment   = SegReader_Get_Segment(seg_reader);
            Hash      *mini_meta = Hash_new(2);
            Hash_Store_Utf8(mini_meta, "count", 5,
                            (Obj*)Str_newf("%u32", (uint32_t)BitVec_Count(deldocs)));
            Hash_Store_Utf8(mini_meta, "filename", 8,
                            (Obj*)S_del_filename(self, seg_reader));
            Hash_Store(files, (Obj*)Seg_Get_Name(segment), (Obj*)mini_meta);
        }
    }
    Hash_Store_Utf8(metadata, "files", 5, (Obj*)files);

    return metadata;
}
Exemplo n.º 21
0
void
DefDelWriter_Delete_By_Term_IMP(DefaultDeletionsWriter *self,
                                String *field, Obj *term) {
    DefaultDeletionsWriterIVARS *const ivars = DefDelWriter_IVARS(self);
    for (uint32_t i = 0, max = VA_Get_Size(ivars->seg_readers); i < max; i++) {
        SegReader *seg_reader = (SegReader*)VA_Fetch(ivars->seg_readers, i);
        PostingListReader *plist_reader
            = (PostingListReader*)SegReader_Fetch(
                  seg_reader, Class_Get_Name(POSTINGLISTREADER));
        BitVector *bit_vec = (BitVector*)VA_Fetch(ivars->bit_vecs, i);
        PostingList *plist = plist_reader
                             ? PListReader_Posting_List(plist_reader, field, term)
                             : NULL;
        int32_t doc_id;
        int32_t num_zapped = 0;

        // Iterate through postings, marking each doc as deleted.
        if (plist) {
            while (0 != (doc_id = PList_Next(plist))) {
                num_zapped += !BitVec_Get(bit_vec, doc_id);
                BitVec_Set(bit_vec, doc_id);
            }
            if (num_zapped) { ivars->updated[i] = true; }
            DECREF(plist);
        }
    }
}
Exemplo n.º 22
0
CharBuf*
IxManager_make_snapshot_filename(IndexManager *self)
{
    VArray *files = Folder_List(self->folder);
    u32_t i, max;
    i32_t max_gen = 0;

    for (i = 0, max = VA_Get_Size(files); i < max; i++) {
        CharBuf *file = (CharBuf*)VA_Fetch(files, i);
        if (    CB_Starts_With_Str(file, "snapshot_", 9)
                && CB_Ends_With_Str(file, ".json", 5)
           ) {
            i32_t gen = IxFileNames_extract_gen(file);
            if (gen > max_gen) {
                max_gen = gen;
            }
        }
    }
    DECREF(files);

    {
        i32_t    new_gen = max_gen + 1;
        CharBuf *base_36 = StrHelp_to_base36(new_gen);
        CharBuf *snapfile = CB_newf("snapshot_%o.json", base_36);
        DECREF(base_36);
        return snapfile;
    }
}
Exemplo n.º 23
0
void
DefDelWriter_Finish_IMP(DefaultDeletionsWriter *self) {
    DefaultDeletionsWriterIVARS *const ivars = DefDelWriter_IVARS(self);
    Folder *const folder = ivars->folder;

    for (uint32_t i = 0, max = VA_Get_Size(ivars->seg_readers); i < max; i++) {
        SegReader *seg_reader = (SegReader*)VA_Fetch(ivars->seg_readers, i);
        if (ivars->updated[i]) {
            BitVector *deldocs   = (BitVector*)VA_Fetch(ivars->bit_vecs, i);
            int32_t    doc_max   = SegReader_Doc_Max(seg_reader);
            double     used      = (doc_max + 1) / 8.0;
            uint32_t   byte_size = (uint32_t)ceil(used);
            uint32_t   new_max   = byte_size * 8 - 1;
            String    *filename  = S_del_filename(self, seg_reader);
            OutStream *outstream = Folder_Open_Out(folder, filename);
            if (!outstream) { RETHROW(INCREF(Err_get_error())); }

            // Ensure that we have 1 bit for each doc in segment.
            BitVec_Grow(deldocs, new_max);

            // Write deletions data and clean up.
            OutStream_Write_Bytes(outstream,
                                  (char*)BitVec_Get_Raw_Bits(deldocs),
                                  byte_size);
            OutStream_Close(outstream);
            DECREF(outstream);
            DECREF(filename);
        }
    }

    Seg_Store_Metadata_Utf8(ivars->segment, "deletions", 9,
                            (Obj*)DefDelWriter_Metadata(self));
}
Exemplo n.º 24
0
VArray*
IxManager_segreaders_to_merge(IndexManager *self, PolyReader *reader,
                              bool_t all)
{
    VArray *seg_readers = VA_Shallow_Copy(PolyReader_Get_Seg_Readers(reader));
    UNUSED_VAR(self);

    if (!all) {
        u32_t i;
        u32_t total_docs = 0;
        u32_t threshold = 0;
        const u32_t num_seg_readers = VA_Get_Size(seg_readers);

        /* Sort by ascending size in docs. */
        VA_Sort(seg_readers, S_compare_doc_count);

        /* Find sparsely populated segments. */
        for (i = 0; i < num_seg_readers; i++) {
            SegReader *seg_reader = (SegReader*)VA_Fetch(seg_readers, i);
            total_docs += SegReader_Doc_Count(seg_reader);
            if (total_docs < Math_fibonacci(i + 5)) {
                threshold = i + 1;
            }
        }
        VA_Splice(seg_readers, threshold, num_seg_readers);
    }

    return seg_readers;
}
Exemplo n.º 25
0
Segment*
IxManager_make_new_segment(IndexManager *self, Snapshot *snapshot)
{
    VArray *files = Snapshot_List(snapshot);
    u32_t i, max;
    i32_t highest_seg_num = 0;
    CharBuf *seg_name = CB_new(20);
    Segment *segment;

    /* Find highest seg num. */
    for (i = 0, max = VA_Get_Size(files); i < max; i++) {
        CharBuf *file = (CharBuf*)VA_Fetch(files, i);
        if (CB_Starts_With_Str(file, "seg_", 4)) {
            i32_t seg_num = IxFileNames_extract_gen(file);
            if (seg_num > highest_seg_num) {
                highest_seg_num = seg_num;
            }
        }
    }

    /* Create segment with num one greater than current max. */
    S_cat_seg_name(seg_name, highest_seg_num + 1);
    segment = Seg_new(seg_name, self->folder);

    DECREF(seg_name);
    DECREF(files);

    return segment;
}
Exemplo n.º 26
0
static uint8_t*
S_find_endpost(SortExternal *self, SortExternalIVARS *ivars) {
    uint8_t *endpost = NULL;
    const size_t width = ivars->width;

    for (uint32_t i = 0, max = VA_Get_Size(ivars->runs); i < max; i++) {
        // Get a run and retrieve the last item in its cache.
        SortExternal *const run = (SortExternal*)VA_Fetch(ivars->runs, i);
        SortExternalIVARS *const run_ivars = SortEx_IVARS(run);
        const uint32_t tick = run_ivars->cache_max - 1;
        if (tick >= run_ivars->cache_cap || run_ivars->cache_max < 1) {
            THROW(ERR, "Invalid SortExternal cache access: %u32 %u32 %u32", tick,
                  run_ivars->cache_max, run_ivars->cache_cap);
        }
        else {
            // Cache item with the highest sort value currently held in memory
            // by the run.
            uint8_t *candidate = run_ivars->cache + tick * width;

            // If it's the first run, item is automatically the new endpost.
            if (i == 0) {
                endpost = candidate;
            }
            // If it's less than the current endpost, it's the new endpost.
            else if (SortEx_Compare(self, candidate, endpost) < 0) {
                endpost = candidate;
            }
        }
    }

    return endpost;
}
Exemplo n.º 27
0
PolyReader*
PolyReader_init(PolyReader *self, Schema *schema, Folder *folder,
                Snapshot *snapshot, IndexManager *manager,
                VArray *sub_readers) {
    PolyReaderIVARS *const ivars = PolyReader_IVARS(self);
    ivars->doc_max    = 0;
    ivars->del_count  = 0;

    if (sub_readers) {
        uint32_t num_segs = VA_Get_Size(sub_readers);
        VArray *segments = VA_new(num_segs);
        for (uint32_t i = 0; i < num_segs; i++) {
            SegReader *seg_reader
                = (SegReader*)CERTIFY(VA_Fetch(sub_readers, i), SEGREADER);
            VA_Push(segments, INCREF(SegReader_Get_Segment(seg_reader)));
        }
        IxReader_init((IndexReader*)self, schema, folder, snapshot,
                      segments, -1, manager);
        DECREF(segments);
        S_init_sub_readers(self, sub_readers);
    }
    else {
        IxReader_init((IndexReader*)self, schema, folder, snapshot,
                      NULL, -1, manager);
        ivars->sub_readers = VA_new(0);
        ivars->offsets = I32Arr_new_steal(NULL, 0);
    }

    return self;
}
Exemplo n.º 28
0
PhraseCompiler*
PhraseCompiler_init(PhraseCompiler *self, PhraseQuery *parent, 
                    Searchable *searchable, float boost)
{
    Schema     *schema = Searchable_Get_Schema(searchable);
    Similarity *sim    = Schema_Fetch_Sim(schema, parent->field);
    VArray     *terms  = parent->terms;
    u32_t i, max;

    /* Try harder to find a Similarity if necessary. */
    if (!sim) { sim = Schema_Get_Similarity(schema); }

    /* Init. */
    Compiler_init((Compiler*)self, (Query*)parent, searchable, sim, boost);

    /* Store IDF for the phrase. */
    self->idf = 0;
    for (i = 0, max = VA_Get_Size(terms); i < max; i++) {
        Obj *term = VA_Fetch(terms, i);
        self->idf += Sim_IDF(sim, searchable, parent->field, term);
    }

    /* Calculate raw weight. */
    self->raw_weight = self->idf * self->boost;

    /* Make final preparations. */
    PhraseCompiler_Normalize(self);

    return self;
}
Exemplo n.º 29
0
// Create all the spans needed by HeatMap_Flatten_Spans, based on the source
// offsets and lengths... but leave the scores at 0.
static VArray*
S_flattened_but_empty_spans(VArray *spans)
{
    const uint32_t num_spans = VA_Get_Size(spans);
    int32_t *bounds = (int32_t*)MALLOCATE((num_spans * 2) * sizeof(int32_t));

    // Assemble a list of all unique start/end boundaries. 
    for (uint32_t i = 0; i < num_spans; i++) {
        Span *span            = (Span*)VA_Fetch(spans, i);
        bounds[i]             = span->offset;
        bounds[i + num_spans] = span->offset + span->length; 
    }
    Sort_quicksort(bounds, num_spans * 2, sizeof(uint32_t), 
        S_compare_i32, NULL);
    uint32_t num_bounds = 0;
    int32_t  last       = I32_MAX;
    for (uint32_t i = 0; i < num_spans * 2; i++) {
        if (bounds[i] != last) {
            bounds[num_bounds++] = bounds[i];
            last = bounds[i];
        }
    }

    // Create one Span for each zone between two bounds. 
    VArray *flattened = VA_new(num_bounds - 1);
    for (uint32_t i = 0; i < num_bounds - 1; i++) {
        int32_t  start   = bounds[i];
        int32_t  length  = bounds[i + 1] - start;
        VA_Push(flattened, (Obj*)Span_new(start, length, 0.0f));
    }

    FREEMEM(bounds);
    return flattened;
}
Exemplo n.º 30
0
void
Indexer_prepare_commit(Indexer *self)
{
    VArray   *seg_readers     = PolyReader_Get_Seg_Readers(self->polyreader);
    uint32_t  num_seg_readers = VA_Get_Size(seg_readers);
    bool_t    merge_happened  = false;

    if ( !self->write_lock || self->prepared ) {
        THROW(ERR, "Can't call Prepare_Commit() more than once");
    }

    // Merge existing index data. 
    if (num_seg_readers) { 
        merge_happened = S_maybe_merge(self, seg_readers);
    }

    // Add a new segment and write a new snapshot file if... 
    if (   Seg_Get_Count(self->segment)      // Docs/segs added. 
        || merge_happened                        // Some segs merged. 
        || !Snapshot_Num_Entries(self->snapshot) // Initializing index. 
        || DelWriter_Updated(self->del_writer) 
    ) {
        Folder   *folder   = self->folder;
        Schema   *schema   = self->schema;
        Snapshot *snapshot = self->snapshot;
        CharBuf  *old_schema_name = S_find_schema_file(snapshot);
        uint64_t  schema_gen = old_schema_name
                             ? IxFileNames_extract_gen(old_schema_name) + 1
                             : 1;
        char      base36[StrHelp_MAX_BASE36_BYTES];
        CharBuf  *new_schema_name;
        
        StrHelp_to_base36(schema_gen, &base36);
        new_schema_name = CB_newf("schema_%s.json", base36);

        // Finish the segment, write schema file. 
        SegWriter_Finish(self->seg_writer);
        Schema_Write(schema, folder, new_schema_name);
        if (old_schema_name) {
            Snapshot_Delete_Entry(snapshot, old_schema_name);
        }
        Snapshot_Add_Entry(snapshot, new_schema_name);
        DECREF(new_schema_name);

        // Write temporary snapshot file. 
        DECREF(self->snapfile);
        self->snapfile = IxManager_Make_Snapshot_Filename(self->manager);
        CB_Cat_Trusted_Str(self->snapfile, ".temp", 5);
        Folder_Delete(folder, self->snapfile);
        Snapshot_Write_File(snapshot, folder, self->snapfile);

        self->needs_commit = true;
    }

    // Close reader, so that we can delete its files if appropriate. 
    PolyReader_Close(self->polyreader);

    self->prepared = true;
}