コード例 #1
0
LexiconWriter*
LexWriter_init(LexiconWriter *self, Snapshot *snapshot, Segment *segment, 
               PolyReader *polyreader)
{
    Schema       *schema = PolyReader_Get_Schema(polyreader);
    Architecture *arch   = Schema_Get_Architecture(schema);

    DataWriter_init((DataWriter*)self, snapshot, segment, polyreader);

    /* Assign. */
    self->index_interval = Arch_Index_Interval(arch);
    self->skip_interval  = Arch_Skip_Interval(arch);

    /* Init. */
    self->ix_out             = NULL;
    self->ixix_out           = NULL;
    self->dat_out            = NULL;
    self->count              = 0;
    self->ix_count           = 0;
    self->last_tinfo         = TInfo_new(0,0,0,0);
    self->last_text          = CB_new(40);
    self->dat_file           = CB_new(30);
    self->ix_file            = CB_new(30);
    self->ixix_file          = CB_new(30);
    self->counts             = Hash_new(0);
    self->ix_counts          = Hash_new(0);
    self->stepper            = NULL;
    self->temp_mode          = false;

    /* Derive. */
    self->stepper = LexStepper_new((CharBuf*)&EMPTY, self->skip_interval);

    return self;
}
コード例 #2
0
ファイル: TestTermInfo.c プロジェクト: pavansondur/lucy
void 
test_freqfilepos(TestBatch *batch) {
    TermInfo* tinfo = TInfo_new(10);
    TInfo_Set_Post_FilePos(tinfo, 20);
    TInfo_Set_Skip_FilePos(tinfo, 40);
    TInfo_Set_Lex_FilePos(tinfo, 50);

    TermInfo* cloned_tinfo = TInfo_Clone(tinfo);

    TEST_FALSE(batch, Lucy_TInfo_Equals(tinfo, (lucy_Obj*)cloned_tinfo),"the clone should be a separate C struct");
    TEST_INT_EQ(batch, TInfo_Get_Doc_Freq(tinfo), 10, "new sets doc_freq correctly" );
    TEST_INT_EQ(batch, TInfo_Get_Doc_Freq(tinfo), 10, "... doc_freq cloned" );
    TEST_INT_EQ(batch, TInfo_Get_Post_FilePos(tinfo), 20, "... post_filepos cloned" );
    TEST_INT_EQ(batch, TInfo_Get_Skip_FilePos(tinfo), 40, "... skip_filepos cloned" );
    TEST_INT_EQ(batch, TInfo_Get_Lex_FilePos(tinfo),  50, "... lex_filepos cloned" );

    TInfo_Set_Doc_Freq(tinfo, 5);
    TEST_INT_EQ(batch, TInfo_Get_Doc_Freq(tinfo), 5,  "set/get doc_freq" );
    TEST_INT_EQ(batch, TInfo_Get_Doc_Freq(cloned_tinfo), 10, "setting orig doesn't affect clone" );

    TInfo_Set_Post_FilePos(tinfo, 15);
    TEST_INT_EQ(batch, TInfo_Get_Post_FilePos(tinfo), 15, "set/get post_filepos" );

    TInfo_Set_Skip_FilePos(tinfo, 35);
    TEST_INT_EQ(batch, TInfo_Get_Skip_FilePos(tinfo), 35, "set/get skip_filepos" );

    TInfo_Set_Lex_FilePos(tinfo, 45);
    TEST_INT_EQ(batch, TInfo_Get_Lex_FilePos(tinfo), 45, "set/get lex_filepos" );

    DECREF(tinfo);
    DECREF(cloned_tinfo);
}
コード例 #3
0
ファイル: MatchPosting.c プロジェクト: pavansondur/lucy
MatchTermInfoStepper*
MatchTInfoStepper_init(MatchTermInfoStepper *self, Schema *schema) {
    Architecture *arch = Schema_Get_Architecture(schema);
    TermStepper_init((TermStepper*)self);
    self->skip_interval = Arch_Skip_Interval(arch);
    self->value = (Obj*)TInfo_new(0);
    return self;
}
コード例 #4
0
ファイル: TermInfo.c プロジェクト: gitpan/KinoSearch
TermInfo*
TInfo_clone(TermInfo *self) 
{
    TermInfo *evil_twin = TInfo_new(self->doc_freq);
    evil_twin->post_filepos = self->post_filepos;
    evil_twin->skip_filepos = self->skip_filepos;
    evil_twin->lex_filepos  = self->lex_filepos;
    return evil_twin;
}
コード例 #5
0
ファイル: LexIndex.c プロジェクト: kidaa/lucy
LexIndex*
LexIndex_init(LexIndex *self, Schema *schema, Folder *folder,
              Segment *segment, String *field) {
    int32_t  field_num = Seg_Field_Num(segment, field);
    String  *seg_name  = Seg_Get_Name(segment);
    String  *ixix_file = Str_newf("%o/lexicon-%i32.ixix", seg_name, field_num);
    String  *ix_file   = Str_newf("%o/lexicon-%i32.ix", seg_name, field_num);
    Architecture *arch = Schema_Get_Architecture(schema);

    // Init.
    Lex_init((Lexicon*)self, field);
    LexIndexIVARS *const ivars = LexIndex_IVARS(self);
    ivars->tinfo        = TInfo_new(0);
    ivars->tick         = 0;

    // Derive
    ivars->field_type = Schema_Fetch_Type(schema, field);
    if (!ivars->field_type) {
        String *mess = MAKE_MESS("Unknown field: '%o'", field);
        DECREF(ix_file);
        DECREF(ixix_file);
        DECREF(self);
        Err_throw_mess(ERR, mess);
    }
    ivars->field_type = (FieldType*)INCREF(ivars->field_type);
    ivars->term_stepper = FType_Make_Term_Stepper(ivars->field_type);
    ivars->ixix_in = Folder_Open_In(folder, ixix_file);
    if (!ivars->ixix_in) {
        Err *error = (Err*)INCREF(Err_get_error());
        DECREF(ix_file);
        DECREF(ixix_file);
        DECREF(self);
        RETHROW(error);
    }
    ivars->ix_in = Folder_Open_In(folder, ix_file);
    if (!ivars->ix_in) {
        Err *error = (Err*)INCREF(Err_get_error());
        DECREF(ix_file);
        DECREF(ixix_file);
        DECREF(self);
        RETHROW(error);
    }
    ivars->index_interval = Arch_Index_Interval(arch);
    ivars->skip_interval  = Arch_Skip_Interval(arch);
    ivars->size    = (int32_t)(InStream_Length(ivars->ixix_in) / sizeof(int64_t));
    ivars->offsets = (const int64_t*)InStream_Buf(ivars->ixix_in,
            (size_t)InStream_Length(ivars->ixix_in));

    DECREF(ixix_file);
    DECREF(ix_file);

    return self;
}
コード例 #6
0
ファイル: LexIndex.c プロジェクト: robertkrimen/Search-Kino03
LexIndex*
LexIndex_init(LexIndex *self, Schema *schema, Folder *folder, 
              Segment *segment, const CharBuf *field)
{
    i32_t    field_num = Seg_Field_Num(segment, field);
    CharBuf *seg_name  = Seg_Get_Name(segment);
    CharBuf *ixix_file = CB_newf("%o/lexicon-%i32.ixix", seg_name, field_num);
    CharBuf *ix_file   = CB_newf("%o/lexicon-%i32.ix", seg_name, field_num);
    Architecture *arch = Schema_Get_Architecture(schema);

    /* Init. */
    self->term  = ViewCB_new_from_trusted_utf8(NULL, 0);
    self->tinfo = TInfo_new(0,0,0,0);
    self->tick  = 0;

    /* Derive */
    self->field_type = Schema_Fetch_Type(schema, field);
    if (!self->field_type) {
        CharBuf *mess = MAKE_MESS("Unknown field: '%o'", field);
        DECREF(ix_file);
        DECREF(ixix_file);
        DECREF(self);
        Err_throw_mess(mess);
    }
    INCREF(self->field_type);
    self->ixix_in = Folder_Open_In(folder, ixix_file);
    self->ix_in   = Folder_Open_In(folder, ix_file);
    if (!self->ixix_in || !self->ix_in) {
        CharBuf *mess =
             MAKE_MESS("Can't open either %o or %o", ix_file, ixix_file);
        DECREF(ix_file);
        DECREF(ixix_file);
        DECREF(self);
        Err_throw_mess(mess);
    }
    self->index_interval = Arch_Index_Interval(arch);
    self->skip_interval  = Arch_Skip_Interval(arch);
    self->size    = (i32_t)(InStream_Length(self->ixix_in) / sizeof(i64_t));
    self->offsets = (i64_t*)InStream_Buf(self->ixix_in,
        (size_t)InStream_Length(self->ixix_in));
    self->data = InStream_Buf(self->ix_in, InStream_Length(self->ix_in));
    self->limit = self->data + InStream_Length(self->ix_in);

    DECREF(ixix_file);
    DECREF(ix_file);

    return self;
}
コード例 #7
0
ファイル: PostingPool.c プロジェクト: github188/SimpleCode
static void
S_write_terms_and_postings(PostingPool *self, PostingWriter *post_writer,
                           OutStream *skip_stream) {
    PostingPoolIVARS *const ivars = PostPool_IVARS(self);
    TermInfo      *const tinfo            = TInfo_new(0);
    TermInfo      *const skip_tinfo       = TInfo_new(0);
    TermInfoIVARS *const tinfo_ivars      = TInfo_IVARS(tinfo);
    TermInfoIVARS *const skip_tinfo_ivars = TInfo_IVARS(skip_tinfo);
    LexiconWriter *const lex_writer       = ivars->lex_writer;
    SkipStepper   *const skip_stepper     = ivars->skip_stepper;
    SkipStepperIVARS *const skip_stepper_ivars
        = SkipStepper_IVARS(skip_stepper);
    int32_t        last_skip_doc          = 0;
    int64_t        last_skip_filepos      = 0;
    const int32_t  skip_interval
        = Arch_Skip_Interval(Schema_Get_Architecture(ivars->schema));

    // Prime heldover variables.
    RawPosting *posting
        = (RawPosting*)CERTIFY(PostPool_Fetch(self), RAWPOSTING);
    RawPostingIVARS *post_ivars = RawPost_IVARS(posting);
    CharBuf *last_term_text
        = CB_new_from_trusted_utf8(post_ivars->blob, post_ivars->content_len);
    const char *last_text_buf  = CB_Get_Ptr8(last_term_text);
    uint32_t    last_text_size = CB_Get_Size(last_term_text);
    SkipStepper_Set_ID_And_Filepos(skip_stepper, 0, 0);

    // Initialize sentinel to be used on the last iter, using an empty string
    // in order to make LexiconWriter Do The Right Thing.
    size_t sentinel_size = Class_Get_Obj_Alloc_Size(RAWPOSTING)
                           + 20;  // blob length + cushion
    char empty_string[] = "";
    RawPosting *sentinel = RawPost_new(alloca(sentinel_size), 0, 1,
                                       empty_string, 0);

    while (1) {
        bool same_text_as_last = true;

        if (posting == NULL) {
            // On the last iter, use an empty string to make LexiconWriter
            // DTRT.
            posting = sentinel;
            post_ivars = RawPost_IVARS(posting);
            same_text_as_last = false;
        }
        else {
            // Compare once.
            if (post_ivars->content_len != last_text_size
                || memcmp(&post_ivars->blob, last_text_buf, last_text_size) != 0
               ) {
                same_text_as_last = false;
            }
        }

        // If the term text changes, process the last term.
        if (!same_text_as_last) {
            // Hand off to LexiconWriter.
            LexWriter_Add_Term(lex_writer, (Obj*)last_term_text, tinfo);

            // Start each term afresh.
            TInfo_Reset(tinfo);
            PostWriter_Start_Term(post_writer, tinfo);

            // Init skip data in preparation for the next term.
            skip_stepper_ivars->doc_id  = 0;
            skip_stepper_ivars->filepos = tinfo_ivars->post_filepos;
            last_skip_doc         = 0;
            last_skip_filepos     = tinfo_ivars->post_filepos;

            // Remember the term_text so we can write string diffs.
            CB_Mimic_Utf8(last_term_text, post_ivars->blob,
                          post_ivars->content_len);
            last_text_buf  = CB_Get_Ptr8(last_term_text);
            last_text_size = CB_Get_Size(last_term_text);
        }

        // Bail on last iter before writing invalid posting data.
        if (posting == sentinel) { break; }

        // Write posting data.
        PostWriter_Write_Posting(post_writer, posting);

        // Doc freq lags by one iter.
        tinfo_ivars->doc_freq++;

        //  Write skip data.
        if (skip_stream != NULL
            && same_text_as_last
            && tinfo_ivars->doc_freq % skip_interval == 0
            && tinfo_ivars->doc_freq != 0
           ) {
            // If first skip group, save skip stream pos for term info.
            if (tinfo_ivars->doc_freq == skip_interval) {
                tinfo_ivars->skip_filepos = OutStream_Tell(skip_stream);
            }
            // Write deltas.
            last_skip_doc               = skip_stepper_ivars->doc_id;
            last_skip_filepos           = skip_stepper_ivars->filepos;
            skip_stepper_ivars->doc_id  = post_ivars->doc_id;
            PostWriter_Update_Skip_Info(post_writer, skip_tinfo);
            skip_stepper_ivars->filepos = skip_tinfo_ivars->post_filepos;
            SkipStepper_Write_Record(skip_stepper, skip_stream,
                                     last_skip_doc, last_skip_filepos);
        }

        // Retrieve the next posting from the sort pool.
        // DECREF(posting);  // No!!  DON'T destroy!!!

        posting = (RawPosting*)PostPool_Fetch(self);
        post_ivars = RawPost_IVARS(posting);
    }

    // Clean up.
    DECREF(last_term_text);
    DECREF(skip_tinfo);
    DECREF(tinfo);
}