예제 #1
0
파일: OutStream.c 프로젝트: kidaa/lucy
int64_t
OutStream_Align_IMP(OutStream *self, int64_t modulus) {
    int64_t len = OutStream_Tell(self);
    int64_t filler_bytes = (modulus - (len % modulus)) % modulus;
    while (filler_bytes--) { OutStream_Write_U8(self, 0); }
    return OutStream_Tell(self);
}
예제 #2
0
static void
S_add_last_term_to_ix(LexiconWriter *self, char *last_text, size_t last_size)
{
    OutStream *const ix_out     = self->ix_out;
    OutStream *const ixix_out   = self->ixix_out;
    TermInfo  *const last_tinfo = self->last_tinfo;

    /* Write file pointer to index record. */
    OutStream_Write_U64(ixix_out, OutStream_Tell(ix_out));

    /* Write term text. */
    OutStream_Write_C32(ix_out, last_size);
    OutStream_Write_Bytes(ix_out, last_text, last_size);
    
    /* Write doc_freq. */
    OutStream_Write_C32(ix_out, last_tinfo->doc_freq);

    /* Write postings file pointer. */
    OutStream_Write_C64(ix_out, last_tinfo->post_filepos);

    /* Write skip file pointer (maybe). */
    if (last_tinfo->doc_freq >= self->skip_interval) {
        OutStream_Write_C64(ix_out, last_tinfo->skip_filepos);
    }

    /* Write file pointer to main record. */
    OutStream_Write_C64(ix_out, OutStream_Tell(self->dat_out));

    /* Keep track of how many terms have been added to lexicon.ix. */
    self->ix_count++;
}
void
SortFieldWriter_Flush_IMP(SortFieldWriter *self) {
    SortFieldWriterIVARS *const ivars = SortFieldWriter_IVARS(self);

    // Don't add a run unless we have data to put in it.
    if (SortFieldWriter_Buffer_Count(self) == 0) { return; }

    OutStream *const temp_ord_out = ivars->temp_ord_out;
    OutStream *const temp_ix_out  = ivars->temp_ix_out;
    OutStream *const temp_dat_out = ivars->temp_dat_out;

    SortFieldWriter_Sort_Buffer(self);
    SortFieldWriter *run
        = SortFieldWriter_new(ivars->schema, ivars->snapshot, ivars->segment,
                              ivars->polyreader, ivars->field, ivars->counter,
                              ivars->mem_thresh, NULL, NULL, NULL);
    SortFieldWriterIVARS *const run_ivars = SortFieldWriter_IVARS(run);

    // Record stream starts and align.
    run_ivars->ord_start = OutStream_Align(temp_ord_out, sizeof(int64_t));
    if (ivars->var_width) {
        run_ivars->ix_start  = OutStream_Align(temp_ix_out, sizeof(int64_t));
    }
    run_ivars->dat_start = OutStream_Align(temp_dat_out, sizeof(int64_t));

    // Have the run borrow the array of elems.
    run_ivars->buffer   = ivars->buffer;
    run_ivars->buf_max  = ivars->buf_max;
    run_ivars->buf_tick = ivars->buf_tick;
    run_ivars->buf_cap  = ivars->buf_cap;

    // Write files, record stats.
    run_ivars->run_max = (int32_t)Seg_Get_Count(ivars->segment);
    run_ivars->run_cardinality = S_write_files(run, temp_ord_out, temp_ix_out,
                                               temp_dat_out);

    // Reclaim the buffer from the run and empty it.
    run_ivars->buffer    = NULL;
    run_ivars->buf_max   = 0;
    run_ivars->buf_tick  = 0;
    run_ivars->buf_cap   = 0;
    ivars->buf_tick = ivars->buf_max;
    SortFieldWriter_Clear_Buffer(self);

    // Record stream ends.
    run_ivars->ord_end = OutStream_Tell(temp_ord_out);
    if (ivars->var_width) {
        run_ivars->ix_end  = OutStream_Tell(temp_ix_out);
    }
    run_ivars->dat_end = OutStream_Tell(temp_dat_out);

    // Add the run to the array.
    SortFieldWriter_Add_Run(self, (SortExternal*)run);
}
예제 #4
0
static void
S_add_last_term_to_ix(LexiconWriter *self)
{
    // Write file pointer to index record. 
    OutStream_Write_I64(self->ixix_out, OutStream_Tell(self->ix_out));

    // Write term and file pointer to main record.  Track count of terms added
    // to ix.
    TermStepper_Write_Key_Frame(self->term_stepper,
        self->ix_out, TermStepper_Get_Value(self->term_stepper));
    TermStepper_Write_Key_Frame(self->tinfo_stepper, 
        self->ix_out, TermStepper_Get_Value(self->tinfo_stepper));
    OutStream_Write_C64(self->ix_out, OutStream_Tell(self->dat_out));
    self->ix_count++;
}
예제 #5
0
void
PostPool_Flush_IMP(PostingPool *self) {
    PostingPoolIVARS *const ivars = PostPool_IVARS(self);

    // Don't add a run unless we have data to put in it.
    if (PostPool_Buffer_Count(self) == 0) { return; }

    PostingPool *run
        = PostPool_new(ivars->schema, ivars->snapshot, ivars->segment,
                       ivars->polyreader, ivars->field, ivars->lex_writer,
                       ivars->mem_pool, ivars->lex_temp_out,
                       ivars->post_temp_out, ivars->skip_out);
    PostingPoolIVARS *const run_ivars = PostPool_IVARS(run);
    PostingWriter *post_writer
        = (PostingWriter*)RawPostWriter_new(ivars->schema, ivars->snapshot,
                                            ivars->segment, ivars->polyreader,
                                            ivars->post_temp_out);

    // Borrow the buffer.
    run_ivars->buffer   = ivars->buffer;
    run_ivars->buf_tick = ivars->buf_tick;
    run_ivars->buf_max  = ivars->buf_max;
    run_ivars->buf_cap  = ivars->buf_cap;

    // Write to temp files.
    LexWriter_Enter_Temp_Mode(ivars->lex_writer, ivars->field,
                              ivars->lex_temp_out);
    run_ivars->lex_start  = OutStream_Tell(ivars->lex_temp_out);
    run_ivars->post_start = OutStream_Tell(ivars->post_temp_out);
    PostPool_Sort_Buffer(self);
    S_write_terms_and_postings(run, post_writer, NULL);

    run_ivars->lex_end  = OutStream_Tell(ivars->lex_temp_out);
    run_ivars->post_end = OutStream_Tell(ivars->post_temp_out);
    LexWriter_Leave_Temp_Mode(ivars->lex_writer);

    // Return the buffer and empty it.
    run_ivars->buffer   = NULL;
    run_ivars->buf_tick = 0;
    run_ivars->buf_max  = 0;
    run_ivars->buf_cap  = 0;
    PostPool_Clear_Buffer(self);

    // Add the run to the array.
    PostPool_Add_Run(self, (SortExternal*)run);

    DECREF(post_writer);
}
예제 #6
0
void
MatchPostWriter_Start_Term_IMP(MatchPostingWriter *self, TermInfo *tinfo) {
    MatchPostingWriterIVARS *const ivars = MatchPostWriter_IVARS(self);
    TermInfoIVARS *const tinfo_ivars = TInfo_IVARS(tinfo);
    ivars->last_doc_id   = 0;
    tinfo_ivars->post_filepos = OutStream_Tell(ivars->outstream);
}
예제 #7
0
void
HLWriter_Add_Inverted_Doc_IMP(HighlightWriter *self, Inverter *inverter,
                              int32_t doc_id) {
    HighlightWriterIVARS *const ivars = HLWriter_IVARS(self);
    OutStream *dat_out = S_lazy_init(self);
    OutStream *ix_out  = ivars->ix_out;
    int64_t    filepos = OutStream_Tell(dat_out);
    uint32_t num_highlightable = 0;
    int32_t expected = (int32_t)(OutStream_Tell(ix_out) / 8);

    // Verify doc id.
    if (doc_id != expected) {
        THROW(ERR, "Expected doc id %i32 but got %i32", expected, doc_id);
    }

    // Write index data.
    OutStream_Write_I64(ix_out, filepos);

    // Count, then write number of highlightable fields.
    Inverter_Iterate(inverter);
    while (Inverter_Next(inverter)) {
        FieldType *type = Inverter_Get_Type(inverter);
        if (FType_Is_A(type, FULLTEXTTYPE)
            && FullTextType_Highlightable((FullTextType*)type)
           ) {
            num_highlightable++;
        }
    }
    OutStream_Write_C32(dat_out, num_highlightable);

    Inverter_Iterate(inverter);
    while (Inverter_Next(inverter)) {
        FieldType *type = Inverter_Get_Type(inverter);
        if (FType_Is_A(type, FULLTEXTTYPE)
            && FullTextType_Highlightable((FullTextType*)type)
           ) {
            String    *field     = Inverter_Get_Field_Name(inverter);
            Inversion *inversion = Inverter_Get_Inversion(inverter);
            ByteBuf   *tv_buf    = HLWriter_TV_Buf(self, inversion);
            Freezer_serialize_string(field, dat_out);
            Freezer_serialize_bytebuf(tv_buf, dat_out);
            DECREF(tv_buf);
        }
    }
}
예제 #8
0
void
HLWriter_add_inverted_doc(HighlightWriter *self, Inverter *inverter, 
                          i32_t doc_id)
{
    OutStream *dat_out = S_lazy_init(self);
    OutStream *ix_out  = self->ix_out;
    i64_t      filepos = OutStream_Tell(dat_out);
    u32_t num_highlightable = 0;
    i32_t expected = (i32_t)(OutStream_Tell(ix_out) / 8);

    /* Verify doc id. */
    if (doc_id != expected)
        THROW("Expected doc id %i32 but got %i32", expected, doc_id);

    /* Write index data. */
    OutStream_Write_U64(ix_out, filepos);

    /* Count, then write number of highlightable fields. */
    Inverter_Iter_Init(inverter);
    while (Inverter_Next(inverter)) {
        FieldType *type = Inverter_Get_Type(inverter);
        if (   OBJ_IS_A(type, FULLTEXTTYPE) 
            && FullTextType_Highlightable(type)
        ) {
            num_highlightable++;
        }
    }
    OutStream_Write_C32(dat_out, num_highlightable);

    Inverter_Iter_Init(inverter);
    while (Inverter_Next(inverter)) {
        FieldType *type = Inverter_Get_Type(inverter);
        if (   OBJ_IS_A(type, FULLTEXTTYPE) 
            && FullTextType_Highlightable(type)
        ) {
            CharBuf   *field     = Inverter_Get_Field_Name(inverter);
            Inversion *inversion = Inverter_Get_Inversion(inverter);
            ByteBuf   *tv_buf    = HLWriter_TV_Buf(self, inversion);
            CB_Serialize(field, dat_out);
            BB_Serialize(tv_buf, dat_out);
            DECREF(tv_buf);
        }
    }
}
예제 #9
0
void
DocWriter_add_inverted_doc(DocWriter *self, Inverter *inverter,
                           int32_t doc_id) {
    OutStream *dat_out    = S_lazy_init(self);
    OutStream *ix_out     = self->ix_out;
    uint32_t   num_stored = 0;
    int64_t    start      = OutStream_Tell(dat_out);
    int64_t    expected   = OutStream_Tell(ix_out) / 8;

    // Verify doc id.
    if (doc_id != expected) {
        THROW(ERR, "Expected doc id %i64 but got %i32", expected, doc_id);
    }

    // Write the number of stored fields.
    Inverter_Iterate(inverter);
    while (Inverter_Next(inverter)) {
        FieldType *type = Inverter_Get_Type(inverter);
        if (FType_Stored(type)) { num_stored++; }
    }
    OutStream_Write_C32(dat_out, num_stored);

    Inverter_Iterate(inverter);
    while (Inverter_Next(inverter)) {
        // Only store fields marked as "stored".
        FieldType *type = Inverter_Get_Type(inverter);
        if (FType_Stored(type)) {
            CharBuf *field = Inverter_Get_Field_Name(inverter);
            Obj *value = Inverter_Get_Value(inverter);
            CB_Serialize(field, dat_out);
            Obj_Serialize(value, dat_out);
        }
    }

    // Write file pointer.
    OutStream_Write_I64(ix_out, start);
}
예제 #10
0
void
DocWriter_finish(DocWriter *self) {
    if (self->dat_out) {
        // Write one final file pointer, so that we can derive the length of
        // the last record.
        int64_t end = OutStream_Tell(self->dat_out);
        OutStream_Write_I64(self->ix_out, end);

        // Close down output streams.
        OutStream_Close(self->dat_out);
        OutStream_Close(self->ix_out);
        Seg_Store_Metadata_Str(self->segment, "documents", 9,
                               (Obj*)DocWriter_Metadata(self));
    }
}
예제 #11
0
void
HLWriter_finish(HighlightWriter *self)
{
    if (self->dat_out) {
        /* Write one final file pointer, so that we can derive the length of
         * the last record. */
        i64_t end = OutStream_Tell(self->dat_out);
        OutStream_Write_U64(self->ix_out, end);
        
        /* Close down the output streams. */
        OutStream_Close(self->dat_out);
        OutStream_Close(self->ix_out);
        Seg_Store_Metadata_Str(self->segment, "highlight", 9, 
            (Obj*)HLWriter_Metadata(self));
    }
}
예제 #12
0
void
HLWriter_Finish_IMP(HighlightWriter *self) {
    HighlightWriterIVARS *const ivars = HLWriter_IVARS(self);
    if (ivars->dat_out) {
        // Write one final file pointer, so that we can derive the length of
        // the last record.
        int64_t end = OutStream_Tell(ivars->dat_out);
        OutStream_Write_I64(ivars->ix_out, end);

        // Close down the output streams.
        OutStream_Close(ivars->dat_out);
        OutStream_Close(ivars->ix_out);
        Seg_Store_Metadata_Utf8(ivars->segment, "highlight", 9,
                                (Obj*)HLWriter_Metadata(self));
    }
}
예제 #13
0
파일: OutStream.c 프로젝트: kidaa/lucy
void
OutStream_Absorb_IMP(OutStream *self, InStream *instream) {
    OutStreamIVARS *const ivars = OutStream_IVARS(self);
    char buf[IO_STREAM_BUF_SIZE];
    int64_t bytes_left = InStream_Length(instream);

    // Read blocks of content into an intermediate buffer, than write them to
    // the OutStream.
    //
    // TODO: optimize by utilizing OutStream's buffer directly, while still
    // not flushing too frequently and keeping code complexity under control.
    OutStream_Grow(self, OutStream_Tell(self) + bytes_left);
    while (bytes_left) {
        const size_t bytes_this_iter = bytes_left < IO_STREAM_BUF_SIZE
                                       ? (size_t)bytes_left
                                       : IO_STREAM_BUF_SIZE;
        InStream_Read_Bytes(instream, buf, bytes_this_iter);
        SI_write_bytes(self, ivars, buf, bytes_this_iter);
        bytes_left -= bytes_this_iter;
    }
}
예제 #14
0
void
HLWriter_Add_Segment_IMP(HighlightWriter *self, SegReader *reader,
                         I32Array *doc_map) {
    HighlightWriterIVARS *const ivars = HLWriter_IVARS(self);
    int32_t doc_max = SegReader_Doc_Max(reader);

    if (doc_max == 0) {
        // Bail if the supplied segment is empty.
        return;
    }
    else {
        DefaultHighlightReader *hl_reader
            = (DefaultHighlightReader*)CERTIFY(
                  SegReader_Obtain(reader, Class_Get_Name(HIGHLIGHTREADER)),
                  DEFAULTHIGHLIGHTREADER);
        OutStream *dat_out = S_lazy_init(self);
        OutStream *ix_out  = ivars->ix_out;
        int32_t    orig;
        ByteBuf   *bb = BB_new(0);

        for (orig = 1; orig <= doc_max; orig++) {
            // Skip deleted docs.
            if (doc_map && !I32Arr_Get(doc_map, orig)) {
                continue;
            }

            // Write file pointer.
            OutStream_Write_I64(ix_out, OutStream_Tell(dat_out));

            // Copy the raw record.
            DefHLReader_Read_Record(hl_reader, orig, bb);
            OutStream_Write_Bytes(dat_out, BB_Get_Buf(bb), BB_Get_Size(bb));

            BB_Set_Size(bb, 0);
        }
        DECREF(bb);
    }
}
예제 #15
0
파일: DocWriter.c 프로젝트: hernan604/lucy
void
DocWriter_Add_Segment_IMP(DocWriter *self, SegReader *reader,
                          I32Array *doc_map) {
    DocWriterIVARS *const ivars = DocWriter_IVARS(self);
    int32_t doc_max = SegReader_Doc_Max(reader);

    if (doc_max == 0) {
        // Bail if the supplied segment is empty.
        return;
    }
    else {
        OutStream *const dat_out = S_lazy_init(self);
        OutStream *const ix_out  = ivars->ix_out;
        ByteBuf   *const buffer  = BB_new(0);
        DefaultDocReader *const doc_reader
            = (DefaultDocReader*)CERTIFY(
                  SegReader_Obtain(reader, VTable_Get_Name(DOCREADER)),
                  DEFAULTDOCREADER);

        for (int32_t i = 1, max = SegReader_Doc_Max(reader); i <= max; i++) {
            if (I32Arr_Get(doc_map, i)) {
                int64_t  start = OutStream_Tell(dat_out);

                // Copy record over.
                DefDocReader_Read_Record(doc_reader, buffer, i);
                char *buf   = BB_Get_Buf(buffer);
                size_t size = BB_Get_Size(buffer);
                OutStream_Write_Bytes(dat_out, buf, size);

                // Write file pointer.
                OutStream_Write_I64(ix_out, start);
            }
        }

        DECREF(buffer);
    }
}
예제 #16
0
void
HLWriter_add_segment(HighlightWriter *self, SegReader *reader, 
                     I32Array *doc_map)
{
    i32_t doc_max = SegReader_Doc_Max(reader);

    if (doc_max == 0) {
        /* Bail if the supplied segment is empty. */
        return;
    }
    else {
        DefaultHighlightReader *hl_reader = (DefaultHighlightReader*)
            ASSERT_IS_A(SegReader_Obtain(reader, HIGHLIGHTREADER.name),
            DEFAULTHIGHLIGHTREADER);
        OutStream *dat_out = S_lazy_init(self);
        OutStream *ix_out  = self->ix_out;
        i32_t      orig;
        ByteBuf   *bb = BB_new(0);

        for (orig = 1; orig <= doc_max; orig++) {
            /* Skip deleted docs. */
            if (doc_map && !I32Arr_Get(doc_map, orig))
                continue;

            /* Write file pointer. */
            OutStream_Write_U64( ix_out, OutStream_Tell(dat_out) );
            
            /* Copy the raw record. */
            DefHLReader_Read_Record(hl_reader, orig, bb);
            OutStream_Write_Bytes(dat_out, bb->ptr, bb->size);

            bb->size = 0;
        }
        DECREF(bb);
    }
}
예제 #17
0
파일: DocWriter.c 프로젝트: hernan604/lucy
void
DocWriter_Add_Inverted_Doc_IMP(DocWriter *self, Inverter *inverter,
                               int32_t doc_id) {
    DocWriterIVARS *const ivars = DocWriter_IVARS(self);
    OutStream *dat_out    = S_lazy_init(self);
    OutStream *ix_out     = ivars->ix_out;
    uint32_t   num_stored = 0;
    int64_t    start      = OutStream_Tell(dat_out);
    int64_t    expected   = OutStream_Tell(ix_out) / 8;

    // Verify doc id.
    if (doc_id != expected) {
        THROW(ERR, "Expected doc id %i64 but got %i32", expected, doc_id);
    }

    // Write the number of stored fields.
    Inverter_Iterate(inverter);
    while (Inverter_Next(inverter)) {
        FieldType *type = Inverter_Get_Type(inverter);
        if (FType_Stored(type)) { num_stored++; }
    }
    OutStream_Write_C32(dat_out, num_stored);

    Inverter_Iterate(inverter);
    while (Inverter_Next(inverter)) {
        // Only store fields marked as "stored".
        FieldType *type = Inverter_Get_Type(inverter);
        if (FType_Stored(type)) {
            String *field = Inverter_Get_Field_Name(inverter);
            Obj *value = Inverter_Get_Value(inverter);
            Freezer_serialize_string(field, dat_out);
            switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) {
                case FType_TEXT: {
                    const char *buf  = Str_Get_Ptr8((String*)value);
                    size_t      size = Str_Get_Size((String*)value);
                    OutStream_Write_C32(dat_out, size);
                    OutStream_Write_Bytes(dat_out, buf, size);
                    break;
                }
                case FType_BLOB: {
                    char   *buf  = BB_Get_Buf((ByteBuf*)value);
                    size_t  size = BB_Get_Size((ByteBuf*)value);
                    OutStream_Write_C32(dat_out, size);
                    OutStream_Write_Bytes(dat_out, buf, size);
                    break;
                }
                case FType_INT32: {
                    int32_t val = Int32_Get_Value((Integer32*)value);
                    OutStream_Write_C32(dat_out, val);
                    break;
                }
                case FType_INT64: {
                    int64_t val = Int64_Get_Value((Integer64*)value);
                    OutStream_Write_C64(dat_out, val);
                    break;
                }
                case FType_FLOAT32: {
                    float val = Float32_Get_Value((Float32*)value);
                    OutStream_Write_F32(dat_out, val);
                    break;
                }
                case FType_FLOAT64: {
                    double val = Float64_Get_Value((Float64*)value);
                    OutStream_Write_F64(dat_out, val);
                    break;
                }
                default:
                    THROW(ERR, "Unrecognized type: %o", type);
            }
        }
    }

    // Write file pointer.
    OutStream_Write_I64(ix_out, start);
}
예제 #18
0
void
MatchPostWriter_Update_Skip_Info_IMP(MatchPostingWriter *self, TermInfo *tinfo) {
    MatchPostingWriterIVARS *const ivars = MatchPostWriter_IVARS(self);
    TermInfoIVARS *const tinfo_ivars = TInfo_IVARS(tinfo);
    tinfo_ivars->post_filepos = OutStream_Tell(ivars->outstream);
}
예제 #19
0
static void
S_write_terms_and_postings(PostingPool *self, PostingWriter *post_writer,
                           OutStream *skip_stream) {
    PostingPoolIVARS *const ivars = PostPool_IVARS(self);
    TermInfo      *const tinfo            = TInfo_new(0);
    TermInfo      *const skip_tinfo       = TInfo_new(0);
    TermInfoIVARS *const tinfo_ivars      = TInfo_IVARS(tinfo);
    TermInfoIVARS *const skip_tinfo_ivars = TInfo_IVARS(skip_tinfo);
    LexiconWriter *const lex_writer       = ivars->lex_writer;
    SkipStepper   *const skip_stepper     = ivars->skip_stepper;
    SkipStepperIVARS *const skip_stepper_ivars
        = SkipStepper_IVARS(skip_stepper);
    int32_t        last_skip_doc          = 0;
    int64_t        last_skip_filepos      = 0;
    const int32_t  skip_interval
        = Arch_Skip_Interval(Schema_Get_Architecture(ivars->schema));

    // Prime heldover variables.
    RawPosting *posting
        = (RawPosting*)CERTIFY(PostPool_Fetch(self), RAWPOSTING);
    RawPostingIVARS *post_ivars = RawPost_IVARS(posting);
    CharBuf *last_term_text
        = CB_new_from_trusted_utf8(post_ivars->blob, post_ivars->content_len);
    const char *last_text_buf  = CB_Get_Ptr8(last_term_text);
    uint32_t    last_text_size = CB_Get_Size(last_term_text);
    SkipStepper_Set_ID_And_Filepos(skip_stepper, 0, 0);

    // Initialize sentinel to be used on the last iter, using an empty string
    // in order to make LexiconWriter Do The Right Thing.
    size_t sentinel_size = Class_Get_Obj_Alloc_Size(RAWPOSTING)
                           + 20;  // blob length + cushion
    char empty_string[] = "";
    RawPosting *sentinel = RawPost_new(alloca(sentinel_size), 0, 1,
                                       empty_string, 0);

    while (1) {
        bool same_text_as_last = true;

        if (posting == NULL) {
            // On the last iter, use an empty string to make LexiconWriter
            // DTRT.
            posting = sentinel;
            post_ivars = RawPost_IVARS(posting);
            same_text_as_last = false;
        }
        else {
            // Compare once.
            if (post_ivars->content_len != last_text_size
                || memcmp(&post_ivars->blob, last_text_buf, last_text_size) != 0
               ) {
                same_text_as_last = false;
            }
        }

        // If the term text changes, process the last term.
        if (!same_text_as_last) {
            // Hand off to LexiconWriter.
            LexWriter_Add_Term(lex_writer, (Obj*)last_term_text, tinfo);

            // Start each term afresh.
            TInfo_Reset(tinfo);
            PostWriter_Start_Term(post_writer, tinfo);

            // Init skip data in preparation for the next term.
            skip_stepper_ivars->doc_id  = 0;
            skip_stepper_ivars->filepos = tinfo_ivars->post_filepos;
            last_skip_doc         = 0;
            last_skip_filepos     = tinfo_ivars->post_filepos;

            // Remember the term_text so we can write string diffs.
            CB_Mimic_Utf8(last_term_text, post_ivars->blob,
                          post_ivars->content_len);
            last_text_buf  = CB_Get_Ptr8(last_term_text);
            last_text_size = CB_Get_Size(last_term_text);
        }

        // Bail on last iter before writing invalid posting data.
        if (posting == sentinel) { break; }

        // Write posting data.
        PostWriter_Write_Posting(post_writer, posting);

        // Doc freq lags by one iter.
        tinfo_ivars->doc_freq++;

        //  Write skip data.
        if (skip_stream != NULL
            && same_text_as_last
            && tinfo_ivars->doc_freq % skip_interval == 0
            && tinfo_ivars->doc_freq != 0
           ) {
            // If first skip group, save skip stream pos for term info.
            if (tinfo_ivars->doc_freq == skip_interval) {
                tinfo_ivars->skip_filepos = OutStream_Tell(skip_stream);
            }
            // Write deltas.
            last_skip_doc               = skip_stepper_ivars->doc_id;
            last_skip_filepos           = skip_stepper_ivars->filepos;
            skip_stepper_ivars->doc_id  = post_ivars->doc_id;
            PostWriter_Update_Skip_Info(post_writer, skip_tinfo);
            skip_stepper_ivars->filepos = skip_tinfo_ivars->post_filepos;
            SkipStepper_Write_Record(skip_stepper, skip_stream,
                                     last_skip_doc, last_skip_filepos);
        }

        // Retrieve the next posting from the sort pool.
        // DECREF(posting);  // No!!  DON'T destroy!!!

        posting = (RawPosting*)PostPool_Fetch(self);
        post_ivars = RawPost_IVARS(posting);
    }

    // Clean up.
    DECREF(last_term_text);
    DECREF(skip_tinfo);
    DECREF(tinfo);
}
예제 #20
0
static void
S_do_consolidate(CompoundFileWriter *self, CompoundFileWriterIVARS *ivars) {
    UNUSED_VAR(self);
    Folder    *folder       = ivars->folder;
    Hash      *metadata     = Hash_new(0);
    Hash      *sub_files    = Hash_new(0);
    Vector    *files        = Folder_List(folder, NULL);
    Vector    *merged       = Vec_new(Vec_Get_Size(files));
    String    *cf_file      = (String*)SSTR_WRAP_UTF8("cf.dat", 6);
    OutStream *outstream    = Folder_Open_Out(folder, (String*)cf_file);
    bool       rename_success;

    if (!outstream) { RETHROW(INCREF(Err_get_error())); }

    // Start metadata.
    Hash_Store_Utf8(metadata, "files", 5, INCREF(sub_files));
    Hash_Store_Utf8(metadata, "format", 6,
                    (Obj*)Str_newf("%i32", CFWriter_current_file_format));

    Vec_Sort(files);
    for (uint32_t i = 0, max = Vec_Get_Size(files); i < max; i++) {
        String *infilename = (String*)Vec_Fetch(files, i);

        if (!Str_Ends_With_Utf8(infilename, ".json", 5)) {
            InStream *instream   = Folder_Open_In(folder, infilename);
            Hash     *file_data  = Hash_new(2);
            int64_t   offset, len;

            if (!instream) { RETHROW(INCREF(Err_get_error())); }

            // Absorb the file.
            offset = OutStream_Tell(outstream);
            OutStream_Absorb(outstream, instream);
            len = OutStream_Tell(outstream) - offset;

            // Record offset and length.
            Hash_Store_Utf8(file_data, "offset", 6,
                            (Obj*)Str_newf("%i64", offset));
            Hash_Store_Utf8(file_data, "length", 6,
                            (Obj*)Str_newf("%i64", len));
            Hash_Store(sub_files, infilename, (Obj*)file_data);
            Vec_Push(merged, INCREF(infilename));

            // Add filler NULL bytes so that every sub-file begins on a file
            // position multiple of 8.
            OutStream_Align(outstream, 8);

            InStream_Close(instream);
            DECREF(instream);
        }
    }

    // Write metadata to cfmeta file.
    String *cfmeta_temp = (String*)SSTR_WRAP_UTF8("cfmeta.json.temp", 16);
    String *cfmeta_file = (String*)SSTR_WRAP_UTF8("cfmeta.json", 11);
    Json_spew_json((Obj*)metadata, (Folder*)ivars->folder, cfmeta_temp);
    rename_success = Folder_Rename(ivars->folder, cfmeta_temp, cfmeta_file);
    if (!rename_success) { RETHROW(INCREF(Err_get_error())); }

    // Clean up.
    OutStream_Close(outstream);
    DECREF(outstream);
    DECREF(files);
    DECREF(metadata);
    /*
    HashIterator *iter = HashIter_new(sub_files);
    while (HashIter_Next(iter)) {
        String *merged_file = HashIter_Get_Key(iter);
        if (!Folder_Delete(folder, merged_file)) {
            String *mess = MAKE_MESS("Can't delete '%o'", merged_file);
            DECREF(sub_files);
            Err_throw_mess(ERR, mess);
        }
    }
    DECREF(iter);
    */
    DECREF(sub_files);
    for (uint32_t i = 0, max = Vec_Get_Size(merged); i < max; i++) {
        String *merged_file = (String*)Vec_Fetch(merged, i);
        if (!Folder_Delete(folder, merged_file)) {
            String *mess = MAKE_MESS("Can't delete '%o'", merged_file);
            DECREF(merged);
            Err_throw_mess(ERR, mess);
        }
    }
    DECREF(merged);
}
예제 #21
0
void
MatchPostWriter_update_skip_info(MatchPostingWriter *self, TermInfo *tinfo) {
    tinfo->post_filepos = OutStream_Tell(self->outstream);
}
예제 #22
0
void
MatchPostWriter_start_term(MatchPostingWriter *self, TermInfo *tinfo) {
    self->last_doc_id   = 0;
    tinfo->post_filepos = OutStream_Tell(self->outstream);
}
static void
S_write_val(Obj *val, int8_t prim_id, OutStream *ix_out, OutStream *dat_out,
            int64_t dat_start) {
    if (val) {
        switch (prim_id & FType_PRIMITIVE_ID_MASK) {
            case FType_TEXT: {
                    String *string = (String*)val;
                    int64_t dat_pos = OutStream_Tell(dat_out) - dat_start;
                    OutStream_Write_I64(ix_out, dat_pos);
                    OutStream_Write_Bytes(dat_out, Str_Get_Ptr8(string),
                                          Str_Get_Size(string));
                    break;
                }
            case FType_BLOB: {
                    Blob *blob = (Blob*)val;
                    int64_t dat_pos = OutStream_Tell(dat_out) - dat_start;
                    OutStream_Write_I64(ix_out, dat_pos);
                    OutStream_Write_Bytes(dat_out, Blob_Get_Buf(blob),
                                          Blob_Get_Size(blob));
                    break;
                }
            case FType_INT32: {
                    int32_t i32 = (int32_t)Int_Get_Value((Integer*)val);
                    OutStream_Write_I32(dat_out, i32);
                    break;
                }
            case FType_INT64: {
                    int64_t i64 = Int_Get_Value((Integer*)val);
                    OutStream_Write_I64(dat_out, i64);
                    break;
                }
            case FType_FLOAT32: {
                    float f32 = (float)Float_Get_Value((Float*)val);
                    OutStream_Write_F32(dat_out, f32);
                    break;
                }
            case FType_FLOAT64: {
                    double f64 = Float_Get_Value((Float*)val);
                    OutStream_Write_F64(dat_out, f64);
                    break;
                }
            default:
                THROW(ERR, "Unrecognized primitive id: %i32", (int32_t)prim_id);
        }
    }
    else {
        switch (prim_id & FType_PRIMITIVE_ID_MASK) {
            case FType_TEXT:
            case FType_BLOB: {
                    int64_t dat_pos = OutStream_Tell(dat_out) - dat_start;
                    OutStream_Write_I64(ix_out, dat_pos);
                }
                break;
            case FType_INT32:
                OutStream_Write_I32(dat_out, 0);
                break;
            case FType_INT64:
                OutStream_Write_I64(dat_out, 0);
                break;
            case FType_FLOAT64:
                OutStream_Write_F64(dat_out, 0.0);
                break;
            case FType_FLOAT32:
                OutStream_Write_F32(dat_out, 0.0f);
                break;
            default:
                THROW(ERR, "Unrecognized primitive id: %i32", (int32_t)prim_id);
        }
    }
}
static int32_t
S_write_files(SortFieldWriter *self, OutStream *ord_out, OutStream *ix_out,
              OutStream *dat_out) {
    SortFieldWriterIVARS *const ivars = SortFieldWriter_IVARS(self);
    int8_t    prim_id   = ivars->prim_id;
    int32_t   doc_max   = (int32_t)Seg_Get_Count(ivars->segment);
    bool      has_nulls = ivars->count == doc_max ? false : true;
    size_t    size      = (doc_max + 1) * sizeof(int32_t);
    int32_t  *ords      = (int32_t*)MALLOCATE(size);
    int32_t   ord       = 0;
    int64_t   dat_start = OutStream_Tell(dat_out);

    // Assign -1 as a stand-in for the NULL ord.
    for (int32_t i = 0; i <= doc_max; i++) {
        ords[i] = -1;
    }

    // Grab the first item and record its ord.  Add a dummy ord for invalid
    // doc id 0.
    SFWriterElem *elem = (SFWriterElem*)SortFieldWriter_Fetch(self);
    SFWriterElemIVARS *elem_ivars = SFWriterElem_IVARS(elem);
    if (elem_ivars->doc_id > doc_max) {
        THROW(ERR, "doc_id %i32 greater than doc_max %i32",
              elem_ivars->doc_id, doc_max);
    }
    ords[elem_ivars->doc_id] = ord;
    ords[0] = 0;

    // Build array of ords, write non-NULL sorted values.
    Obj *last_val = INCREF(elem_ivars->value);
    S_write_val(elem_ivars->value, prim_id, ix_out, dat_out, dat_start);
    DECREF(elem);
    while (NULL != (elem = (SFWriterElem*)SortFieldWriter_Fetch(self))) {
        elem_ivars = SFWriterElem_IVARS(elem);
        if (elem_ivars->value != last_val) {
            int32_t comparison
                = FType_Compare_Values(ivars->type, elem_ivars->value,
                                       last_val);
            if (comparison != 0) {
                ord++;
                S_write_val(elem_ivars->value, prim_id, ix_out, dat_out,
                            dat_start);
            }
            DECREF(last_val);
            last_val = INCREF(elem_ivars->value);
        }
        if (elem_ivars->doc_id > doc_max) {
            THROW(ERR, "doc_id %i32 greater than doc_max %i32",
                  elem_ivars->doc_id, doc_max);
        }
        ords[elem_ivars->doc_id] = ord;
        DECREF(elem);
    }
    DECREF(last_val);

    // If there are NULL values, write one now and record the NULL ord.
    if (has_nulls) {
        S_write_val(NULL, prim_id, ix_out, dat_out, dat_start);
        ord++;
        ivars->null_ord = ord;
    }
    int32_t null_ord = ivars->null_ord;

    // Write one extra file pointer so that we can always derive length.
    if (ivars->var_width) {
        OutStream_Write_I64(ix_out, OutStream_Tell(dat_out) - dat_start);
    }

    // Calculate cardinality and ord width.
    int32_t cardinality = ord + 1;
    ivars->ord_width     = S_calc_width(cardinality);
    int32_t ord_width   = ivars->ord_width;

    // Write ords.
    const double BITS_PER_BYTE = 8.0;
    double bytes_per_doc = ord_width / BITS_PER_BYTE;
    double byte_count = ceil((doc_max + 1) * bytes_per_doc);
    char *compressed_ords
        = (char*)CALLOCATE((size_t)byte_count, sizeof(char));
    for (int32_t i = 0; i <= doc_max; i++) {
        int32_t real_ord = ords[i] == -1 ? null_ord : ords[i];
        S_write_ord(compressed_ords, ord_width, i, real_ord);
    }
    OutStream_Write_Bytes(ord_out, compressed_ords, (size_t)byte_count);
    FREEMEM(compressed_ords);

    FREEMEM(ords);
    return cardinality;
}