SortFieldWriter*
SortFieldWriter_init(SortFieldWriter *self, Schema *schema,
                     Snapshot *snapshot, Segment *segment,
                     PolyReader *polyreader, String *field,
                     Counter *counter, size_t mem_thresh,
                     OutStream *temp_ord_out, OutStream *temp_ix_out,
                     OutStream *temp_dat_out) {
    // Init.
    SortEx_init((SortExternal*)self);
    SortFieldWriterIVARS *const ivars = SortFieldWriter_IVARS(self);
    ivars->null_ord        = -1;
    ivars->count           = 0;
    ivars->ord_start       = 0;
    ivars->ord_end         = 0;
    ivars->ix_start        = 0;
    ivars->ix_end          = 0;
    ivars->dat_start       = 0;
    ivars->dat_end         = 0;
    ivars->run_cardinality = -1;
    ivars->run_max         = -1;
    ivars->sort_cache      = NULL;
    ivars->doc_map         = NULL;
    ivars->sorted_ids      = NULL;
    ivars->run_tick        = 1;
    ivars->ord_width       = 0;

    // Assign.
    ivars->field        = Str_Clone(field);
    ivars->schema       = (Schema*)INCREF(schema);
    ivars->snapshot     = (Snapshot*)INCREF(snapshot);
    ivars->segment      = (Segment*)INCREF(segment);
    ivars->polyreader   = (PolyReader*)INCREF(polyreader);
    ivars->counter      = (Counter*)INCREF(counter);
    ivars->temp_ord_out = (OutStream*)INCREF(temp_ord_out);
    ivars->temp_ix_out  = (OutStream*)INCREF(temp_ix_out);
    ivars->temp_dat_out = (OutStream*)INCREF(temp_dat_out);
    ivars->mem_thresh   = mem_thresh;

    // Derive.
    ivars->field_num = Seg_Field_Num(segment, field);
    FieldType *type = (FieldType*)CERTIFY(
                          Schema_Fetch_Type(ivars->schema, field), FIELDTYPE);
    ivars->type    = (FieldType*)INCREF(type);
    ivars->prim_id = FType_Primitive_ID(type);
    ivars->mem_per_entry = Class_Get_Obj_Alloc_Size(SFWRITERELEM);
    if (ivars->prim_id == FType_TEXT) {
        ivars->mem_per_entry += Class_Get_Obj_Alloc_Size(STRING);
        ivars->var_width = true;
    }
    else if (ivars->prim_id == FType_BLOB) {
        ivars->mem_per_entry += Class_Get_Obj_Alloc_Size(BLOB);
        ivars->var_width = true;
    }
    else {
        ivars->mem_per_entry += Class_Get_Obj_Alloc_Size(FLOAT);
        ivars->var_width = false;
    }

    return self;
}
Ejemplo n.º 2
0
void
ScorePost_Add_Inversion_To_Pool_IMP(ScorePosting *self,
                                    PostingPool *post_pool,
                                    Inversion *inversion, FieldType *type,
                                    int32_t doc_id, float doc_boost,
                                    float length_norm) {
    ScorePostingIVARS *const ivars = ScorePost_IVARS(self);
    MemoryPool     *mem_pool = PostPool_Get_Mem_Pool(post_pool);
    Similarity     *sim = ivars->sim;
    float           field_boost = doc_boost * FType_Get_Boost(type) * length_norm;
    const uint8_t   field_boost_byte  = Sim_Encode_Norm(sim, field_boost);
    const size_t    base_size = Class_Get_Obj_Alloc_Size(RAWPOSTING);
    Token         **tokens;
    uint32_t        freq;

    Inversion_Reset(inversion);
    while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) {
        TokenIVARS *const token_ivars = Token_IVARS(*tokens);
        uint32_t raw_post_bytes
            = MAX_RAW_POSTING_LEN(base_size, token_ivars->len, freq);
        RawPosting *raw_posting
            = RawPost_new(MemPool_Grab(mem_pool, raw_post_bytes), doc_id,
                          freq, token_ivars->text, token_ivars->len);
        RawPostingIVARS *const raw_post_ivars = RawPost_IVARS(raw_posting);
        char *const start  = raw_post_ivars->blob + token_ivars->len;
        char *dest         = start;
        uint32_t last_prox = 0;

        // Field_boost.
        *((uint8_t*)dest) = field_boost_byte;
        dest++;

        // Positions.
        for (uint32_t i = 0; i < freq; i++) {
            TokenIVARS *const t_ivars = Token_IVARS(tokens[i]);
            const uint32_t prox_delta = t_ivars->pos - last_prox;
            NumUtil_encode_c32(prox_delta, &dest);
            last_prox = t_ivars->pos;
        }

        // Resize raw posting memory allocation.
        raw_post_ivars->aux_len = dest - start;
        raw_post_bytes = dest - (char*)raw_posting;
        MemPool_Resize(mem_pool, raw_posting, raw_post_bytes);
        PostPool_Feed(post_pool, (Obj*)raw_posting);
    }
}
Ejemplo n.º 3
0
RawPosting*
ScorePost_Read_Raw_IMP(ScorePosting *self, InStream *instream,
                       int32_t last_doc_id, String *term_text,
                       MemoryPool *mem_pool) {
    const char *const text_buf  = Str_Get_Ptr8(term_text);
    const size_t      text_size = Str_Get_Size(term_text);
    const uint32_t    doc_code  = InStream_Read_C32(instream);
    const uint32_t    delta_doc = doc_code >> 1;
    const int32_t     doc_id    = last_doc_id + delta_doc;
    const uint32_t    freq      = (doc_code & 1)
                                  ? 1
                                  : InStream_Read_C32(instream);
    const size_t base_size = Class_Get_Obj_Alloc_Size(RAWPOSTING);
    size_t raw_post_bytes  = MAX_RAW_POSTING_LEN(base_size, text_size, freq);
    void *const allocation = MemPool_Grab(mem_pool, raw_post_bytes);
    RawPosting *const raw_posting
        = RawPost_new(allocation, doc_id, freq, text_buf, text_size);
    RawPostingIVARS *const raw_post_ivars = RawPost_IVARS(raw_posting);
    uint32_t num_prox = freq;
    char *const start = raw_post_ivars->blob + text_size;
    char *dest        = start;
    UNUSED_VAR(self);

    // Field_boost.
    *((uint8_t*)dest) = InStream_Read_U8(instream);
    dest++;

    // Read positions.
    while (num_prox--) {
        dest += InStream_Read_Raw_C64(instream, dest);
    }

    // Resize raw posting memory allocation.
    raw_post_ivars->aux_len = dest - start;
    raw_post_bytes       = dest - (char*)raw_posting;
    MemPool_Resize(mem_pool, raw_posting, raw_post_bytes);

    return raw_posting;
}
Ejemplo n.º 4
0
static void
S_write_terms_and_postings(PostingPool *self, PostingWriter *post_writer,
                           OutStream *skip_stream) {
    PostingPoolIVARS *const ivars = PostPool_IVARS(self);
    TermInfo      *const tinfo            = TInfo_new(0);
    TermInfo      *const skip_tinfo       = TInfo_new(0);
    TermInfoIVARS *const tinfo_ivars      = TInfo_IVARS(tinfo);
    TermInfoIVARS *const skip_tinfo_ivars = TInfo_IVARS(skip_tinfo);
    LexiconWriter *const lex_writer       = ivars->lex_writer;
    SkipStepper   *const skip_stepper     = ivars->skip_stepper;
    SkipStepperIVARS *const skip_stepper_ivars
        = SkipStepper_IVARS(skip_stepper);
    int32_t        last_skip_doc          = 0;
    int64_t        last_skip_filepos      = 0;
    const int32_t  skip_interval
        = Arch_Skip_Interval(Schema_Get_Architecture(ivars->schema));

    // Prime heldover variables.
    RawPosting *posting
        = (RawPosting*)CERTIFY(PostPool_Fetch(self), RAWPOSTING);
    RawPostingIVARS *post_ivars = RawPost_IVARS(posting);
    CharBuf *last_term_text
        = CB_new_from_trusted_utf8(post_ivars->blob, post_ivars->content_len);
    const char *last_text_buf  = CB_Get_Ptr8(last_term_text);
    uint32_t    last_text_size = CB_Get_Size(last_term_text);
    SkipStepper_Set_ID_And_Filepos(skip_stepper, 0, 0);

    // Initialize sentinel to be used on the last iter, using an empty string
    // in order to make LexiconWriter Do The Right Thing.
    size_t sentinel_size = Class_Get_Obj_Alloc_Size(RAWPOSTING)
                           + 20;  // blob length + cushion
    char empty_string[] = "";
    RawPosting *sentinel = RawPost_new(alloca(sentinel_size), 0, 1,
                                       empty_string, 0);

    while (1) {
        bool same_text_as_last = true;

        if (posting == NULL) {
            // On the last iter, use an empty string to make LexiconWriter
            // DTRT.
            posting = sentinel;
            post_ivars = RawPost_IVARS(posting);
            same_text_as_last = false;
        }
        else {
            // Compare once.
            if (post_ivars->content_len != last_text_size
                || memcmp(&post_ivars->blob, last_text_buf, last_text_size) != 0
               ) {
                same_text_as_last = false;
            }
        }

        // If the term text changes, process the last term.
        if (!same_text_as_last) {
            // Hand off to LexiconWriter.
            LexWriter_Add_Term(lex_writer, (Obj*)last_term_text, tinfo);

            // Start each term afresh.
            TInfo_Reset(tinfo);
            PostWriter_Start_Term(post_writer, tinfo);

            // Init skip data in preparation for the next term.
            skip_stepper_ivars->doc_id  = 0;
            skip_stepper_ivars->filepos = tinfo_ivars->post_filepos;
            last_skip_doc         = 0;
            last_skip_filepos     = tinfo_ivars->post_filepos;

            // Remember the term_text so we can write string diffs.
            CB_Mimic_Utf8(last_term_text, post_ivars->blob,
                          post_ivars->content_len);
            last_text_buf  = CB_Get_Ptr8(last_term_text);
            last_text_size = CB_Get_Size(last_term_text);
        }

        // Bail on last iter before writing invalid posting data.
        if (posting == sentinel) { break; }

        // Write posting data.
        PostWriter_Write_Posting(post_writer, posting);

        // Doc freq lags by one iter.
        tinfo_ivars->doc_freq++;

        //  Write skip data.
        if (skip_stream != NULL
            && same_text_as_last
            && tinfo_ivars->doc_freq % skip_interval == 0
            && tinfo_ivars->doc_freq != 0
           ) {
            // If first skip group, save skip stream pos for term info.
            if (tinfo_ivars->doc_freq == skip_interval) {
                tinfo_ivars->skip_filepos = OutStream_Tell(skip_stream);
            }
            // Write deltas.
            last_skip_doc               = skip_stepper_ivars->doc_id;
            last_skip_filepos           = skip_stepper_ivars->filepos;
            skip_stepper_ivars->doc_id  = post_ivars->doc_id;
            PostWriter_Update_Skip_Info(post_writer, skip_tinfo);
            skip_stepper_ivars->filepos = skip_tinfo_ivars->post_filepos;
            SkipStepper_Write_Record(skip_stepper, skip_stream,
                                     last_skip_doc, last_skip_filepos);
        }

        // Retrieve the next posting from the sort pool.
        // DECREF(posting);  // No!!  DON'T destroy!!!

        posting = (RawPosting*)PostPool_Fetch(self);
        post_ivars = RawPost_IVARS(posting);
    }

    // Clean up.
    DECREF(last_term_text);
    DECREF(skip_tinfo);
    DECREF(tinfo);
}