Exemplo n.º 1
0
int
PostPool_Compare_IMP(PostingPool *self, void *va, void *vb) {
    RawPostingIVARS *const a     = RawPost_IVARS(*(RawPosting**)va);
    RawPostingIVARS *const b     = RawPost_IVARS(*(RawPosting**)vb);
    const size_t      a_len = a->content_len;
    const size_t      b_len = b->content_len;
    const size_t      len   = a_len < b_len ? a_len : b_len;
    int comparison = memcmp(a->blob, b->blob, len);
    UNUSED_VAR(self);

    if (comparison == 0) {
        // If a is a substring of b, it's less than b, so return a neg num.
        comparison = a_len - b_len;

        // Break ties by doc id.
        if (comparison == 0) {
            comparison = a->doc_id - b->doc_id;
        }
    }

    return comparison;
}
Exemplo n.º 2
0
RawPosting*
RawPost_new(void *pre_allocated_memory, int32_t doc_id, uint32_t freq,
            const char *term_text, size_t term_text_len) {
    RawPosting *self
        = (RawPosting*)VTable_Init_Obj(RAWPOSTING, pre_allocated_memory);
    RawPostingIVARS *const ivars = RawPost_IVARS(self);
    ivars->doc_id      = doc_id;
    ivars->freq        = freq;
    ivars->content_len = term_text_len;
    ivars->aux_len     = 0;
    memcpy(&ivars->blob, term_text, term_text_len);

    return self;
}
Exemplo n.º 3
0
void
ScorePost_Add_Inversion_To_Pool_IMP(ScorePosting *self,
                                    PostingPool *post_pool,
                                    Inversion *inversion, FieldType *type,
                                    int32_t doc_id, float doc_boost,
                                    float length_norm) {
    ScorePostingIVARS *const ivars = ScorePost_IVARS(self);
    MemoryPool     *mem_pool = PostPool_Get_Mem_Pool(post_pool);
    Similarity     *sim = ivars->sim;
    float           field_boost = doc_boost * FType_Get_Boost(type) * length_norm;
    const uint8_t   field_boost_byte  = Sim_Encode_Norm(sim, field_boost);
    const size_t    base_size = Class_Get_Obj_Alloc_Size(RAWPOSTING);
    Token         **tokens;
    uint32_t        freq;

    Inversion_Reset(inversion);
    while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) {
        TokenIVARS *const token_ivars = Token_IVARS(*tokens);
        uint32_t raw_post_bytes
            = MAX_RAW_POSTING_LEN(base_size, token_ivars->len, freq);
        RawPosting *raw_posting
            = RawPost_new(MemPool_Grab(mem_pool, raw_post_bytes), doc_id,
                          freq, token_ivars->text, token_ivars->len);
        RawPostingIVARS *const raw_post_ivars = RawPost_IVARS(raw_posting);
        char *const start  = raw_post_ivars->blob + token_ivars->len;
        char *dest         = start;
        uint32_t last_prox = 0;

        // Field_boost.
        *((uint8_t*)dest) = field_boost_byte;
        dest++;

        // Positions.
        for (uint32_t i = 0; i < freq; i++) {
            TokenIVARS *const t_ivars = Token_IVARS(tokens[i]);
            const uint32_t prox_delta = t_ivars->pos - last_prox;
            NumUtil_encode_c32(prox_delta, &dest);
            last_prox = t_ivars->pos;
        }

        // Resize raw posting memory allocation.
        raw_post_ivars->aux_len = dest - start;
        raw_post_bytes = dest - (char*)raw_posting;
        MemPool_Resize(mem_pool, raw_posting, raw_post_bytes);
        PostPool_Feed(post_pool, (Obj*)raw_posting);
    }
}
Exemplo n.º 4
0
void
MatchPostWriter_Write_Posting_IMP(MatchPostingWriter *self, RawPosting *posting) {
    MatchPostingWriterIVARS *const ivars = MatchPostWriter_IVARS(self);
    RawPostingIVARS *const posting_ivars = RawPost_IVARS(posting);
    OutStream *const outstream   = ivars->outstream;
    const int32_t    doc_id      = posting_ivars->doc_id;
    const uint32_t   delta_doc   = doc_id - ivars->last_doc_id;
    char  *const     aux_content = posting_ivars->blob
                                   + posting_ivars->content_len;
    if (posting_ivars->freq == 1) {
        const uint32_t doc_code = (delta_doc << 1) | 1;
        OutStream_Write_C32(outstream, doc_code);
    }
    else {
        const uint32_t doc_code = delta_doc << 1;
        OutStream_Write_C32(outstream, doc_code);
        OutStream_Write_C32(outstream, posting_ivars->freq);
    }
    OutStream_Write_Bytes(outstream, aux_content, posting_ivars->aux_len);
    ivars->last_doc_id = doc_id;
}
Exemplo n.º 5
0
RawPosting*
ScorePost_Read_Raw_IMP(ScorePosting *self, InStream *instream,
                       int32_t last_doc_id, String *term_text,
                       MemoryPool *mem_pool) {
    const char *const text_buf  = Str_Get_Ptr8(term_text);
    const size_t      text_size = Str_Get_Size(term_text);
    const uint32_t    doc_code  = InStream_Read_C32(instream);
    const uint32_t    delta_doc = doc_code >> 1;
    const int32_t     doc_id    = last_doc_id + delta_doc;
    const uint32_t    freq      = (doc_code & 1)
                                  ? 1
                                  : InStream_Read_C32(instream);
    const size_t base_size = Class_Get_Obj_Alloc_Size(RAWPOSTING);
    size_t raw_post_bytes  = MAX_RAW_POSTING_LEN(base_size, text_size, freq);
    void *const allocation = MemPool_Grab(mem_pool, raw_post_bytes);
    RawPosting *const raw_posting
        = RawPost_new(allocation, doc_id, freq, text_buf, text_size);
    RawPostingIVARS *const raw_post_ivars = RawPost_IVARS(raw_posting);
    uint32_t num_prox = freq;
    char *const start = raw_post_ivars->blob + text_size;
    char *dest        = start;
    UNUSED_VAR(self);

    // Field_boost.
    *((uint8_t*)dest) = InStream_Read_U8(instream);
    dest++;

    // Read positions.
    while (num_prox--) {
        dest += InStream_Read_Raw_C64(instream, dest);
    }

    // Resize raw posting memory allocation.
    raw_post_ivars->aux_len = dest - start;
    raw_post_bytes       = dest - (char*)raw_posting;
    MemPool_Resize(mem_pool, raw_posting, raw_post_bytes);

    return raw_posting;
}
Exemplo n.º 6
0
uint32_t
PostPool_Refill_IMP(PostingPool *self) {
    PostingPoolIVARS *const ivars = PostPool_IVARS(self);
    Lexicon *const     lexicon     = ivars->lexicon;
    PostingList *const plist       = ivars->plist;
    I32Array    *const doc_map     = ivars->doc_map;
    const uint32_t     mem_thresh  = ivars->mem_thresh;
    const int32_t      doc_base    = ivars->doc_base;
    uint32_t           num_elems   = 0; // number of items recovered
    String            *term_text   = NULL;

    if (ivars->lexicon == NULL) { return 0; }
    else { term_text = (String*)Lex_Get_Term(lexicon); }

    // Make sure buffer is empty.
    if (ivars->buf_max - ivars->buf_tick > 0) {
        THROW(ERR, "Refill called but buffer contains %u32 items",
              ivars->buf_max - ivars->buf_tick);
    }
    ivars->buf_max  = 0;
    ivars->buf_tick = 0;

    // Ditch old MemoryPool and get another.
    DECREF(ivars->mem_pool);
    ivars->mem_pool = MemPool_new(0);
    MemoryPool *const mem_pool = ivars->mem_pool;
    MemoryPoolIVARS *const mem_pool_ivars = MemPool_IVARS(mem_pool);


    while (1) {
        if (ivars->post_count == 0) {
            // Read a term.
            if (Lex_Next(lexicon)) {
                ivars->post_count = Lex_Doc_Freq(lexicon);
                term_text = (String*)Lex_Get_Term(lexicon);
                if (term_text && !Obj_Is_A((Obj*)term_text, STRING)) {
                    THROW(ERR, "Only String terms are supported for now");
                }
                Posting *posting = PList_Get_Posting(plist);
                Post_Set_Doc_ID(posting, doc_base);
                ivars->last_doc_id = doc_base;
            }
            // Bail if we've read everything in this run.
            else {
                break;
            }
        }

        // Bail if we've hit the ceiling for this run's buffer.
        if (mem_pool_ivars->consumed >= mem_thresh && num_elems > 0) {
            break;
        }

        // Read a posting from the input stream.
        RawPosting *rawpost
            = PList_Read_Raw(plist, ivars->last_doc_id, term_text, mem_pool);
        RawPostingIVARS *const rawpost_ivars = RawPost_IVARS(rawpost);
        ivars->last_doc_id = rawpost_ivars->doc_id;
        ivars->post_count--;

        // Skip deletions.
        if (doc_map != NULL) {
            const int32_t remapped
                = I32Arr_Get(doc_map, rawpost_ivars->doc_id - doc_base);
            if (!remapped) {
                continue;
            }
            rawpost_ivars->doc_id = remapped;
        }

        // Add to the run's buffer.
        if (num_elems >= ivars->buf_cap) {
            size_t new_cap = Memory_oversize(num_elems + 1, sizeof(Obj*));
            PostPool_Grow_Buffer(self, new_cap);
        }
        ivars->buffer[num_elems] = (Obj*)rawpost;
        num_elems++;
    }

    // Reset the buffer array position and length; remember file pos.
    ivars->buf_max   = num_elems;
    ivars->buf_tick  = 0;

    return num_elems;
}
Exemplo n.º 7
0
static void
S_write_terms_and_postings(PostingPool *self, PostingWriter *post_writer,
                           OutStream *skip_stream) {
    PostingPoolIVARS *const ivars = PostPool_IVARS(self);
    TermInfo      *const tinfo            = TInfo_new(0);
    TermInfo      *const skip_tinfo       = TInfo_new(0);
    TermInfoIVARS *const tinfo_ivars      = TInfo_IVARS(tinfo);
    TermInfoIVARS *const skip_tinfo_ivars = TInfo_IVARS(skip_tinfo);
    LexiconWriter *const lex_writer       = ivars->lex_writer;
    SkipStepper   *const skip_stepper     = ivars->skip_stepper;
    SkipStepperIVARS *const skip_stepper_ivars
        = SkipStepper_IVARS(skip_stepper);
    int32_t        last_skip_doc          = 0;
    int64_t        last_skip_filepos      = 0;
    const int32_t  skip_interval
        = Arch_Skip_Interval(Schema_Get_Architecture(ivars->schema));

    // Prime heldover variables.
    RawPosting *posting
        = (RawPosting*)CERTIFY(PostPool_Fetch(self), RAWPOSTING);
    RawPostingIVARS *post_ivars = RawPost_IVARS(posting);
    CharBuf *last_term_text
        = CB_new_from_trusted_utf8(post_ivars->blob, post_ivars->content_len);
    const char *last_text_buf  = CB_Get_Ptr8(last_term_text);
    uint32_t    last_text_size = CB_Get_Size(last_term_text);
    SkipStepper_Set_ID_And_Filepos(skip_stepper, 0, 0);

    // Initialize sentinel to be used on the last iter, using an empty string
    // in order to make LexiconWriter Do The Right Thing.
    size_t sentinel_size = Class_Get_Obj_Alloc_Size(RAWPOSTING)
                           + 20;  // blob length + cushion
    char empty_string[] = "";
    RawPosting *sentinel = RawPost_new(alloca(sentinel_size), 0, 1,
                                       empty_string, 0);

    while (1) {
        bool same_text_as_last = true;

        if (posting == NULL) {
            // On the last iter, use an empty string to make LexiconWriter
            // DTRT.
            posting = sentinel;
            post_ivars = RawPost_IVARS(posting);
            same_text_as_last = false;
        }
        else {
            // Compare once.
            if (post_ivars->content_len != last_text_size
                || memcmp(&post_ivars->blob, last_text_buf, last_text_size) != 0
               ) {
                same_text_as_last = false;
            }
        }

        // If the term text changes, process the last term.
        if (!same_text_as_last) {
            // Hand off to LexiconWriter.
            LexWriter_Add_Term(lex_writer, (Obj*)last_term_text, tinfo);

            // Start each term afresh.
            TInfo_Reset(tinfo);
            PostWriter_Start_Term(post_writer, tinfo);

            // Init skip data in preparation for the next term.
            skip_stepper_ivars->doc_id  = 0;
            skip_stepper_ivars->filepos = tinfo_ivars->post_filepos;
            last_skip_doc         = 0;
            last_skip_filepos     = tinfo_ivars->post_filepos;

            // Remember the term_text so we can write string diffs.
            CB_Mimic_Utf8(last_term_text, post_ivars->blob,
                          post_ivars->content_len);
            last_text_buf  = CB_Get_Ptr8(last_term_text);
            last_text_size = CB_Get_Size(last_term_text);
        }

        // Bail on last iter before writing invalid posting data.
        if (posting == sentinel) { break; }

        // Write posting data.
        PostWriter_Write_Posting(post_writer, posting);

        // Doc freq lags by one iter.
        tinfo_ivars->doc_freq++;

        //  Write skip data.
        if (skip_stream != NULL
            && same_text_as_last
            && tinfo_ivars->doc_freq % skip_interval == 0
            && tinfo_ivars->doc_freq != 0
           ) {
            // If first skip group, save skip stream pos for term info.
            if (tinfo_ivars->doc_freq == skip_interval) {
                tinfo_ivars->skip_filepos = OutStream_Tell(skip_stream);
            }
            // Write deltas.
            last_skip_doc               = skip_stepper_ivars->doc_id;
            last_skip_filepos           = skip_stepper_ivars->filepos;
            skip_stepper_ivars->doc_id  = post_ivars->doc_id;
            PostWriter_Update_Skip_Info(post_writer, skip_tinfo);
            skip_stepper_ivars->filepos = skip_tinfo_ivars->post_filepos;
            SkipStepper_Write_Record(skip_stepper, skip_stream,
                                     last_skip_doc, last_skip_filepos);
        }

        // Retrieve the next posting from the sort pool.
        // DECREF(posting);  // No!!  DON'T destroy!!!

        posting = (RawPosting*)PostPool_Fetch(self);
        post_ivars = RawPost_IVARS(posting);
    }

    // Clean up.
    DECREF(last_term_text);
    DECREF(skip_tinfo);
    DECREF(tinfo);
}