예제 #1
0
static void
test_c32(TestBatch *batch) {
    uint64_t  mins[]   = { 0,   0x4000 - 100, (uint32_t)I32_MAX - 100, U32_MAX - 10 };
    uint64_t  limits[] = { 500, 0x4000 + 100, (uint32_t)I32_MAX + 100, U32_MAX      };
    uint32_t  set_num;
    uint32_t  num_sets  = sizeof(mins) / sizeof(uint64_t);
    size_t    count     = 64;
    uint64_t *ints      = NULL;
    size_t    amount    = count * C32_MAX_BYTES;
    char     *encoded   = (char*)CALLOCATE(amount, sizeof(char));
    char     *target    = encoded;
    char     *limit     = target + amount;

    for (set_num = 0; set_num < num_sets; set_num++) {
        char *skip;
        ints = TestUtils_random_u64s(ints, count,
                                     mins[set_num], limits[set_num]);
        target = encoded;
        for (size_t i = 0; i < count; i++) {
            NumUtil_encode_c32((uint32_t)ints[i], &target);
        }
        target = encoded;
        skip   = encoded;
        for (size_t i = 0; i < count; i++) {
            TEST_INT_EQ(batch, NumUtil_decode_c32(&target), (long)ints[i],
                        "c32 %lu", (long)ints[i]);
            NumUtil_skip_cint(&skip);
            if (target > limit) { THROW(ERR, "overrun"); }
        }
        TEST_TRUE(batch, skip == target, "skip %lu == %lu",
                  (unsigned long)skip, (unsigned long)target);

        target = encoded;
        for (size_t i = 0; i < count; i++) {
            NumUtil_encode_padded_c32((uint32_t)ints[i], &target);
        }
        TEST_TRUE(batch, target == limit,
                  "padded c32 uses 5 bytes (%lu == %lu)", (unsigned long)target,
                  (unsigned long)limit);
        target = encoded;
        skip   = encoded;
        for (size_t i = 0; i < count; i++) {
            TEST_INT_EQ(batch, NumUtil_decode_c32(&target), (long)ints[i],
                        "padded c32 %lu", (long)ints[i]);
            NumUtil_skip_cint(&skip);
            if (target > limit) { THROW(ERR, "overrun"); }
        }
        TEST_TRUE(batch, skip == target, "skip padded %lu == %lu",
                  (unsigned long)skip, (unsigned long)target);
    }

    target = encoded;
    NumUtil_encode_c32(U32_MAX, &target);
    target = encoded;
    TEST_INT_EQ(batch, NumUtil_decode_c32(&target), U32_MAX, "c32 U32_MAX");

    FREEMEM(encoded);
    FREEMEM(ints);
}
예제 #2
0
void
ScorePost_Add_Inversion_To_Pool_IMP(ScorePosting *self,
                                    PostingPool *post_pool,
                                    Inversion *inversion, FieldType *type,
                                    int32_t doc_id, float doc_boost,
                                    float length_norm) {
    ScorePostingIVARS *const ivars = ScorePost_IVARS(self);
    MemoryPool     *mem_pool = PostPool_Get_Mem_Pool(post_pool);
    Similarity     *sim = ivars->sim;
    float           field_boost = doc_boost * FType_Get_Boost(type) * length_norm;
    const uint8_t   field_boost_byte  = Sim_Encode_Norm(sim, field_boost);
    const size_t    base_size = Class_Get_Obj_Alloc_Size(RAWPOSTING);
    Token         **tokens;
    uint32_t        freq;

    Inversion_Reset(inversion);
    while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) {
        TokenIVARS *const token_ivars = Token_IVARS(*tokens);
        uint32_t raw_post_bytes
            = MAX_RAW_POSTING_LEN(base_size, token_ivars->len, freq);
        RawPosting *raw_posting
            = RawPost_new(MemPool_Grab(mem_pool, raw_post_bytes), doc_id,
                          freq, token_ivars->text, token_ivars->len);
        RawPostingIVARS *const raw_post_ivars = RawPost_IVARS(raw_posting);
        char *const start  = raw_post_ivars->blob + token_ivars->len;
        char *dest         = start;
        uint32_t last_prox = 0;

        // Field_boost.
        *((uint8_t*)dest) = field_boost_byte;
        dest++;

        // Positions.
        for (uint32_t i = 0; i < freq; i++) {
            TokenIVARS *const t_ivars = Token_IVARS(tokens[i]);
            const uint32_t prox_delta = t_ivars->pos - last_prox;
            NumUtil_encode_c32(prox_delta, &dest);
            last_prox = t_ivars->pos;
        }

        // Resize raw posting memory allocation.
        raw_post_ivars->aux_len = dest - start;
        raw_post_bytes = dest - (char*)raw_posting;
        MemPool_Resize(mem_pool, raw_posting, raw_post_bytes);
        PostPool_Feed(post_pool, (Obj*)raw_posting);
    }
}
예제 #3
0
void
RichPost_add_inversion_to_pool(RichPosting *self, PostingPool *post_pool, 
                               Inversion *inversion, FieldType *type, 
                               int32_t doc_id, float doc_boost,
                               float length_norm)
{
    MemoryPool *mem_pool = PostPool_Get_Mem_Pool(post_pool);
    Similarity *sim = self->sim;
    float       field_boost = doc_boost * FType_Get_Boost(type) * length_norm;
    Token     **tokens;
    uint32_t    freq;

    Inversion_Reset(inversion);
    while ( (tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL ) {
        Token   *token          = *tokens;
        uint32_t raw_post_bytes = MAX_RAW_POSTING_LEN(token->len, freq);
        RawPosting *raw_posting = RawPost_new(
            MemPool_Grab(mem_pool, raw_post_bytes), doc_id, freq,
            token->text, token->len
        );
        char *const start = raw_posting->blob + token->len;
        char *dest = start;
        uint32_t last_prox = 0;
        uint32_t i;

        // Positions and boosts. 
        for (i = 0; i < freq; i++) {
            Token *const t = tokens[i];
            const uint32_t prox_delta = t->pos - last_prox;
            const float boost = field_boost * t->boost;

            NumUtil_encode_c32(prox_delta, &dest);
            last_prox = t->pos; 

            *((uint8_t*)dest) = Sim_Encode_Norm(sim, boost);
            dest++;
        }

        // Resize raw posting memory allocation. 
        raw_posting->aux_len = dest - start;
        raw_post_bytes = dest - (char*)raw_posting;
        MemPool_Resize(mem_pool, raw_posting, raw_post_bytes);
        PostPool_Feed(post_pool, &raw_posting);
    }
}
예제 #4
0
ByteBuf*
HLWriter_TV_Buf_IMP(HighlightWriter *self, Inversion *inversion) {
    const char *last_text = "";
    size_t      last_len = 0;
    ByteBuf    *tv_buf = BB_new(20 + Inversion_Get_Size(inversion) * 8);
    uint32_t    num_postings = 0;
    Token     **tokens;
    uint32_t    freq;
    UNUSED_VAR(self);

    // Leave space for a c32 indicating the number of postings.
    BB_Set_Size(tv_buf, C32_MAX_BYTES);

    Inversion_Reset(inversion);
    while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) {
        Token *token = *tokens;
        char *const   token_text = Token_Get_Text(token);
        const int32_t token_len  = Token_Get_Len(token);

        int32_t overlap = StrHelp_overlap(last_text, token_text,
                                          last_len, token_len);
        char *ptr;
        char *orig;
        size_t old_size = BB_Get_Size(tv_buf);
        size_t new_size = old_size
                          + C32_MAX_BYTES      // overlap
                          + C32_MAX_BYTES      // length of string diff
                          + (token_len - overlap)        // diff char data
                          + C32_MAX_BYTES                // num prox
                          + (C32_MAX_BYTES * freq * 3);  // pos data

        // Allocate for worst-case scenario.
        ptr  = BB_Grow(tv_buf, new_size);
        orig = ptr;
        ptr += old_size;

        // Track number of postings.
        num_postings += 1;

        // Append the string diff to the tv_buf.
        NumUtil_encode_c32(overlap, &ptr);
        NumUtil_encode_c32((token_len - overlap), &ptr);
        memcpy(ptr, (token_text + overlap), (token_len - overlap));
        ptr += token_len - overlap;

        // Save text and text_len for comparison next loop.
        last_text = token_text;
        last_len  = token_len;

        // Append the number of positions for this term.
        NumUtil_encode_c32(freq, &ptr);

        do {
            // Add position, start_offset, and end_offset to tv_buf.
            NumUtil_encode_c32(Token_Get_Pos(token), &ptr);
            NumUtil_encode_c32(Token_Get_Start_Offset(token), &ptr);
            NumUtil_encode_c32(Token_Get_End_Offset(token), &ptr);
        } while (--freq && (token = *++tokens));

        // Set new byte length.
        BB_Set_Size(tv_buf, ptr - orig);
    }

    // Go back and start the term vector string with the posting count.
    char *dest = BB_Get_Buf(tv_buf);
    NumUtil_encode_padded_c32(num_postings, &dest);

    return tv_buf;
}