Пример #1
0
static void
test_Grow(TestBatchRunner *runner) {
    ByteBuf *bb = BB_new(1);
    TEST_INT_EQ(runner, BB_Get_Capacity(bb), 8,
                "Allocate in 8-byte increments");
    BB_Grow(bb, 9);
    TEST_INT_EQ(runner, BB_Get_Capacity(bb), 16,
                "Grow in 8-byte increments");
    DECREF(bb);
}
Пример #2
0
static void
test_Mimic(TestBatchRunner *runner) {
    ByteBuf *a = BB_new_bytes("foo", 3);
    ByteBuf *b = BB_new(0);

    BB_Mimic(b, (Obj*)a);
    TEST_TRUE(runner, BB_Equals(a, (Obj*)b), "Mimic");

    BB_Mimic_Bytes(a, "bar", 4);
    TEST_TRUE(runner, strcmp(BB_Get_Buf(a), "bar") == 0,
              "Mimic_Bytes content");
    TEST_INT_EQ(runner, BB_Get_Size(a), 4, "Mimic_Bytes size");

    BB_Mimic(b, (Obj*)a);
    TEST_TRUE(runner, BB_Equals(a, (Obj*)b), "Mimic");

    DECREF(a);
    DECREF(b);
}
Пример #3
0
void
HLWriter_Add_Segment_IMP(HighlightWriter *self, SegReader *reader,
                         I32Array *doc_map) {
    HighlightWriterIVARS *const ivars = HLWriter_IVARS(self);
    int32_t doc_max = SegReader_Doc_Max(reader);

    if (doc_max == 0) {
        // Bail if the supplied segment is empty.
        return;
    }
    else {
        DefaultHighlightReader *hl_reader
            = (DefaultHighlightReader*)CERTIFY(
                  SegReader_Obtain(reader, Class_Get_Name(HIGHLIGHTREADER)),
                  DEFAULTHIGHLIGHTREADER);
        OutStream *dat_out = S_lazy_init(self);
        OutStream *ix_out  = ivars->ix_out;
        int32_t    orig;
        ByteBuf   *bb = BB_new(0);

        for (orig = 1; orig <= doc_max; orig++) {
            // Skip deleted docs.
            if (doc_map && !I32Arr_Get(doc_map, orig)) {
                continue;
            }

            // Write file pointer.
            OutStream_Write_I64(ix_out, OutStream_Tell(dat_out));

            // Copy the raw record.
            DefHLReader_Read_Record(hl_reader, orig, bb);
            OutStream_Write_Bytes(dat_out, BB_Get_Buf(bb), BB_Get_Size(bb));

            BB_Set_Size(bb, 0);
        }
        DECREF(bb);
    }
}
Пример #4
0
static Hash*
S_extract_tv_cache(Blob *field_buf) {
    Hash       *tv_cache  = Hash_new(0);
    const char *tv_string = Blob_Get_Buf(field_buf);
    int32_t     num_terms = NumUtil_decode_ci32(&tv_string);
    ByteBuf    *text_buf  = BB_new(0);

    // Read the number of highlightable terms in the field.
    for (int32_t i = 0; i < num_terms; i++) {
        size_t   overlap = NumUtil_decode_cu32(&tv_string);
        size_t   len     = NumUtil_decode_cu32(&tv_string);

        // Decompress the term text.
        BB_Set_Size(text_buf, overlap);
        BB_Cat_Bytes(text_buf, tv_string, len);
        tv_string += len;

        // Get positions & offsets string.
        const char *bookmark_ptr  = tv_string;
        int32_t     num_positions = NumUtil_decode_ci32(&tv_string);
        while (num_positions--) {
            // Leave nums compressed to save a little mem.
            NumUtil_skip_cint(&tv_string);
            NumUtil_skip_cint(&tv_string);
            NumUtil_skip_cint(&tv_string);
        }
        len = tv_string - bookmark_ptr;

        // Store the $text => $posdata pair in the output hash.
        String *text = BB_Trusted_Utf8_To_String(text_buf);
        Hash_Store(tv_cache, text, (Obj*)Blob_new(bookmark_ptr, len));
        DECREF(text);
    }
    DECREF(text_buf);

    return tv_cache;
}
Пример #5
0
void
DocWriter_Add_Segment_IMP(DocWriter *self, SegReader *reader,
                          I32Array *doc_map) {
    DocWriterIVARS *const ivars = DocWriter_IVARS(self);
    int32_t doc_max = SegReader_Doc_Max(reader);

    if (doc_max == 0) {
        // Bail if the supplied segment is empty.
        return;
    }
    else {
        OutStream *const dat_out = S_lazy_init(self);
        OutStream *const ix_out  = ivars->ix_out;
        ByteBuf   *const buffer  = BB_new(0);
        DefaultDocReader *const doc_reader
            = (DefaultDocReader*)CERTIFY(
                  SegReader_Obtain(reader, VTable_Get_Name(DOCREADER)),
                  DEFAULTDOCREADER);

        for (int32_t i = 1, max = SegReader_Doc_Max(reader); i <= max; i++) {
            if (I32Arr_Get(doc_map, i)) {
                int64_t  start = OutStream_Tell(dat_out);

                // Copy record over.
                DefDocReader_Read_Record(doc_reader, buffer, i);
                char *buf   = BB_Get_Buf(buffer);
                size_t size = BB_Get_Size(buffer);
                OutStream_Write_Bytes(dat_out, buf, size);

                // Write file pointer.
                OutStream_Write_I64(ix_out, start);
            }
        }

        DECREF(buffer);
    }
}
Пример #6
0
void
HLWriter_add_segment(HighlightWriter *self, SegReader *reader, 
                     I32Array *doc_map)
{
    i32_t doc_max = SegReader_Doc_Max(reader);

    if (doc_max == 0) {
        /* Bail if the supplied segment is empty. */
        return;
    }
    else {
        DefaultHighlightReader *hl_reader = (DefaultHighlightReader*)
            ASSERT_IS_A(SegReader_Obtain(reader, HIGHLIGHTREADER.name),
            DEFAULTHIGHLIGHTREADER);
        OutStream *dat_out = S_lazy_init(self);
        OutStream *ix_out  = self->ix_out;
        i32_t      orig;
        ByteBuf   *bb = BB_new(0);

        for (orig = 1; orig <= doc_max; orig++) {
            /* Skip deleted docs. */
            if (doc_map && !I32Arr_Get(doc_map, orig))
                continue;

            /* Write file pointer. */
            OutStream_Write_U64( ix_out, OutStream_Tell(dat_out) );
            
            /* Copy the raw record. */
            DefHLReader_Read_Record(hl_reader, orig, bb);
            OutStream_Write_Bytes(dat_out, bb->ptr, bb->size);

            bb->size = 0;
        }
        DECREF(bb);
    }
}
Пример #7
0
ByteBuf*
HLWriter_TV_Buf_IMP(HighlightWriter *self, Inversion *inversion) {
    const char *last_text = "";
    size_t      last_len = 0;
    ByteBuf    *tv_buf = BB_new(20 + Inversion_Get_Size(inversion) * 8);
    uint32_t    num_postings = 0;
    Token     **tokens;
    uint32_t    freq;
    UNUSED_VAR(self);

    // Leave space for a c32 indicating the number of postings.
    BB_Set_Size(tv_buf, C32_MAX_BYTES);

    Inversion_Reset(inversion);
    while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) {
        Token *token = *tokens;
        char *const   token_text = Token_Get_Text(token);
        const int32_t token_len  = Token_Get_Len(token);

        int32_t overlap = StrHelp_overlap(last_text, token_text,
                                          last_len, token_len);
        char *ptr;
        char *orig;
        size_t old_size = BB_Get_Size(tv_buf);
        size_t new_size = old_size
                          + C32_MAX_BYTES      // overlap
                          + C32_MAX_BYTES      // length of string diff
                          + (token_len - overlap)        // diff char data
                          + C32_MAX_BYTES                // num prox
                          + (C32_MAX_BYTES * freq * 3);  // pos data

        // Allocate for worst-case scenario.
        ptr  = BB_Grow(tv_buf, new_size);
        orig = ptr;
        ptr += old_size;

        // Track number of postings.
        num_postings += 1;

        // Append the string diff to the tv_buf.
        NumUtil_encode_c32(overlap, &ptr);
        NumUtil_encode_c32((token_len - overlap), &ptr);
        memcpy(ptr, (token_text + overlap), (token_len - overlap));
        ptr += token_len - overlap;

        // Save text and text_len for comparison next loop.
        last_text = token_text;
        last_len  = token_len;

        // Append the number of positions for this term.
        NumUtil_encode_c32(freq, &ptr);

        do {
            // Add position, start_offset, and end_offset to tv_buf.
            NumUtil_encode_c32(Token_Get_Pos(token), &ptr);
            NumUtil_encode_c32(Token_Get_Start_Offset(token), &ptr);
            NumUtil_encode_c32(Token_Get_End_Offset(token), &ptr);
        } while (--freq && (token = *++tokens));

        // Set new byte length.
        BB_Set_Size(tv_buf, ptr - orig);
    }

    // Go back and start the term vector string with the posting count.
    char *dest = BB_Get_Buf(tv_buf);
    NumUtil_encode_padded_c32(num_postings, &dest);

    return tv_buf;
}
Пример #8
0
ByteBuf*
HLWriter_tv_buf(HighlightWriter *self, Inversion *inversion)
{
    char        *last_text = "";
    size_t       last_len = 0;
    ByteBuf     *tv_buf = BB_new(20 + inversion->size * 8); /* generous */
    u32_t        num_postings = 0;
    char        *dest;
    Token      **tokens;
    u32_t        freq;
    UNUSED_VAR(self); /* heh. */

    /* Leave space for a c32 indicating the number of postings. */
    BB_Set_Size(tv_buf, C32_MAX_BYTES);

    Inversion_Reset(inversion);
    while ( (tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL ) {
        Token *token = *tokens;
        i32_t overlap = StrHelp_string_diff(last_text, token->text, 
            last_len, token->len);
        char *ptr;
        size_t new_size =   BB_Get_Size(tv_buf)
                          + C32_MAX_BYTES      /* overlap */
                          + C32_MAX_BYTES      /* length of string diff */
                          + (token->len - overlap) /* diff char data */
                          + C32_MAX_BYTES                /* num prox */
                          + (C32_MAX_BYTES * freq * 3);  /* pos data */

        /* Allocate for worst-case scenario. */
        BB_Grow(tv_buf, new_size);
        ptr = BBEND(tv_buf);

        /* Track number of postings. */
        num_postings += 1;
        
        /* Append the string diff to the tv_buf. */
        Math_encode_c32(overlap, &ptr);
        Math_encode_c32( (token->len - overlap), &ptr);
        memcpy(ptr, (token->text + overlap), (token->len - overlap));
        ptr += token->len - overlap;

        /* Save text and text_len for comparison next loop. */
        last_text = token->text;
        last_len  = token->len;

        /* Append the number of positions for this term. */
        Math_encode_c32(freq, &ptr);

        do {
            /* Add position, start_offset, and end_offset to tv_buf. */
            Math_encode_c32(token->pos, &ptr);
            Math_encode_c32(token->start_offset, &ptr);
            Math_encode_c32(token->end_offset, &ptr);

        } while (--freq && (token = *++tokens));

        /* Set new byte length. */
        BB_Set_Size(tv_buf, ptr - tv_buf->ptr); 
    }
    
    /* Go back and start the term vector string with the number of postings. */
    dest = tv_buf->ptr;
    Math_encode_padded_c32(num_postings, &dest);

    return tv_buf;
}