Beispiel #1
0
static void
test_Equals(TestBatchRunner *runner) {
    ByteBuf *wanted = BB_new_bytes("foo", 4); // Include terminating NULL.
    ByteBuf *got    = BB_new_bytes("foo", 4);

    TEST_TRUE(runner, BB_Equals(wanted, (Obj*)got), "Equals");
    TEST_INT_EQ(runner, BB_Hash_Sum(got), BB_Hash_Sum(wanted), "Hash_Sum");

    TEST_TRUE(runner, BB_Equals_Bytes(got, "foo", 4), "Equals_Bytes");
    TEST_FALSE(runner, BB_Equals_Bytes(got, "foo", 3),
               "Equals_Bytes spoiled by different size");
    TEST_FALSE(runner, BB_Equals_Bytes(got, "bar", 4),
               "Equals_Bytes spoiled by different content");

    BB_Set_Size(got, 3);
    TEST_FALSE(runner, BB_Equals(wanted, (Obj*)got),
               "Different size spoils Equals");
    TEST_FALSE(runner, BB_Hash_Sum(got) == BB_Hash_Sum(wanted),
               "Different size spoils Hash_Sum (probably -- at least this one)");

    BB_Mimic_Bytes(got, "bar", 4);
    TEST_INT_EQ(runner, BB_Get_Size(wanted), BB_Get_Size(got),
                "same length");
    TEST_FALSE(runner, BB_Equals(wanted, (Obj*)got),
               "Different content spoils Equals");

    DECREF(got);
    DECREF(wanted);
}
Beispiel #2
0
static void
S_init_arena(MemoryPool *self, size_t amount) {
    ByteBuf *bb;

    // Indicate which arena we're using at present.
    self->tick++;

    if (self->tick < (int32_t)VA_Get_Size(self->arenas)) {
        // In recycle mode, use previously acquired memory.
        bb = (ByteBuf*)VA_Fetch(self->arenas, self->tick);
        if (amount >= BB_Get_Size(bb)) {
            BB_Grow(bb, amount);
            BB_Set_Size(bb, amount);
        }
    }
    else {
        // In add mode, get more mem from system.
        size_t buf_size = (amount + 1) > self->arena_size
                          ? (amount + 1)
                          : self->arena_size;
        char *ptr = (char*)MALLOCATE(buf_size);
        bb = BB_new_steal_bytes(ptr, buf_size - 1, buf_size);
        VA_Push(self->arenas, (Obj*)bb);
    }

    // Recalculate consumption to take into account blocked off space.
    self->consumed = 0;
    for (int32_t i = 0; i < self->tick; i++) {
        ByteBuf *bb = (ByteBuf*)VA_Fetch(self->arenas, i);
        self->consumed += BB_Get_Size(bb);
    }

    self->buf   = BB_Get_Buf(bb);
    self->limit = self->buf + BB_Get_Size(bb);
}
Beispiel #3
0
static void
test_compare(TestBatchRunner *runner) {
    ByteBuf *a = BB_new_bytes("foo\0a", 5);
    ByteBuf *b = BB_new_bytes("foo\0b", 5);

    BB_Set_Size(a, 4);
    BB_Set_Size(b, 4);
    TEST_INT_EQ(runner, BB_compare(&a, &b), 0,
                "BB_compare returns 0 for equal ByteBufs");

    BB_Set_Size(a, 3);
    TEST_TRUE(runner, BB_compare(&a, &b) < 0, "shorter ByteBuf sorts first");

    BB_Set_Size(a, 5);
    BB_Set_Size(b, 5);
    TEST_TRUE(runner, BB_compare(&a, &b) < 0,
              "NULL doesn't interfere with BB_compare");

    DECREF(a);
    DECREF(b);
}
Beispiel #4
0
void
DefDocReader_Read_Record_IMP(DefaultDocReader *self, ByteBuf *buffer,
                             int32_t doc_id) {
    DefaultDocReaderIVARS *const ivars = DefDocReader_IVARS(self);

    // Find start and length of variable length record.
    InStream_Seek(ivars->ix_in, (int64_t)doc_id * 8);
    int64_t start = InStream_Read_I64(ivars->ix_in);
    int64_t end   = InStream_Read_I64(ivars->ix_in);
    size_t size  = (size_t)(end - start);

    // Read in the record.
    char *buf = BB_Grow(buffer, size);
    InStream_Seek(ivars->dat_in, start);
    InStream_Read_Bytes(ivars->dat_in, buf, size);
    BB_Set_Size(buffer, size);
}
Beispiel #5
0
void
DefHLReader_read_record(DefaultHighlightReader *self, int32_t doc_id,
                        ByteBuf *target) {
    InStream *dat_in = self->dat_in;
    InStream *ix_in  = self->ix_in;

    InStream_Seek(ix_in, doc_id * 8);

    // Copy the whole record.
    int64_t  filepos = InStream_Read_I64(ix_in);
    int64_t  end     = InStream_Read_I64(ix_in);
    size_t   size    = (size_t)(end - filepos);
    char    *buf     = BB_Grow(target, size);
    InStream_Seek(dat_in, filepos);
    InStream_Read_Bytes(dat_in, buf, size);
    BB_Set_Size(target, size);
}
Beispiel #6
0
void
DefDocReader_read_record(DefaultDocReader *self, ByteBuf *buffer,
                         i32_t doc_id)
{
    i64_t start;
    i64_t end;
    i32_t size;

    /* Find start and length of variable length record. */
    InStream_Seek(self->ix_in, (i64_t)doc_id * 8);
    start = InStream_Read_U64(self->ix_in);
    end   = InStream_Read_U64(self->ix_in);
    size  = end - start;

    /* Read in the record. */
    BB_Grow(buffer, size);
    InStream_Seek(self->dat_in, start);
    InStream_Read_Bytes(self->dat_in, buffer->ptr, size);
    BB_Set_Size(buffer, size);
}
static void
S_init_arena(MemoryPool *self, size_t amount)
{
    ByteBuf *bb;
    i32_t i;

    /* Indicate which arena we're using at present. */
    self->tick++;

    if (self->tick < (i32_t)VA_Get_Size(self->arenas)) {
        /* In recycle mode, use previously acquired memory. */
        bb = (ByteBuf*)VA_Fetch(self->arenas, self->tick);
        if (amount >= BB_Get_Size(bb)) {
            BB_Grow(bb, amount);
            BB_Set_Size(bb, amount);
        }
    }
    else {
        /* In add mode, get more mem from system. */
        size_t buf_size = (amount + 1) > self->arena_size 
            ? (amount + 1)
            : self->arena_size;
        char *ptr       = MALLOCATE(buf_size, char);
        if (ptr == NULL)
            THROW("Failed to allocate memory");
        bb = BB_new_steal_str(ptr, buf_size - 1, buf_size);
        VA_Push(self->arenas, (Obj*)bb);
    }

    /* Recalculate consumption to take into account blocked off space. */
    self->consumed = 0;
    for (i = 0; i < self->tick; i++) {
        ByteBuf *bb = (ByteBuf*)VA_Fetch(self->arenas, i);
        self->consumed += BB_Get_Size(bb);
    }

    self->buf   = bb->ptr;
    self->limit = BBEND(bb);
}
Beispiel #8
0
void
HLWriter_Add_Segment_IMP(HighlightWriter *self, SegReader *reader,
                         I32Array *doc_map) {
    HighlightWriterIVARS *const ivars = HLWriter_IVARS(self);
    int32_t doc_max = SegReader_Doc_Max(reader);

    if (doc_max == 0) {
        // Bail if the supplied segment is empty.
        return;
    }
    else {
        DefaultHighlightReader *hl_reader
            = (DefaultHighlightReader*)CERTIFY(
                  SegReader_Obtain(reader, Class_Get_Name(HIGHLIGHTREADER)),
                  DEFAULTHIGHLIGHTREADER);
        OutStream *dat_out = S_lazy_init(self);
        OutStream *ix_out  = ivars->ix_out;
        int32_t    orig;
        ByteBuf   *bb = BB_new(0);

        for (orig = 1; orig <= doc_max; orig++) {
            // Skip deleted docs.
            if (doc_map && !I32Arr_Get(doc_map, orig)) {
                continue;
            }

            // Write file pointer.
            OutStream_Write_I64(ix_out, OutStream_Tell(dat_out));

            // Copy the raw record.
            DefHLReader_Read_Record(hl_reader, orig, bb);
            OutStream_Write_Bytes(dat_out, BB_Get_Buf(bb), BB_Get_Size(bb));

            BB_Set_Size(bb, 0);
        }
        DECREF(bb);
    }
}
Beispiel #9
0
static Hash*
S_extract_tv_cache(Blob *field_buf) {
    Hash       *tv_cache  = Hash_new(0);
    const char *tv_string = Blob_Get_Buf(field_buf);
    int32_t     num_terms = NumUtil_decode_ci32(&tv_string);
    ByteBuf    *text_buf  = BB_new(0);

    // Read the number of highlightable terms in the field.
    for (int32_t i = 0; i < num_terms; i++) {
        size_t   overlap = NumUtil_decode_cu32(&tv_string);
        size_t   len     = NumUtil_decode_cu32(&tv_string);

        // Decompress the term text.
        BB_Set_Size(text_buf, overlap);
        BB_Cat_Bytes(text_buf, tv_string, len);
        tv_string += len;

        // Get positions & offsets string.
        const char *bookmark_ptr  = tv_string;
        int32_t     num_positions = NumUtil_decode_ci32(&tv_string);
        while (num_positions--) {
            // Leave nums compressed to save a little mem.
            NumUtil_skip_cint(&tv_string);
            NumUtil_skip_cint(&tv_string);
            NumUtil_skip_cint(&tv_string);
        }
        len = tv_string - bookmark_ptr;

        // Store the $text => $posdata pair in the output hash.
        String *text = BB_Trusted_Utf8_To_String(text_buf);
        Hash_Store(tv_cache, text, (Obj*)Blob_new(bookmark_ptr, len));
        DECREF(text);
    }
    DECREF(text_buf);

    return tv_cache;
}
Beispiel #10
0
ByteBuf*
HLWriter_TV_Buf_IMP(HighlightWriter *self, Inversion *inversion) {
    const char *last_text = "";
    size_t      last_len = 0;
    ByteBuf    *tv_buf = BB_new(20 + Inversion_Get_Size(inversion) * 8);
    uint32_t    num_postings = 0;
    Token     **tokens;
    uint32_t    freq;
    UNUSED_VAR(self);

    // Leave space for a c32 indicating the number of postings.
    BB_Set_Size(tv_buf, C32_MAX_BYTES);

    Inversion_Reset(inversion);
    while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) {
        Token *token = *tokens;
        char *const   token_text = Token_Get_Text(token);
        const int32_t token_len  = Token_Get_Len(token);

        int32_t overlap = StrHelp_overlap(last_text, token_text,
                                          last_len, token_len);
        char *ptr;
        char *orig;
        size_t old_size = BB_Get_Size(tv_buf);
        size_t new_size = old_size
                          + C32_MAX_BYTES      // overlap
                          + C32_MAX_BYTES      // length of string diff
                          + (token_len - overlap)        // diff char data
                          + C32_MAX_BYTES                // num prox
                          + (C32_MAX_BYTES * freq * 3);  // pos data

        // Allocate for worst-case scenario.
        ptr  = BB_Grow(tv_buf, new_size);
        orig = ptr;
        ptr += old_size;

        // Track number of postings.
        num_postings += 1;

        // Append the string diff to the tv_buf.
        NumUtil_encode_c32(overlap, &ptr);
        NumUtil_encode_c32((token_len - overlap), &ptr);
        memcpy(ptr, (token_text + overlap), (token_len - overlap));
        ptr += token_len - overlap;

        // Save text and text_len for comparison next loop.
        last_text = token_text;
        last_len  = token_len;

        // Append the number of positions for this term.
        NumUtil_encode_c32(freq, &ptr);

        do {
            // Add position, start_offset, and end_offset to tv_buf.
            NumUtil_encode_c32(Token_Get_Pos(token), &ptr);
            NumUtil_encode_c32(Token_Get_Start_Offset(token), &ptr);
            NumUtil_encode_c32(Token_Get_End_Offset(token), &ptr);
        } while (--freq && (token = *++tokens));

        // Set new byte length.
        BB_Set_Size(tv_buf, ptr - orig);
    }

    // Go back and start the term vector string with the posting count.
    char *dest = BB_Get_Buf(tv_buf);
    NumUtil_encode_padded_c32(num_postings, &dest);

    return tv_buf;
}
ByteBuf*
HLWriter_tv_buf(HighlightWriter *self, Inversion *inversion)
{
    char        *last_text = "";
    size_t       last_len = 0;
    ByteBuf     *tv_buf = BB_new(20 + inversion->size * 8); /* generous */
    u32_t        num_postings = 0;
    char        *dest;
    Token      **tokens;
    u32_t        freq;
    UNUSED_VAR(self); /* heh. */

    /* Leave space for a c32 indicating the number of postings. */
    BB_Set_Size(tv_buf, C32_MAX_BYTES);

    Inversion_Reset(inversion);
    while ( (tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL ) {
        Token *token = *tokens;
        i32_t overlap = StrHelp_string_diff(last_text, token->text, 
            last_len, token->len);
        char *ptr;
        size_t new_size =   BB_Get_Size(tv_buf)
                          + C32_MAX_BYTES      /* overlap */
                          + C32_MAX_BYTES      /* length of string diff */
                          + (token->len - overlap) /* diff char data */
                          + C32_MAX_BYTES                /* num prox */
                          + (C32_MAX_BYTES * freq * 3);  /* pos data */

        /* Allocate for worst-case scenario. */
        BB_Grow(tv_buf, new_size);
        ptr = BBEND(tv_buf);

        /* Track number of postings. */
        num_postings += 1;
        
        /* Append the string diff to the tv_buf. */
        Math_encode_c32(overlap, &ptr);
        Math_encode_c32( (token->len - overlap), &ptr);
        memcpy(ptr, (token->text + overlap), (token->len - overlap));
        ptr += token->len - overlap;

        /* Save text and text_len for comparison next loop. */
        last_text = token->text;
        last_len  = token->len;

        /* Append the number of positions for this term. */
        Math_encode_c32(freq, &ptr);

        do {
            /* Add position, start_offset, and end_offset to tv_buf. */
            Math_encode_c32(token->pos, &ptr);
            Math_encode_c32(token->start_offset, &ptr);
            Math_encode_c32(token->end_offset, &ptr);

        } while (--freq && (token = *++tokens));

        /* Set new byte length. */
        BB_Set_Size(tv_buf, ptr - tv_buf->ptr); 
    }
    
    /* Go back and start the term vector string with the number of postings. */
    dest = tv_buf->ptr;
    Math_encode_padded_c32(num_postings, &dest);

    return tv_buf;
}