Пример #1
0
static void
test_overlap(TestBatchRunner *runner) {
    size_t result;
    result = StrHelp_overlap("", "", 0, 0);
    TEST_UINT_EQ(runner, result, 0, "two empty strings");
    result = StrHelp_overlap("", "foo", 0, 3);
    TEST_UINT_EQ(runner, result, 0, "first string is empty");
    result = StrHelp_overlap("foo", "", 3, 0);
    TEST_UINT_EQ(runner, result, 0, "second string is empty");
    result = StrHelp_overlap("foo", "foo", 3, 3);
    TEST_UINT_EQ(runner, result, 3, "equal strings");
    result = StrHelp_overlap("foo bar", "foo", 7, 3);
    TEST_UINT_EQ(runner, result, 3, "first string is longer");
    result = StrHelp_overlap("foo", "foo bar", 3, 7);
    TEST_UINT_EQ(runner, result, 3, "second string is longer");
    result = StrHelp_overlap("bar", "baz", 3, 3);
    TEST_UINT_EQ(runner, result, 2, "different byte");
}
Пример #2
0
void
TextTermStepper_Write_Delta_IMP(TextTermStepper *self, OutStream *outstream,
                                Obj *value) {
    TextTermStepperIVARS *const ivars = TextTermStepper_IVARS(self);
    CharBuf    *charbuf   = (CharBuf*)ivars->value;
    const char *last_text = CB_Get_Ptr8(charbuf);
    size_t      last_size = CB_Get_Size(charbuf);
    const char *new_text  = NULL;
    size_t      new_size  = 0;

    if (Obj_is_a(value, STRING)) {
        String *new_string = (String*)value;
        new_text = Str_Get_Ptr8(new_string);
        new_size = Str_Get_Size(new_string);
    }
    else if (Obj_is_a(value, CHARBUF)) {
        CharBuf *new_charbuf = (CharBuf*)value;
        new_text = CB_Get_Ptr8(new_charbuf);
        new_size = CB_Get_Size(new_charbuf);
    }
    else {
        THROW(ERR, "'value' must be a String or CharBuf");
    }

    // Count how many bytes the strings share at the top.
    const int32_t overlap = StrHelp_overlap(last_text, new_text,
                                            last_size, new_size);
    const char *const diff_start_str = new_text + overlap;
    const size_t diff_len            = new_size - overlap;

    // Write number of common bytes and common bytes.
    OutStream_Write_C32(outstream, overlap);
    OutStream_Write_String(outstream, diff_start_str, diff_len);

    // Update value.
    CB_Mimic_Utf8(charbuf, new_text, new_size);

    // Invalidate string.
    DECREF(ivars->string);
    ivars->string = NULL;
}
Пример #3
0
void
TextTermStepper_write_delta(TextTermStepper *self, OutStream *outstream,
                            Obj *value)
{
    CharBuf *new_value  = (CharBuf*)CERTIFY(value, CHARBUF);
    CharBuf *last_value = (CharBuf*)self->value;
    char    *new_text  = (char*)CB_Get_Ptr8(new_value);
    size_t   new_size  = CB_Get_Size(new_value);
    char    *last_text = (char*)CB_Get_Ptr8(last_value);
    size_t   last_size = CB_Get_Size(last_value);

    // Count how many bytes the strings share at the top.  
    const int32_t overlap = StrHelp_overlap(last_text, new_text,
        last_size, new_size);
    const char *const diff_start_str = new_text + overlap;
    const size_t diff_len            = new_size - overlap;

    // Write number of common bytes and common bytes. 
    OutStream_Write_C32(outstream, overlap);
    OutStream_Write_String(outstream, diff_start_str, diff_len);

    // Update value. 
    CB_Mimic((CharBuf*)self->value, value);
}
Пример #4
0
ByteBuf*
HLWriter_TV_Buf_IMP(HighlightWriter *self, Inversion *inversion) {
    const char *last_text = "";
    size_t      last_len = 0;
    ByteBuf    *tv_buf = BB_new(20 + Inversion_Get_Size(inversion) * 8);
    uint32_t    num_postings = 0;
    Token     **tokens;
    uint32_t    freq;
    UNUSED_VAR(self);

    // Leave space for a c32 indicating the number of postings.
    BB_Set_Size(tv_buf, C32_MAX_BYTES);

    Inversion_Reset(inversion);
    while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) {
        Token *token = *tokens;
        char *const   token_text = Token_Get_Text(token);
        const int32_t token_len  = Token_Get_Len(token);

        int32_t overlap = StrHelp_overlap(last_text, token_text,
                                          last_len, token_len);
        char *ptr;
        char *orig;
        size_t old_size = BB_Get_Size(tv_buf);
        size_t new_size = old_size
                          + C32_MAX_BYTES      // overlap
                          + C32_MAX_BYTES      // length of string diff
                          + (token_len - overlap)        // diff char data
                          + C32_MAX_BYTES                // num prox
                          + (C32_MAX_BYTES * freq * 3);  // pos data

        // Allocate for worst-case scenario.
        ptr  = BB_Grow(tv_buf, new_size);
        orig = ptr;
        ptr += old_size;

        // Track number of postings.
        num_postings += 1;

        // Append the string diff to the tv_buf.
        NumUtil_encode_c32(overlap, &ptr);
        NumUtil_encode_c32((token_len - overlap), &ptr);
        memcpy(ptr, (token_text + overlap), (token_len - overlap));
        ptr += token_len - overlap;

        // Save text and text_len for comparison next loop.
        last_text = token_text;
        last_len  = token_len;

        // Append the number of positions for this term.
        NumUtil_encode_c32(freq, &ptr);

        do {
            // Add position, start_offset, and end_offset to tv_buf.
            NumUtil_encode_c32(Token_Get_Pos(token), &ptr);
            NumUtil_encode_c32(Token_Get_Start_Offset(token), &ptr);
            NumUtil_encode_c32(Token_Get_End_Offset(token), &ptr);
        } while (--freq && (token = *++tokens));

        // Set new byte length.
        BB_Set_Size(tv_buf, ptr - orig);
    }

    // Go back and start the term vector string with the posting count.
    char *dest = BB_Get_Buf(tv_buf);
    NumUtil_encode_padded_c32(num_postings, &dest);

    return tv_buf;
}