static void test_overlap(TestBatchRunner *runner) { size_t result; result = StrHelp_overlap("", "", 0, 0); TEST_UINT_EQ(runner, result, 0, "two empty strings"); result = StrHelp_overlap("", "foo", 0, 3); TEST_UINT_EQ(runner, result, 0, "first string is empty"); result = StrHelp_overlap("foo", "", 3, 0); TEST_UINT_EQ(runner, result, 0, "second string is empty"); result = StrHelp_overlap("foo", "foo", 3, 3); TEST_UINT_EQ(runner, result, 3, "equal strings"); result = StrHelp_overlap("foo bar", "foo", 7, 3); TEST_UINT_EQ(runner, result, 3, "first string is longer"); result = StrHelp_overlap("foo", "foo bar", 3, 7); TEST_UINT_EQ(runner, result, 3, "second string is longer"); result = StrHelp_overlap("bar", "baz", 3, 3); TEST_UINT_EQ(runner, result, 2, "different byte"); }
void TextTermStepper_Write_Delta_IMP(TextTermStepper *self, OutStream *outstream, Obj *value) { TextTermStepperIVARS *const ivars = TextTermStepper_IVARS(self); CharBuf *charbuf = (CharBuf*)ivars->value; const char *last_text = CB_Get_Ptr8(charbuf); size_t last_size = CB_Get_Size(charbuf); const char *new_text = NULL; size_t new_size = 0; if (Obj_is_a(value, STRING)) { String *new_string = (String*)value; new_text = Str_Get_Ptr8(new_string); new_size = Str_Get_Size(new_string); } else if (Obj_is_a(value, CHARBUF)) { CharBuf *new_charbuf = (CharBuf*)value; new_text = CB_Get_Ptr8(new_charbuf); new_size = CB_Get_Size(new_charbuf); } else { THROW(ERR, "'value' must be a String or CharBuf"); } // Count how many bytes the strings share at the top. const int32_t overlap = StrHelp_overlap(last_text, new_text, last_size, new_size); const char *const diff_start_str = new_text + overlap; const size_t diff_len = new_size - overlap; // Write number of common bytes and common bytes. OutStream_Write_C32(outstream, overlap); OutStream_Write_String(outstream, diff_start_str, diff_len); // Update value. CB_Mimic_Utf8(charbuf, new_text, new_size); // Invalidate string. DECREF(ivars->string); ivars->string = NULL; }
void TextTermStepper_write_delta(TextTermStepper *self, OutStream *outstream, Obj *value) { CharBuf *new_value = (CharBuf*)CERTIFY(value, CHARBUF); CharBuf *last_value = (CharBuf*)self->value; char *new_text = (char*)CB_Get_Ptr8(new_value); size_t new_size = CB_Get_Size(new_value); char *last_text = (char*)CB_Get_Ptr8(last_value); size_t last_size = CB_Get_Size(last_value); // Count how many bytes the strings share at the top. const int32_t overlap = StrHelp_overlap(last_text, new_text, last_size, new_size); const char *const diff_start_str = new_text + overlap; const size_t diff_len = new_size - overlap; // Write number of common bytes and common bytes. OutStream_Write_C32(outstream, overlap); OutStream_Write_String(outstream, diff_start_str, diff_len); // Update value. CB_Mimic((CharBuf*)self->value, value); }
ByteBuf* HLWriter_TV_Buf_IMP(HighlightWriter *self, Inversion *inversion) { const char *last_text = ""; size_t last_len = 0; ByteBuf *tv_buf = BB_new(20 + Inversion_Get_Size(inversion) * 8); uint32_t num_postings = 0; Token **tokens; uint32_t freq; UNUSED_VAR(self); // Leave space for a c32 indicating the number of postings. BB_Set_Size(tv_buf, C32_MAX_BYTES); Inversion_Reset(inversion); while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) { Token *token = *tokens; char *const token_text = Token_Get_Text(token); const int32_t token_len = Token_Get_Len(token); int32_t overlap = StrHelp_overlap(last_text, token_text, last_len, token_len); char *ptr; char *orig; size_t old_size = BB_Get_Size(tv_buf); size_t new_size = old_size + C32_MAX_BYTES // overlap + C32_MAX_BYTES // length of string diff + (token_len - overlap) // diff char data + C32_MAX_BYTES // num prox + (C32_MAX_BYTES * freq * 3); // pos data // Allocate for worst-case scenario. ptr = BB_Grow(tv_buf, new_size); orig = ptr; ptr += old_size; // Track number of postings. num_postings += 1; // Append the string diff to the tv_buf. NumUtil_encode_c32(overlap, &ptr); NumUtil_encode_c32((token_len - overlap), &ptr); memcpy(ptr, (token_text + overlap), (token_len - overlap)); ptr += token_len - overlap; // Save text and text_len for comparison next loop. last_text = token_text; last_len = token_len; // Append the number of positions for this term. NumUtil_encode_c32(freq, &ptr); do { // Add position, start_offset, and end_offset to tv_buf. NumUtil_encode_c32(Token_Get_Pos(token), &ptr); NumUtil_encode_c32(Token_Get_Start_Offset(token), &ptr); NumUtil_encode_c32(Token_Get_End_Offset(token), &ptr); } while (--freq && (token = *++tokens)); // Set new byte length. BB_Set_Size(tv_buf, ptr - orig); } // Go back and start the term vector string with the posting count. char *dest = BB_Get_Buf(tv_buf); NumUtil_encode_padded_c32(num_postings, &dest); return tv_buf; }