static void test_Grow(TestBatchRunner *runner) { ByteBuf *bb = BB_new(1); TEST_INT_EQ(runner, BB_Get_Capacity(bb), 8, "Allocate in 8-byte increments"); BB_Grow(bb, 9); TEST_INT_EQ(runner, BB_Get_Capacity(bb), 16, "Grow in 8-byte increments"); DECREF(bb); }
static void test_Mimic(TestBatchRunner *runner) { ByteBuf *a = BB_new_bytes("foo", 3); ByteBuf *b = BB_new(0); BB_Mimic(b, (Obj*)a); TEST_TRUE(runner, BB_Equals(a, (Obj*)b), "Mimic"); BB_Mimic_Bytes(a, "bar", 4); TEST_TRUE(runner, strcmp(BB_Get_Buf(a), "bar") == 0, "Mimic_Bytes content"); TEST_INT_EQ(runner, BB_Get_Size(a), 4, "Mimic_Bytes size"); BB_Mimic(b, (Obj*)a); TEST_TRUE(runner, BB_Equals(a, (Obj*)b), "Mimic"); DECREF(a); DECREF(b); }
void HLWriter_Add_Segment_IMP(HighlightWriter *self, SegReader *reader, I32Array *doc_map) { HighlightWriterIVARS *const ivars = HLWriter_IVARS(self); int32_t doc_max = SegReader_Doc_Max(reader); if (doc_max == 0) { // Bail if the supplied segment is empty. return; } else { DefaultHighlightReader *hl_reader = (DefaultHighlightReader*)CERTIFY( SegReader_Obtain(reader, Class_Get_Name(HIGHLIGHTREADER)), DEFAULTHIGHLIGHTREADER); OutStream *dat_out = S_lazy_init(self); OutStream *ix_out = ivars->ix_out; int32_t orig; ByteBuf *bb = BB_new(0); for (orig = 1; orig <= doc_max; orig++) { // Skip deleted docs. if (doc_map && !I32Arr_Get(doc_map, orig)) { continue; } // Write file pointer. OutStream_Write_I64(ix_out, OutStream_Tell(dat_out)); // Copy the raw record. DefHLReader_Read_Record(hl_reader, orig, bb); OutStream_Write_Bytes(dat_out, BB_Get_Buf(bb), BB_Get_Size(bb)); BB_Set_Size(bb, 0); } DECREF(bb); } }
static Hash* S_extract_tv_cache(Blob *field_buf) { Hash *tv_cache = Hash_new(0); const char *tv_string = Blob_Get_Buf(field_buf); int32_t num_terms = NumUtil_decode_ci32(&tv_string); ByteBuf *text_buf = BB_new(0); // Read the number of highlightable terms in the field. for (int32_t i = 0; i < num_terms; i++) { size_t overlap = NumUtil_decode_cu32(&tv_string); size_t len = NumUtil_decode_cu32(&tv_string); // Decompress the term text. BB_Set_Size(text_buf, overlap); BB_Cat_Bytes(text_buf, tv_string, len); tv_string += len; // Get positions & offsets string. const char *bookmark_ptr = tv_string; int32_t num_positions = NumUtil_decode_ci32(&tv_string); while (num_positions--) { // Leave nums compressed to save a little mem. NumUtil_skip_cint(&tv_string); NumUtil_skip_cint(&tv_string); NumUtil_skip_cint(&tv_string); } len = tv_string - bookmark_ptr; // Store the $text => $posdata pair in the output hash. String *text = BB_Trusted_Utf8_To_String(text_buf); Hash_Store(tv_cache, text, (Obj*)Blob_new(bookmark_ptr, len)); DECREF(text); } DECREF(text_buf); return tv_cache; }
void DocWriter_Add_Segment_IMP(DocWriter *self, SegReader *reader, I32Array *doc_map) { DocWriterIVARS *const ivars = DocWriter_IVARS(self); int32_t doc_max = SegReader_Doc_Max(reader); if (doc_max == 0) { // Bail if the supplied segment is empty. return; } else { OutStream *const dat_out = S_lazy_init(self); OutStream *const ix_out = ivars->ix_out; ByteBuf *const buffer = BB_new(0); DefaultDocReader *const doc_reader = (DefaultDocReader*)CERTIFY( SegReader_Obtain(reader, VTable_Get_Name(DOCREADER)), DEFAULTDOCREADER); for (int32_t i = 1, max = SegReader_Doc_Max(reader); i <= max; i++) { if (I32Arr_Get(doc_map, i)) { int64_t start = OutStream_Tell(dat_out); // Copy record over. DefDocReader_Read_Record(doc_reader, buffer, i); char *buf = BB_Get_Buf(buffer); size_t size = BB_Get_Size(buffer); OutStream_Write_Bytes(dat_out, buf, size); // Write file pointer. OutStream_Write_I64(ix_out, start); } } DECREF(buffer); } }
void HLWriter_add_segment(HighlightWriter *self, SegReader *reader, I32Array *doc_map) { i32_t doc_max = SegReader_Doc_Max(reader); if (doc_max == 0) { /* Bail if the supplied segment is empty. */ return; } else { DefaultHighlightReader *hl_reader = (DefaultHighlightReader*) ASSERT_IS_A(SegReader_Obtain(reader, HIGHLIGHTREADER.name), DEFAULTHIGHLIGHTREADER); OutStream *dat_out = S_lazy_init(self); OutStream *ix_out = self->ix_out; i32_t orig; ByteBuf *bb = BB_new(0); for (orig = 1; orig <= doc_max; orig++) { /* Skip deleted docs. */ if (doc_map && !I32Arr_Get(doc_map, orig)) continue; /* Write file pointer. */ OutStream_Write_U64( ix_out, OutStream_Tell(dat_out) ); /* Copy the raw record. */ DefHLReader_Read_Record(hl_reader, orig, bb); OutStream_Write_Bytes(dat_out, bb->ptr, bb->size); bb->size = 0; } DECREF(bb); } }
ByteBuf* HLWriter_TV_Buf_IMP(HighlightWriter *self, Inversion *inversion) { const char *last_text = ""; size_t last_len = 0; ByteBuf *tv_buf = BB_new(20 + Inversion_Get_Size(inversion) * 8); uint32_t num_postings = 0; Token **tokens; uint32_t freq; UNUSED_VAR(self); // Leave space for a c32 indicating the number of postings. BB_Set_Size(tv_buf, C32_MAX_BYTES); Inversion_Reset(inversion); while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) { Token *token = *tokens; char *const token_text = Token_Get_Text(token); const int32_t token_len = Token_Get_Len(token); int32_t overlap = StrHelp_overlap(last_text, token_text, last_len, token_len); char *ptr; char *orig; size_t old_size = BB_Get_Size(tv_buf); size_t new_size = old_size + C32_MAX_BYTES // overlap + C32_MAX_BYTES // length of string diff + (token_len - overlap) // diff char data + C32_MAX_BYTES // num prox + (C32_MAX_BYTES * freq * 3); // pos data // Allocate for worst-case scenario. ptr = BB_Grow(tv_buf, new_size); orig = ptr; ptr += old_size; // Track number of postings. num_postings += 1; // Append the string diff to the tv_buf. NumUtil_encode_c32(overlap, &ptr); NumUtil_encode_c32((token_len - overlap), &ptr); memcpy(ptr, (token_text + overlap), (token_len - overlap)); ptr += token_len - overlap; // Save text and text_len for comparison next loop. last_text = token_text; last_len = token_len; // Append the number of positions for this term. NumUtil_encode_c32(freq, &ptr); do { // Add position, start_offset, and end_offset to tv_buf. NumUtil_encode_c32(Token_Get_Pos(token), &ptr); NumUtil_encode_c32(Token_Get_Start_Offset(token), &ptr); NumUtil_encode_c32(Token_Get_End_Offset(token), &ptr); } while (--freq && (token = *++tokens)); // Set new byte length. BB_Set_Size(tv_buf, ptr - orig); } // Go back and start the term vector string with the posting count. char *dest = BB_Get_Buf(tv_buf); NumUtil_encode_padded_c32(num_postings, &dest); return tv_buf; }
ByteBuf* HLWriter_tv_buf(HighlightWriter *self, Inversion *inversion) { char *last_text = ""; size_t last_len = 0; ByteBuf *tv_buf = BB_new(20 + inversion->size * 8); /* generous */ u32_t num_postings = 0; char *dest; Token **tokens; u32_t freq; UNUSED_VAR(self); /* heh. */ /* Leave space for a c32 indicating the number of postings. */ BB_Set_Size(tv_buf, C32_MAX_BYTES); Inversion_Reset(inversion); while ( (tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL ) { Token *token = *tokens; i32_t overlap = StrHelp_string_diff(last_text, token->text, last_len, token->len); char *ptr; size_t new_size = BB_Get_Size(tv_buf) + C32_MAX_BYTES /* overlap */ + C32_MAX_BYTES /* length of string diff */ + (token->len - overlap) /* diff char data */ + C32_MAX_BYTES /* num prox */ + (C32_MAX_BYTES * freq * 3); /* pos data */ /* Allocate for worst-case scenario. */ BB_Grow(tv_buf, new_size); ptr = BBEND(tv_buf); /* Track number of postings. */ num_postings += 1; /* Append the string diff to the tv_buf. */ Math_encode_c32(overlap, &ptr); Math_encode_c32( (token->len - overlap), &ptr); memcpy(ptr, (token->text + overlap), (token->len - overlap)); ptr += token->len - overlap; /* Save text and text_len for comparison next loop. */ last_text = token->text; last_len = token->len; /* Append the number of positions for this term. */ Math_encode_c32(freq, &ptr); do { /* Add position, start_offset, and end_offset to tv_buf. */ Math_encode_c32(token->pos, &ptr); Math_encode_c32(token->start_offset, &ptr); Math_encode_c32(token->end_offset, &ptr); } while (--freq && (token = *++tokens)); /* Set new byte length. */ BB_Set_Size(tv_buf, ptr - tv_buf->ptr); } /* Go back and start the term vector string with the number of postings. */ dest = tv_buf->ptr; Math_encode_padded_c32(num_postings, &dest); return tv_buf; }