static void test_Equals(TestBatchRunner *runner) { ByteBuf *wanted = BB_new_bytes("foo", 4); // Include terminating NULL. ByteBuf *got = BB_new_bytes("foo", 4); TEST_TRUE(runner, BB_Equals(wanted, (Obj*)got), "Equals"); TEST_INT_EQ(runner, BB_Hash_Sum(got), BB_Hash_Sum(wanted), "Hash_Sum"); TEST_TRUE(runner, BB_Equals_Bytes(got, "foo", 4), "Equals_Bytes"); TEST_FALSE(runner, BB_Equals_Bytes(got, "foo", 3), "Equals_Bytes spoiled by different size"); TEST_FALSE(runner, BB_Equals_Bytes(got, "bar", 4), "Equals_Bytes spoiled by different content"); BB_Set_Size(got, 3); TEST_FALSE(runner, BB_Equals(wanted, (Obj*)got), "Different size spoils Equals"); TEST_FALSE(runner, BB_Hash_Sum(got) == BB_Hash_Sum(wanted), "Different size spoils Hash_Sum (probably -- at least this one)"); BB_Mimic_Bytes(got, "bar", 4); TEST_INT_EQ(runner, BB_Get_Size(wanted), BB_Get_Size(got), "same length"); TEST_FALSE(runner, BB_Equals(wanted, (Obj*)got), "Different content spoils Equals"); DECREF(got); DECREF(wanted); }
static void S_init_arena(MemoryPool *self, size_t amount) { ByteBuf *bb; // Indicate which arena we're using at present. self->tick++; if (self->tick < (int32_t)VA_Get_Size(self->arenas)) { // In recycle mode, use previously acquired memory. bb = (ByteBuf*)VA_Fetch(self->arenas, self->tick); if (amount >= BB_Get_Size(bb)) { BB_Grow(bb, amount); BB_Set_Size(bb, amount); } } else { // In add mode, get more mem from system. size_t buf_size = (amount + 1) > self->arena_size ? (amount + 1) : self->arena_size; char *ptr = (char*)MALLOCATE(buf_size); bb = BB_new_steal_bytes(ptr, buf_size - 1, buf_size); VA_Push(self->arenas, (Obj*)bb); } // Recalculate consumption to take into account blocked off space. self->consumed = 0; for (int32_t i = 0; i < self->tick; i++) { ByteBuf *bb = (ByteBuf*)VA_Fetch(self->arenas, i); self->consumed += BB_Get_Size(bb); } self->buf = BB_Get_Buf(bb); self->limit = self->buf + BB_Get_Size(bb); }
static void test_compare(TestBatchRunner *runner) { ByteBuf *a = BB_new_bytes("foo\0a", 5); ByteBuf *b = BB_new_bytes("foo\0b", 5); BB_Set_Size(a, 4); BB_Set_Size(b, 4); TEST_INT_EQ(runner, BB_compare(&a, &b), 0, "BB_compare returns 0 for equal ByteBufs"); BB_Set_Size(a, 3); TEST_TRUE(runner, BB_compare(&a, &b) < 0, "shorter ByteBuf sorts first"); BB_Set_Size(a, 5); BB_Set_Size(b, 5); TEST_TRUE(runner, BB_compare(&a, &b) < 0, "NULL doesn't interfere with BB_compare"); DECREF(a); DECREF(b); }
void DefDocReader_Read_Record_IMP(DefaultDocReader *self, ByteBuf *buffer, int32_t doc_id) { DefaultDocReaderIVARS *const ivars = DefDocReader_IVARS(self); // Find start and length of variable length record. InStream_Seek(ivars->ix_in, (int64_t)doc_id * 8); int64_t start = InStream_Read_I64(ivars->ix_in); int64_t end = InStream_Read_I64(ivars->ix_in); size_t size = (size_t)(end - start); // Read in the record. char *buf = BB_Grow(buffer, size); InStream_Seek(ivars->dat_in, start); InStream_Read_Bytes(ivars->dat_in, buf, size); BB_Set_Size(buffer, size); }
void DefHLReader_read_record(DefaultHighlightReader *self, int32_t doc_id, ByteBuf *target) { InStream *dat_in = self->dat_in; InStream *ix_in = self->ix_in; InStream_Seek(ix_in, doc_id * 8); // Copy the whole record. int64_t filepos = InStream_Read_I64(ix_in); int64_t end = InStream_Read_I64(ix_in); size_t size = (size_t)(end - filepos); char *buf = BB_Grow(target, size); InStream_Seek(dat_in, filepos); InStream_Read_Bytes(dat_in, buf, size); BB_Set_Size(target, size); }
void DefDocReader_read_record(DefaultDocReader *self, ByteBuf *buffer, i32_t doc_id) { i64_t start; i64_t end; i32_t size; /* Find start and length of variable length record. */ InStream_Seek(self->ix_in, (i64_t)doc_id * 8); start = InStream_Read_U64(self->ix_in); end = InStream_Read_U64(self->ix_in); size = end - start; /* Read in the record. */ BB_Grow(buffer, size); InStream_Seek(self->dat_in, start); InStream_Read_Bytes(self->dat_in, buffer->ptr, size); BB_Set_Size(buffer, size); }
static void S_init_arena(MemoryPool *self, size_t amount) { ByteBuf *bb; i32_t i; /* Indicate which arena we're using at present. */ self->tick++; if (self->tick < (i32_t)VA_Get_Size(self->arenas)) { /* In recycle mode, use previously acquired memory. */ bb = (ByteBuf*)VA_Fetch(self->arenas, self->tick); if (amount >= BB_Get_Size(bb)) { BB_Grow(bb, amount); BB_Set_Size(bb, amount); } } else { /* In add mode, get more mem from system. */ size_t buf_size = (amount + 1) > self->arena_size ? (amount + 1) : self->arena_size; char *ptr = MALLOCATE(buf_size, char); if (ptr == NULL) THROW("Failed to allocate memory"); bb = BB_new_steal_str(ptr, buf_size - 1, buf_size); VA_Push(self->arenas, (Obj*)bb); } /* Recalculate consumption to take into account blocked off space. */ self->consumed = 0; for (i = 0; i < self->tick; i++) { ByteBuf *bb = (ByteBuf*)VA_Fetch(self->arenas, i); self->consumed += BB_Get_Size(bb); } self->buf = bb->ptr; self->limit = BBEND(bb); }
void HLWriter_Add_Segment_IMP(HighlightWriter *self, SegReader *reader, I32Array *doc_map) { HighlightWriterIVARS *const ivars = HLWriter_IVARS(self); int32_t doc_max = SegReader_Doc_Max(reader); if (doc_max == 0) { // Bail if the supplied segment is empty. return; } else { DefaultHighlightReader *hl_reader = (DefaultHighlightReader*)CERTIFY( SegReader_Obtain(reader, Class_Get_Name(HIGHLIGHTREADER)), DEFAULTHIGHLIGHTREADER); OutStream *dat_out = S_lazy_init(self); OutStream *ix_out = ivars->ix_out; int32_t orig; ByteBuf *bb = BB_new(0); for (orig = 1; orig <= doc_max; orig++) { // Skip deleted docs. if (doc_map && !I32Arr_Get(doc_map, orig)) { continue; } // Write file pointer. OutStream_Write_I64(ix_out, OutStream_Tell(dat_out)); // Copy the raw record. DefHLReader_Read_Record(hl_reader, orig, bb); OutStream_Write_Bytes(dat_out, BB_Get_Buf(bb), BB_Get_Size(bb)); BB_Set_Size(bb, 0); } DECREF(bb); } }
static Hash* S_extract_tv_cache(Blob *field_buf) { Hash *tv_cache = Hash_new(0); const char *tv_string = Blob_Get_Buf(field_buf); int32_t num_terms = NumUtil_decode_ci32(&tv_string); ByteBuf *text_buf = BB_new(0); // Read the number of highlightable terms in the field. for (int32_t i = 0; i < num_terms; i++) { size_t overlap = NumUtil_decode_cu32(&tv_string); size_t len = NumUtil_decode_cu32(&tv_string); // Decompress the term text. BB_Set_Size(text_buf, overlap); BB_Cat_Bytes(text_buf, tv_string, len); tv_string += len; // Get positions & offsets string. const char *bookmark_ptr = tv_string; int32_t num_positions = NumUtil_decode_ci32(&tv_string); while (num_positions--) { // Leave nums compressed to save a little mem. NumUtil_skip_cint(&tv_string); NumUtil_skip_cint(&tv_string); NumUtil_skip_cint(&tv_string); } len = tv_string - bookmark_ptr; // Store the $text => $posdata pair in the output hash. String *text = BB_Trusted_Utf8_To_String(text_buf); Hash_Store(tv_cache, text, (Obj*)Blob_new(bookmark_ptr, len)); DECREF(text); } DECREF(text_buf); return tv_cache; }
ByteBuf* HLWriter_TV_Buf_IMP(HighlightWriter *self, Inversion *inversion) { const char *last_text = ""; size_t last_len = 0; ByteBuf *tv_buf = BB_new(20 + Inversion_Get_Size(inversion) * 8); uint32_t num_postings = 0; Token **tokens; uint32_t freq; UNUSED_VAR(self); // Leave space for a c32 indicating the number of postings. BB_Set_Size(tv_buf, C32_MAX_BYTES); Inversion_Reset(inversion); while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) { Token *token = *tokens; char *const token_text = Token_Get_Text(token); const int32_t token_len = Token_Get_Len(token); int32_t overlap = StrHelp_overlap(last_text, token_text, last_len, token_len); char *ptr; char *orig; size_t old_size = BB_Get_Size(tv_buf); size_t new_size = old_size + C32_MAX_BYTES // overlap + C32_MAX_BYTES // length of string diff + (token_len - overlap) // diff char data + C32_MAX_BYTES // num prox + (C32_MAX_BYTES * freq * 3); // pos data // Allocate for worst-case scenario. ptr = BB_Grow(tv_buf, new_size); orig = ptr; ptr += old_size; // Track number of postings. num_postings += 1; // Append the string diff to the tv_buf. NumUtil_encode_c32(overlap, &ptr); NumUtil_encode_c32((token_len - overlap), &ptr); memcpy(ptr, (token_text + overlap), (token_len - overlap)); ptr += token_len - overlap; // Save text and text_len for comparison next loop. last_text = token_text; last_len = token_len; // Append the number of positions for this term. NumUtil_encode_c32(freq, &ptr); do { // Add position, start_offset, and end_offset to tv_buf. NumUtil_encode_c32(Token_Get_Pos(token), &ptr); NumUtil_encode_c32(Token_Get_Start_Offset(token), &ptr); NumUtil_encode_c32(Token_Get_End_Offset(token), &ptr); } while (--freq && (token = *++tokens)); // Set new byte length. BB_Set_Size(tv_buf, ptr - orig); } // Go back and start the term vector string with the posting count. char *dest = BB_Get_Buf(tv_buf); NumUtil_encode_padded_c32(num_postings, &dest); return tv_buf; }
ByteBuf* HLWriter_tv_buf(HighlightWriter *self, Inversion *inversion) { char *last_text = ""; size_t last_len = 0; ByteBuf *tv_buf = BB_new(20 + inversion->size * 8); /* generous */ u32_t num_postings = 0; char *dest; Token **tokens; u32_t freq; UNUSED_VAR(self); /* heh. */ /* Leave space for a c32 indicating the number of postings. */ BB_Set_Size(tv_buf, C32_MAX_BYTES); Inversion_Reset(inversion); while ( (tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL ) { Token *token = *tokens; i32_t overlap = StrHelp_string_diff(last_text, token->text, last_len, token->len); char *ptr; size_t new_size = BB_Get_Size(tv_buf) + C32_MAX_BYTES /* overlap */ + C32_MAX_BYTES /* length of string diff */ + (token->len - overlap) /* diff char data */ + C32_MAX_BYTES /* num prox */ + (C32_MAX_BYTES * freq * 3); /* pos data */ /* Allocate for worst-case scenario. */ BB_Grow(tv_buf, new_size); ptr = BBEND(tv_buf); /* Track number of postings. */ num_postings += 1; /* Append the string diff to the tv_buf. */ Math_encode_c32(overlap, &ptr); Math_encode_c32( (token->len - overlap), &ptr); memcpy(ptr, (token->text + overlap), (token->len - overlap)); ptr += token->len - overlap; /* Save text and text_len for comparison next loop. */ last_text = token->text; last_len = token->len; /* Append the number of positions for this term. */ Math_encode_c32(freq, &ptr); do { /* Add position, start_offset, and end_offset to tv_buf. */ Math_encode_c32(token->pos, &ptr); Math_encode_c32(token->start_offset, &ptr); Math_encode_c32(token->end_offset, &ptr); } while (--freq && (token = *++tokens)); /* Set new byte length. */ BB_Set_Size(tv_buf, ptr - tv_buf->ptr); } /* Go back and start the term vector string with the number of postings. */ dest = tv_buf->ptr; Math_encode_padded_c32(num_postings, &dest); return tv_buf; }