static void test_Equals(TestBatchRunner *runner) { ByteBuf *bb = BB_new_bytes("foo", 4); // Include terminating NULL. { ByteBuf *other = BB_new_bytes("foo", 4); TEST_TRUE(runner, BB_Equals(bb, (Obj*)other), "Equals"); DECREF(other); } TEST_TRUE(runner, BB_Equals_Bytes(bb, "foo", 4), "Equals_Bytes"); TEST_FALSE(runner, BB_Equals_Bytes(bb, "foo", 3), "Equals_Bytes spoiled by different size"); TEST_FALSE(runner, BB_Equals_Bytes(bb, "bar", 4), "Equals_Bytes spoiled by different content"); { ByteBuf *other = BB_new_bytes("foo", 3); TEST_FALSE(runner, BB_Equals(bb, (Obj*)other), "Different size spoils Equals"); DECREF(other); } { ByteBuf *other = BB_new_bytes("bar", 4); TEST_UINT_EQ(runner, BB_Get_Size(bb), BB_Get_Size(other), "same length"); TEST_FALSE(runner, BB_Equals(bb, (Obj*)other), "Different content spoils Equals"); DECREF(other); } DECREF(bb); }
static void test_Equals(TestBatchRunner *runner) { ByteBuf *wanted = BB_new_bytes("foo", 4); // Include terminating NULL. ByteBuf *got = BB_new_bytes("foo", 4); TEST_TRUE(runner, BB_Equals(wanted, (Obj*)got), "Equals"); TEST_INT_EQ(runner, BB_Hash_Sum(got), BB_Hash_Sum(wanted), "Hash_Sum"); TEST_TRUE(runner, BB_Equals_Bytes(got, "foo", 4), "Equals_Bytes"); TEST_FALSE(runner, BB_Equals_Bytes(got, "foo", 3), "Equals_Bytes spoiled by different size"); TEST_FALSE(runner, BB_Equals_Bytes(got, "bar", 4), "Equals_Bytes spoiled by different content"); BB_Set_Size(got, 3); TEST_FALSE(runner, BB_Equals(wanted, (Obj*)got), "Different size spoils Equals"); TEST_FALSE(runner, BB_Hash_Sum(got) == BB_Hash_Sum(wanted), "Different size spoils Hash_Sum (probably -- at least this one)"); BB_Mimic_Bytes(got, "bar", 4); TEST_INT_EQ(runner, BB_Get_Size(wanted), BB_Get_Size(got), "same length"); TEST_FALSE(runner, BB_Equals(wanted, (Obj*)got), "Different content spoils Equals"); DECREF(got); DECREF(wanted); }
static void S_init_arena(MemoryPool *self, size_t amount) { ByteBuf *bb; // Indicate which arena we're using at present. self->tick++; if (self->tick < (int32_t)VA_Get_Size(self->arenas)) { // In recycle mode, use previously acquired memory. bb = (ByteBuf*)VA_Fetch(self->arenas, self->tick); if (amount >= BB_Get_Size(bb)) { BB_Grow(bb, amount); BB_Set_Size(bb, amount); } } else { // In add mode, get more mem from system. size_t buf_size = (amount + 1) > self->arena_size ? (amount + 1) : self->arena_size; char *ptr = (char*)MALLOCATE(buf_size); bb = BB_new_steal_bytes(ptr, buf_size - 1, buf_size); VA_Push(self->arenas, (Obj*)bb); } // Recalculate consumption to take into account blocked off space. self->consumed = 0; for (int32_t i = 0; i < self->tick; i++) { ByteBuf *bb = (ByteBuf*)VA_Fetch(self->arenas, i); self->consumed += BB_Get_Size(bb); } self->buf = BB_Get_Buf(bb); self->limit = self->buf + BB_Get_Size(bb); }
void BBSortEx_feed(BBSortEx *self, void *data) { SortEx_feed((SortExternal*)self, data); // Flush() if necessary. ByteBuf *bytebuf = (ByteBuf*)CERTIFY(*(ByteBuf**)data, BYTEBUF); self->mem_consumed += BB_Get_Size(bytebuf); if (self->mem_consumed >= self->mem_thresh) { BBSortEx_Flush(self); } }
static void S_init_arena(MemoryPool *self, size_t amount) { ByteBuf *bb; i32_t i; /* Indicate which arena we're using at present. */ self->tick++; if (self->tick < (i32_t)VA_Get_Size(self->arenas)) { /* In recycle mode, use previously acquired memory. */ bb = (ByteBuf*)VA_Fetch(self->arenas, self->tick); if (amount >= BB_Get_Size(bb)) { BB_Grow(bb, amount); BB_Set_Size(bb, amount); } } else { /* In add mode, get more mem from system. */ size_t buf_size = (amount + 1) > self->arena_size ? (amount + 1) : self->arena_size; char *ptr = MALLOCATE(buf_size, char); if (ptr == NULL) THROW("Failed to allocate memory"); bb = BB_new_steal_str(ptr, buf_size - 1, buf_size); VA_Push(self->arenas, (Obj*)bb); } /* Recalculate consumption to take into account blocked off space. */ self->consumed = 0; for (i = 0; i < self->tick; i++) { ByteBuf *bb = (ByteBuf*)VA_Fetch(self->arenas, i); self->consumed += BB_Get_Size(bb); } self->buf = bb->ptr; self->limit = BBEND(bb); }
void BBSortEx_Feed_IMP(BBSortEx *self, Obj *item) { BBSortExIVARS *const ivars = BBSortEx_IVARS(self); BBSortEx_Feed_t super_feed = SUPER_METHOD_PTR(BBSORTEX, LUCY_BBSortEx_Feed); super_feed(self, item); // Flush() if necessary. ByteBuf *bytebuf = (ByteBuf*)CERTIFY(item, BYTEBUF); ivars->mem_consumed += BB_Get_Size(bytebuf); if (ivars->mem_consumed >= ivars->mem_thresh) { BBSortEx_Flush(self); } }
static void test_Mimic(TestBatchRunner *runner) { ByteBuf *a = BB_new_bytes("foo", 3); ByteBuf *b = BB_new(0); BB_Mimic(b, (Obj*)a); TEST_TRUE(runner, BB_Equals(a, (Obj*)b), "Mimic"); BB_Mimic_Bytes(a, "bar", 4); TEST_TRUE(runner, strcmp(BB_Get_Buf(a), "bar") == 0, "Mimic_Bytes content"); TEST_INT_EQ(runner, BB_Get_Size(a), 4, "Mimic_Bytes size"); BB_Mimic(b, (Obj*)a); TEST_TRUE(runner, BB_Equals(a, (Obj*)b), "Mimic"); DECREF(a); DECREF(b); }
static TermVector* S_extract_tv_from_tv_buf(String *field, String *term_text, ByteBuf *tv_buf) { TermVector *retval = NULL; const char *posdata = BB_Get_Buf(tv_buf); const char *posdata_end = posdata + BB_Get_Size(tv_buf); int32_t *positions = NULL; int32_t *starts = NULL; int32_t *ends = NULL; uint32_t num_pos = 0; if (posdata != posdata_end) { num_pos = NumUtil_decode_c32(&posdata); positions = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t)); starts = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t)); ends = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t)); } // Expand C32s. for (uint32_t i = 0; i < num_pos; i++) { positions[i] = NumUtil_decode_c32(&posdata); starts[i] = NumUtil_decode_c32(&posdata); ends[i] = NumUtil_decode_c32(&posdata); } if (posdata != posdata_end) { THROW(ERR, "Bad encoding of posdata"); } else { I32Array *posits_map = I32Arr_new_steal(positions, num_pos); I32Array *starts_map = I32Arr_new_steal(starts, num_pos); I32Array *ends_map = I32Arr_new_steal(ends, num_pos); retval = TV_new(field, term_text, posits_map, starts_map, ends_map); DECREF(posits_map); DECREF(starts_map); DECREF(ends_map); } return retval; }
uint32_t BBSortEx_Refill_IMP(BBSortEx *self) { BBSortExIVARS *const ivars = BBSortEx_IVARS(self); // Make sure buffer is empty, then set buffer tick vars. if (ivars->buf_max - ivars->buf_tick > 0) { THROW(ERR, "Refill called but buffer contains %u32 items", ivars->buf_max - ivars->buf_tick); } ivars->buf_tick = 0; ivars->buf_max = 0; // Read in elements. while (1) { ByteBuf *elem = NULL; if (ivars->mem_consumed >= ivars->mem_thresh) { ivars->mem_consumed = 0; break; } else if (ivars->external_tick >= VA_Get_Size(ivars->external)) { break; } else { elem = (ByteBuf*)VA_Fetch(ivars->external, ivars->external_tick); ivars->external_tick++; // Should be + sizeof(ByteBuf), but that's ok. ivars->mem_consumed += BB_Get_Size(elem); } if (ivars->buf_max == ivars->buf_cap) { BBSortEx_Grow_Buffer(self, Memory_oversize(ivars->buf_max + 1, sizeof(Obj*))); } ivars->buffer[ivars->buf_max++] = INCREF(elem); } return ivars->buf_max; }
uint32_t BBSortEx_refill(BBSortEx *self) { // Make sure cache is empty, then set cache tick vars. if (self->cache_max - self->cache_tick > 0) { THROW(ERR, "Refill called but cache contains %u32 items", self->cache_max - self->cache_tick); } self->cache_tick = 0; self->cache_max = 0; // Read in elements. while (1) { ByteBuf *elem = NULL; if (self->mem_consumed >= self->mem_thresh) { self->mem_consumed = 0; break; } else if (self->external_tick >= VA_Get_Size(self->external)) { break; } else { elem = (ByteBuf*)VA_Fetch(self->external, self->external_tick); self->external_tick++; // Should be + sizeof(ByteBuf), but that's ok. self->mem_consumed += BB_Get_Size(elem); } if (self->cache_max == self->cache_cap) { BBSortEx_Grow_Cache(self, Memory_oversize(self->cache_max + 1, self->width)); } Obj **cache = (Obj**)self->cache; cache[self->cache_max++] = INCREF(elem); } return self->cache_max; }
void HLWriter_Add_Segment_IMP(HighlightWriter *self, SegReader *reader, I32Array *doc_map) { HighlightWriterIVARS *const ivars = HLWriter_IVARS(self); int32_t doc_max = SegReader_Doc_Max(reader); if (doc_max == 0) { // Bail if the supplied segment is empty. return; } else { DefaultHighlightReader *hl_reader = (DefaultHighlightReader*)CERTIFY( SegReader_Obtain(reader, Class_Get_Name(HIGHLIGHTREADER)), DEFAULTHIGHLIGHTREADER); OutStream *dat_out = S_lazy_init(self); OutStream *ix_out = ivars->ix_out; int32_t orig; ByteBuf *bb = BB_new(0); for (orig = 1; orig <= doc_max; orig++) { // Skip deleted docs. if (doc_map && !I32Arr_Get(doc_map, orig)) { continue; } // Write file pointer. OutStream_Write_I64(ix_out, OutStream_Tell(dat_out)); // Copy the raw record. DefHLReader_Read_Record(hl_reader, orig, bb); OutStream_Write_Bytes(dat_out, BB_Get_Buf(bb), BB_Get_Size(bb)); BB_Set_Size(bb, 0); } DECREF(bb); } }
void DocWriter_Add_Segment_IMP(DocWriter *self, SegReader *reader, I32Array *doc_map) { DocWriterIVARS *const ivars = DocWriter_IVARS(self); int32_t doc_max = SegReader_Doc_Max(reader); if (doc_max == 0) { // Bail if the supplied segment is empty. return; } else { OutStream *const dat_out = S_lazy_init(self); OutStream *const ix_out = ivars->ix_out; ByteBuf *const buffer = BB_new(0); DefaultDocReader *const doc_reader = (DefaultDocReader*)CERTIFY( SegReader_Obtain(reader, VTable_Get_Name(DOCREADER)), DEFAULTDOCREADER); for (int32_t i = 1, max = SegReader_Doc_Max(reader); i <= max; i++) { if (I32Arr_Get(doc_map, i)) { int64_t start = OutStream_Tell(dat_out); // Copy record over. DefDocReader_Read_Record(doc_reader, buffer, i); char *buf = BB_Get_Buf(buffer); size_t size = BB_Get_Size(buffer); OutStream_Write_Bytes(dat_out, buf, size); // Write file pointer. OutStream_Write_I64(ix_out, start); } } DECREF(buffer); } }
ByteBuf* HLWriter_TV_Buf_IMP(HighlightWriter *self, Inversion *inversion) { const char *last_text = ""; size_t last_len = 0; ByteBuf *tv_buf = BB_new(20 + Inversion_Get_Size(inversion) * 8); uint32_t num_postings = 0; Token **tokens; uint32_t freq; UNUSED_VAR(self); // Leave space for a c32 indicating the number of postings. BB_Set_Size(tv_buf, C32_MAX_BYTES); Inversion_Reset(inversion); while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) { Token *token = *tokens; char *const token_text = Token_Get_Text(token); const int32_t token_len = Token_Get_Len(token); int32_t overlap = StrHelp_overlap(last_text, token_text, last_len, token_len); char *ptr; char *orig; size_t old_size = BB_Get_Size(tv_buf); size_t new_size = old_size + C32_MAX_BYTES // overlap + C32_MAX_BYTES // length of string diff + (token_len - overlap) // diff char data + C32_MAX_BYTES // num prox + (C32_MAX_BYTES * freq * 3); // pos data // Allocate for worst-case scenario. ptr = BB_Grow(tv_buf, new_size); orig = ptr; ptr += old_size; // Track number of postings. num_postings += 1; // Append the string diff to the tv_buf. NumUtil_encode_c32(overlap, &ptr); NumUtil_encode_c32((token_len - overlap), &ptr); memcpy(ptr, (token_text + overlap), (token_len - overlap)); ptr += token_len - overlap; // Save text and text_len for comparison next loop. last_text = token_text; last_len = token_len; // Append the number of positions for this term. NumUtil_encode_c32(freq, &ptr); do { // Add position, start_offset, and end_offset to tv_buf. NumUtil_encode_c32(Token_Get_Pos(token), &ptr); NumUtil_encode_c32(Token_Get_Start_Offset(token), &ptr); NumUtil_encode_c32(Token_Get_End_Offset(token), &ptr); } while (--freq && (token = *++tokens)); // Set new byte length. BB_Set_Size(tv_buf, ptr - orig); } // Go back and start the term vector string with the posting count. char *dest = BB_Get_Buf(tv_buf); NumUtil_encode_padded_c32(num_postings, &dest); return tv_buf; }
void DocWriter_Add_Inverted_Doc_IMP(DocWriter *self, Inverter *inverter, int32_t doc_id) { DocWriterIVARS *const ivars = DocWriter_IVARS(self); OutStream *dat_out = S_lazy_init(self); OutStream *ix_out = ivars->ix_out; uint32_t num_stored = 0; int64_t start = OutStream_Tell(dat_out); int64_t expected = OutStream_Tell(ix_out) / 8; // Verify doc id. if (doc_id != expected) { THROW(ERR, "Expected doc id %i64 but got %i32", expected, doc_id); } // Write the number of stored fields. Inverter_Iterate(inverter); while (Inverter_Next(inverter)) { FieldType *type = Inverter_Get_Type(inverter); if (FType_Stored(type)) { num_stored++; } } OutStream_Write_C32(dat_out, num_stored); Inverter_Iterate(inverter); while (Inverter_Next(inverter)) { // Only store fields marked as "stored". FieldType *type = Inverter_Get_Type(inverter); if (FType_Stored(type)) { String *field = Inverter_Get_Field_Name(inverter); Obj *value = Inverter_Get_Value(inverter); Freezer_serialize_string(field, dat_out); switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) { case FType_TEXT: { const char *buf = Str_Get_Ptr8((String*)value); size_t size = Str_Get_Size((String*)value); OutStream_Write_C32(dat_out, size); OutStream_Write_Bytes(dat_out, buf, size); break; } case FType_BLOB: { char *buf = BB_Get_Buf((ByteBuf*)value); size_t size = BB_Get_Size((ByteBuf*)value); OutStream_Write_C32(dat_out, size); OutStream_Write_Bytes(dat_out, buf, size); break; } case FType_INT32: { int32_t val = Int32_Get_Value((Integer32*)value); OutStream_Write_C32(dat_out, val); break; } case FType_INT64: { int64_t val = Int64_Get_Value((Integer64*)value); OutStream_Write_C64(dat_out, val); break; } case FType_FLOAT32: { float val = Float32_Get_Value((Float32*)value); OutStream_Write_F32(dat_out, val); break; } case FType_FLOAT64: { double val = Float64_Get_Value((Float64*)value); OutStream_Write_F64(dat_out, val); break; } default: THROW(ERR, "Unrecognized type: %o", type); } } } // Write file pointer. OutStream_Write_I64(ix_out, start); }
ByteBuf* HLWriter_tv_buf(HighlightWriter *self, Inversion *inversion) { char *last_text = ""; size_t last_len = 0; ByteBuf *tv_buf = BB_new(20 + inversion->size * 8); /* generous */ u32_t num_postings = 0; char *dest; Token **tokens; u32_t freq; UNUSED_VAR(self); /* heh. */ /* Leave space for a c32 indicating the number of postings. */ BB_Set_Size(tv_buf, C32_MAX_BYTES); Inversion_Reset(inversion); while ( (tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL ) { Token *token = *tokens; i32_t overlap = StrHelp_string_diff(last_text, token->text, last_len, token->len); char *ptr; size_t new_size = BB_Get_Size(tv_buf) + C32_MAX_BYTES /* overlap */ + C32_MAX_BYTES /* length of string diff */ + (token->len - overlap) /* diff char data */ + C32_MAX_BYTES /* num prox */ + (C32_MAX_BYTES * freq * 3); /* pos data */ /* Allocate for worst-case scenario. */ BB_Grow(tv_buf, new_size); ptr = BBEND(tv_buf); /* Track number of postings. */ num_postings += 1; /* Append the string diff to the tv_buf. */ Math_encode_c32(overlap, &ptr); Math_encode_c32( (token->len - overlap), &ptr); memcpy(ptr, (token->text + overlap), (token->len - overlap)); ptr += token->len - overlap; /* Save text and text_len for comparison next loop. */ last_text = token->text; last_len = token->len; /* Append the number of positions for this term. */ Math_encode_c32(freq, &ptr); do { /* Add position, start_offset, and end_offset to tv_buf. */ Math_encode_c32(token->pos, &ptr); Math_encode_c32(token->start_offset, &ptr); Math_encode_c32(token->end_offset, &ptr); } while (--freq && (token = *++tokens)); /* Set new byte length. */ BB_Set_Size(tv_buf, ptr - tv_buf->ptr); } /* Go back and start the term vector string with the number of postings. */ dest = tv_buf->ptr; Math_encode_padded_c32(num_postings, &dest); return tv_buf; }