static void test_Write_File_and_Read_File(TestBatch *batch) { RAMFolder *folder = RAMFolder_new(NULL); Segment *segment = Seg_new(100); Segment *got = Seg_new(100); CharBuf *meta; CharBuf *flotsam = (CharBuf*)ZCB_WRAP_STR("flotsam", 7); CharBuf *jetsam = (CharBuf*)ZCB_WRAP_STR("jetsam", 6); Seg_Set_Count(segment, 111); Seg_Store_Metadata_Str(segment, "foo", 3, (Obj*)CB_newf("bar")); Seg_Add_Field(segment, flotsam); Seg_Add_Field(segment, jetsam); RAMFolder_MkDir(folder, Seg_Get_Name(segment)); Seg_Write_File(segment, (Folder*)folder); Seg_Read_File(got, (Folder*)folder); TEST_TRUE(batch, Seg_Get_Count(got) == Seg_Get_Count(segment), "Round-trip count through file"); TEST_TRUE(batch, Seg_Field_Num(got, jetsam) == Seg_Field_Num(segment, jetsam), "Round trip field names through file"); meta = (CharBuf*)Seg_Fetch_Metadata_Str(got, "foo", 3); TEST_TRUE(batch, meta && CB_Is_A(meta, CHARBUF) && CB_Equals_Str(meta, "bar", 3), "Round trip metadata through file"); DECREF(got); DECREF(segment); DECREF(folder); }
static void test_fields(TestBatchRunner *runner) { Segment *segment = Seg_new(1); StackString *foo = SSTR_WRAP_UTF8("foo", 3); StackString *bar = SSTR_WRAP_UTF8("bar", 3); StackString *baz = SSTR_WRAP_UTF8("baz", 3); int32_t field_num; field_num = Seg_Add_Field(segment, (String*)foo); TEST_TRUE(runner, field_num == 1, "Add_Field returns field number, and field numbers start at 1"); field_num = Seg_Add_Field(segment, (String*)bar); TEST_TRUE(runner, field_num == 2, "add a second field"); field_num = Seg_Add_Field(segment, (String*)foo); TEST_TRUE(runner, field_num == 1, "Add_Field returns existing field number if field is already known"); TEST_TRUE(runner, SStr_Equals(bar, (Obj*)Seg_Field_Name(segment, 2)), "Field_Name"); TEST_TRUE(runner, Seg_Field_Name(segment, 3) == NULL, "Field_Name returns NULL for unknown field number"); TEST_TRUE(runner, Seg_Field_Num(segment, (String*)bar) == 2, "Field_Num"); TEST_TRUE(runner, Seg_Field_Num(segment, (String*)baz) == 0, "Field_Num returns 0 for unknown field name"); DECREF(segment); }
static void test_Write_File_and_Read_File(TestBatchRunner *runner) { RAMFolder *folder = RAMFolder_new(NULL); Segment *segment = Seg_new(100); Segment *got = Seg_new(100); String *meta; String *flotsam = (String*)SSTR_WRAP_UTF8("flotsam", 7); String *jetsam = (String*)SSTR_WRAP_UTF8("jetsam", 6); Seg_Set_Count(segment, 111); Seg_Store_Metadata_Utf8(segment, "foo", 3, (Obj*)Str_newf("bar")); Seg_Add_Field(segment, flotsam); Seg_Add_Field(segment, jetsam); RAMFolder_MkDir(folder, Seg_Get_Name(segment)); Seg_Write_File(segment, (Folder*)folder); Seg_Read_File(got, (Folder*)folder); TEST_TRUE(runner, Seg_Get_Count(got) == Seg_Get_Count(segment), "Round-trip count through file"); TEST_TRUE(runner, Seg_Field_Num(got, jetsam) == Seg_Field_Num(segment, jetsam), "Round trip field names through file"); meta = (String*)Seg_Fetch_Metadata_Utf8(got, "foo", 3); TEST_TRUE(runner, meta && Str_Is_A(meta, STRING) && Str_Equals_Utf8(meta, "bar", 3), "Round trip metadata through file"); DECREF(got); DECREF(segment); DECREF(folder); }
static void test_fields(TestBatch *batch) { Segment *segment = Seg_new(1); ZombieCharBuf *foo = ZCB_WRAP_STR("foo",3 ); ZombieCharBuf *bar = ZCB_WRAP_STR("bar", 3); ZombieCharBuf *baz = ZCB_WRAP_STR("baz", 3); int32_t field_num; field_num = Seg_Add_Field(segment, (CharBuf*)foo); TEST_TRUE(batch, field_num == 1, "Add_Field returns field number, and field numbers start at 1"); field_num = Seg_Add_Field(segment, (CharBuf*)bar); TEST_TRUE(batch, field_num == 2, "add a second field"); field_num = Seg_Add_Field(segment, (CharBuf*)foo); TEST_TRUE(batch, field_num == 1, "Add_Field returns existing field number if field is already known"); TEST_TRUE(batch, ZCB_Equals(bar, (Obj*)Seg_Field_Name(segment, 2)), "Field_Name"); TEST_TRUE(batch, Seg_Field_Name(segment, 3) == NULL, "Field_Name returns NULL for unknown field number"); TEST_TRUE(batch, Seg_Field_Num(segment, (CharBuf*)bar) == 2, "Field_Num"); TEST_TRUE(batch, Seg_Field_Num(segment, (CharBuf*)baz) == 0, "Field_Num returns 0 for unknown field name"); DECREF(segment); }
void SortWriter_add_segment(SortWriter *self, SegReader *reader, I32Array *doc_map) { SortWriterIVARS *const ivars = SortWriter_IVARS(self); VArray *fields = Schema_All_Fields(ivars->schema); // Proceed field-at-a-time, rather than doc-at-a-time. for (uint32_t i = 0, max = VA_Get_Size(fields); i < max; i++) { CharBuf *field = (CharBuf*)VA_Fetch(fields, i); SortReader *sort_reader = (SortReader*)SegReader_Fetch( reader, VTable_Get_Name(SORTREADER)); SortCache *cache = sort_reader ? SortReader_Fetch_Sort_Cache(sort_reader, field) : NULL; if (cache) { int32_t field_num = Seg_Field_Num(ivars->segment, field); SortFieldWriter *field_writer = S_lazy_init_field_writer(self, field_num); SortFieldWriter_Add_Segment(field_writer, reader, doc_map, cache); ivars->flush_at_finish = true; } } DECREF(fields); }
static InverterEntry* S_fetch_entry(InverterIVARS *ivars, CharBuf *field) { Schema *const schema = ivars->schema; int32_t field_num = Seg_Field_Num(ivars->segment, field); if (!field_num) { // This field seems not to be in the segment yet. Try to find it in // the Schema. if (Schema_Fetch_Type(schema, field)) { // The field is in the Schema. Get a field num from the Segment. field_num = Seg_Add_Field(ivars->segment, field); } else { // We've truly failed to find the field. The user must // not have spec'd it. THROW(ERR, "Unknown field name: '%o'", field); } } InverterEntry *entry = (InverterEntry*)VA_Fetch(ivars->entry_pool, field_num); if (!entry) { entry = InvEntry_new(schema, (CharBuf*)field, field_num); VA_Store(ivars->entry_pool, field_num, (Obj*)entry); } return entry; }
SortFieldWriter* SortFieldWriter_init(SortFieldWriter *self, Schema *schema, Snapshot *snapshot, Segment *segment, PolyReader *polyreader, String *field, Counter *counter, size_t mem_thresh, OutStream *temp_ord_out, OutStream *temp_ix_out, OutStream *temp_dat_out) { // Init. SortEx_init((SortExternal*)self); SortFieldWriterIVARS *const ivars = SortFieldWriter_IVARS(self); ivars->null_ord = -1; ivars->count = 0; ivars->ord_start = 0; ivars->ord_end = 0; ivars->ix_start = 0; ivars->ix_end = 0; ivars->dat_start = 0; ivars->dat_end = 0; ivars->run_cardinality = -1; ivars->run_max = -1; ivars->sort_cache = NULL; ivars->doc_map = NULL; ivars->sorted_ids = NULL; ivars->run_tick = 1; ivars->ord_width = 0; // Assign. ivars->field = Str_Clone(field); ivars->schema = (Schema*)INCREF(schema); ivars->snapshot = (Snapshot*)INCREF(snapshot); ivars->segment = (Segment*)INCREF(segment); ivars->polyreader = (PolyReader*)INCREF(polyreader); ivars->counter = (Counter*)INCREF(counter); ivars->temp_ord_out = (OutStream*)INCREF(temp_ord_out); ivars->temp_ix_out = (OutStream*)INCREF(temp_ix_out); ivars->temp_dat_out = (OutStream*)INCREF(temp_dat_out); ivars->mem_thresh = mem_thresh; // Derive. ivars->field_num = Seg_Field_Num(segment, field); FieldType *type = (FieldType*)CERTIFY( Schema_Fetch_Type(ivars->schema, field), FIELDTYPE); ivars->type = (FieldType*)INCREF(type); ivars->prim_id = FType_Primitive_ID(type); ivars->mem_per_entry = Class_Get_Obj_Alloc_Size(SFWRITERELEM); if (ivars->prim_id == FType_TEXT) { ivars->mem_per_entry += Class_Get_Obj_Alloc_Size(STRING); ivars->var_width = true; } else if (ivars->prim_id == FType_BLOB) { ivars->mem_per_entry += Class_Get_Obj_Alloc_Size(BLOB); ivars->var_width = true; } else { ivars->mem_per_entry += Class_Get_Obj_Alloc_Size(FLOAT); ivars->var_width = false; } return self; }
LexIndex* LexIndex_init(LexIndex *self, Schema *schema, Folder *folder, Segment *segment, String *field) { int32_t field_num = Seg_Field_Num(segment, field); String *seg_name = Seg_Get_Name(segment); String *ixix_file = Str_newf("%o/lexicon-%i32.ixix", seg_name, field_num); String *ix_file = Str_newf("%o/lexicon-%i32.ix", seg_name, field_num); Architecture *arch = Schema_Get_Architecture(schema); // Init. Lex_init((Lexicon*)self, field); LexIndexIVARS *const ivars = LexIndex_IVARS(self); ivars->tinfo = TInfo_new(0); ivars->tick = 0; // Derive ivars->field_type = Schema_Fetch_Type(schema, field); if (!ivars->field_type) { String *mess = MAKE_MESS("Unknown field: '%o'", field); DECREF(ix_file); DECREF(ixix_file); DECREF(self); Err_throw_mess(ERR, mess); } ivars->field_type = (FieldType*)INCREF(ivars->field_type); ivars->term_stepper = FType_Make_Term_Stepper(ivars->field_type); ivars->ixix_in = Folder_Open_In(folder, ixix_file); if (!ivars->ixix_in) { Err *error = (Err*)INCREF(Err_get_error()); DECREF(ix_file); DECREF(ixix_file); DECREF(self); RETHROW(error); } ivars->ix_in = Folder_Open_In(folder, ix_file); if (!ivars->ix_in) { Err *error = (Err*)INCREF(Err_get_error()); DECREF(ix_file); DECREF(ixix_file); DECREF(self); RETHROW(error); } ivars->index_interval = Arch_Index_Interval(arch); ivars->skip_interval = Arch_Skip_Interval(arch); ivars->size = (int32_t)(InStream_Length(ivars->ixix_in) / sizeof(int64_t)); ivars->offsets = (const int64_t*)InStream_Buf(ivars->ixix_in, (size_t)InStream_Length(ivars->ixix_in)); DECREF(ixix_file); DECREF(ix_file); return self; }
LexIndex* LexIndex_init(LexIndex *self, Schema *schema, Folder *folder, Segment *segment, const CharBuf *field) { i32_t field_num = Seg_Field_Num(segment, field); CharBuf *seg_name = Seg_Get_Name(segment); CharBuf *ixix_file = CB_newf("%o/lexicon-%i32.ixix", seg_name, field_num); CharBuf *ix_file = CB_newf("%o/lexicon-%i32.ix", seg_name, field_num); Architecture *arch = Schema_Get_Architecture(schema); /* Init. */ self->term = ViewCB_new_from_trusted_utf8(NULL, 0); self->tinfo = TInfo_new(0,0,0,0); self->tick = 0; /* Derive */ self->field_type = Schema_Fetch_Type(schema, field); if (!self->field_type) { CharBuf *mess = MAKE_MESS("Unknown field: '%o'", field); DECREF(ix_file); DECREF(ixix_file); DECREF(self); Err_throw_mess(mess); } INCREF(self->field_type); self->ixix_in = Folder_Open_In(folder, ixix_file); self->ix_in = Folder_Open_In(folder, ix_file); if (!self->ixix_in || !self->ix_in) { CharBuf *mess = MAKE_MESS("Can't open either %o or %o", ix_file, ixix_file); DECREF(ix_file); DECREF(ixix_file); DECREF(self); Err_throw_mess(mess); } self->index_interval = Arch_Index_Interval(arch); self->skip_interval = Arch_Skip_Interval(arch); self->size = (i32_t)(InStream_Length(self->ixix_in) / sizeof(i64_t)); self->offsets = (i64_t*)InStream_Buf(self->ixix_in, (size_t)InStream_Length(self->ixix_in)); self->data = InStream_Buf(self->ix_in, InStream_Length(self->ix_in)); self->limit = self->data + InStream_Length(self->ix_in); DECREF(ixix_file); DECREF(ix_file); return self; }
Lexicon* DefLexReader_Lexicon_IMP(DefaultLexiconReader *self, String *field, Obj *term) { DefaultLexiconReaderIVARS *const ivars = DefLexReader_IVARS(self); int32_t field_num = Seg_Field_Num(ivars->segment, field); SegLexicon *orig = (SegLexicon*)VA_Fetch(ivars->lexicons, field_num); SegLexicon *lexicon = NULL; if (orig) { // i.e. has data lexicon = SegLex_new(ivars->schema, ivars->folder, ivars->segment, field); SegLex_Seek(lexicon, term); } return (Lexicon*)lexicon; }
Lexicon* DefLexReader_lexicon(DefaultLexiconReader *self, const CharBuf *field, Obj *term) { i32_t field_num = Seg_Field_Num(self->segment, field); SegLexicon *orig = (SegLexicon*)VA_Fetch(self->lexicons, field_num); SegLexicon *lexicon = NULL; if (orig) { /* i.e. has data */ lexicon = SegLex_new(self->schema, self->folder, self->segment, field); SegLex_Seek(lexicon, term); } return (Lexicon*)lexicon; }
// Indicate whether it is safe to build a SegLexicon using the given // parameters. Will return false if the field is not indexed or if no terms // are present for this field in this segment. static bool S_has_data(Schema *schema, Folder *folder, Segment *segment, String *field) { FieldType *type = Schema_Fetch_Type(schema, field); if (!type || !FType_Indexed(type)) { // If the field isn't indexed, bail out. return false; } else { // Bail out if there are no terms for this field in this segment. int32_t field_num = Seg_Field_Num(segment, field); String *seg_name = Seg_Get_Name(segment); String *file = Str_newf("%o/lexicon-%i32.dat", seg_name, field_num); bool retval = Folder_Exists(folder, file); DECREF(file); return retval; } }
PostingPool* PostPool_init(PostingPool *self, Schema *schema, Snapshot *snapshot, Segment *segment, PolyReader *polyreader, String *field, LexiconWriter *lex_writer, MemoryPool *mem_pool, OutStream *lex_temp_out, OutStream *post_temp_out, OutStream *skip_out) { // Init. SortEx_init((SortExternal*)self); PostingPoolIVARS *const ivars = PostPool_IVARS(self); ivars->doc_base = 0; ivars->last_doc_id = 0; ivars->doc_map = NULL; ivars->post_count = 0; ivars->lexicon = NULL; ivars->plist = NULL; ivars->lex_temp_in = NULL; ivars->post_temp_in = NULL; ivars->lex_start = INT64_MAX; ivars->post_start = INT64_MAX; ivars->lex_end = 0; ivars->post_end = 0; ivars->skip_stepper = SkipStepper_new(); // Assign. ivars->schema = (Schema*)INCREF(schema); ivars->snapshot = (Snapshot*)INCREF(snapshot); ivars->segment = (Segment*)INCREF(segment); ivars->polyreader = (PolyReader*)INCREF(polyreader); ivars->lex_writer = (LexiconWriter*)INCREF(lex_writer); ivars->mem_pool = (MemoryPool*)INCREF(mem_pool); ivars->field = Str_Clone(field); ivars->lex_temp_out = (OutStream*)INCREF(lex_temp_out); ivars->post_temp_out = (OutStream*)INCREF(post_temp_out); ivars->skip_out = (OutStream*)INCREF(skip_out); // Derive. Similarity *sim = Schema_Fetch_Sim(schema, field); ivars->posting = Sim_Make_Posting(sim); ivars->type = (FieldType*)INCREF(Schema_Fetch_Type(schema, field)); ivars->field_num = Seg_Field_Num(segment, field); return self; }
static TermInfo* S_find_tinfo(DefaultLexiconReader *self, String *field, Obj *target) { DefaultLexiconReaderIVARS *const ivars = DefLexReader_IVARS(self); if (field != NULL && target != NULL) { int32_t field_num = Seg_Field_Num(ivars->segment, field); SegLexicon *lexicon = (SegLexicon*)VA_Fetch(ivars->lexicons, field_num); if (lexicon) { // Iterate until the result is ge the term. SegLex_Seek(lexicon, target); //if found matches target, return info; otherwise NULL Obj *found = SegLex_Get_Term(lexicon); if (found && Obj_Equals(target, found)) { return SegLex_Get_Term_Info(lexicon); } } } return NULL; }
void PostPool_assign_seg(PostingPool *self, Folder *other_folder, Segment *other_segment, i32_t doc_base, I32Array *doc_map) { i32_t field_num = Seg_Field_Num(other_segment, self->field); CharBuf *other_seg_name = Seg_Get_Name(other_segment); CharBuf *lex_file = CB_newf("%o/lexicon-%i32.dat", other_seg_name, field_num); /* Dedicate pool to this task alone. */ if (self->from_seg || self->cache_max > 0 || self->lex_end != 0) THROW("Can't Assign_Segment to PostingPool with other content"); self->from_seg = true; /* Prepare to read from existing files. */ if (Folder_Exists(other_folder, lex_file)) { CharBuf *post_file = CB_newf("%o/postings-%i32.dat", other_seg_name, field_num); /* Open lexicon and postings files. */ self->lex_instream = Folder_Open_In(other_folder, lex_file); self->post_instream = Folder_Open_In(other_folder, post_file); if (!self->lex_instream) { THROW("Can't open %o", lex_file); } if (!self->post_instream) { THROW("Can't open %o", post_file); } self->lex_end = InStream_Length(self->lex_instream); self->post_end = InStream_Length(self->post_instream); /* Assign doc base and doc map. */ self->doc_base = doc_base; self->doc_map = doc_map ? (I32Array*)INCREF(doc_map) : NULL; DECREF(post_file); } else { /* This posting pool will be empty. */ } /* Clean up. */ DECREF(lex_file); }
TermInfo* DefLexReader_fetch_term_info(DefaultLexiconReader *self, const CharBuf *field, Obj *target) { if (field != NULL && target != NULL) { i32_t field_num = Seg_Field_Num(self->segment, field); SegLexicon *lexicon = (SegLexicon*)VA_Fetch(self->lexicons, field_num); if (lexicon) { /* Iterate until the result is ge the term. */ SegLex_Seek(lexicon, target); /*if found matches target, return info; otherwise NULL */ { Obj *found = SegLex_Get_Term(lexicon); if (found && Obj_Equals(target, found)) { return SegLex_Get_Term_Info(lexicon); } } } } return NULL; }
static SortCache* S_lazy_init_sort_cache(DefaultSortReader *self, String *field) { DefaultSortReaderIVARS *const ivars = DefSortReader_IVARS(self); // See if we have any values. Obj *count_obj = Hash_Fetch(ivars->counts, (Obj*)field); int32_t count = count_obj ? (int32_t)Obj_To_I64(count_obj) : 0; if (!count) { return NULL; } // Get a FieldType and sanity check that the field is sortable. Schema *schema = DefSortReader_Get_Schema(self); FieldType *type = Schema_Fetch_Type(schema, field); if (!type || !FType_Sortable(type)) { THROW(ERR, "'%o' isn't a sortable field", field); } // Open streams. Folder *folder = DefSortReader_Get_Folder(self); Segment *segment = DefSortReader_Get_Segment(self); String *seg_name = Seg_Get_Name(segment); int32_t field_num = Seg_Field_Num(segment, field); int8_t prim_id = FType_Primitive_ID(type); bool var_width = (prim_id == FType_TEXT || prim_id == FType_BLOB) ? true : false; String *ord_path = Str_newf("%o/sort-%i32.ord", seg_name, field_num); InStream *ord_in = Folder_Open_In(folder, ord_path); DECREF(ord_path); if (!ord_in) { THROW(ERR, "Error building sort cache for '%o': %o", field, Err_get_error()); } InStream *ix_in = NULL; if (var_width) { String *ix_path = Str_newf("%o/sort-%i32.ix", seg_name, field_num); ix_in = Folder_Open_In(folder, ix_path); DECREF(ix_path); if (!ix_in) { THROW(ERR, "Error building sort cache for '%o': %o", field, Err_get_error()); } } String *dat_path = Str_newf("%o/sort-%i32.dat", seg_name, field_num); InStream *dat_in = Folder_Open_In(folder, dat_path); DECREF(dat_path); if (!dat_in) { THROW(ERR, "Error building sort cache for '%o': %o", field, Err_get_error()); } Obj *null_ord_obj = Hash_Fetch(ivars->null_ords, (Obj*)field); int32_t null_ord = null_ord_obj ? (int32_t)Obj_To_I64(null_ord_obj) : -1; Obj *ord_width_obj = Hash_Fetch(ivars->ord_widths, (Obj*)field); int32_t ord_width = ord_width_obj ? (int32_t)Obj_To_I64(ord_width_obj) : S_calc_ord_width(count); int32_t doc_max = (int32_t)Seg_Get_Count(segment); SortCache *cache = NULL; switch (prim_id & FType_PRIMITIVE_ID_MASK) { case FType_TEXT: cache = (SortCache*)TextSortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, ix_in, dat_in); break; case FType_INT32: cache = (SortCache*)I32SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; case FType_INT64: cache = (SortCache*)I64SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; case FType_FLOAT32: cache = (SortCache*)F32SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; case FType_FLOAT64: cache = (SortCache*)F64SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; default: THROW(ERR, "No SortCache class for %o", type); } Hash_Store(ivars->caches, (Obj*)field, (Obj*)cache); if (ivars->format == 2) { // bug compatibility SortCache_Set_Native_Ords(cache, true); } DECREF(ord_in); DECREF(ix_in); DECREF(dat_in); return cache; }