static void test_Write_File_and_Read_File(TestBatch *batch) { RAMFolder *folder = RAMFolder_new(NULL); Segment *segment = Seg_new(100); Segment *got = Seg_new(100); CharBuf *meta; CharBuf *flotsam = (CharBuf*)ZCB_WRAP_STR("flotsam", 7); CharBuf *jetsam = (CharBuf*)ZCB_WRAP_STR("jetsam", 6); Seg_Set_Count(segment, 111); Seg_Store_Metadata_Str(segment, "foo", 3, (Obj*)CB_newf("bar")); Seg_Add_Field(segment, flotsam); Seg_Add_Field(segment, jetsam); RAMFolder_MkDir(folder, Seg_Get_Name(segment)); Seg_Write_File(segment, (Folder*)folder); Seg_Read_File(got, (Folder*)folder); TEST_TRUE(batch, Seg_Get_Count(got) == Seg_Get_Count(segment), "Round-trip count through file"); TEST_TRUE(batch, Seg_Field_Num(got, jetsam) == Seg_Field_Num(segment, jetsam), "Round trip field names through file"); meta = (CharBuf*)Seg_Fetch_Metadata_Str(got, "foo", 3); TEST_TRUE(batch, meta && CB_Is_A(meta, CHARBUF) && CB_Equals_Str(meta, "bar", 3), "Round trip metadata through file"); DECREF(got); DECREF(segment); DECREF(folder); }
static void test_Write_File_and_Read_File(TestBatchRunner *runner) { RAMFolder *folder = RAMFolder_new(NULL); Segment *segment = Seg_new(100); Segment *got = Seg_new(100); String *meta; String *flotsam = (String*)SSTR_WRAP_UTF8("flotsam", 7); String *jetsam = (String*)SSTR_WRAP_UTF8("jetsam", 6); Seg_Set_Count(segment, 111); Seg_Store_Metadata_Utf8(segment, "foo", 3, (Obj*)Str_newf("bar")); Seg_Add_Field(segment, flotsam); Seg_Add_Field(segment, jetsam); RAMFolder_MkDir(folder, Seg_Get_Name(segment)); Seg_Write_File(segment, (Folder*)folder); Seg_Read_File(got, (Folder*)folder); TEST_TRUE(runner, Seg_Get_Count(got) == Seg_Get_Count(segment), "Round-trip count through file"); TEST_TRUE(runner, Seg_Field_Num(got, jetsam) == Seg_Field_Num(segment, jetsam), "Round trip field names through file"); meta = (String*)Seg_Fetch_Metadata_Utf8(got, "foo", 3); TEST_TRUE(runner, meta && Str_Is_A(meta, STRING) && Str_Equals_Utf8(meta, "bar", 3), "Round trip metadata through file"); DECREF(got); DECREF(segment); DECREF(folder); }
static void test_count(TestBatchRunner *runner) { Segment *segment = Seg_new(100); TEST_TRUE(runner, Seg_Get_Count(segment) == 0, "count starts off at 0"); Seg_Set_Count(segment, 120); TEST_TRUE(runner, Seg_Get_Count(segment) == 120, "Set_Count"); TEST_TRUE(runner, Seg_Increment_Count(segment, 10) == 130, "Increment_Count"); DECREF(segment); }
SegReader* SegReader_init(SegReader *self, Schema *schema, Folder *folder, Snapshot *snapshot, Vector *segments, int32_t seg_tick) { Segment *segment; IxReader_init((IndexReader*)self, schema, folder, snapshot, segments, seg_tick, NULL); SegReaderIVARS *const ivars = SegReader_IVARS(self); segment = SegReader_Get_Segment(self); ivars->doc_max = (int32_t)Seg_Get_Count(segment); ivars->seg_name = (String*)INCREF(Seg_Get_Name(segment)); ivars->seg_num = Seg_Get_Number(segment); Err *error = Err_trap(S_try_init_components, self); if (error) { // An error occurred, so clean up self and rethrow the exception. DECREF(self); RETHROW(error); } DeletionsReader *del_reader = (DeletionsReader*)Hash_Fetch( ivars->components, Class_Get_Name(DELETIONSREADER)); ivars->del_count = del_reader ? DelReader_Del_Count(del_reader) : 0; return self; }
void Indexer_prepare_commit(Indexer *self) { VArray *seg_readers = PolyReader_Get_Seg_Readers(self->polyreader); uint32_t num_seg_readers = VA_Get_Size(seg_readers); bool_t merge_happened = false; if ( !self->write_lock || self->prepared ) { THROW(ERR, "Can't call Prepare_Commit() more than once"); } // Merge existing index data. if (num_seg_readers) { merge_happened = S_maybe_merge(self, seg_readers); } // Add a new segment and write a new snapshot file if... if ( Seg_Get_Count(self->segment) // Docs/segs added. || merge_happened // Some segs merged. || !Snapshot_Num_Entries(self->snapshot) // Initializing index. || DelWriter_Updated(self->del_writer) ) { Folder *folder = self->folder; Schema *schema = self->schema; Snapshot *snapshot = self->snapshot; CharBuf *old_schema_name = S_find_schema_file(snapshot); uint64_t schema_gen = old_schema_name ? IxFileNames_extract_gen(old_schema_name) + 1 : 1; char base36[StrHelp_MAX_BASE36_BYTES]; CharBuf *new_schema_name; StrHelp_to_base36(schema_gen, &base36); new_schema_name = CB_newf("schema_%s.json", base36); // Finish the segment, write schema file. SegWriter_Finish(self->seg_writer); Schema_Write(schema, folder, new_schema_name); if (old_schema_name) { Snapshot_Delete_Entry(snapshot, old_schema_name); } Snapshot_Add_Entry(snapshot, new_schema_name); DECREF(new_schema_name); // Write temporary snapshot file. DECREF(self->snapfile); self->snapfile = IxManager_Make_Snapshot_Filename(self->manager); CB_Cat_Trusted_Str(self->snapfile, ".temp", 5); Folder_Delete(folder, self->snapfile); Snapshot_Write_File(snapshot, folder, self->snapfile); self->needs_commit = true; } // Close reader, so that we can delete its files if appropriate. PolyReader_Close(self->polyreader); self->prepared = true; }
void Indexer_add_index(Indexer *self, Obj *index) { Folder *other_folder = NULL; IndexReader *reader = NULL; if (Obj_Is_A(index, FOLDER)) { other_folder = (Folder*)INCREF(index); } else if (Obj_Is_A(index, CHARBUF)) { other_folder = (Folder*)FSFolder_new((CharBuf*)index); } else { THROW(ERR, "Invalid type for 'index': %o", Obj_Get_Class_Name(index)); } reader = IxReader_open((Obj*)other_folder, NULL, NULL); if (reader == NULL) { THROW(ERR, "Index doesn't seem to contain any data"); } else { Schema *schema = self->schema; Schema *other_schema = IxReader_Get_Schema(reader); VArray *other_fields = Schema_All_Fields(other_schema); VArray *seg_readers = IxReader_Seg_Readers(reader); uint32_t i, max; // Validate schema compatibility and add fields. Schema_Eat(schema, other_schema); // Add fields to Segment. for (i = 0, max = VA_Get_Size(other_fields); i < max; i++) { CharBuf *other_field = (CharBuf*)VA_Fetch(other_fields, i); Seg_Add_Field(self->segment, other_field); } DECREF(other_fields); // Add all segments. for (i = 0, max = VA_Get_Size(seg_readers); i < max; i++) { SegReader *seg_reader = (SegReader*)VA_Fetch(seg_readers, i); DeletionsReader *del_reader = (DeletionsReader*)SegReader_Fetch( seg_reader, VTable_Get_Name(DELETIONSREADER)); Matcher *deletions = del_reader ? DelReader_Iterator(del_reader) : NULL; I32Array *doc_map = DelWriter_Generate_Doc_Map(self->del_writer, deletions, SegReader_Doc_Max(seg_reader), (int32_t)Seg_Get_Count(self->segment) ); SegWriter_Add_Segment(self->seg_writer, seg_reader, doc_map); DECREF(deletions); DECREF(doc_map); } DECREF(seg_readers); } DECREF(reader); DECREF(other_folder); }
void Indexer_Add_Index_IMP(Indexer *self, Obj *index) { IndexerIVARS *const ivars = Indexer_IVARS(self); Folder *other_folder = NULL; IndexReader *reader = NULL; if (Obj_is_a(index, FOLDER)) { other_folder = (Folder*)INCREF(index); } else if (Obj_is_a(index, STRING)) { other_folder = (Folder*)FSFolder_new((String*)index); } else { THROW(ERR, "Invalid type for 'index': %o", Obj_get_class_name(index)); } reader = IxReader_open((Obj*)other_folder, NULL, NULL); if (reader == NULL) { THROW(ERR, "Index doesn't seem to contain any data"); } else { Schema *schema = ivars->schema; Schema *other_schema = IxReader_Get_Schema(reader); Vector *other_fields = Schema_All_Fields(other_schema); Vector *seg_readers = IxReader_Seg_Readers(reader); // Validate schema compatibility and add fields. Schema_Eat(schema, other_schema); // Add fields to Segment. for (size_t i = 0, max = Vec_Get_Size(other_fields); i < max; i++) { String *other_field = (String*)Vec_Fetch(other_fields, i); Seg_Add_Field(ivars->segment, other_field); } DECREF(other_fields); // Add all segments. for (size_t i = 0, max = Vec_Get_Size(seg_readers); i < max; i++) { SegReader *seg_reader = (SegReader*)Vec_Fetch(seg_readers, i); DeletionsReader *del_reader = (DeletionsReader*)SegReader_Fetch( seg_reader, Class_Get_Name(DELETIONSREADER)); Matcher *deletions = del_reader ? DelReader_Iterator(del_reader) : NULL; I32Array *doc_map = DelWriter_Generate_Doc_Map( ivars->del_writer, deletions, SegReader_Doc_Max(seg_reader), (int32_t)Seg_Get_Count(ivars->segment)); SegWriter_Add_Segment(ivars->seg_writer, seg_reader, doc_map); DECREF(deletions); DECREF(doc_map); } DECREF(seg_readers); } DECREF(reader); DECREF(other_folder); }
void Indexer_Prepare_Commit_IMP(Indexer *self) { IndexerIVARS *const ivars = Indexer_IVARS(self); Vector *seg_readers = PolyReader_Get_Seg_Readers(ivars->polyreader); size_t num_seg_readers = Vec_Get_Size(seg_readers); bool merge_happened = false; if (!ivars->write_lock || ivars->prepared) { THROW(ERR, "Can't call Prepare_Commit() more than once"); } // Merge existing index data. if (num_seg_readers) { merge_happened = S_maybe_merge(self, seg_readers); } // Add a new segment and write a new snapshot file if... if (Seg_Get_Count(ivars->segment) // Docs/segs added. || merge_happened // Some segs merged. || !Snapshot_Num_Entries(ivars->snapshot) // Initializing index. || DelWriter_Updated(ivars->del_writer) ) { Folder *folder = ivars->folder; Schema *schema = ivars->schema; Snapshot *snapshot = ivars->snapshot; // Derive snapshot and schema file names. DECREF(ivars->snapfile); String *snapfile = IxManager_Make_Snapshot_Filename(ivars->manager); ivars->snapfile = Str_Cat_Trusted_Utf8(snapfile, ".temp", 5); DECREF(snapfile); uint64_t schema_gen = IxFileNames_extract_gen(ivars->snapfile); char base36[StrHelp_MAX_BASE36_BYTES]; StrHelp_to_base36(schema_gen, &base36); String *new_schema_name = Str_newf("schema_%s.json", base36); // Finish the segment, write schema file. SegWriter_Finish(ivars->seg_writer); Schema_Write(schema, folder, new_schema_name); String *old_schema_name = S_find_schema_file(snapshot); if (old_schema_name) { Snapshot_Delete_Entry(snapshot, old_schema_name); } Snapshot_Add_Entry(snapshot, new_schema_name); DECREF(new_schema_name); // Write temporary snapshot file. Folder_Delete(folder, ivars->snapfile); Snapshot_Write_File(snapshot, folder, ivars->snapfile); ivars->needs_commit = true; } // Close reader, so that we can delete its files if appropriate. PolyReader_Close(ivars->polyreader); ivars->prepared = true; }
void SortFieldWriter_Flush_IMP(SortFieldWriter *self) { SortFieldWriterIVARS *const ivars = SortFieldWriter_IVARS(self); // Don't add a run unless we have data to put in it. if (SortFieldWriter_Buffer_Count(self) == 0) { return; } OutStream *const temp_ord_out = ivars->temp_ord_out; OutStream *const temp_ix_out = ivars->temp_ix_out; OutStream *const temp_dat_out = ivars->temp_dat_out; SortFieldWriter_Sort_Buffer(self); SortFieldWriter *run = SortFieldWriter_new(ivars->schema, ivars->snapshot, ivars->segment, ivars->polyreader, ivars->field, ivars->counter, ivars->mem_thresh, NULL, NULL, NULL); SortFieldWriterIVARS *const run_ivars = SortFieldWriter_IVARS(run); // Record stream starts and align. run_ivars->ord_start = OutStream_Align(temp_ord_out, sizeof(int64_t)); if (ivars->var_width) { run_ivars->ix_start = OutStream_Align(temp_ix_out, sizeof(int64_t)); } run_ivars->dat_start = OutStream_Align(temp_dat_out, sizeof(int64_t)); // Have the run borrow the array of elems. run_ivars->buffer = ivars->buffer; run_ivars->buf_max = ivars->buf_max; run_ivars->buf_tick = ivars->buf_tick; run_ivars->buf_cap = ivars->buf_cap; // Write files, record stats. run_ivars->run_max = (int32_t)Seg_Get_Count(ivars->segment); run_ivars->run_cardinality = S_write_files(run, temp_ord_out, temp_ix_out, temp_dat_out); // Reclaim the buffer from the run and empty it. run_ivars->buffer = NULL; run_ivars->buf_max = 0; run_ivars->buf_tick = 0; run_ivars->buf_cap = 0; ivars->buf_tick = ivars->buf_max; SortFieldWriter_Clear_Buffer(self); // Record stream ends. run_ivars->ord_end = OutStream_Tell(temp_ord_out); if (ivars->var_width) { run_ivars->ix_end = OutStream_Tell(temp_ix_out); } run_ivars->dat_end = OutStream_Tell(temp_dat_out); // Add the run to the array. SortFieldWriter_Add_Run(self, (SortExternal*)run); }
static uint32_t S_maybe_merge(BackgroundMerger *self) { BackgroundMergerIVARS *const ivars = BGMerger_IVARS(self); Vector *to_merge = IxManager_Recycle(ivars->manager, ivars->polyreader, ivars->del_writer, 0, ivars->optimize); int32_t num_to_merge = Vec_Get_Size(to_merge); // There's no point in merging one segment if it has no deletions, because // we'd just be rewriting it. */ if (num_to_merge == 1) { SegReader *seg_reader = (SegReader*)Vec_Fetch(to_merge, 0); if (!SegReader_Del_Count(seg_reader)) { DECREF(to_merge); return 0; } } else if (num_to_merge == 0) { DECREF(to_merge); return 0; } // Now that we're sure we're writing a new segment, prep the seg dir. SegWriter_Prep_Seg_Dir(ivars->seg_writer); // Consolidate segments. for (uint32_t i = 0, max = num_to_merge; i < max; i++) { SegReader *seg_reader = (SegReader*)Vec_Fetch(to_merge, i); String *seg_name = SegReader_Get_Seg_Name(seg_reader); int64_t doc_count = Seg_Get_Count(ivars->segment); Matcher *deletions = DelWriter_Seg_Deletions(ivars->del_writer, seg_reader); I32Array *doc_map = DelWriter_Generate_Doc_Map( ivars->del_writer, deletions, SegReader_Doc_Max(seg_reader), (int32_t)doc_count); Hash_Store(ivars->doc_maps, seg_name, (Obj*)doc_map); SegWriter_Merge_Segment(ivars->seg_writer, seg_reader, doc_map); DECREF(deletions); } DECREF(to_merge); return num_to_merge; }
SegReader* SegReader_init(SegReader *self, Schema *schema, Folder *folder, Snapshot *snapshot, VArray *segments, i32_t seg_tick) { CharBuf *mess; IxReader_init((IndexReader*)self, schema, folder, snapshot, segments, seg_tick, NULL); self->doc_max = Seg_Get_Count(SegReader_Get_Segment(self)); mess = SegReader_Try_Init_Components(self); if (mess) { /* An error occurred, so clean up self and throw an exception. */ DECREF(self); Err_throw_mess(mess); } { DeletionsReader *del_reader = (DeletionsReader*)Hash_Fetch( self->components, DELETIONSREADER.name); self->del_count = del_reader ? DelReader_Del_Count(del_reader) : 0; } return self; }
static SortCache* S_lazy_init_sort_cache(DefaultSortReader *self, String *field) { DefaultSortReaderIVARS *const ivars = DefSortReader_IVARS(self); // See if we have any values. Obj *count_obj = Hash_Fetch(ivars->counts, (Obj*)field); int32_t count = count_obj ? (int32_t)Obj_To_I64(count_obj) : 0; if (!count) { return NULL; } // Get a FieldType and sanity check that the field is sortable. Schema *schema = DefSortReader_Get_Schema(self); FieldType *type = Schema_Fetch_Type(schema, field); if (!type || !FType_Sortable(type)) { THROW(ERR, "'%o' isn't a sortable field", field); } // Open streams. Folder *folder = DefSortReader_Get_Folder(self); Segment *segment = DefSortReader_Get_Segment(self); String *seg_name = Seg_Get_Name(segment); int32_t field_num = Seg_Field_Num(segment, field); int8_t prim_id = FType_Primitive_ID(type); bool var_width = (prim_id == FType_TEXT || prim_id == FType_BLOB) ? true : false; String *ord_path = Str_newf("%o/sort-%i32.ord", seg_name, field_num); InStream *ord_in = Folder_Open_In(folder, ord_path); DECREF(ord_path); if (!ord_in) { THROW(ERR, "Error building sort cache for '%o': %o", field, Err_get_error()); } InStream *ix_in = NULL; if (var_width) { String *ix_path = Str_newf("%o/sort-%i32.ix", seg_name, field_num); ix_in = Folder_Open_In(folder, ix_path); DECREF(ix_path); if (!ix_in) { THROW(ERR, "Error building sort cache for '%o': %o", field, Err_get_error()); } } String *dat_path = Str_newf("%o/sort-%i32.dat", seg_name, field_num); InStream *dat_in = Folder_Open_In(folder, dat_path); DECREF(dat_path); if (!dat_in) { THROW(ERR, "Error building sort cache for '%o': %o", field, Err_get_error()); } Obj *null_ord_obj = Hash_Fetch(ivars->null_ords, (Obj*)field); int32_t null_ord = null_ord_obj ? (int32_t)Obj_To_I64(null_ord_obj) : -1; Obj *ord_width_obj = Hash_Fetch(ivars->ord_widths, (Obj*)field); int32_t ord_width = ord_width_obj ? (int32_t)Obj_To_I64(ord_width_obj) : S_calc_ord_width(count); int32_t doc_max = (int32_t)Seg_Get_Count(segment); SortCache *cache = NULL; switch (prim_id & FType_PRIMITIVE_ID_MASK) { case FType_TEXT: cache = (SortCache*)TextSortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, ix_in, dat_in); break; case FType_INT32: cache = (SortCache*)I32SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; case FType_INT64: cache = (SortCache*)I64SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; case FType_FLOAT32: cache = (SortCache*)F32SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; case FType_FLOAT64: cache = (SortCache*)F64SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; default: THROW(ERR, "No SortCache class for %o", type); } Hash_Store(ivars->caches, (Obj*)field, (Obj*)cache); if (ivars->format == 2) { // bug compatibility SortCache_Set_Native_Ords(cache, true); } DECREF(ord_in); DECREF(ix_in); DECREF(dat_in); return cache; }
SortCache* SortCache_init(SortCache *self, Schema *schema, Folder *folder, Segment *segment, i32_t field_num) { CharBuf *field = Seg_Field_Name(segment, field_num); CharBuf *seg_name = Seg_Get_Name(segment); CharBuf *ord_file = CB_newf("%o/sort-%i32.ord", seg_name, field_num); CharBuf *ix_file = CB_newf("%o/sort-%i32.ix", seg_name, field_num); CharBuf *dat_file = CB_newf("%o/sort-%i32.dat", seg_name, field_num); i64_t ord_len, ix_len, dat_len; /* Derive. */ self->doc_max = Seg_Get_Count(segment); self->type = Schema_Fetch_Type(schema, field); if (!self->type || !FType_Sortable(self->type)) { THROW("'%o' isn't a sortable field", field); } /* Open instreams. */ self->ord_in = Folder_Open_In(folder, ord_file); self->ix_in = Folder_Open_In(folder, ix_file); self->dat_in = Folder_Open_In(folder, dat_file); if (!self->ix_in || !self->dat_in || !self->ord_in) { CharBuf *mess = MAKE_MESS("Can't open either %o, %o or %o", ord_file, ix_file, dat_file); DECREF(ord_file); DECREF(ix_file); DECREF(dat_file); Err_throw_mess(mess); } ord_len = InStream_Length(self->ord_in); ix_len = InStream_Length(self->ix_in); dat_len = InStream_Length(self->dat_in); /* Calculate the number of unique values and derive the ord bit width. */ self->num_uniq = (i32_t)(ix_len / 8) - 1; self->width = S_calc_width(self->num_uniq); /* Validate file lengths. */ { double bytes_per_doc = self->width / 8.0; double max_ords = ord_len / bytes_per_doc; if (max_ords < self->doc_max + 1) { THROW("Conflict between ord count max %f64 and doc_max %i32", max_ords, self->doc_max); } } /* Mmap ords, offsets and character data. */ self->ords = InStream_Buf(self->ord_in, (size_t)ord_len); self->offsets = (i64_t*)InStream_Buf(self->ix_in, (size_t)ix_len); self->char_data = InStream_Buf(self->dat_in, dat_len); { char *offs = (char*)self->offsets; self->offsets_limit = (i64_t*)(offs + ix_len); self->char_data_limit = self->char_data + dat_len; } DECREF(ord_file); DECREF(ix_file); DECREF(dat_file); return self; }
static bool_t S_maybe_merge(Indexer *self, VArray *seg_readers) { bool_t merge_happened = false; uint32_t num_seg_readers = VA_Get_Size(seg_readers); Lock *merge_lock = IxManager_Make_Merge_Lock(self->manager); bool_t got_merge_lock = Lock_Obtain(merge_lock); int64_t cutoff; VArray *to_merge; uint32_t i, max; if (got_merge_lock) { self->merge_lock = merge_lock; cutoff = 0; } else { // If something else holds the merge lock, don't interfere. Hash *merge_data = IxManager_Read_Merge_Data(self->manager); if (merge_data) { Obj *cutoff_obj = Hash_Fetch_Str(merge_data, "cutoff", 6); if (cutoff_obj) { cutoff = Obj_To_I64(cutoff_obj); } else { cutoff = I64_MAX; } DECREF(merge_data); } else { cutoff = I64_MAX; } DECREF(merge_lock); } // Get a list of segments to recycle. Validate and confirm that there are // no dupes in the list. to_merge = IxManager_Recycle(self->manager, self->polyreader, self->del_writer, cutoff, self->optimize); { Hash *seen = Hash_new(VA_Get_Size(to_merge)); for (i = 0, max = VA_Get_Size(to_merge); i < max; i++) { SegReader *seg_reader = (SegReader*)CERTIFY( VA_Fetch(to_merge, i), SEGREADER); CharBuf *seg_name = SegReader_Get_Seg_Name(seg_reader); if (Hash_Fetch(seen, (Obj*)seg_name)) { DECREF(seen); DECREF(to_merge); THROW(ERR, "Recycle() tried to merge segment '%o' twice", seg_name); } Hash_Store(seen, (Obj*)seg_name, INCREF(&EMPTY)); } DECREF(seen); } // Consolidate segments if either sparse or optimizing forced. for (i = 0, max = VA_Get_Size(to_merge); i < max; i++) { SegReader *seg_reader = (SegReader*)VA_Fetch(to_merge, i); int64_t seg_num = SegReader_Get_Seg_Num(seg_reader); Matcher *deletions = DelWriter_Seg_Deletions(self->del_writer, seg_reader); I32Array *doc_map = DelWriter_Generate_Doc_Map(self->del_writer, deletions, SegReader_Doc_Max(seg_reader), (int32_t)Seg_Get_Count(self->segment) ); if (seg_num <= cutoff) { THROW(ERR, "Segment %o violates cutoff (%i64 <= %i64)", SegReader_Get_Seg_Name(seg_reader), seg_num, cutoff); } SegWriter_Merge_Segment(self->seg_writer, seg_reader, doc_map); merge_happened = true; DECREF(deletions); DECREF(doc_map); } // Write out new deletions. if (DelWriter_Updated(self->del_writer)) { // Only write out if they haven't all been applied. if (VA_Get_Size(to_merge) != num_seg_readers) { DelWriter_Finish(self->del_writer); } } DECREF(to_merge); return merge_happened; }
static int32_t S_write_files(SortFieldWriter *self, OutStream *ord_out, OutStream *ix_out, OutStream *dat_out) { SortFieldWriterIVARS *const ivars = SortFieldWriter_IVARS(self); int8_t prim_id = ivars->prim_id; int32_t doc_max = (int32_t)Seg_Get_Count(ivars->segment); bool has_nulls = ivars->count == doc_max ? false : true; size_t size = (doc_max + 1) * sizeof(int32_t); int32_t *ords = (int32_t*)MALLOCATE(size); int32_t ord = 0; int64_t dat_start = OutStream_Tell(dat_out); // Assign -1 as a stand-in for the NULL ord. for (int32_t i = 0; i <= doc_max; i++) { ords[i] = -1; } // Grab the first item and record its ord. Add a dummy ord for invalid // doc id 0. SFWriterElem *elem = (SFWriterElem*)SortFieldWriter_Fetch(self); SFWriterElemIVARS *elem_ivars = SFWriterElem_IVARS(elem); if (elem_ivars->doc_id > doc_max) { THROW(ERR, "doc_id %i32 greater than doc_max %i32", elem_ivars->doc_id, doc_max); } ords[elem_ivars->doc_id] = ord; ords[0] = 0; // Build array of ords, write non-NULL sorted values. Obj *last_val = INCREF(elem_ivars->value); S_write_val(elem_ivars->value, prim_id, ix_out, dat_out, dat_start); DECREF(elem); while (NULL != (elem = (SFWriterElem*)SortFieldWriter_Fetch(self))) { elem_ivars = SFWriterElem_IVARS(elem); if (elem_ivars->value != last_val) { int32_t comparison = FType_Compare_Values(ivars->type, elem_ivars->value, last_val); if (comparison != 0) { ord++; S_write_val(elem_ivars->value, prim_id, ix_out, dat_out, dat_start); } DECREF(last_val); last_val = INCREF(elem_ivars->value); } if (elem_ivars->doc_id > doc_max) { THROW(ERR, "doc_id %i32 greater than doc_max %i32", elem_ivars->doc_id, doc_max); } ords[elem_ivars->doc_id] = ord; DECREF(elem); } DECREF(last_val); // If there are NULL values, write one now and record the NULL ord. if (has_nulls) { S_write_val(NULL, prim_id, ix_out, dat_out, dat_start); ord++; ivars->null_ord = ord; } int32_t null_ord = ivars->null_ord; // Write one extra file pointer so that we can always derive length. if (ivars->var_width) { OutStream_Write_I64(ix_out, OutStream_Tell(dat_out) - dat_start); } // Calculate cardinality and ord width. int32_t cardinality = ord + 1; ivars->ord_width = S_calc_width(cardinality); int32_t ord_width = ivars->ord_width; // Write ords. const double BITS_PER_BYTE = 8.0; double bytes_per_doc = ord_width / BITS_PER_BYTE; double byte_count = ceil((doc_max + 1) * bytes_per_doc); char *compressed_ords = (char*)CALLOCATE((size_t)byte_count, sizeof(char)); for (int32_t i = 0; i <= doc_max; i++) { int32_t real_ord = ords[i] == -1 ? null_ord : ords[i]; S_write_ord(compressed_ords, ord_width, i, real_ord); } OutStream_Write_Bytes(ord_out, compressed_ords, (size_t)byte_count); FREEMEM(compressed_ords); FREEMEM(ords); return cardinality; }