void PostPool_Flip_IMP(PostingPool *self) { PostingPoolIVARS *const ivars = PostPool_IVARS(self); uint32_t num_runs = VA_Get_Size(ivars->runs); uint32_t sub_thresh = num_runs > 0 ? ivars->mem_thresh / num_runs : ivars->mem_thresh; if (num_runs) { Folder *folder = PolyReader_Get_Folder(ivars->polyreader); String *seg_name = Seg_Get_Name(ivars->segment); String *lex_temp_path = Str_newf("%o/lextemp", seg_name); String *post_temp_path = Str_newf("%o/ptemp", seg_name); ivars->lex_temp_in = Folder_Open_In(folder, lex_temp_path); if (!ivars->lex_temp_in) { RETHROW(INCREF(Err_get_error())); } ivars->post_temp_in = Folder_Open_In(folder, post_temp_path); if (!ivars->post_temp_in) { RETHROW(INCREF(Err_get_error())); } DECREF(lex_temp_path); DECREF(post_temp_path); } PostPool_Sort_Buffer(self); if (num_runs && (ivars->buf_max - ivars->buf_tick) > 0) { uint32_t num_items = PostPool_Buffer_Count(self); // Cheap imitation of flush. FIXME. PostingPool *run = PostPool_new(ivars->schema, ivars->snapshot, ivars->segment, ivars->polyreader, ivars->field, ivars->lex_writer, ivars->mem_pool, ivars->lex_temp_out, ivars->post_temp_out, ivars->skip_out); PostPool_Grow_Buffer(run, num_items); PostingPoolIVARS *const run_ivars = PostPool_IVARS(run); memcpy(run_ivars->buffer, (ivars->buffer) + ivars->buf_tick, num_items * sizeof(Obj*)); run_ivars->buf_max = num_items; PostPool_Add_Run(self, (SortExternal*)run); ivars->buf_tick = 0; ivars->buf_max = 0; } // Assign. for (uint32_t i = 0; i < num_runs; i++) { PostingPool *run = (PostingPool*)VA_Fetch(ivars->runs, i); if (run != NULL) { PostPool_Set_Mem_Thresh(run, sub_thresh); if (!PostPool_IVARS(run)->lexicon) { S_fresh_flip(run, ivars->lex_temp_in, ivars->post_temp_in); } } } ivars->flipped = true; }
DefaultHighlightReader* DefHLReader_init(DefaultHighlightReader *self, Schema *schema, Folder *folder, Snapshot *snapshot, VArray *segments, int32_t seg_tick) { Segment *segment; Hash *metadata; HLReader_init((HighlightReader*)self, schema, folder, snapshot, segments, seg_tick); segment = DefHLReader_Get_Segment(self); metadata = (Hash*)Seg_Fetch_Metadata_Str(segment, "highlight", 9); if (!metadata) { metadata = (Hash*)Seg_Fetch_Metadata_Str(segment, "term_vectors", 12); } // Check format. if (metadata) { Obj *format = Hash_Fetch_Str(metadata, "format", 6); if (!format) { THROW(ERR, "Missing 'format' var"); } else { if (Obj_To_I64(format) != HLWriter_current_file_format) { THROW(ERR, "Unsupported highlight data format: %i64", Obj_To_I64(format)); } } } // Open instreams. { CharBuf *seg_name = Seg_Get_Name(segment); CharBuf *ix_file = CB_newf("%o/highlight.ix", seg_name); CharBuf *dat_file = CB_newf("%o/highlight.dat", seg_name); if (Folder_Exists(folder, ix_file)) { self->ix_in = Folder_Open_In(folder, ix_file); if (!self->ix_in) { Err *error = (Err*)INCREF(Err_get_error()); DECREF(ix_file); DECREF(dat_file); DECREF(self); RETHROW(error); } self->dat_in = Folder_Open_In(folder, dat_file); if (!self->dat_in) { Err *error = (Err*)INCREF(Err_get_error()); DECREF(ix_file); DECREF(dat_file); DECREF(self); RETHROW(error); } } DECREF(ix_file); DECREF(dat_file); } return self; }
DefaultDocReader* DefDocReader_init(DefaultDocReader *self, Schema *schema, Folder *folder, Snapshot *snapshot, Vector *segments, int32_t seg_tick) { Hash *metadata; Segment *segment; DocReader_init((DocReader*)self, schema, folder, snapshot, segments, seg_tick); DefaultDocReaderIVARS *const ivars = DefDocReader_IVARS(self); segment = DefDocReader_Get_Segment(self); metadata = (Hash*)Seg_Fetch_Metadata_Utf8(segment, "documents", 9); if (metadata) { String *seg_name = Seg_Get_Name(segment); String *ix_file = Str_newf("%o/documents.ix", seg_name); String *dat_file = Str_newf("%o/documents.dat", seg_name); Obj *format = Hash_Fetch_Utf8(metadata, "format", 6); // Check format. if (!format) { THROW(ERR, "Missing 'format' var"); } else { int64_t format_val = Json_obj_to_i64(format); if (format_val < DocWriter_current_file_format) { THROW(ERR, "Obsolete doc storage format %i64; " "Index regeneration is required", format_val); } else if (format_val != DocWriter_current_file_format) { THROW(ERR, "Unsupported doc storage format: %i64", format_val); } } // Get streams. if (Folder_Exists(folder, ix_file)) { ivars->ix_in = Folder_Open_In(folder, ix_file); if (!ivars->ix_in) { Err *error = (Err*)INCREF(Err_get_error()); DECREF(ix_file); DECREF(dat_file); DECREF(self); RETHROW(error); } ivars->dat_in = Folder_Open_In(folder, dat_file); if (!ivars->dat_in) { Err *error = (Err*)INCREF(Err_get_error()); DECREF(ix_file); DECREF(dat_file); DECREF(self); RETHROW(error); } } DECREF(ix_file); DECREF(dat_file); } return self; }
LexIndex* LexIndex_init(LexIndex *self, Schema *schema, Folder *folder, Segment *segment, String *field) { int32_t field_num = Seg_Field_Num(segment, field); String *seg_name = Seg_Get_Name(segment); String *ixix_file = Str_newf("%o/lexicon-%i32.ixix", seg_name, field_num); String *ix_file = Str_newf("%o/lexicon-%i32.ix", seg_name, field_num); Architecture *arch = Schema_Get_Architecture(schema); // Init. Lex_init((Lexicon*)self, field); LexIndexIVARS *const ivars = LexIndex_IVARS(self); ivars->tinfo = TInfo_new(0); ivars->tick = 0; // Derive ivars->field_type = Schema_Fetch_Type(schema, field); if (!ivars->field_type) { String *mess = MAKE_MESS("Unknown field: '%o'", field); DECREF(ix_file); DECREF(ixix_file); DECREF(self); Err_throw_mess(ERR, mess); } ivars->field_type = (FieldType*)INCREF(ivars->field_type); ivars->term_stepper = FType_Make_Term_Stepper(ivars->field_type); ivars->ixix_in = Folder_Open_In(folder, ixix_file); if (!ivars->ixix_in) { Err *error = (Err*)INCREF(Err_get_error()); DECREF(ix_file); DECREF(ixix_file); DECREF(self); RETHROW(error); } ivars->ix_in = Folder_Open_In(folder, ix_file); if (!ivars->ix_in) { Err *error = (Err*)INCREF(Err_get_error()); DECREF(ix_file); DECREF(ixix_file); DECREF(self); RETHROW(error); } ivars->index_interval = Arch_Index_Interval(arch); ivars->skip_interval = Arch_Skip_Interval(arch); ivars->size = (int32_t)(InStream_Length(ivars->ixix_in) / sizeof(int64_t)); ivars->offsets = (const int64_t*)InStream_Buf(ivars->ixix_in, (size_t)InStream_Length(ivars->ixix_in)); DECREF(ixix_file); DECREF(ix_file); return self; }
DefaultHighlightReader* DefHLReader_init(DefaultHighlightReader *self, Schema *schema, Folder *folder, Snapshot *snapshot, Vector *segments, int32_t seg_tick) { HLReader_init((HighlightReader*)self, schema, folder, snapshot, segments, seg_tick); DefaultHighlightReaderIVARS *const ivars = DefHLReader_IVARS(self); Segment *segment = DefHLReader_Get_Segment(self); Hash *metadata = (Hash*)Seg_Fetch_Metadata_Utf8(segment, "highlight", 9); if (!metadata) { metadata = (Hash*)Seg_Fetch_Metadata_Utf8(segment, "term_vectors", 12); } // Check format. if (metadata) { Obj *format = Hash_Fetch_Utf8(metadata, "format", 6); if (!format) { THROW(ERR, "Missing 'format' var"); } else { if (Json_obj_to_i64(format) != HLWriter_current_file_format) { THROW(ERR, "Unsupported highlight data format: %i64", Json_obj_to_i64(format)); } } } // Open instreams. String *seg_name = Seg_Get_Name(segment); String *ix_file = Str_newf("%o/highlight.ix", seg_name); String *dat_file = Str_newf("%o/highlight.dat", seg_name); if (Folder_Exists(folder, ix_file)) { ivars->ix_in = Folder_Open_In(folder, ix_file); if (!ivars->ix_in) { Err *error = (Err*)INCREF(Err_get_error()); DECREF(ix_file); DECREF(dat_file); DECREF(self); RETHROW(error); } ivars->dat_in = Folder_Open_In(folder, dat_file); if (!ivars->dat_in) { Err *error = (Err*)INCREF(Err_get_error()); DECREF(ix_file); DECREF(dat_file); DECREF(self); RETHROW(error); } } DECREF(ix_file); DECREF(dat_file); return self; }
DefaultDocReader* DefDocReader_init(DefaultDocReader *self, Schema *schema, Folder *folder, Snapshot *snapshot, VArray *segments, i32_t seg_tick) { Hash *metadata; Segment *segment; DocReader_init((DocReader*)self, schema, folder, snapshot, segments, seg_tick); segment = DefDocReader_Get_Segment(self); metadata = (Hash*)Seg_Fetch_Metadata_Str(segment, "documents", 9); if (metadata) { CharBuf *seg_name = Seg_Get_Name(segment); CharBuf *ix_file = CB_newf("%o/documents.ix", seg_name); CharBuf *dat_file = CB_newf("%o/documents.dat", seg_name); Obj *format = Hash_Fetch_Str(metadata, "format", 6); /* Check format. */ if (!format) { THROW("Missing 'format' var"); } else { i64_t format_val = Obj_To_I64(format); if (format_val < DocWriter_current_file_format) { THROW("Obsolete doc storage format %i64; " "Index regeneration is required", format_val); } else if (format_val != DocWriter_current_file_format) { THROW("Unsupported doc storage format: %i64", format_val); } } /* Get streams. */ if (Folder_Exists(folder, ix_file)) { self->ix_in = Folder_Open_In(folder, ix_file); self->dat_in = Folder_Open_In(folder, dat_file); if (!self->ix_in || !self->dat_in) { CharBuf *mess = MAKE_MESS("Can't open either %o or %o", ix_file, dat_file); DECREF(ix_file); DECREF(dat_file); DECREF(self); Err_throw_mess(mess); } } DECREF(ix_file); DECREF(dat_file); } return self; }
LexIndex* LexIndex_init(LexIndex *self, Schema *schema, Folder *folder, Segment *segment, const CharBuf *field) { i32_t field_num = Seg_Field_Num(segment, field); CharBuf *seg_name = Seg_Get_Name(segment); CharBuf *ixix_file = CB_newf("%o/lexicon-%i32.ixix", seg_name, field_num); CharBuf *ix_file = CB_newf("%o/lexicon-%i32.ix", seg_name, field_num); Architecture *arch = Schema_Get_Architecture(schema); /* Init. */ self->term = ViewCB_new_from_trusted_utf8(NULL, 0); self->tinfo = TInfo_new(0,0,0,0); self->tick = 0; /* Derive */ self->field_type = Schema_Fetch_Type(schema, field); if (!self->field_type) { CharBuf *mess = MAKE_MESS("Unknown field: '%o'", field); DECREF(ix_file); DECREF(ixix_file); DECREF(self); Err_throw_mess(mess); } INCREF(self->field_type); self->ixix_in = Folder_Open_In(folder, ixix_file); self->ix_in = Folder_Open_In(folder, ix_file); if (!self->ixix_in || !self->ix_in) { CharBuf *mess = MAKE_MESS("Can't open either %o or %o", ix_file, ixix_file); DECREF(ix_file); DECREF(ixix_file); DECREF(self); Err_throw_mess(mess); } self->index_interval = Arch_Index_Interval(arch); self->skip_interval = Arch_Skip_Interval(arch); self->size = (i32_t)(InStream_Length(self->ixix_in) / sizeof(i64_t)); self->offsets = (i64_t*)InStream_Buf(self->ixix_in, (size_t)InStream_Length(self->ixix_in)); self->data = InStream_Buf(self->ix_in, InStream_Length(self->ix_in)); self->limit = self->data + InStream_Length(self->ix_in); DECREF(ixix_file); DECREF(ix_file); return self; }
void SortFieldWriter_Flip_IMP(SortFieldWriter *self) { SortFieldWriterIVARS *const ivars = SortFieldWriter_IVARS(self); uint32_t num_items = SortFieldWriter_Buffer_Count(self); uint32_t num_runs = Vec_Get_Size(ivars->runs); if (ivars->flipped) { THROW(ERR, "Can't call Flip() twice"); } ivars->flipped = true; // Sanity check. if (num_runs && num_items) { THROW(ERR, "Sanity check failed: num_runs: %u32 num_items: %u32", num_runs, num_items); } if (num_items) { SortFieldWriter_Sort_Buffer(self); } else if (num_runs) { Folder *folder = PolyReader_Get_Folder(ivars->polyreader); String *seg_name = Seg_Get_Name(ivars->segment); String *ord_path = Str_newf("%o/sort_ord_temp", seg_name); ivars->ord_in = Folder_Open_In(folder, ord_path); DECREF(ord_path); if (!ivars->ord_in) { RETHROW(INCREF(Err_get_error())); } if (ivars->var_width) { String *ix_path = Str_newf("%o/sort_ix_temp", seg_name); ivars->ix_in = Folder_Open_In(folder, ix_path); DECREF(ix_path); if (!ivars->ix_in) { RETHROW(INCREF(Err_get_error())); } } String *dat_path = Str_newf("%o/sort_dat_temp", seg_name); ivars->dat_in = Folder_Open_In(folder, dat_path); DECREF(dat_path); if (!ivars->dat_in) { RETHROW(INCREF(Err_get_error())); } // Assign streams and a slice of mem_thresh. size_t sub_thresh = ivars->mem_thresh / num_runs; if (sub_thresh < 65536) { sub_thresh = 65536; } for (uint32_t i = 0; i < num_runs; i++) { SortFieldWriter *run = (SortFieldWriter*)Vec_Fetch(ivars->runs, i); S_flip_run(run, sub_thresh, ivars->ord_in, ivars->ix_in, ivars->dat_in); } } ivars->flipped = true; }
ByteBuf* Folder_slurp_file(Folder *self, const CharBuf *path) { InStream *instream = Folder_Open_In(self, path); ByteBuf *retval = NULL; if (!instream) { RETHROW(INCREF(Err_get_error())); } else { uint64_t length = InStream_Length(instream); if (length >= SIZE_MAX) { InStream_Close(instream); DECREF(instream); THROW(ERR, "File %o is too big to slurp (%u64 bytes)", path, length); } else { size_t size = (size_t)length; char *ptr = (char*)MALLOCATE((size_t)size + 1); InStream_Read_Bytes(instream, ptr, size); ptr[size] = '\0'; retval = BB_new_steal_bytes(ptr, size, size + 1); InStream_Close(instream); DECREF(instream); } } return retval; }
void PostPool_assign_seg(PostingPool *self, Folder *other_folder, Segment *other_segment, i32_t doc_base, I32Array *doc_map) { i32_t field_num = Seg_Field_Num(other_segment, self->field); CharBuf *other_seg_name = Seg_Get_Name(other_segment); CharBuf *lex_file = CB_newf("%o/lexicon-%i32.dat", other_seg_name, field_num); /* Dedicate pool to this task alone. */ if (self->from_seg || self->cache_max > 0 || self->lex_end != 0) THROW("Can't Assign_Segment to PostingPool with other content"); self->from_seg = true; /* Prepare to read from existing files. */ if (Folder_Exists(other_folder, lex_file)) { CharBuf *post_file = CB_newf("%o/postings-%i32.dat", other_seg_name, field_num); /* Open lexicon and postings files. */ self->lex_instream = Folder_Open_In(other_folder, lex_file); self->post_instream = Folder_Open_In(other_folder, post_file); if (!self->lex_instream) { THROW("Can't open %o", lex_file); } if (!self->post_instream) { THROW("Can't open %o", post_file); } self->lex_end = InStream_Length(self->lex_instream); self->post_end = InStream_Length(self->post_instream); /* Assign doc base and doc map. */ self->doc_base = doc_base; self->doc_map = doc_map ? (I32Array*)INCREF(doc_map) : NULL; DECREF(post_file); } else { /* This posting pool will be empty. */ } /* Clean up. */ DECREF(lex_file); }
static void test_Open_In(TestBatch *batch) { Folder *folder = (Folder*)RAMFolder_new(NULL); FileHandle *fh; InStream *instream; Folder_MkDir(folder, &foo); Folder_MkDir(folder, &foo_bar); fh = Folder_Open_FileHandle(folder, &boffo, FH_CREATE | FH_WRITE_ONLY); DECREF(fh); fh = Folder_Open_FileHandle(folder, &foo_boffo, FH_CREATE | FH_WRITE_ONLY); DECREF(fh); instream = Folder_Open_In(folder, &boffo); TEST_TRUE(batch, instream && InStream_Is_A(instream, INSTREAM), "Open_In"); DECREF(instream); instream = Folder_Open_In(folder, &foo_boffo); TEST_TRUE(batch, instream && InStream_Is_A(instream, INSTREAM), "Open_In for nested file"); DECREF(instream); Err_set_error(NULL); instream = Folder_Open_In(folder, &foo); TEST_TRUE(batch, instream == NULL, "Open_InStream on existing dir path fails"); TEST_TRUE(batch, Err_get_error() != NULL, "Open_In on existing dir name sets Err_error"); Err_set_error(NULL); instream = Folder_Open_In(folder, &foo_bar_baz_boffo); TEST_TRUE(batch, instream == NULL, "Open_In for entry within non-existent dir fails"); TEST_TRUE(batch, Err_get_error() != NULL, "Open_In for entry within non-existent dir sets Err_error"); DECREF(folder); }
Obj* Json_slurp_json(Folder *folder, const CharBuf *path) { InStream *instream = Folder_Open_In(folder, path); if (!instream) { ERR_ADD_FRAME(Err_get_error()); return NULL; } size_t len = (size_t)InStream_Length(instream); char *buf = InStream_Buf(instream, len); Obj *dump = S_parse_json(buf, len); InStream_Close(instream); DECREF(instream); if (!dump) { ERR_ADD_FRAME(Err_get_error()); } return dump; }
static SortCache* S_lazy_init_sort_cache(DefaultSortReader *self, String *field) { DefaultSortReaderIVARS *const ivars = DefSortReader_IVARS(self); // See if we have any values. Obj *count_obj = Hash_Fetch(ivars->counts, (Obj*)field); int32_t count = count_obj ? (int32_t)Obj_To_I64(count_obj) : 0; if (!count) { return NULL; } // Get a FieldType and sanity check that the field is sortable. Schema *schema = DefSortReader_Get_Schema(self); FieldType *type = Schema_Fetch_Type(schema, field); if (!type || !FType_Sortable(type)) { THROW(ERR, "'%o' isn't a sortable field", field); } // Open streams. Folder *folder = DefSortReader_Get_Folder(self); Segment *segment = DefSortReader_Get_Segment(self); String *seg_name = Seg_Get_Name(segment); int32_t field_num = Seg_Field_Num(segment, field); int8_t prim_id = FType_Primitive_ID(type); bool var_width = (prim_id == FType_TEXT || prim_id == FType_BLOB) ? true : false; String *ord_path = Str_newf("%o/sort-%i32.ord", seg_name, field_num); InStream *ord_in = Folder_Open_In(folder, ord_path); DECREF(ord_path); if (!ord_in) { THROW(ERR, "Error building sort cache for '%o': %o", field, Err_get_error()); } InStream *ix_in = NULL; if (var_width) { String *ix_path = Str_newf("%o/sort-%i32.ix", seg_name, field_num); ix_in = Folder_Open_In(folder, ix_path); DECREF(ix_path); if (!ix_in) { THROW(ERR, "Error building sort cache for '%o': %o", field, Err_get_error()); } } String *dat_path = Str_newf("%o/sort-%i32.dat", seg_name, field_num); InStream *dat_in = Folder_Open_In(folder, dat_path); DECREF(dat_path); if (!dat_in) { THROW(ERR, "Error building sort cache for '%o': %o", field, Err_get_error()); } Obj *null_ord_obj = Hash_Fetch(ivars->null_ords, (Obj*)field); int32_t null_ord = null_ord_obj ? (int32_t)Obj_To_I64(null_ord_obj) : -1; Obj *ord_width_obj = Hash_Fetch(ivars->ord_widths, (Obj*)field); int32_t ord_width = ord_width_obj ? (int32_t)Obj_To_I64(ord_width_obj) : S_calc_ord_width(count); int32_t doc_max = (int32_t)Seg_Get_Count(segment); SortCache *cache = NULL; switch (prim_id & FType_PRIMITIVE_ID_MASK) { case FType_TEXT: cache = (SortCache*)TextSortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, ix_in, dat_in); break; case FType_INT32: cache = (SortCache*)I32SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; case FType_INT64: cache = (SortCache*)I64SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; case FType_FLOAT32: cache = (SortCache*)F32SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; case FType_FLOAT64: cache = (SortCache*)F64SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; default: THROW(ERR, "No SortCache class for %o", type); } Hash_Store(ivars->caches, (Obj*)field, (Obj*)cache); if (ivars->format == 2) { // bug compatibility SortCache_Set_Native_Ords(cache, true); } DECREF(ord_in); DECREF(ix_in); DECREF(dat_in); return cache; }
CompoundFileReader* CFReader_do_open(CompoundFileReader *self, Folder *folder) { CharBuf *cfmeta_file = (CharBuf*)ZCB_WRAP_STR("cfmeta.json", 11); Hash *metadata = (Hash*)Json_slurp_json((Folder*)folder, cfmeta_file); Err *error = NULL; Folder_init((Folder*)self, Folder_Get_Path(folder)); // Parse metadata file. if (!metadata || !Hash_Is_A(metadata, HASH)) { error = Err_new(CB_newf("Can't read '%o' in '%o'", cfmeta_file, Folder_Get_Path(folder))); } else { Obj *format = Hash_Fetch_Str(metadata, "format", 6); self->format = format ? (int32_t)Obj_To_I64(format) : 0; self->records = (Hash*)INCREF(Hash_Fetch_Str(metadata, "files", 5)); if (self->format < 1) { error = Err_new(CB_newf("Corrupt %o file: Missing or invalid 'format'", cfmeta_file)); } else if (self->format > CFWriter_current_file_format) { error = Err_new(CB_newf("Unsupported compound file format: %i32 " "(current = %i32", self->format, CFWriter_current_file_format)); } else if (!self->records) { error = Err_new(CB_newf("Corrupt %o file: missing 'files' key", cfmeta_file)); } } DECREF(metadata); if (error) { Err_set_error(error); DECREF(self); return NULL; } // Open an instream which we'll clone over and over. CharBuf *cf_file = (CharBuf*)ZCB_WRAP_STR("cf.dat", 6); self->instream = Folder_Open_In(folder, cf_file); if (!self->instream) { ERR_ADD_FRAME(Err_get_error()); DECREF(self); return NULL; } // Assign. self->real_folder = (Folder*)INCREF(folder); // Strip directory name from filepaths for old format. if (self->format == 1) { VArray *files = Hash_Keys(self->records); ZombieCharBuf *filename = ZCB_BLANK(); ZombieCharBuf *folder_name = IxFileNames_local_part(Folder_Get_Path(folder), ZCB_BLANK()); size_t folder_name_len = ZCB_Length(folder_name); for (uint32_t i = 0, max = VA_Get_Size(files); i < max; i++) { CharBuf *orig = (CharBuf*)VA_Fetch(files, i); if (CB_Starts_With(orig, (CharBuf*)folder_name)) { Obj *record = Hash_Delete(self->records, (Obj*)orig); ZCB_Assign(filename, orig); ZCB_Nip(filename, folder_name_len + sizeof(DIR_SEP) - 1); Hash_Store(self->records, (Obj*)filename, (Obj*)record); } } DECREF(files); } return self; }
static void S_do_consolidate(CompoundFileWriter *self, CompoundFileWriterIVARS *ivars) { UNUSED_VAR(self); Folder *folder = ivars->folder; Hash *metadata = Hash_new(0); Hash *sub_files = Hash_new(0); Vector *files = Folder_List(folder, NULL); Vector *merged = Vec_new(Vec_Get_Size(files)); String *cf_file = (String*)SSTR_WRAP_UTF8("cf.dat", 6); OutStream *outstream = Folder_Open_Out(folder, (String*)cf_file); bool rename_success; if (!outstream) { RETHROW(INCREF(Err_get_error())); } // Start metadata. Hash_Store_Utf8(metadata, "files", 5, INCREF(sub_files)); Hash_Store_Utf8(metadata, "format", 6, (Obj*)Str_newf("%i32", CFWriter_current_file_format)); Vec_Sort(files); for (uint32_t i = 0, max = Vec_Get_Size(files); i < max; i++) { String *infilename = (String*)Vec_Fetch(files, i); if (!Str_Ends_With_Utf8(infilename, ".json", 5)) { InStream *instream = Folder_Open_In(folder, infilename); Hash *file_data = Hash_new(2); int64_t offset, len; if (!instream) { RETHROW(INCREF(Err_get_error())); } // Absorb the file. offset = OutStream_Tell(outstream); OutStream_Absorb(outstream, instream); len = OutStream_Tell(outstream) - offset; // Record offset and length. Hash_Store_Utf8(file_data, "offset", 6, (Obj*)Str_newf("%i64", offset)); Hash_Store_Utf8(file_data, "length", 6, (Obj*)Str_newf("%i64", len)); Hash_Store(sub_files, infilename, (Obj*)file_data); Vec_Push(merged, INCREF(infilename)); // Add filler NULL bytes so that every sub-file begins on a file // position multiple of 8. OutStream_Align(outstream, 8); InStream_Close(instream); DECREF(instream); } } // Write metadata to cfmeta file. String *cfmeta_temp = (String*)SSTR_WRAP_UTF8("cfmeta.json.temp", 16); String *cfmeta_file = (String*)SSTR_WRAP_UTF8("cfmeta.json", 11); Json_spew_json((Obj*)metadata, (Folder*)ivars->folder, cfmeta_temp); rename_success = Folder_Rename(ivars->folder, cfmeta_temp, cfmeta_file); if (!rename_success) { RETHROW(INCREF(Err_get_error())); } // Clean up. OutStream_Close(outstream); DECREF(outstream); DECREF(files); DECREF(metadata); /* HashIterator *iter = HashIter_new(sub_files); while (HashIter_Next(iter)) { String *merged_file = HashIter_Get_Key(iter); if (!Folder_Delete(folder, merged_file)) { String *mess = MAKE_MESS("Can't delete '%o'", merged_file); DECREF(sub_files); Err_throw_mess(ERR, mess); } } DECREF(iter); */ DECREF(sub_files); for (uint32_t i = 0, max = Vec_Get_Size(merged); i < max; i++) { String *merged_file = (String*)Vec_Fetch(merged, i); if (!Folder_Delete(folder, merged_file)) { String *mess = MAKE_MESS("Can't delete '%o'", merged_file); DECREF(merged); Err_throw_mess(ERR, mess); } } DECREF(merged); }
CompoundFileReader* CFReader_do_open(CompoundFileReader *self, Folder *folder) { CompoundFileReaderIVARS *const ivars = CFReader_IVARS(self); String *cfmeta_file = (String*)SSTR_WRAP_UTF8("cfmeta.json", 11); Hash *metadata = (Hash*)Json_slurp_json((Folder*)folder, cfmeta_file); Err *error = NULL; Folder_init((Folder*)self, Folder_Get_Path(folder)); // Parse metadata file. if (!metadata || !Hash_Is_A(metadata, HASH)) { error = Err_new(Str_newf("Can't read '%o' in '%o'", cfmeta_file, Folder_Get_Path(folder))); } else { Obj *format = Hash_Fetch_Utf8(metadata, "format", 6); ivars->format = format ? (int32_t)Obj_To_I64(format) : 0; ivars->records = (Hash*)INCREF(Hash_Fetch_Utf8(metadata, "files", 5)); if (ivars->format < 1) { error = Err_new(Str_newf("Corrupt %o file: Missing or invalid 'format'", cfmeta_file)); } else if (ivars->format > CFWriter_current_file_format) { error = Err_new(Str_newf("Unsupported compound file format: %i32 " "(current = %i32", ivars->format, CFWriter_current_file_format)); } else if (!ivars->records) { error = Err_new(Str_newf("Corrupt %o file: missing 'files' key", cfmeta_file)); } } DECREF(metadata); if (error) { Err_set_error(error); DECREF(self); return NULL; } // Open an instream which we'll clone over and over. String *cf_file = (String*)SSTR_WRAP_UTF8("cf.dat", 6); ivars->instream = Folder_Open_In(folder, cf_file); if (!ivars->instream) { ERR_ADD_FRAME(Err_get_error()); DECREF(self); return NULL; } // Assign. ivars->real_folder = (Folder*)INCREF(folder); // Strip directory name from filepaths for old format. if (ivars->format == 1) { Vector *files = Hash_Keys(ivars->records); String *folder_name = IxFileNames_local_part(Folder_Get_Path(folder)); size_t folder_name_len = Str_Length(folder_name); for (uint32_t i = 0, max = Vec_Get_Size(files); i < max; i++) { String *orig = (String*)Vec_Fetch(files, i); if (Str_Starts_With(orig, folder_name)) { Obj *record = Hash_Delete(ivars->records, orig); size_t offset = folder_name_len + sizeof(CHY_DIR_SEP) - 1; size_t len = Str_Length(orig) - offset; String *filename = Str_SubString(orig, offset, len); Hash_Store(ivars->records, filename, (Obj*)record); DECREF(filename); } } DECREF(folder_name); DECREF(files); } return self; }
SortCache* SortCache_init(SortCache *self, Schema *schema, Folder *folder, Segment *segment, i32_t field_num) { CharBuf *field = Seg_Field_Name(segment, field_num); CharBuf *seg_name = Seg_Get_Name(segment); CharBuf *ord_file = CB_newf("%o/sort-%i32.ord", seg_name, field_num); CharBuf *ix_file = CB_newf("%o/sort-%i32.ix", seg_name, field_num); CharBuf *dat_file = CB_newf("%o/sort-%i32.dat", seg_name, field_num); i64_t ord_len, ix_len, dat_len; /* Derive. */ self->doc_max = Seg_Get_Count(segment); self->type = Schema_Fetch_Type(schema, field); if (!self->type || !FType_Sortable(self->type)) { THROW("'%o' isn't a sortable field", field); } /* Open instreams. */ self->ord_in = Folder_Open_In(folder, ord_file); self->ix_in = Folder_Open_In(folder, ix_file); self->dat_in = Folder_Open_In(folder, dat_file); if (!self->ix_in || !self->dat_in || !self->ord_in) { CharBuf *mess = MAKE_MESS("Can't open either %o, %o or %o", ord_file, ix_file, dat_file); DECREF(ord_file); DECREF(ix_file); DECREF(dat_file); Err_throw_mess(mess); } ord_len = InStream_Length(self->ord_in); ix_len = InStream_Length(self->ix_in); dat_len = InStream_Length(self->dat_in); /* Calculate the number of unique values and derive the ord bit width. */ self->num_uniq = (i32_t)(ix_len / 8) - 1; self->width = S_calc_width(self->num_uniq); /* Validate file lengths. */ { double bytes_per_doc = self->width / 8.0; double max_ords = ord_len / bytes_per_doc; if (max_ords < self->doc_max + 1) { THROW("Conflict between ord count max %f64 and doc_max %i32", max_ords, self->doc_max); } } /* Mmap ords, offsets and character data. */ self->ords = InStream_Buf(self->ord_in, (size_t)ord_len); self->offsets = (i64_t*)InStream_Buf(self->ix_in, (size_t)ix_len); self->char_data = InStream_Buf(self->dat_in, dat_len); { char *offs = (char*)self->offsets; self->offsets_limit = (i64_t*)(offs + ix_len); self->char_data_limit = self->char_data + dat_len; } DECREF(ord_file); DECREF(ix_file); DECREF(dat_file); return self; }