int64_t OutStream_Align_IMP(OutStream *self, int64_t modulus) { int64_t len = OutStream_Tell(self); int64_t filler_bytes = (modulus - (len % modulus)) % modulus; while (filler_bytes--) { OutStream_Write_U8(self, 0); } return OutStream_Tell(self); }
static void S_add_last_term_to_ix(LexiconWriter *self, char *last_text, size_t last_size) { OutStream *const ix_out = self->ix_out; OutStream *const ixix_out = self->ixix_out; TermInfo *const last_tinfo = self->last_tinfo; /* Write file pointer to index record. */ OutStream_Write_U64(ixix_out, OutStream_Tell(ix_out)); /* Write term text. */ OutStream_Write_C32(ix_out, last_size); OutStream_Write_Bytes(ix_out, last_text, last_size); /* Write doc_freq. */ OutStream_Write_C32(ix_out, last_tinfo->doc_freq); /* Write postings file pointer. */ OutStream_Write_C64(ix_out, last_tinfo->post_filepos); /* Write skip file pointer (maybe). */ if (last_tinfo->doc_freq >= self->skip_interval) { OutStream_Write_C64(ix_out, last_tinfo->skip_filepos); } /* Write file pointer to main record. */ OutStream_Write_C64(ix_out, OutStream_Tell(self->dat_out)); /* Keep track of how many terms have been added to lexicon.ix. */ self->ix_count++; }
void SortFieldWriter_Flush_IMP(SortFieldWriter *self) { SortFieldWriterIVARS *const ivars = SortFieldWriter_IVARS(self); // Don't add a run unless we have data to put in it. if (SortFieldWriter_Buffer_Count(self) == 0) { return; } OutStream *const temp_ord_out = ivars->temp_ord_out; OutStream *const temp_ix_out = ivars->temp_ix_out; OutStream *const temp_dat_out = ivars->temp_dat_out; SortFieldWriter_Sort_Buffer(self); SortFieldWriter *run = SortFieldWriter_new(ivars->schema, ivars->snapshot, ivars->segment, ivars->polyreader, ivars->field, ivars->counter, ivars->mem_thresh, NULL, NULL, NULL); SortFieldWriterIVARS *const run_ivars = SortFieldWriter_IVARS(run); // Record stream starts and align. run_ivars->ord_start = OutStream_Align(temp_ord_out, sizeof(int64_t)); if (ivars->var_width) { run_ivars->ix_start = OutStream_Align(temp_ix_out, sizeof(int64_t)); } run_ivars->dat_start = OutStream_Align(temp_dat_out, sizeof(int64_t)); // Have the run borrow the array of elems. run_ivars->buffer = ivars->buffer; run_ivars->buf_max = ivars->buf_max; run_ivars->buf_tick = ivars->buf_tick; run_ivars->buf_cap = ivars->buf_cap; // Write files, record stats. run_ivars->run_max = (int32_t)Seg_Get_Count(ivars->segment); run_ivars->run_cardinality = S_write_files(run, temp_ord_out, temp_ix_out, temp_dat_out); // Reclaim the buffer from the run and empty it. run_ivars->buffer = NULL; run_ivars->buf_max = 0; run_ivars->buf_tick = 0; run_ivars->buf_cap = 0; ivars->buf_tick = ivars->buf_max; SortFieldWriter_Clear_Buffer(self); // Record stream ends. run_ivars->ord_end = OutStream_Tell(temp_ord_out); if (ivars->var_width) { run_ivars->ix_end = OutStream_Tell(temp_ix_out); } run_ivars->dat_end = OutStream_Tell(temp_dat_out); // Add the run to the array. SortFieldWriter_Add_Run(self, (SortExternal*)run); }
static void S_add_last_term_to_ix(LexiconWriter *self) { // Write file pointer to index record. OutStream_Write_I64(self->ixix_out, OutStream_Tell(self->ix_out)); // Write term and file pointer to main record. Track count of terms added // to ix. TermStepper_Write_Key_Frame(self->term_stepper, self->ix_out, TermStepper_Get_Value(self->term_stepper)); TermStepper_Write_Key_Frame(self->tinfo_stepper, self->ix_out, TermStepper_Get_Value(self->tinfo_stepper)); OutStream_Write_C64(self->ix_out, OutStream_Tell(self->dat_out)); self->ix_count++; }
void PostPool_Flush_IMP(PostingPool *self) { PostingPoolIVARS *const ivars = PostPool_IVARS(self); // Don't add a run unless we have data to put in it. if (PostPool_Buffer_Count(self) == 0) { return; } PostingPool *run = PostPool_new(ivars->schema, ivars->snapshot, ivars->segment, ivars->polyreader, ivars->field, ivars->lex_writer, ivars->mem_pool, ivars->lex_temp_out, ivars->post_temp_out, ivars->skip_out); PostingPoolIVARS *const run_ivars = PostPool_IVARS(run); PostingWriter *post_writer = (PostingWriter*)RawPostWriter_new(ivars->schema, ivars->snapshot, ivars->segment, ivars->polyreader, ivars->post_temp_out); // Borrow the buffer. run_ivars->buffer = ivars->buffer; run_ivars->buf_tick = ivars->buf_tick; run_ivars->buf_max = ivars->buf_max; run_ivars->buf_cap = ivars->buf_cap; // Write to temp files. LexWriter_Enter_Temp_Mode(ivars->lex_writer, ivars->field, ivars->lex_temp_out); run_ivars->lex_start = OutStream_Tell(ivars->lex_temp_out); run_ivars->post_start = OutStream_Tell(ivars->post_temp_out); PostPool_Sort_Buffer(self); S_write_terms_and_postings(run, post_writer, NULL); run_ivars->lex_end = OutStream_Tell(ivars->lex_temp_out); run_ivars->post_end = OutStream_Tell(ivars->post_temp_out); LexWriter_Leave_Temp_Mode(ivars->lex_writer); // Return the buffer and empty it. run_ivars->buffer = NULL; run_ivars->buf_tick = 0; run_ivars->buf_max = 0; run_ivars->buf_cap = 0; PostPool_Clear_Buffer(self); // Add the run to the array. PostPool_Add_Run(self, (SortExternal*)run); DECREF(post_writer); }
void MatchPostWriter_Start_Term_IMP(MatchPostingWriter *self, TermInfo *tinfo) { MatchPostingWriterIVARS *const ivars = MatchPostWriter_IVARS(self); TermInfoIVARS *const tinfo_ivars = TInfo_IVARS(tinfo); ivars->last_doc_id = 0; tinfo_ivars->post_filepos = OutStream_Tell(ivars->outstream); }
void HLWriter_Add_Inverted_Doc_IMP(HighlightWriter *self, Inverter *inverter, int32_t doc_id) { HighlightWriterIVARS *const ivars = HLWriter_IVARS(self); OutStream *dat_out = S_lazy_init(self); OutStream *ix_out = ivars->ix_out; int64_t filepos = OutStream_Tell(dat_out); uint32_t num_highlightable = 0; int32_t expected = (int32_t)(OutStream_Tell(ix_out) / 8); // Verify doc id. if (doc_id != expected) { THROW(ERR, "Expected doc id %i32 but got %i32", expected, doc_id); } // Write index data. OutStream_Write_I64(ix_out, filepos); // Count, then write number of highlightable fields. Inverter_Iterate(inverter); while (Inverter_Next(inverter)) { FieldType *type = Inverter_Get_Type(inverter); if (FType_Is_A(type, FULLTEXTTYPE) && FullTextType_Highlightable((FullTextType*)type) ) { num_highlightable++; } } OutStream_Write_C32(dat_out, num_highlightable); Inverter_Iterate(inverter); while (Inverter_Next(inverter)) { FieldType *type = Inverter_Get_Type(inverter); if (FType_Is_A(type, FULLTEXTTYPE) && FullTextType_Highlightable((FullTextType*)type) ) { String *field = Inverter_Get_Field_Name(inverter); Inversion *inversion = Inverter_Get_Inversion(inverter); ByteBuf *tv_buf = HLWriter_TV_Buf(self, inversion); Freezer_serialize_string(field, dat_out); Freezer_serialize_bytebuf(tv_buf, dat_out); DECREF(tv_buf); } } }
void HLWriter_add_inverted_doc(HighlightWriter *self, Inverter *inverter, i32_t doc_id) { OutStream *dat_out = S_lazy_init(self); OutStream *ix_out = self->ix_out; i64_t filepos = OutStream_Tell(dat_out); u32_t num_highlightable = 0; i32_t expected = (i32_t)(OutStream_Tell(ix_out) / 8); /* Verify doc id. */ if (doc_id != expected) THROW("Expected doc id %i32 but got %i32", expected, doc_id); /* Write index data. */ OutStream_Write_U64(ix_out, filepos); /* Count, then write number of highlightable fields. */ Inverter_Iter_Init(inverter); while (Inverter_Next(inverter)) { FieldType *type = Inverter_Get_Type(inverter); if ( OBJ_IS_A(type, FULLTEXTTYPE) && FullTextType_Highlightable(type) ) { num_highlightable++; } } OutStream_Write_C32(dat_out, num_highlightable); Inverter_Iter_Init(inverter); while (Inverter_Next(inverter)) { FieldType *type = Inverter_Get_Type(inverter); if ( OBJ_IS_A(type, FULLTEXTTYPE) && FullTextType_Highlightable(type) ) { CharBuf *field = Inverter_Get_Field_Name(inverter); Inversion *inversion = Inverter_Get_Inversion(inverter); ByteBuf *tv_buf = HLWriter_TV_Buf(self, inversion); CB_Serialize(field, dat_out); BB_Serialize(tv_buf, dat_out); DECREF(tv_buf); } } }
void DocWriter_add_inverted_doc(DocWriter *self, Inverter *inverter, int32_t doc_id) { OutStream *dat_out = S_lazy_init(self); OutStream *ix_out = self->ix_out; uint32_t num_stored = 0; int64_t start = OutStream_Tell(dat_out); int64_t expected = OutStream_Tell(ix_out) / 8; // Verify doc id. if (doc_id != expected) { THROW(ERR, "Expected doc id %i64 but got %i32", expected, doc_id); } // Write the number of stored fields. Inverter_Iterate(inverter); while (Inverter_Next(inverter)) { FieldType *type = Inverter_Get_Type(inverter); if (FType_Stored(type)) { num_stored++; } } OutStream_Write_C32(dat_out, num_stored); Inverter_Iterate(inverter); while (Inverter_Next(inverter)) { // Only store fields marked as "stored". FieldType *type = Inverter_Get_Type(inverter); if (FType_Stored(type)) { CharBuf *field = Inverter_Get_Field_Name(inverter); Obj *value = Inverter_Get_Value(inverter); CB_Serialize(field, dat_out); Obj_Serialize(value, dat_out); } } // Write file pointer. OutStream_Write_I64(ix_out, start); }
void DocWriter_finish(DocWriter *self) { if (self->dat_out) { // Write one final file pointer, so that we can derive the length of // the last record. int64_t end = OutStream_Tell(self->dat_out); OutStream_Write_I64(self->ix_out, end); // Close down output streams. OutStream_Close(self->dat_out); OutStream_Close(self->ix_out); Seg_Store_Metadata_Str(self->segment, "documents", 9, (Obj*)DocWriter_Metadata(self)); } }
void HLWriter_finish(HighlightWriter *self) { if (self->dat_out) { /* Write one final file pointer, so that we can derive the length of * the last record. */ i64_t end = OutStream_Tell(self->dat_out); OutStream_Write_U64(self->ix_out, end); /* Close down the output streams. */ OutStream_Close(self->dat_out); OutStream_Close(self->ix_out); Seg_Store_Metadata_Str(self->segment, "highlight", 9, (Obj*)HLWriter_Metadata(self)); } }
void HLWriter_Finish_IMP(HighlightWriter *self) { HighlightWriterIVARS *const ivars = HLWriter_IVARS(self); if (ivars->dat_out) { // Write one final file pointer, so that we can derive the length of // the last record. int64_t end = OutStream_Tell(ivars->dat_out); OutStream_Write_I64(ivars->ix_out, end); // Close down the output streams. OutStream_Close(ivars->dat_out); OutStream_Close(ivars->ix_out); Seg_Store_Metadata_Utf8(ivars->segment, "highlight", 9, (Obj*)HLWriter_Metadata(self)); } }
void OutStream_Absorb_IMP(OutStream *self, InStream *instream) { OutStreamIVARS *const ivars = OutStream_IVARS(self); char buf[IO_STREAM_BUF_SIZE]; int64_t bytes_left = InStream_Length(instream); // Read blocks of content into an intermediate buffer, than write them to // the OutStream. // // TODO: optimize by utilizing OutStream's buffer directly, while still // not flushing too frequently and keeping code complexity under control. OutStream_Grow(self, OutStream_Tell(self) + bytes_left); while (bytes_left) { const size_t bytes_this_iter = bytes_left < IO_STREAM_BUF_SIZE ? (size_t)bytes_left : IO_STREAM_BUF_SIZE; InStream_Read_Bytes(instream, buf, bytes_this_iter); SI_write_bytes(self, ivars, buf, bytes_this_iter); bytes_left -= bytes_this_iter; } }
void HLWriter_Add_Segment_IMP(HighlightWriter *self, SegReader *reader, I32Array *doc_map) { HighlightWriterIVARS *const ivars = HLWriter_IVARS(self); int32_t doc_max = SegReader_Doc_Max(reader); if (doc_max == 0) { // Bail if the supplied segment is empty. return; } else { DefaultHighlightReader *hl_reader = (DefaultHighlightReader*)CERTIFY( SegReader_Obtain(reader, Class_Get_Name(HIGHLIGHTREADER)), DEFAULTHIGHLIGHTREADER); OutStream *dat_out = S_lazy_init(self); OutStream *ix_out = ivars->ix_out; int32_t orig; ByteBuf *bb = BB_new(0); for (orig = 1; orig <= doc_max; orig++) { // Skip deleted docs. if (doc_map && !I32Arr_Get(doc_map, orig)) { continue; } // Write file pointer. OutStream_Write_I64(ix_out, OutStream_Tell(dat_out)); // Copy the raw record. DefHLReader_Read_Record(hl_reader, orig, bb); OutStream_Write_Bytes(dat_out, BB_Get_Buf(bb), BB_Get_Size(bb)); BB_Set_Size(bb, 0); } DECREF(bb); } }
void DocWriter_Add_Segment_IMP(DocWriter *self, SegReader *reader, I32Array *doc_map) { DocWriterIVARS *const ivars = DocWriter_IVARS(self); int32_t doc_max = SegReader_Doc_Max(reader); if (doc_max == 0) { // Bail if the supplied segment is empty. return; } else { OutStream *const dat_out = S_lazy_init(self); OutStream *const ix_out = ivars->ix_out; ByteBuf *const buffer = BB_new(0); DefaultDocReader *const doc_reader = (DefaultDocReader*)CERTIFY( SegReader_Obtain(reader, VTable_Get_Name(DOCREADER)), DEFAULTDOCREADER); for (int32_t i = 1, max = SegReader_Doc_Max(reader); i <= max; i++) { if (I32Arr_Get(doc_map, i)) { int64_t start = OutStream_Tell(dat_out); // Copy record over. DefDocReader_Read_Record(doc_reader, buffer, i); char *buf = BB_Get_Buf(buffer); size_t size = BB_Get_Size(buffer); OutStream_Write_Bytes(dat_out, buf, size); // Write file pointer. OutStream_Write_I64(ix_out, start); } } DECREF(buffer); } }
void HLWriter_add_segment(HighlightWriter *self, SegReader *reader, I32Array *doc_map) { i32_t doc_max = SegReader_Doc_Max(reader); if (doc_max == 0) { /* Bail if the supplied segment is empty. */ return; } else { DefaultHighlightReader *hl_reader = (DefaultHighlightReader*) ASSERT_IS_A(SegReader_Obtain(reader, HIGHLIGHTREADER.name), DEFAULTHIGHLIGHTREADER); OutStream *dat_out = S_lazy_init(self); OutStream *ix_out = self->ix_out; i32_t orig; ByteBuf *bb = BB_new(0); for (orig = 1; orig <= doc_max; orig++) { /* Skip deleted docs. */ if (doc_map && !I32Arr_Get(doc_map, orig)) continue; /* Write file pointer. */ OutStream_Write_U64( ix_out, OutStream_Tell(dat_out) ); /* Copy the raw record. */ DefHLReader_Read_Record(hl_reader, orig, bb); OutStream_Write_Bytes(dat_out, bb->ptr, bb->size); bb->size = 0; } DECREF(bb); } }
void DocWriter_Add_Inverted_Doc_IMP(DocWriter *self, Inverter *inverter, int32_t doc_id) { DocWriterIVARS *const ivars = DocWriter_IVARS(self); OutStream *dat_out = S_lazy_init(self); OutStream *ix_out = ivars->ix_out; uint32_t num_stored = 0; int64_t start = OutStream_Tell(dat_out); int64_t expected = OutStream_Tell(ix_out) / 8; // Verify doc id. if (doc_id != expected) { THROW(ERR, "Expected doc id %i64 but got %i32", expected, doc_id); } // Write the number of stored fields. Inverter_Iterate(inverter); while (Inverter_Next(inverter)) { FieldType *type = Inverter_Get_Type(inverter); if (FType_Stored(type)) { num_stored++; } } OutStream_Write_C32(dat_out, num_stored); Inverter_Iterate(inverter); while (Inverter_Next(inverter)) { // Only store fields marked as "stored". FieldType *type = Inverter_Get_Type(inverter); if (FType_Stored(type)) { String *field = Inverter_Get_Field_Name(inverter); Obj *value = Inverter_Get_Value(inverter); Freezer_serialize_string(field, dat_out); switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) { case FType_TEXT: { const char *buf = Str_Get_Ptr8((String*)value); size_t size = Str_Get_Size((String*)value); OutStream_Write_C32(dat_out, size); OutStream_Write_Bytes(dat_out, buf, size); break; } case FType_BLOB: { char *buf = BB_Get_Buf((ByteBuf*)value); size_t size = BB_Get_Size((ByteBuf*)value); OutStream_Write_C32(dat_out, size); OutStream_Write_Bytes(dat_out, buf, size); break; } case FType_INT32: { int32_t val = Int32_Get_Value((Integer32*)value); OutStream_Write_C32(dat_out, val); break; } case FType_INT64: { int64_t val = Int64_Get_Value((Integer64*)value); OutStream_Write_C64(dat_out, val); break; } case FType_FLOAT32: { float val = Float32_Get_Value((Float32*)value); OutStream_Write_F32(dat_out, val); break; } case FType_FLOAT64: { double val = Float64_Get_Value((Float64*)value); OutStream_Write_F64(dat_out, val); break; } default: THROW(ERR, "Unrecognized type: %o", type); } } } // Write file pointer. OutStream_Write_I64(ix_out, start); }
void MatchPostWriter_Update_Skip_Info_IMP(MatchPostingWriter *self, TermInfo *tinfo) { MatchPostingWriterIVARS *const ivars = MatchPostWriter_IVARS(self); TermInfoIVARS *const tinfo_ivars = TInfo_IVARS(tinfo); tinfo_ivars->post_filepos = OutStream_Tell(ivars->outstream); }
static void S_write_terms_and_postings(PostingPool *self, PostingWriter *post_writer, OutStream *skip_stream) { PostingPoolIVARS *const ivars = PostPool_IVARS(self); TermInfo *const tinfo = TInfo_new(0); TermInfo *const skip_tinfo = TInfo_new(0); TermInfoIVARS *const tinfo_ivars = TInfo_IVARS(tinfo); TermInfoIVARS *const skip_tinfo_ivars = TInfo_IVARS(skip_tinfo); LexiconWriter *const lex_writer = ivars->lex_writer; SkipStepper *const skip_stepper = ivars->skip_stepper; SkipStepperIVARS *const skip_stepper_ivars = SkipStepper_IVARS(skip_stepper); int32_t last_skip_doc = 0; int64_t last_skip_filepos = 0; const int32_t skip_interval = Arch_Skip_Interval(Schema_Get_Architecture(ivars->schema)); // Prime heldover variables. RawPosting *posting = (RawPosting*)CERTIFY(PostPool_Fetch(self), RAWPOSTING); RawPostingIVARS *post_ivars = RawPost_IVARS(posting); CharBuf *last_term_text = CB_new_from_trusted_utf8(post_ivars->blob, post_ivars->content_len); const char *last_text_buf = CB_Get_Ptr8(last_term_text); uint32_t last_text_size = CB_Get_Size(last_term_text); SkipStepper_Set_ID_And_Filepos(skip_stepper, 0, 0); // Initialize sentinel to be used on the last iter, using an empty string // in order to make LexiconWriter Do The Right Thing. size_t sentinel_size = Class_Get_Obj_Alloc_Size(RAWPOSTING) + 20; // blob length + cushion char empty_string[] = ""; RawPosting *sentinel = RawPost_new(alloca(sentinel_size), 0, 1, empty_string, 0); while (1) { bool same_text_as_last = true; if (posting == NULL) { // On the last iter, use an empty string to make LexiconWriter // DTRT. posting = sentinel; post_ivars = RawPost_IVARS(posting); same_text_as_last = false; } else { // Compare once. if (post_ivars->content_len != last_text_size || memcmp(&post_ivars->blob, last_text_buf, last_text_size) != 0 ) { same_text_as_last = false; } } // If the term text changes, process the last term. if (!same_text_as_last) { // Hand off to LexiconWriter. LexWriter_Add_Term(lex_writer, (Obj*)last_term_text, tinfo); // Start each term afresh. TInfo_Reset(tinfo); PostWriter_Start_Term(post_writer, tinfo); // Init skip data in preparation for the next term. skip_stepper_ivars->doc_id = 0; skip_stepper_ivars->filepos = tinfo_ivars->post_filepos; last_skip_doc = 0; last_skip_filepos = tinfo_ivars->post_filepos; // Remember the term_text so we can write string diffs. CB_Mimic_Utf8(last_term_text, post_ivars->blob, post_ivars->content_len); last_text_buf = CB_Get_Ptr8(last_term_text); last_text_size = CB_Get_Size(last_term_text); } // Bail on last iter before writing invalid posting data. if (posting == sentinel) { break; } // Write posting data. PostWriter_Write_Posting(post_writer, posting); // Doc freq lags by one iter. tinfo_ivars->doc_freq++; // Write skip data. if (skip_stream != NULL && same_text_as_last && tinfo_ivars->doc_freq % skip_interval == 0 && tinfo_ivars->doc_freq != 0 ) { // If first skip group, save skip stream pos for term info. if (tinfo_ivars->doc_freq == skip_interval) { tinfo_ivars->skip_filepos = OutStream_Tell(skip_stream); } // Write deltas. last_skip_doc = skip_stepper_ivars->doc_id; last_skip_filepos = skip_stepper_ivars->filepos; skip_stepper_ivars->doc_id = post_ivars->doc_id; PostWriter_Update_Skip_Info(post_writer, skip_tinfo); skip_stepper_ivars->filepos = skip_tinfo_ivars->post_filepos; SkipStepper_Write_Record(skip_stepper, skip_stream, last_skip_doc, last_skip_filepos); } // Retrieve the next posting from the sort pool. // DECREF(posting); // No!! DON'T destroy!!! posting = (RawPosting*)PostPool_Fetch(self); post_ivars = RawPost_IVARS(posting); } // Clean up. DECREF(last_term_text); DECREF(skip_tinfo); DECREF(tinfo); }
static void S_do_consolidate(CompoundFileWriter *self, CompoundFileWriterIVARS *ivars) { UNUSED_VAR(self); Folder *folder = ivars->folder; Hash *metadata = Hash_new(0); Hash *sub_files = Hash_new(0); Vector *files = Folder_List(folder, NULL); Vector *merged = Vec_new(Vec_Get_Size(files)); String *cf_file = (String*)SSTR_WRAP_UTF8("cf.dat", 6); OutStream *outstream = Folder_Open_Out(folder, (String*)cf_file); bool rename_success; if (!outstream) { RETHROW(INCREF(Err_get_error())); } // Start metadata. Hash_Store_Utf8(metadata, "files", 5, INCREF(sub_files)); Hash_Store_Utf8(metadata, "format", 6, (Obj*)Str_newf("%i32", CFWriter_current_file_format)); Vec_Sort(files); for (uint32_t i = 0, max = Vec_Get_Size(files); i < max; i++) { String *infilename = (String*)Vec_Fetch(files, i); if (!Str_Ends_With_Utf8(infilename, ".json", 5)) { InStream *instream = Folder_Open_In(folder, infilename); Hash *file_data = Hash_new(2); int64_t offset, len; if (!instream) { RETHROW(INCREF(Err_get_error())); } // Absorb the file. offset = OutStream_Tell(outstream); OutStream_Absorb(outstream, instream); len = OutStream_Tell(outstream) - offset; // Record offset and length. Hash_Store_Utf8(file_data, "offset", 6, (Obj*)Str_newf("%i64", offset)); Hash_Store_Utf8(file_data, "length", 6, (Obj*)Str_newf("%i64", len)); Hash_Store(sub_files, infilename, (Obj*)file_data); Vec_Push(merged, INCREF(infilename)); // Add filler NULL bytes so that every sub-file begins on a file // position multiple of 8. OutStream_Align(outstream, 8); InStream_Close(instream); DECREF(instream); } } // Write metadata to cfmeta file. String *cfmeta_temp = (String*)SSTR_WRAP_UTF8("cfmeta.json.temp", 16); String *cfmeta_file = (String*)SSTR_WRAP_UTF8("cfmeta.json", 11); Json_spew_json((Obj*)metadata, (Folder*)ivars->folder, cfmeta_temp); rename_success = Folder_Rename(ivars->folder, cfmeta_temp, cfmeta_file); if (!rename_success) { RETHROW(INCREF(Err_get_error())); } // Clean up. OutStream_Close(outstream); DECREF(outstream); DECREF(files); DECREF(metadata); /* HashIterator *iter = HashIter_new(sub_files); while (HashIter_Next(iter)) { String *merged_file = HashIter_Get_Key(iter); if (!Folder_Delete(folder, merged_file)) { String *mess = MAKE_MESS("Can't delete '%o'", merged_file); DECREF(sub_files); Err_throw_mess(ERR, mess); } } DECREF(iter); */ DECREF(sub_files); for (uint32_t i = 0, max = Vec_Get_Size(merged); i < max; i++) { String *merged_file = (String*)Vec_Fetch(merged, i); if (!Folder_Delete(folder, merged_file)) { String *mess = MAKE_MESS("Can't delete '%o'", merged_file); DECREF(merged); Err_throw_mess(ERR, mess); } } DECREF(merged); }
void MatchPostWriter_update_skip_info(MatchPostingWriter *self, TermInfo *tinfo) { tinfo->post_filepos = OutStream_Tell(self->outstream); }
void MatchPostWriter_start_term(MatchPostingWriter *self, TermInfo *tinfo) { self->last_doc_id = 0; tinfo->post_filepos = OutStream_Tell(self->outstream); }
static void S_write_val(Obj *val, int8_t prim_id, OutStream *ix_out, OutStream *dat_out, int64_t dat_start) { if (val) { switch (prim_id & FType_PRIMITIVE_ID_MASK) { case FType_TEXT: { String *string = (String*)val; int64_t dat_pos = OutStream_Tell(dat_out) - dat_start; OutStream_Write_I64(ix_out, dat_pos); OutStream_Write_Bytes(dat_out, Str_Get_Ptr8(string), Str_Get_Size(string)); break; } case FType_BLOB: { Blob *blob = (Blob*)val; int64_t dat_pos = OutStream_Tell(dat_out) - dat_start; OutStream_Write_I64(ix_out, dat_pos); OutStream_Write_Bytes(dat_out, Blob_Get_Buf(blob), Blob_Get_Size(blob)); break; } case FType_INT32: { int32_t i32 = (int32_t)Int_Get_Value((Integer*)val); OutStream_Write_I32(dat_out, i32); break; } case FType_INT64: { int64_t i64 = Int_Get_Value((Integer*)val); OutStream_Write_I64(dat_out, i64); break; } case FType_FLOAT32: { float f32 = (float)Float_Get_Value((Float*)val); OutStream_Write_F32(dat_out, f32); break; } case FType_FLOAT64: { double f64 = Float_Get_Value((Float*)val); OutStream_Write_F64(dat_out, f64); break; } default: THROW(ERR, "Unrecognized primitive id: %i32", (int32_t)prim_id); } } else { switch (prim_id & FType_PRIMITIVE_ID_MASK) { case FType_TEXT: case FType_BLOB: { int64_t dat_pos = OutStream_Tell(dat_out) - dat_start; OutStream_Write_I64(ix_out, dat_pos); } break; case FType_INT32: OutStream_Write_I32(dat_out, 0); break; case FType_INT64: OutStream_Write_I64(dat_out, 0); break; case FType_FLOAT64: OutStream_Write_F64(dat_out, 0.0); break; case FType_FLOAT32: OutStream_Write_F32(dat_out, 0.0f); break; default: THROW(ERR, "Unrecognized primitive id: %i32", (int32_t)prim_id); } } }
static int32_t S_write_files(SortFieldWriter *self, OutStream *ord_out, OutStream *ix_out, OutStream *dat_out) { SortFieldWriterIVARS *const ivars = SortFieldWriter_IVARS(self); int8_t prim_id = ivars->prim_id; int32_t doc_max = (int32_t)Seg_Get_Count(ivars->segment); bool has_nulls = ivars->count == doc_max ? false : true; size_t size = (doc_max + 1) * sizeof(int32_t); int32_t *ords = (int32_t*)MALLOCATE(size); int32_t ord = 0; int64_t dat_start = OutStream_Tell(dat_out); // Assign -1 as a stand-in for the NULL ord. for (int32_t i = 0; i <= doc_max; i++) { ords[i] = -1; } // Grab the first item and record its ord. Add a dummy ord for invalid // doc id 0. SFWriterElem *elem = (SFWriterElem*)SortFieldWriter_Fetch(self); SFWriterElemIVARS *elem_ivars = SFWriterElem_IVARS(elem); if (elem_ivars->doc_id > doc_max) { THROW(ERR, "doc_id %i32 greater than doc_max %i32", elem_ivars->doc_id, doc_max); } ords[elem_ivars->doc_id] = ord; ords[0] = 0; // Build array of ords, write non-NULL sorted values. Obj *last_val = INCREF(elem_ivars->value); S_write_val(elem_ivars->value, prim_id, ix_out, dat_out, dat_start); DECREF(elem); while (NULL != (elem = (SFWriterElem*)SortFieldWriter_Fetch(self))) { elem_ivars = SFWriterElem_IVARS(elem); if (elem_ivars->value != last_val) { int32_t comparison = FType_Compare_Values(ivars->type, elem_ivars->value, last_val); if (comparison != 0) { ord++; S_write_val(elem_ivars->value, prim_id, ix_out, dat_out, dat_start); } DECREF(last_val); last_val = INCREF(elem_ivars->value); } if (elem_ivars->doc_id > doc_max) { THROW(ERR, "doc_id %i32 greater than doc_max %i32", elem_ivars->doc_id, doc_max); } ords[elem_ivars->doc_id] = ord; DECREF(elem); } DECREF(last_val); // If there are NULL values, write one now and record the NULL ord. if (has_nulls) { S_write_val(NULL, prim_id, ix_out, dat_out, dat_start); ord++; ivars->null_ord = ord; } int32_t null_ord = ivars->null_ord; // Write one extra file pointer so that we can always derive length. if (ivars->var_width) { OutStream_Write_I64(ix_out, OutStream_Tell(dat_out) - dat_start); } // Calculate cardinality and ord width. int32_t cardinality = ord + 1; ivars->ord_width = S_calc_width(cardinality); int32_t ord_width = ivars->ord_width; // Write ords. const double BITS_PER_BYTE = 8.0; double bytes_per_doc = ord_width / BITS_PER_BYTE; double byte_count = ceil((doc_max + 1) * bytes_per_doc); char *compressed_ords = (char*)CALLOCATE((size_t)byte_count, sizeof(char)); for (int32_t i = 0; i <= doc_max; i++) { int32_t real_ord = ords[i] == -1 ? null_ord : ords[i]; S_write_ord(compressed_ords, ord_width, i, real_ord); } OutStream_Write_Bytes(ord_out, compressed_ords, (size_t)byte_count); FREEMEM(compressed_ords); FREEMEM(ords); return cardinality; }