static void S_add_last_term_to_ix(LexiconWriter *self, char *last_text, size_t last_size) { OutStream *const ix_out = self->ix_out; OutStream *const ixix_out = self->ixix_out; TermInfo *const last_tinfo = self->last_tinfo; /* Write file pointer to index record. */ OutStream_Write_U64(ixix_out, OutStream_Tell(ix_out)); /* Write term text. */ OutStream_Write_C32(ix_out, last_size); OutStream_Write_Bytes(ix_out, last_text, last_size); /* Write doc_freq. */ OutStream_Write_C32(ix_out, last_tinfo->doc_freq); /* Write postings file pointer. */ OutStream_Write_C64(ix_out, last_tinfo->post_filepos); /* Write skip file pointer (maybe). */ if (last_tinfo->doc_freq >= self->skip_interval) { OutStream_Write_C64(ix_out, last_tinfo->skip_filepos); } /* Write file pointer to main record. */ OutStream_Write_C64(ix_out, OutStream_Tell(self->dat_out)); /* Keep track of how many terms have been added to lexicon.ix. */ self->ix_count++; }
void Freezer_serialize_string(String *string, OutStream *outstream) { size_t size = Str_Get_Size(string); const char *buf = Str_Get_Ptr8(string); OutStream_Write_C64(outstream, size); OutStream_Write_Bytes(outstream, buf, size); }
void Freezer_serialize(Obj *obj, OutStream *outstream) { if (Obj_is_a(obj, STRING)) { Freezer_serialize_string((String*)obj, outstream); } else if (Obj_is_a(obj, BLOB)) { Freezer_serialize_blob((Blob*)obj, outstream); } else if (Obj_is_a(obj, VECTOR)) { Freezer_serialize_varray((Vector*)obj, outstream); } else if (Obj_is_a(obj, HASH)) { Freezer_serialize_hash((Hash*)obj, outstream); } else if (Obj_is_a(obj, INTEGER)) { int64_t val = Int_Get_Value((Integer*)obj); OutStream_Write_C64(outstream, (uint64_t)val); } else if (Obj_is_a(obj, FLOAT)) { double val = Float_Get_Value((Float*)obj); OutStream_Write_F64(outstream, val); } else if (Obj_is_a(obj, BOOLEAN)) { bool val = Bool_Get_Value((Boolean*)obj); OutStream_Write_U8(outstream, (uint8_t)val); } else if (Obj_is_a(obj, QUERY)) { Query_Serialize((Query*)obj, outstream); } else if (Obj_is_a(obj, DOC)) { Doc_Serialize((Doc*)obj, outstream); } else if (Obj_is_a(obj, DOCVECTOR)) { DocVec_Serialize((DocVector*)obj, outstream); } else if (Obj_is_a(obj, TERMVECTOR)) { TV_Serialize((TermVector*)obj, outstream); } else if (Obj_is_a(obj, SIMILARITY)) { Sim_Serialize((Similarity*)obj, outstream); } else if (Obj_is_a(obj, MATCHDOC)) { MatchDoc_Serialize((MatchDoc*)obj, outstream); } else if (Obj_is_a(obj, TOPDOCS)) { TopDocs_Serialize((TopDocs*)obj, outstream); } else if (Obj_is_a(obj, SORTSPEC)) { SortSpec_Serialize((SortSpec*)obj, outstream); } else if (Obj_is_a(obj, SORTRULE)) { SortRule_Serialize((SortRule*)obj, outstream); } else { THROW(ERR, "Don't know how to serialize a %o", Obj_get_class_name(obj)); } }
void MatchTInfoStepper_write_key_frame(MatchTermInfoStepper *self, OutStream *outstream, Obj *value) { TermInfo *tinfo = (TermInfo*)CERTIFY(value, TERMINFO); int32_t doc_freq = TInfo_Get_Doc_Freq(tinfo); // Write doc_freq. OutStream_Write_C32(outstream, doc_freq); // Write postings file pointer. OutStream_Write_C64(outstream, tinfo->post_filepos); // Write skip file pointer (maybe). if (doc_freq >= self->skip_interval) { OutStream_Write_C64(outstream, tinfo->skip_filepos); } TInfo_Mimic((TermInfo*)self->value, (Obj*)tinfo); }
void MatchTInfoStepper_Write_Key_Frame_IMP(MatchTermInfoStepper *self, OutStream *outstream, Obj *value) { MatchTermInfoStepperIVARS *const ivars = MatchTInfoStepper_IVARS(self); TermInfo *tinfo = (TermInfo*)CERTIFY(value, TERMINFO); int32_t doc_freq = TInfo_Get_Doc_Freq(tinfo); TermInfoIVARS *const tinfo_ivars = TInfo_IVARS((TermInfo*)value); // Write doc_freq. OutStream_Write_C32(outstream, doc_freq); // Write postings file pointer. OutStream_Write_C64(outstream, tinfo_ivars->post_filepos); // Write skip file pointer (maybe). if (doc_freq >= ivars->skip_interval) { OutStream_Write_C64(outstream, tinfo_ivars->skip_filepos); } TInfo_Mimic((TermInfo*)ivars->value, (Obj*)tinfo); }
void MatchTInfoStepper_Write_Delta_IMP(MatchTermInfoStepper *self, OutStream *outstream, Obj *value) { MatchTermInfoStepperIVARS *const ivars = MatchTInfoStepper_IVARS(self); TermInfo *tinfo = (TermInfo*)CERTIFY(value, TERMINFO); TermInfo *last_tinfo = (TermInfo*)ivars->value; int32_t doc_freq = TInfo_Get_Doc_Freq(tinfo); int64_t post_delta = TInfo_IVARS(tinfo)->post_filepos - TInfo_IVARS(last_tinfo)->post_filepos; // Write doc_freq. OutStream_Write_C32(outstream, doc_freq); // Write postings file pointer delta. OutStream_Write_C64(outstream, post_delta); // Write skip file pointer (maybe). if (doc_freq >= ivars->skip_interval) { OutStream_Write_C64(outstream, TInfo_IVARS(tinfo)->skip_filepos); } TInfo_Mimic((TermInfo*)ivars->value, (Obj*)tinfo); }
static void S_add_last_term_to_ix(LexiconWriter *self) { // Write file pointer to index record. OutStream_Write_I64(self->ixix_out, OutStream_Tell(self->ix_out)); // Write term and file pointer to main record. Track count of terms added // to ix. TermStepper_Write_Key_Frame(self->term_stepper, self->ix_out, TermStepper_Get_Value(self->term_stepper)); TermStepper_Write_Key_Frame(self->tinfo_stepper, self->ix_out, TermStepper_Get_Value(self->tinfo_stepper)); OutStream_Write_C64(self->ix_out, OutStream_Tell(self->dat_out)); self->ix_count++; }
void TV_Serialize_IMP(TermVector *self, OutStream *target) { TermVectorIVARS *const ivars = TV_IVARS(self); int32_t *posits = I32Arr_IVARS(ivars->positions)->ints; int32_t *starts = I32Arr_IVARS(ivars->start_offsets)->ints; int32_t *ends = I32Arr_IVARS(ivars->start_offsets)->ints; Freezer_serialize_string(ivars->field, target); Freezer_serialize_string(ivars->text, target); OutStream_Write_C64(target, ivars->num_pos); for (size_t i = 0; i < ivars->num_pos; i++) { OutStream_Write_C32(target, posits[i]); OutStream_Write_C32(target, starts[i]); OutStream_Write_C32(target, ends[i]); } }
void DocWriter_Add_Inverted_Doc_IMP(DocWriter *self, Inverter *inverter, int32_t doc_id) { DocWriterIVARS *const ivars = DocWriter_IVARS(self); OutStream *dat_out = S_lazy_init(self); OutStream *ix_out = ivars->ix_out; uint32_t num_stored = 0; int64_t start = OutStream_Tell(dat_out); int64_t expected = OutStream_Tell(ix_out) / 8; // Verify doc id. if (doc_id != expected) { THROW(ERR, "Expected doc id %i64 but got %i32", expected, doc_id); } // Write the number of stored fields. Inverter_Iterate(inverter); while (Inverter_Next(inverter)) { FieldType *type = Inverter_Get_Type(inverter); if (FType_Stored(type)) { num_stored++; } } OutStream_Write_C32(dat_out, num_stored); Inverter_Iterate(inverter); while (Inverter_Next(inverter)) { // Only store fields marked as "stored". FieldType *type = Inverter_Get_Type(inverter); if (FType_Stored(type)) { String *field = Inverter_Get_Field_Name(inverter); Obj *value = Inverter_Get_Value(inverter); Freezer_serialize_string(field, dat_out); switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) { case FType_TEXT: { const char *buf = Str_Get_Ptr8((String*)value); size_t size = Str_Get_Size((String*)value); OutStream_Write_C32(dat_out, size); OutStream_Write_Bytes(dat_out, buf, size); break; } case FType_BLOB: { char *buf = BB_Get_Buf((ByteBuf*)value); size_t size = BB_Get_Size((ByteBuf*)value); OutStream_Write_C32(dat_out, size); OutStream_Write_Bytes(dat_out, buf, size); break; } case FType_INT32: { int32_t val = Int32_Get_Value((Integer32*)value); OutStream_Write_C32(dat_out, val); break; } case FType_INT64: { int64_t val = Int64_Get_Value((Integer64*)value); OutStream_Write_C64(dat_out, val); break; } case FType_FLOAT32: { float val = Float32_Get_Value((Float32*)value); OutStream_Write_F32(dat_out, val); break; } case FType_FLOAT64: { double val = Float64_Get_Value((Float64*)value); OutStream_Write_F64(dat_out, val); break; } default: THROW(ERR, "Unrecognized type: %o", type); } } } // Write file pointer. OutStream_Write_I64(ix_out, start); }