void TextTermStepper_Read_Delta_IMP(TextTermStepper *self, InStream *instream) { TextTermStepperIVARS *const ivars = TextTermStepper_IVARS(self); const uint32_t text_overlap = InStream_Read_C32(instream); const uint32_t finish_chars_len = InStream_Read_C32(instream); const uint32_t total_text_len = text_overlap + finish_chars_len; // Allocate space. CharBuf *charbuf = (CharBuf*)ivars->value; char *ptr = CB_Grow(charbuf, total_text_len); // Set the value text. InStream_Read_Bytes(instream, ptr + text_overlap, finish_chars_len); CB_Set_Size(charbuf, total_text_len); if (!StrHelp_utf8_valid(ptr, total_text_len)) { THROW(ERR, "Invalid UTF-8 sequence in '%o' at byte %i64", InStream_Get_Filename(instream), InStream_Tell(instream) - finish_chars_len); } // Null-terminate. ptr[total_text_len] = '\0'; // Invalidate string. DECREF(ivars->string); ivars->string = NULL; }
TermVector* TV_deserialize(TermVector *self, InStream *instream) { u32_t i; CharBuf *field = (CharBuf*)CB_deserialize(NULL, instream); CharBuf *text = (CharBuf*)CB_deserialize(NULL, instream); u32_t num_pos = InStream_Read_C32(instream); i32_t *posits, *starts, *ends; I32Array *positions, *start_offsets, *end_offsets; /* Read positional data. */ posits = MALLOCATE(num_pos, i32_t); starts = MALLOCATE(num_pos, i32_t); ends = MALLOCATE(num_pos, i32_t); for (i = 0; i < num_pos; i++) { posits[i] = InStream_Read_C32(instream); starts[i] = InStream_Read_C32(instream); ends[i] = InStream_Read_C32(instream); } positions = I32Arr_new_steal(posits, num_pos); start_offsets = I32Arr_new_steal(starts, num_pos); end_offsets = I32Arr_new_steal(ends, num_pos); self = self ? self : (TermVector*)VTable_Make_Obj(&TERMVECTOR); self = TV_init(self, field, text, positions, start_offsets, end_offsets); DECREF(positions); DECREF(start_offsets); DECREF(end_offsets); DECREF(text); DECREF(field); return self; }
RawPosting* RichPost_read_raw(RichPosting *self, InStream *instream, int32_t last_doc_id, CharBuf *term_text, MemoryPool *mem_pool) { char *const text_buf = (char*)CB_Get_Ptr8(term_text); const size_t text_size = CB_Get_Size(term_text); const uint32_t doc_code = InStream_Read_C32(instream); const uint32_t delta_doc = doc_code >> 1; const int32_t doc_id = last_doc_id + delta_doc; const uint32_t freq = (doc_code & 1) ? 1 : InStream_Read_C32(instream); size_t raw_post_bytes = MAX_RAW_POSTING_LEN(text_size, freq); void *const allocation = MemPool_Grab(mem_pool, raw_post_bytes); RawPosting *const raw_posting = RawPost_new(allocation, doc_id, freq, text_buf, text_size); uint32_t num_prox = freq; char *const start = raw_posting->blob + text_size; char * dest = start; UNUSED_VAR(self); // Read positions and per-position boosts. while (num_prox--) { dest += InStream_Read_Raw_C64(instream, dest); *((uint8_t*)dest) = InStream_Read_U8(instream); dest++; } // Resize raw posting memory allocation. raw_posting->aux_len = dest - start; raw_post_bytes = dest - (char*)raw_posting; MemPool_Resize(mem_pool, raw_posting, raw_post_bytes); return raw_posting; }
TermVector* TV_Deserialize_IMP(TermVector *self, InStream *instream) { String *field = Freezer_read_string(instream); String *text = Freezer_read_string(instream); size_t num_pos = InStream_Read_C64(instream); // Read positional data. int32_t *posits = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t)); int32_t *starts = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t)); int32_t *ends = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t)); for (size_t i = 0; i < num_pos; i++) { posits[i] = InStream_Read_C32(instream); starts[i] = InStream_Read_C32(instream); ends[i] = InStream_Read_C32(instream); } I32Array *positions = I32Arr_new_steal(posits, num_pos); I32Array *start_offsets = I32Arr_new_steal(starts, num_pos); I32Array *end_offsets = I32Arr_new_steal(ends, num_pos); TV_init(self, field, text, positions, start_offsets, end_offsets); DECREF(positions); DECREF(start_offsets); DECREF(end_offsets); DECREF(text); DECREF(field); return self; }
Hash* Hash_deserialize(Hash *self, InStream *instream) { uint32_t size = InStream_Read_C32(instream); uint32_t num_charbufs = InStream_Read_C32(instream); uint32_t num_other = size - num_charbufs; CharBuf *key = num_charbufs ? CB_new(0) : NULL; Hash_init(self, size); // Read key-value pairs with CharBuf keys. while (num_charbufs--) { uint32_t len = InStream_Read_C32(instream); char *key_buf = CB_Grow(key, len); InStream_Read_Bytes(instream, key_buf, len); key_buf[len] = '\0'; CB_Set_Size(key, len); Hash_Store(self, (Obj*)key, THAW(instream)); } DECREF(key); // Read remaining key/value pairs. while (num_other--) { Obj *k = THAW(instream); Hash_Store(self, k, THAW(instream)); DECREF(k); } return self; }
void TextTermStepper_read_delta(TextTermStepper *self, InStream *instream) { const uint32_t text_overlap = InStream_Read_C32(instream); const uint32_t finish_chars_len = InStream_Read_C32(instream); const uint32_t total_text_len = text_overlap + finish_chars_len; CharBuf *value; char *ptr; // Allocate space. if (self->value == NULL) { self->value = (Obj*)CB_new(total_text_len); } value = (CharBuf*)self->value; ptr = CB_Grow(value, total_text_len); // Set the value text. InStream_Read_Bytes(instream, ptr + text_overlap, finish_chars_len); CB_Set_Size(value, total_text_len); if (!StrHelp_utf8_valid(ptr, total_text_len)) { THROW(ERR, "Invalid UTF-8 sequence in '%o' at byte %i64", InStream_Get_Filename(instream), InStream_Tell(instream) - finish_chars_len); } // Null-terminate. ptr[total_text_len] = '\0'; }
void MatchPost_read_record(MatchPosting *self, InStream *instream) { const u32_t doc_code = InStream_Read_C32(instream); const u32_t doc_delta = doc_code >> 1; /* Apply delta doc and retrieve freq. */ self->doc_id += doc_delta; if (doc_code & 1) self->freq = 1; else self->freq = InStream_Read_C32(instream); }
Vector* Freezer_deserialize_varray(Vector *array, InStream *instream) { uint32_t size = InStream_Read_C32(instream); Vec_init(array, size); for (uint32_t tick = InStream_Read_C32(instream); tick < size; tick += InStream_Read_C32(instream) ) { Obj *obj = THAW(instream); Vec_Store(array, tick, obj); } Vec_Resize(array, size); return array; }
void MatchPost_Read_Record_IMP(MatchPosting *self, InStream *instream) { MatchPostingIVARS *const ivars = MatchPost_IVARS(self); const uint32_t doc_code = InStream_Read_C32(instream); const uint32_t doc_delta = doc_code >> 1; // Apply delta doc and retrieve freq. ivars->doc_id += doc_delta; if (doc_code & 1) { ivars->freq = 1; } else { ivars->freq = InStream_Read_C32(instream); } }
Doc* Doc_Deserialize_IMP(Doc *self, InStream *instream) { DocIVARS *const ivars = Doc_IVARS(self); ivars->fields = Freezer_read_hash(instream); ivars->doc_id = InStream_Read_C32(instream); return self; }
TopDocs* TopDocs_Deserialize_IMP(TopDocs *self, InStream *instream) { TopDocsIVARS *const ivars = TopDocs_IVARS(self); ivars->match_docs = Freezer_read_varray(instream); ivars->total_hits = InStream_Read_C32(instream); return self; }
Blob* Freezer_deserialize_blob(Blob *blob, InStream *instream) { size_t size = InStream_Read_C32(instream); char *buf = (char*)MALLOCATE(size); InStream_Read_Bytes(instream, buf, size); return Blob_init_steal(blob, buf, size); }
RawPosting* MatchPost_read_raw(MatchPosting *self, InStream *instream, i32_t last_doc_id, CharBuf *term_text, MemoryPool *mem_pool) { const size_t text_size = CB_Get_Size(term_text); const u32_t doc_code = InStream_Read_C32(instream); const u32_t delta_doc = doc_code >> 1; const i32_t doc_id = last_doc_id + delta_doc; const u32_t freq = (doc_code & 1) ? 1 : InStream_Read_C32(instream); size_t raw_post_bytes = MAX_RAW_POSTING_LEN(text_size); void *const allocation = MemPool_Grab(mem_pool, raw_post_bytes); UNUSED_VAR(self); return RawPost_new(allocation, doc_id, freq, term_text->ptr, text_size); }
ProximityQuery* ProximityQuery_Deserialize_IMP(ProximityQuery *self, InStream *instream) { float boost = InStream_Read_F32(instream); String *field = Freezer_read_string(instream); VArray *terms = Freezer_read_varray(instream); uint32_t within = InStream_Read_C32(instream); return S_do_init(self, field, terms, boost, within); }
TopDocs* TopDocs_deserialize(TopDocs *self, InStream *instream) { self = self ? self : (TopDocs*)VTable_Make_Obj(&TOPDOCS); self->match_docs = VA_deserialize(NULL, instream); self->total_hits = InStream_Read_C32(instream); return self; }
Hash* Freezer_deserialize_hash(Hash *hash, InStream *instream) { uint32_t size = InStream_Read_C32(instream); Hash_init(hash, size); while (size--) { uint32_t len = InStream_Read_C32(instream); char *key_buf = (char*)MALLOCATE(len + 1); InStream_Read_Bytes(instream, key_buf, len); key_buf[len] = '\0'; String *key = Str_new_steal_utf8(key_buf, len); Hash_Store(hash, key, THAW(instream)); DECREF(key); } return hash; }
void RichPost_read_record(RichPosting *self, InStream *instream) { float *const norm_decoder = self->norm_decoder; uint32_t doc_code; uint32_t num_prox = 0; uint32_t position = 0; uint32_t *positions; float *prox_boosts; float aggregate_weight = 0.0; // Decode delta doc. doc_code = InStream_Read_C32(instream); self->doc_id += doc_code >> 1; // If the stored num was odd, the freq is 1. if (doc_code & 1) { self->freq = 1; } // Otherwise, freq was stored as a C32. else { self->freq = InStream_Read_C32(instream); } // Read positions, aggregate per-position boost byte into weight. num_prox = self->freq; if (num_prox > self->prox_cap) { self->prox = (uint32_t*)REALLOCATE(self->prox, num_prox * sizeof(uint32_t)); self->prox_boosts = (float*)REALLOCATE(self->prox_boosts, num_prox * sizeof(float)); } positions = self->prox; prox_boosts = self->prox_boosts; while (num_prox--) { position += InStream_Read_C32(instream); *positions++ = position; *prox_boosts = norm_decoder[ InStream_Read_U8(instream) ]; aggregate_weight += *prox_boosts; prox_boosts++; } self->weight = aggregate_weight / self->freq; }
RawPosting* MatchPost_Read_Raw_IMP(MatchPosting *self, InStream *instream, int32_t last_doc_id, String *term_text, MemoryPool *mem_pool) { const char *const text_buf = Str_Get_Ptr8(term_text); const size_t text_size = Str_Get_Size(term_text); const uint32_t doc_code = InStream_Read_C32(instream); const uint32_t delta_doc = doc_code >> 1; const int32_t doc_id = last_doc_id + delta_doc; const uint32_t freq = (doc_code & 1) ? 1 : InStream_Read_C32(instream); const size_t base_size = VTable_Get_Obj_Alloc_Size(RAWPOSTING); size_t raw_post_bytes = MAX_RAW_POSTING_LEN(base_size, text_size); void *const allocation = MemPool_Grab(mem_pool, raw_post_bytes); UNUSED_VAR(self); return RawPost_new(allocation, doc_id, freq, text_buf, text_size); }
VArray* VA_deserialize(VArray *self, InStream *instream) { u32_t tick; u32_t size = InStream_Read_C32(instream); if (self) { self->size = size; self->cap = size + 1; self->elems = CALLOCATE(self->cap, Obj*); } else self = VA_new(size);
MatchDoc* MatchDoc_deserialize(MatchDoc *self, InStream *instream) { self = self ? self : (MatchDoc*)VTable_Make_Obj(&MATCHDOC); self->doc_id = InStream_Read_C32(instream); self->score = InStream_Read_Float(instream); if (InStream_Read_U8(instream)) { self->values = VA_deserialize(NULL, instream); } return self; }
RawPosting* ScorePost_Read_Raw_IMP(ScorePosting *self, InStream *instream, int32_t last_doc_id, String *term_text, MemoryPool *mem_pool) { const char *const text_buf = Str_Get_Ptr8(term_text); const size_t text_size = Str_Get_Size(term_text); const uint32_t doc_code = InStream_Read_C32(instream); const uint32_t delta_doc = doc_code >> 1; const int32_t doc_id = last_doc_id + delta_doc; const uint32_t freq = (doc_code & 1) ? 1 : InStream_Read_C32(instream); const size_t base_size = Class_Get_Obj_Alloc_Size(RAWPOSTING); size_t raw_post_bytes = MAX_RAW_POSTING_LEN(base_size, text_size, freq); void *const allocation = MemPool_Grab(mem_pool, raw_post_bytes); RawPosting *const raw_posting = RawPost_new(allocation, doc_id, freq, text_buf, text_size); RawPostingIVARS *const raw_post_ivars = RawPost_IVARS(raw_posting); uint32_t num_prox = freq; char *const start = raw_post_ivars->blob + text_size; char *dest = start; UNUSED_VAR(self); // Field_boost. *((uint8_t*)dest) = InStream_Read_U8(instream); dest++; // Read positions. while (num_prox--) { dest += InStream_Read_Raw_C64(instream, dest); } // Resize raw posting memory allocation. raw_post_ivars->aux_len = dest - start; raw_post_bytes = dest - (char*)raw_posting; MemPool_Resize(mem_pool, raw_posting, raw_post_bytes); return raw_posting; }
String* Freezer_deserialize_string(String *string, InStream *instream) { size_t size = InStream_Read_C32(instream); if (size == SIZE_MAX) { THROW(ERR, "Can't deserialize SIZE_MAX bytes"); } char *buf = (char*)MALLOCATE(size + 1); InStream_Read_Bytes(instream, buf, size); buf[size] = '\0'; if (!StrHelp_utf8_valid(buf, size)) { THROW(ERR, "Attempt to deserialize invalid UTF-8"); } return Str_init_steal_trusted_utf8(string, buf, size); }
ProximityCompiler* ProximityCompiler_Deserialize_IMP(ProximityCompiler *self, InStream *instream) { ProximityCompiler_Deserialize_t super_deserialize = SUPER_METHOD_PTR(PROXIMITYCOMPILER, LUCY_ProximityCompiler_Deserialize); self = super_deserialize(self, instream); ProximityCompilerIVARS *const ivars = ProximityCompiler_IVARS(self); ivars->idf = InStream_Read_F32(instream); ivars->raw_weight = InStream_Read_F32(instream); ivars->query_norm_factor = InStream_Read_F32(instream); ivars->normalized_weight = InStream_Read_F32(instream); ivars->within = InStream_Read_C32(instream); return self; }
SortSpec* SortSpec_deserialize(SortSpec *self, InStream *instream) { uint32_t num_rules = InStream_Read_C32(instream); VArray *rules = VA_new(num_rules); // Add rules. for (uint32_t i = 0; i < num_rules; i++) { SortRule *blank = (SortRule*)VTable_Make_Obj(SORTRULE); VA_Push(rules, (Obj*)SortRule_Deserialize(blank, instream)); } SortSpec_init(self, rules); DECREF(rules); return self; }
static void S_read_entry(LexIndex *self) { LexIndexIVARS *const ivars = LexIndex_IVARS(self); InStream *ix_in = ivars->ix_in; TermInfo *const tinfo = ivars->tinfo; int64_t offset = (int64_t)NumUtil_decode_bigend_u64(ivars->offsets + ivars->tick); InStream_Seek(ix_in, offset); TermStepper_Read_Key_Frame(ivars->term_stepper, ix_in); int32_t doc_freq = InStream_Read_C32(ix_in); TInfo_Set_Doc_Freq(tinfo, doc_freq); TInfo_Set_Post_FilePos(tinfo, InStream_Read_C64(ix_in)); int64_t skip_filepos = doc_freq >= ivars->skip_interval ? InStream_Read_C64(ix_in) : 0; TInfo_Set_Skip_FilePos(tinfo, skip_filepos); TInfo_Set_Lex_FilePos(tinfo, InStream_Read_C64(ix_in)); }
void MatchTInfoStepper_read_delta(MatchTermInfoStepper *self, InStream *instream) { TermInfo *const tinfo = (TermInfo*)self->value; // Read doc freq. tinfo->doc_freq = InStream_Read_C32(instream); // Adjust postings file pointer. tinfo->post_filepos += InStream_Read_C64(instream); // Maybe read skip pointer. if (tinfo->doc_freq >= self->skip_interval) { tinfo->skip_filepos = InStream_Read_C64(instream); } else { tinfo->skip_filepos = 0; } }
void MatchTInfoStepper_Read_Delta_IMP(MatchTermInfoStepper *self, InStream *instream) { MatchTermInfoStepperIVARS *const ivars = MatchTInfoStepper_IVARS(self); TermInfoIVARS *const tinfo_ivars = TInfo_IVARS((TermInfo*)ivars->value); // Read doc freq. tinfo_ivars->doc_freq = InStream_Read_C32(instream); // Adjust postings file pointer. tinfo_ivars->post_filepos += InStream_Read_C64(instream); // Maybe read skip pointer. if (tinfo_ivars->doc_freq >= ivars->skip_interval) { tinfo_ivars->skip_filepos = InStream_Read_C64(instream); } else { tinfo_ivars->skip_filepos = 0; } }
DocVector* DefHLReader_fetch_doc_vec(DefaultHighlightReader *self, int32_t doc_id) { DocVector *doc_vec = DocVec_new(); int64_t file_pos; uint32_t num_fields; InStream_Seek(self->ix_in, doc_id * 8); file_pos = InStream_Read_I64(self->ix_in); InStream_Seek(self->dat_in, file_pos); num_fields = InStream_Read_C32(self->dat_in); while (num_fields--) { CharBuf *field = CB_deserialize(NULL, self->dat_in); ByteBuf *field_buf = BB_deserialize(NULL, self->dat_in); DocVec_Add_Field_Buf(doc_vec, field, field_buf); DECREF(field_buf); DECREF(field); } return doc_vec; }
DocVector* DefHLReader_fetch_doc_vec(DefaultHighlightReader *self, int32_t doc_id) { InStream *const ix_in = self->ix_in; InStream *const dat_in = self->dat_in; DocVector *doc_vec = DocVec_new(); InStream_Seek(ix_in, doc_id * 8); int64_t file_pos = InStream_Read_I64(ix_in); InStream_Seek(dat_in, file_pos); uint32_t num_fields = InStream_Read_C32(dat_in); while (num_fields--) { CharBuf *field = CB_Deserialize((CharBuf*)VTable_Make_Obj(CHARBUF), dat_in); ByteBuf *field_buf = BB_Deserialize((ByteBuf*)VTable_Make_Obj(BYTEBUF), dat_in); DocVec_Add_Field_Buf(doc_vec, field, field_buf); DECREF(field_buf); DECREF(field); } return doc_vec; }
HitDoc* DefDocReader_Fetch_Doc_IMP(DefaultDocReader *self, int32_t doc_id) { DefaultDocReaderIVARS *const ivars = DefDocReader_IVARS(self); Schema *const schema = ivars->schema; InStream *const dat_in = ivars->dat_in; InStream *const ix_in = ivars->ix_in; Hash *const fields = Hash_new(1); int64_t start; uint32_t num_fields; uint32_t field_name_cap = 31; char *field_name = (char*)MALLOCATE(field_name_cap + 1); // Get data file pointer from index, read number of fields. InStream_Seek(ix_in, (int64_t)doc_id * 8); start = InStream_Read_U64(ix_in); InStream_Seek(dat_in, start); num_fields = InStream_Read_C32(dat_in); // Decode stored data and build up the doc field by field. while (num_fields--) { uint32_t field_name_len; Obj *value; FieldType *type; // Read field name. field_name_len = InStream_Read_C32(dat_in); if (field_name_len > field_name_cap) { field_name_cap = field_name_len; field_name = (char*)REALLOCATE(field_name, field_name_cap + 1); } InStream_Read_Bytes(dat_in, field_name, field_name_len); // Find the Field's FieldType. StackString *field_name_str = SSTR_WRAP_UTF8(field_name, field_name_len); type = Schema_Fetch_Type(schema, (String*)field_name_str); // Read the field value. switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) { case FType_TEXT: { uint32_t value_len = InStream_Read_C32(dat_in); char *buf = (char*)MALLOCATE(value_len + 1); InStream_Read_Bytes(dat_in, buf, value_len); buf[value_len] = '\0'; value = (Obj*)Str_new_steal_utf8(buf, value_len); break; } case FType_BLOB: { uint32_t value_len = InStream_Read_C32(dat_in); char *buf = (char*)MALLOCATE(value_len); InStream_Read_Bytes(dat_in, buf, value_len); value = (Obj*)BB_new_steal_bytes( buf, value_len, value_len); break; } case FType_FLOAT32: value = (Obj*)Float32_new( InStream_Read_F32(dat_in)); break; case FType_FLOAT64: value = (Obj*)Float64_new( InStream_Read_F64(dat_in)); break; case FType_INT32: value = (Obj*)Int32_new( (int32_t)InStream_Read_C32(dat_in)); break; case FType_INT64: value = (Obj*)Int64_new( (int64_t)InStream_Read_C64(dat_in)); break; default: value = NULL; THROW(ERR, "Unrecognized type: %o", type); } // Store the value. Hash_Store_Utf8(fields, field_name, field_name_len, value); } FREEMEM(field_name); HitDoc *retval = HitDoc_new(fields, doc_id, 0.0); DECREF(fields); return retval; }