SortFieldWriter* SortFieldWriter_init(SortFieldWriter *self, Schema *schema, Snapshot *snapshot, Segment *segment, PolyReader *polyreader, String *field, Counter *counter, size_t mem_thresh, OutStream *temp_ord_out, OutStream *temp_ix_out, OutStream *temp_dat_out) { // Init. SortEx_init((SortExternal*)self); SortFieldWriterIVARS *const ivars = SortFieldWriter_IVARS(self); ivars->null_ord = -1; ivars->count = 0; ivars->ord_start = 0; ivars->ord_end = 0; ivars->ix_start = 0; ivars->ix_end = 0; ivars->dat_start = 0; ivars->dat_end = 0; ivars->run_cardinality = -1; ivars->run_max = -1; ivars->sort_cache = NULL; ivars->doc_map = NULL; ivars->sorted_ids = NULL; ivars->run_tick = 1; ivars->ord_width = 0; // Assign. ivars->field = Str_Clone(field); ivars->schema = (Schema*)INCREF(schema); ivars->snapshot = (Snapshot*)INCREF(snapshot); ivars->segment = (Segment*)INCREF(segment); ivars->polyreader = (PolyReader*)INCREF(polyreader); ivars->counter = (Counter*)INCREF(counter); ivars->temp_ord_out = (OutStream*)INCREF(temp_ord_out); ivars->temp_ix_out = (OutStream*)INCREF(temp_ix_out); ivars->temp_dat_out = (OutStream*)INCREF(temp_dat_out); ivars->mem_thresh = mem_thresh; // Derive. ivars->field_num = Seg_Field_Num(segment, field); FieldType *type = (FieldType*)CERTIFY( Schema_Fetch_Type(ivars->schema, field), FIELDTYPE); ivars->type = (FieldType*)INCREF(type); ivars->prim_id = FType_Primitive_ID(type); ivars->mem_per_entry = Class_Get_Obj_Alloc_Size(SFWRITERELEM); if (ivars->prim_id == FType_TEXT) { ivars->mem_per_entry += Class_Get_Obj_Alloc_Size(STRING); ivars->var_width = true; } else if (ivars->prim_id == FType_BLOB) { ivars->mem_per_entry += Class_Get_Obj_Alloc_Size(BLOB); ivars->var_width = true; } else { ivars->mem_per_entry += Class_Get_Obj_Alloc_Size(FLOAT); ivars->var_width = false; } return self; }
InverterEntry* InvEntry_init(InverterEntry *self, Schema *schema, String *field, int32_t field_num) { InverterEntryIVARS *const ivars = InvEntry_IVARS(self); ivars->field_num = field_num; ivars->field = field ? Str_Clone(field) : NULL; ivars->inversion = NULL; if (schema) { ivars->analyzer = (Analyzer*)INCREF(Schema_Fetch_Analyzer(schema, field)); ivars->sim = (Similarity*)INCREF(Schema_Fetch_Sim(schema, field)); ivars->type = (FieldType*)INCREF(Schema_Fetch_Type(schema, field)); if (!ivars->type) { THROW(ERR, "Unknown field: '%o'", field); } uint8_t prim_id = FType_Primitive_ID(ivars->type); switch (prim_id & FType_PRIMITIVE_ID_MASK) { case FType_TEXT: ivars->value = NULL; break; case FType_BLOB: ivars->value = (Obj*)ViewBB_new(NULL, 0); break; case FType_INT32: ivars->value = (Obj*)Int32_new(0); break; case FType_INT64: ivars->value = (Obj*)Int64_new(0); break; case FType_FLOAT32: ivars->value = (Obj*)Float32_new(0); break; case FType_FLOAT64: ivars->value = (Obj*)Float64_new(0); break; default: THROW(ERR, "Unrecognized primitive id: %i8", prim_id); } ivars->indexed = FType_Indexed(ivars->type); if (ivars->indexed && FType_Is_A(ivars->type, NUMERICTYPE)) { THROW(ERR, "Field '%o' spec'd as indexed, but numerical types cannot " "be indexed yet", field); } if (FType_Is_A(ivars->type, FULLTEXTTYPE)) { ivars->highlightable = FullTextType_Highlightable((FullTextType*)ivars->type); } } return self; }
void Inverter_Invert_Doc_IMP(Inverter *self, Doc *doc) { InverterIVARS *const ivars = Inverter_IVARS(self); Hash *const fields = (Hash*)Doc_Get_Fields(doc); // Prepare for the new doc. Inverter_Set_Doc(self, doc); // Extract and invert the doc's fields. HashIterator *iter = HashIter_new(fields); while (HashIter_Next(iter)) { String *field = HashIter_Get_Key(iter); Obj *obj = HashIter_Get_Value(iter); InverterEntry *inventry = S_fetch_entry(ivars, field); InverterEntryIVARS *inventry_ivars = InvEntry_IVARS(inventry); FieldType *type = inventry_ivars->type; // Get the field value. switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) { case FType_TEXT: { CERTIFY(obj, STRING); break; } case FType_BLOB: { CERTIFY(obj, BLOB); break; } case FType_INT32: case FType_INT64: { CERTIFY(obj, INTEGER); break; } case FType_FLOAT32: case FType_FLOAT64: { CERTIFY(obj, FLOAT); break; } default: THROW(ERR, "Unrecognized type: %o", type); } if (inventry_ivars->value != obj) { DECREF(inventry_ivars->value); inventry_ivars->value = INCREF(obj); } Inverter_Add_Field(self, inventry); } DECREF(iter); }
HitDoc* DefDocReader_Fetch_Doc_IMP(DefaultDocReader *self, int32_t doc_id) { DefaultDocReaderIVARS *const ivars = DefDocReader_IVARS(self); Schema *const schema = ivars->schema; InStream *const dat_in = ivars->dat_in; InStream *const ix_in = ivars->ix_in; Hash *const fields = Hash_new(1); int64_t start; uint32_t num_fields; uint32_t field_name_cap = 31; char *field_name = (char*)MALLOCATE(field_name_cap + 1); // Get data file pointer from index, read number of fields. InStream_Seek(ix_in, (int64_t)doc_id * 8); start = InStream_Read_U64(ix_in); InStream_Seek(dat_in, start); num_fields = InStream_Read_C32(dat_in); // Decode stored data and build up the doc field by field. while (num_fields--) { uint32_t field_name_len; Obj *value; FieldType *type; // Read field name. field_name_len = InStream_Read_C32(dat_in); if (field_name_len > field_name_cap) { field_name_cap = field_name_len; field_name = (char*)REALLOCATE(field_name, field_name_cap + 1); } InStream_Read_Bytes(dat_in, field_name, field_name_len); // Find the Field's FieldType. StackString *field_name_str = SSTR_WRAP_UTF8(field_name, field_name_len); type = Schema_Fetch_Type(schema, (String*)field_name_str); // Read the field value. switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) { case FType_TEXT: { uint32_t value_len = InStream_Read_C32(dat_in); char *buf = (char*)MALLOCATE(value_len + 1); InStream_Read_Bytes(dat_in, buf, value_len); buf[value_len] = '\0'; value = (Obj*)Str_new_steal_utf8(buf, value_len); break; } case FType_BLOB: { uint32_t value_len = InStream_Read_C32(dat_in); char *buf = (char*)MALLOCATE(value_len); InStream_Read_Bytes(dat_in, buf, value_len); value = (Obj*)BB_new_steal_bytes( buf, value_len, value_len); break; } case FType_FLOAT32: value = (Obj*)Float32_new( InStream_Read_F32(dat_in)); break; case FType_FLOAT64: value = (Obj*)Float64_new( InStream_Read_F64(dat_in)); break; case FType_INT32: value = (Obj*)Int32_new( (int32_t)InStream_Read_C32(dat_in)); break; case FType_INT64: value = (Obj*)Int64_new( (int64_t)InStream_Read_C64(dat_in)); break; default: value = NULL; THROW(ERR, "Unrecognized type: %o", type); } // Store the value. Hash_Store_Utf8(fields, field_name, field_name_len, value); } FREEMEM(field_name); HitDoc *retval = HitDoc_new(fields, doc_id, 0.0); DECREF(fields); return retval; }
void Inverter_invert_doc(Inverter *self, Doc *doc) { InverterIVARS *const ivars = Inverter_IVARS(self); Hash *const fields = (Hash*)Doc_Get_Fields(doc); uint32_t num_keys = Hash_Iterate(fields); // Prepare for the new doc. Inverter_Set_Doc(self, doc); // Extract and invert the doc's fields. while (num_keys--) { Obj *key, *obj; Hash_Next(fields, &key, &obj); CharBuf *field = (CharBuf*)CERTIFY(key, CHARBUF); InverterEntry *inventry = S_fetch_entry(ivars, field); InverterEntryIVARS *inventry_ivars = InvEntry_IVARS(inventry); FieldType *type = inventry_ivars->type; // Get the field value. switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) { case FType_TEXT: { CharBuf *char_buf = (CharBuf*)CERTIFY(obj, CHARBUF); ViewCharBuf *value = (ViewCharBuf*)inventry_ivars->value; ViewCB_Assign(value, char_buf); break; } case FType_BLOB: { ByteBuf *byte_buf = (ByteBuf*)CERTIFY(obj, BYTEBUF); ViewByteBuf *value = (ViewByteBuf*)inventry_ivars->value; ViewBB_Assign(value, byte_buf); break; } case FType_INT32: { int32_t int_val = (int32_t)Obj_To_I64(obj); Integer32* value = (Integer32*)inventry_ivars->value; Int32_Set_Value(value, int_val); break; } case FType_INT64: { int64_t int_val = Obj_To_I64(obj); Integer64* value = (Integer64*)inventry_ivars->value; Int64_Set_Value(value, int_val); break; } case FType_FLOAT32: { float float_val = (float)Obj_To_F64(obj); Float32* value = (Float32*)inventry_ivars->value; Float32_Set_Value(value, float_val); break; } case FType_FLOAT64: { double float_val = Obj_To_F64(obj); Float64* value = (Float64*)inventry_ivars->value; Float64_Set_Value(value, float_val); break; } default: THROW(ERR, "Unrecognized type: %o", type); } Inverter_Add_Field(self, inventry); } }
static SortCache* S_lazy_init_sort_cache(DefaultSortReader *self, String *field) { DefaultSortReaderIVARS *const ivars = DefSortReader_IVARS(self); // See if we have any values. Obj *count_obj = Hash_Fetch(ivars->counts, (Obj*)field); int32_t count = count_obj ? (int32_t)Obj_To_I64(count_obj) : 0; if (!count) { return NULL; } // Get a FieldType and sanity check that the field is sortable. Schema *schema = DefSortReader_Get_Schema(self); FieldType *type = Schema_Fetch_Type(schema, field); if (!type || !FType_Sortable(type)) { THROW(ERR, "'%o' isn't a sortable field", field); } // Open streams. Folder *folder = DefSortReader_Get_Folder(self); Segment *segment = DefSortReader_Get_Segment(self); String *seg_name = Seg_Get_Name(segment); int32_t field_num = Seg_Field_Num(segment, field); int8_t prim_id = FType_Primitive_ID(type); bool var_width = (prim_id == FType_TEXT || prim_id == FType_BLOB) ? true : false; String *ord_path = Str_newf("%o/sort-%i32.ord", seg_name, field_num); InStream *ord_in = Folder_Open_In(folder, ord_path); DECREF(ord_path); if (!ord_in) { THROW(ERR, "Error building sort cache for '%o': %o", field, Err_get_error()); } InStream *ix_in = NULL; if (var_width) { String *ix_path = Str_newf("%o/sort-%i32.ix", seg_name, field_num); ix_in = Folder_Open_In(folder, ix_path); DECREF(ix_path); if (!ix_in) { THROW(ERR, "Error building sort cache for '%o': %o", field, Err_get_error()); } } String *dat_path = Str_newf("%o/sort-%i32.dat", seg_name, field_num); InStream *dat_in = Folder_Open_In(folder, dat_path); DECREF(dat_path); if (!dat_in) { THROW(ERR, "Error building sort cache for '%o': %o", field, Err_get_error()); } Obj *null_ord_obj = Hash_Fetch(ivars->null_ords, (Obj*)field); int32_t null_ord = null_ord_obj ? (int32_t)Obj_To_I64(null_ord_obj) : -1; Obj *ord_width_obj = Hash_Fetch(ivars->ord_widths, (Obj*)field); int32_t ord_width = ord_width_obj ? (int32_t)Obj_To_I64(ord_width_obj) : S_calc_ord_width(count); int32_t doc_max = (int32_t)Seg_Get_Count(segment); SortCache *cache = NULL; switch (prim_id & FType_PRIMITIVE_ID_MASK) { case FType_TEXT: cache = (SortCache*)TextSortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, ix_in, dat_in); break; case FType_INT32: cache = (SortCache*)I32SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; case FType_INT64: cache = (SortCache*)I64SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; case FType_FLOAT32: cache = (SortCache*)F32SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; case FType_FLOAT64: cache = (SortCache*)F64SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; default: THROW(ERR, "No SortCache class for %o", type); } Hash_Store(ivars->caches, (Obj*)field, (Obj*)cache); if (ivars->format == 2) { // bug compatibility SortCache_Set_Native_Ords(cache, true); } DECREF(ord_in); DECREF(ix_in); DECREF(dat_in); return cache; }
void DocWriter_Add_Inverted_Doc_IMP(DocWriter *self, Inverter *inverter, int32_t doc_id) { DocWriterIVARS *const ivars = DocWriter_IVARS(self); OutStream *dat_out = S_lazy_init(self); OutStream *ix_out = ivars->ix_out; uint32_t num_stored = 0; int64_t start = OutStream_Tell(dat_out); int64_t expected = OutStream_Tell(ix_out) / 8; // Verify doc id. if (doc_id != expected) { THROW(ERR, "Expected doc id %i64 but got %i32", expected, doc_id); } // Write the number of stored fields. Inverter_Iterate(inverter); while (Inverter_Next(inverter)) { FieldType *type = Inverter_Get_Type(inverter); if (FType_Stored(type)) { num_stored++; } } OutStream_Write_C32(dat_out, num_stored); Inverter_Iterate(inverter); while (Inverter_Next(inverter)) { // Only store fields marked as "stored". FieldType *type = Inverter_Get_Type(inverter); if (FType_Stored(type)) { String *field = Inverter_Get_Field_Name(inverter); Obj *value = Inverter_Get_Value(inverter); Freezer_serialize_string(field, dat_out); switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) { case FType_TEXT: { const char *buf = Str_Get_Ptr8((String*)value); size_t size = Str_Get_Size((String*)value); OutStream_Write_C32(dat_out, size); OutStream_Write_Bytes(dat_out, buf, size); break; } case FType_BLOB: { char *buf = BB_Get_Buf((ByteBuf*)value); size_t size = BB_Get_Size((ByteBuf*)value); OutStream_Write_C32(dat_out, size); OutStream_Write_Bytes(dat_out, buf, size); break; } case FType_INT32: { int32_t val = Int32_Get_Value((Integer32*)value); OutStream_Write_C32(dat_out, val); break; } case FType_INT64: { int64_t val = Int64_Get_Value((Integer64*)value); OutStream_Write_C64(dat_out, val); break; } case FType_FLOAT32: { float val = Float32_Get_Value((Float32*)value); OutStream_Write_F32(dat_out, val); break; } case FType_FLOAT64: { double val = Float64_Get_Value((Float64*)value); OutStream_Write_F64(dat_out, val); break; } default: THROW(ERR, "Unrecognized type: %o", type); } } } // Write file pointer. OutStream_Write_I64(ix_out, start); }