PostingPool* PostPool_init(PostingPool *self, Schema *schema, Snapshot *snapshot, Segment *segment, PolyReader *polyreader, String *field, LexiconWriter *lex_writer, MemoryPool *mem_pool, OutStream *lex_temp_out, OutStream *post_temp_out, OutStream *skip_out) { // Init. SortEx_init((SortExternal*)self); PostingPoolIVARS *const ivars = PostPool_IVARS(self); ivars->doc_base = 0; ivars->last_doc_id = 0; ivars->doc_map = NULL; ivars->post_count = 0; ivars->lexicon = NULL; ivars->plist = NULL; ivars->lex_temp_in = NULL; ivars->post_temp_in = NULL; ivars->lex_start = INT64_MAX; ivars->post_start = INT64_MAX; ivars->lex_end = 0; ivars->post_end = 0; ivars->skip_stepper = SkipStepper_new(); // Assign. ivars->schema = (Schema*)INCREF(schema); ivars->snapshot = (Snapshot*)INCREF(snapshot); ivars->segment = (Segment*)INCREF(segment); ivars->polyreader = (PolyReader*)INCREF(polyreader); ivars->lex_writer = (LexiconWriter*)INCREF(lex_writer); ivars->mem_pool = (MemoryPool*)INCREF(mem_pool); ivars->field = Str_Clone(field); ivars->lex_temp_out = (OutStream*)INCREF(lex_temp_out); ivars->post_temp_out = (OutStream*)INCREF(post_temp_out); ivars->skip_out = (OutStream*)INCREF(skip_out); // Derive. Similarity *sim = Schema_Fetch_Sim(schema, field); ivars->posting = Sim_Make_Posting(sim); ivars->type = (FieldType*)INCREF(Schema_Fetch_Type(schema, field)); ivars->field_num = Seg_Field_Num(segment, field); return self; }
Lexicon* PolyLexReader_Lexicon_IMP(PolyLexiconReader *self, String *field, Obj *term) { PolyLexicon *lexicon = NULL; if (field != NULL) { Schema *schema = PolyLexReader_Get_Schema(self); FieldType *type = Schema_Fetch_Type(schema, field); if (type != NULL) { PolyLexiconReaderIVARS *const ivars = PolyLexReader_IVARS(self); lexicon = PolyLex_new(field, ivars->readers); if (!PolyLex_Get_Num_Seg_Lexicons(lexicon)) { DECREF(lexicon); return NULL; } if (term) { PolyLex_Seek(lexicon, term); } } } return (Lexicon*)lexicon; }
Lexicon* PolyLexReader_lexicon(PolyLexiconReader *self, const CharBuf *field, Obj *term) { PolyLexicon *lexicon = NULL; if (field != NULL) { Schema *schema = PolyLexReader_Get_Schema(self); FieldType *type = Schema_Fetch_Type(schema, field); if (type != NULL) { lexicon = PolyLex_new(field, self->readers); if (!VA_Get_Size(lexicon->seg_lexicons)) { DECREF(lexicon); return NULL; } if (term) { Lex_Seek(lexicon, term); } } } return (Lexicon*)lexicon; }
void LexWriter_enter_temp_mode(LexiconWriter *self, const CharBuf *field, OutStream *temp_outstream) { Schema *schema = LexWriter_Get_Schema(self); FieldType *type = Schema_Fetch_Type(schema, field); // Assign outstream. if (self->dat_out != NULL) THROW(ERR, "Can't enter temp mode (filename: %o) ", self->dat_file); self->dat_out = (OutStream*)INCREF(temp_outstream); // Initialize count and ix_count, term stepper and term info stepper. self->count = 0; self->ix_count = 0; self->term_stepper = FType_Make_Term_Stepper(type); TermStepper_Reset(self->tinfo_stepper); // Remember that we're in temp mode. self->temp_mode = true; }
PostingPool* PostPool_init(PostingPool *self, Schema *schema, const CharBuf *field, MemoryPool *mem_pool) { Architecture *arch = Schema_Get_Architecture(schema); /* Init. */ SortExRun_init((SortExRun*)self); self->lex_instream = NULL; self->post_instream = NULL; self->lex_start = I64_MAX; self->post_start = I64_MAX; self->lex_end = 0; self->post_end = 0; self->flipped = false; self->from_seg = false; self->mem_thresh = 0; self->doc_base = 0; self->last_doc_id = 0; self->doc_map = NULL; self->post_count = 0; self->scratch = NULL; self->scratch_cap = 0; self->lex_stepper = LexStepper_new(field, Arch_Skip_Interval(arch)); /* Assign. */ self->schema = (Schema*)INCREF(schema); self->mem_pool = (MemoryPool*)INCREF(mem_pool); self->field = CB_Clone(field); /* Derive. */ self->posting = Schema_Fetch_Posting(schema, field); self->posting = (Posting*)Post_Clone(self->posting); self->type = (FieldType*)INCREF(Schema_Fetch_Type(schema, field)); self->compare = PostPoolQ_compare_rawp; return self; }
HitDoc* DefDocReader_Fetch_Doc_IMP(DefaultDocReader *self, int32_t doc_id) { DefaultDocReaderIVARS *const ivars = DefDocReader_IVARS(self); Schema *const schema = ivars->schema; InStream *const dat_in = ivars->dat_in; InStream *const ix_in = ivars->ix_in; Hash *const fields = Hash_new(1); int64_t start; uint32_t num_fields; uint32_t field_name_cap = 31; char *field_name = (char*)MALLOCATE(field_name_cap + 1); // Get data file pointer from index, read number of fields. InStream_Seek(ix_in, (int64_t)doc_id * 8); start = InStream_Read_U64(ix_in); InStream_Seek(dat_in, start); num_fields = InStream_Read_C32(dat_in); // Decode stored data and build up the doc field by field. while (num_fields--) { uint32_t field_name_len; Obj *value; FieldType *type; // Read field name. field_name_len = InStream_Read_C32(dat_in); if (field_name_len > field_name_cap) { field_name_cap = field_name_len; field_name = (char*)REALLOCATE(field_name, field_name_cap + 1); } InStream_Read_Bytes(dat_in, field_name, field_name_len); // Find the Field's FieldType. StackString *field_name_str = SSTR_WRAP_UTF8(field_name, field_name_len); type = Schema_Fetch_Type(schema, (String*)field_name_str); // Read the field value. switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) { case FType_TEXT: { uint32_t value_len = InStream_Read_C32(dat_in); char *buf = (char*)MALLOCATE(value_len + 1); InStream_Read_Bytes(dat_in, buf, value_len); buf[value_len] = '\0'; value = (Obj*)Str_new_steal_utf8(buf, value_len); break; } case FType_BLOB: { uint32_t value_len = InStream_Read_C32(dat_in); char *buf = (char*)MALLOCATE(value_len); InStream_Read_Bytes(dat_in, buf, value_len); value = (Obj*)BB_new_steal_bytes( buf, value_len, value_len); break; } case FType_FLOAT32: value = (Obj*)Float32_new( InStream_Read_F32(dat_in)); break; case FType_FLOAT64: value = (Obj*)Float64_new( InStream_Read_F64(dat_in)); break; case FType_INT32: value = (Obj*)Int32_new( (int32_t)InStream_Read_C32(dat_in)); break; case FType_INT64: value = (Obj*)Int64_new( (int64_t)InStream_Read_C64(dat_in)); break; default: value = NULL; THROW(ERR, "Unrecognized type: %o", type); } // Store the value. Hash_Store_Utf8(fields, field_name, field_name_len, value); } FREEMEM(field_name); HitDoc *retval = HitDoc_new(fields, doc_id, 0.0); DECREF(fields); return retval; }
static SortCache* S_lazy_init_sort_cache(DefaultSortReader *self, String *field) { DefaultSortReaderIVARS *const ivars = DefSortReader_IVARS(self); // See if we have any values. Obj *count_obj = Hash_Fetch(ivars->counts, (Obj*)field); int32_t count = count_obj ? (int32_t)Obj_To_I64(count_obj) : 0; if (!count) { return NULL; } // Get a FieldType and sanity check that the field is sortable. Schema *schema = DefSortReader_Get_Schema(self); FieldType *type = Schema_Fetch_Type(schema, field); if (!type || !FType_Sortable(type)) { THROW(ERR, "'%o' isn't a sortable field", field); } // Open streams. Folder *folder = DefSortReader_Get_Folder(self); Segment *segment = DefSortReader_Get_Segment(self); String *seg_name = Seg_Get_Name(segment); int32_t field_num = Seg_Field_Num(segment, field); int8_t prim_id = FType_Primitive_ID(type); bool var_width = (prim_id == FType_TEXT || prim_id == FType_BLOB) ? true : false; String *ord_path = Str_newf("%o/sort-%i32.ord", seg_name, field_num); InStream *ord_in = Folder_Open_In(folder, ord_path); DECREF(ord_path); if (!ord_in) { THROW(ERR, "Error building sort cache for '%o': %o", field, Err_get_error()); } InStream *ix_in = NULL; if (var_width) { String *ix_path = Str_newf("%o/sort-%i32.ix", seg_name, field_num); ix_in = Folder_Open_In(folder, ix_path); DECREF(ix_path); if (!ix_in) { THROW(ERR, "Error building sort cache for '%o': %o", field, Err_get_error()); } } String *dat_path = Str_newf("%o/sort-%i32.dat", seg_name, field_num); InStream *dat_in = Folder_Open_In(folder, dat_path); DECREF(dat_path); if (!dat_in) { THROW(ERR, "Error building sort cache for '%o': %o", field, Err_get_error()); } Obj *null_ord_obj = Hash_Fetch(ivars->null_ords, (Obj*)field); int32_t null_ord = null_ord_obj ? (int32_t)Obj_To_I64(null_ord_obj) : -1; Obj *ord_width_obj = Hash_Fetch(ivars->ord_widths, (Obj*)field); int32_t ord_width = ord_width_obj ? (int32_t)Obj_To_I64(ord_width_obj) : S_calc_ord_width(count); int32_t doc_max = (int32_t)Seg_Get_Count(segment); SortCache *cache = NULL; switch (prim_id & FType_PRIMITIVE_ID_MASK) { case FType_TEXT: cache = (SortCache*)TextSortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, ix_in, dat_in); break; case FType_INT32: cache = (SortCache*)I32SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; case FType_INT64: cache = (SortCache*)I64SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; case FType_FLOAT32: cache = (SortCache*)F32SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; case FType_FLOAT64: cache = (SortCache*)F64SortCache_new(field, type, count, doc_max, null_ord, ord_width, ord_in, dat_in); break; default: THROW(ERR, "No SortCache class for %o", type); } Hash_Store(ivars->caches, (Obj*)field, (Obj*)cache); if (ivars->format == 2) { // bug compatibility SortCache_Set_Native_Ords(cache, true); } DECREF(ord_in); DECREF(ix_in); DECREF(dat_in); return cache; }
HitQueue* HitQ_init(HitQueue *self, Schema *schema, SortSpec *sort_spec, uint32_t wanted) { HitQueueIVARS *const ivars = HitQ_IVARS(self); if (sort_spec) { VArray *rules = SortSpec_Get_Rules(sort_spec); uint32_t num_rules = VA_Get_Size(rules); uint32_t action_num = 0; if (!schema) { THROW(ERR, "Can't supply sort_spec without schema"); } ivars->need_values = false; ivars->num_actions = num_rules; ivars->actions = (uint8_t*)MALLOCATE(num_rules * sizeof(uint8_t)); ivars->field_types = (FieldType**)CALLOCATE(num_rules, sizeof(FieldType*)); for (uint32_t i = 0; i < num_rules; i++) { SortRule *rule = (SortRule*)VA_Fetch(rules, i); int32_t rule_type = SortRule_Get_Type(rule); bool reverse = SortRule_Get_Reverse(rule); if (rule_type == SortRule_SCORE) { ivars->actions[action_num++] = reverse ? COMPARE_BY_SCORE_REV : COMPARE_BY_SCORE; } else if (rule_type == SortRule_DOC_ID) { ivars->actions[action_num++] = reverse ? COMPARE_BY_DOC_ID_REV : COMPARE_BY_DOC_ID; } else if (rule_type == SortRule_FIELD) { String *field = SortRule_Get_Field(rule); FieldType *type = Schema_Fetch_Type(schema, field); if (type) { ivars->field_types[action_num] = (FieldType*)INCREF(type); ivars->actions[action_num++] = reverse ? COMPARE_BY_VALUE_REV : COMPARE_BY_VALUE; ivars->need_values = true; } else { // Skip over fields we don't know how to sort on. continue; } } else { THROW(ERR, "Unknown SortRule type: %i32", rule_type); } } } else { ivars->num_actions = 2; ivars->actions = (uint8_t*)MALLOCATE(ivars->num_actions * sizeof(uint8_t)); ivars->actions[0] = COMPARE_BY_SCORE; ivars->actions[1] = COMPARE_BY_DOC_ID; } return (HitQueue*)PriQ_init((PriorityQueue*)self, wanted); }
SortCollector* SortColl_init(SortCollector *self, Schema *schema, SortSpec *sort_spec, uint32_t wanted) { VArray *rules = sort_spec ? (VArray*)INCREF(SortSpec_Get_Rules(sort_spec)) : S_default_sort_rules(); uint32_t num_rules = VA_Get_Size(rules); // Validate. if (sort_spec && !schema) { THROW(ERR, "Can't supply a SortSpec without a Schema."); } if (!num_rules) { THROW(ERR, "Can't supply a SortSpec with no SortRules."); } // Init. Coll_init((Collector*)self); SortCollectorIVARS *const ivars = SortColl_IVARS(self); ivars->total_hits = 0; ivars->bubble_doc = INT32_MAX; ivars->bubble_score = F32_NEGINF; ivars->seg_doc_max = 0; // Assign. ivars->wanted = wanted; // Derive. ivars->hit_q = HitQ_new(schema, sort_spec, wanted); ivars->rules = rules; // absorb refcount. ivars->num_rules = num_rules; ivars->sort_caches = (SortCache**)CALLOCATE(num_rules, sizeof(SortCache*)); ivars->ord_arrays = (void**)CALLOCATE(num_rules, sizeof(void*)); ivars->actions = (uint8_t*)CALLOCATE(num_rules, sizeof(uint8_t)); // Build up an array of "actions" which we will execute during each call // to Collect(). Determine whether we need to track scores and field // values. ivars->need_score = false; ivars->need_values = false; for (uint32_t i = 0; i < num_rules; i++) { SortRule *rule = (SortRule*)VA_Fetch(rules, i); int32_t rule_type = SortRule_Get_Type(rule); ivars->actions[i] = S_derive_action(rule, NULL); if (rule_type == SortRule_SCORE) { ivars->need_score = true; } else if (rule_type == SortRule_FIELD) { CharBuf *field = SortRule_Get_Field(rule); FieldType *type = Schema_Fetch_Type(schema, field); if (!type || !FType_Sortable(type)) { THROW(ERR, "'%o' isn't a sortable field", field); } ivars->need_values = true; } } // Perform an optimization. So long as we always collect docs in // ascending order, Collect() will favor lower doc numbers -- so we may // not need to execute a final COMPARE_BY_DOC_ID action. ivars->num_actions = num_rules; if (ivars->actions[num_rules - 1] == COMPARE_BY_DOC_ID) { ivars->num_actions--; } // Override our derived actions with an action which will be excecuted // autmatically until the queue fills up. ivars->auto_actions = (uint8_t*)MALLOCATE(1); ivars->auto_actions[0] = wanted ? AUTO_ACCEPT : AUTO_REJECT; ivars->derived_actions = ivars->actions; ivars->actions = ivars->auto_actions; // Prepare a MatchDoc-in-waiting. VArray *values = ivars->need_values ? VA_new(num_rules) : NULL; float score = ivars->need_score ? F32_NEGINF : F32_NAN; ivars->bumped = MatchDoc_new(INT32_MAX, score, values); DECREF(values); return self; }
SortCache* SortCache_init(SortCache *self, Schema *schema, Folder *folder, Segment *segment, i32_t field_num) { CharBuf *field = Seg_Field_Name(segment, field_num); CharBuf *seg_name = Seg_Get_Name(segment); CharBuf *ord_file = CB_newf("%o/sort-%i32.ord", seg_name, field_num); CharBuf *ix_file = CB_newf("%o/sort-%i32.ix", seg_name, field_num); CharBuf *dat_file = CB_newf("%o/sort-%i32.dat", seg_name, field_num); i64_t ord_len, ix_len, dat_len; /* Derive. */ self->doc_max = Seg_Get_Count(segment); self->type = Schema_Fetch_Type(schema, field); if (!self->type || !FType_Sortable(self->type)) { THROW("'%o' isn't a sortable field", field); } /* Open instreams. */ self->ord_in = Folder_Open_In(folder, ord_file); self->ix_in = Folder_Open_In(folder, ix_file); self->dat_in = Folder_Open_In(folder, dat_file); if (!self->ix_in || !self->dat_in || !self->ord_in) { CharBuf *mess = MAKE_MESS("Can't open either %o, %o or %o", ord_file, ix_file, dat_file); DECREF(ord_file); DECREF(ix_file); DECREF(dat_file); Err_throw_mess(mess); } ord_len = InStream_Length(self->ord_in); ix_len = InStream_Length(self->ix_in); dat_len = InStream_Length(self->dat_in); /* Calculate the number of unique values and derive the ord bit width. */ self->num_uniq = (i32_t)(ix_len / 8) - 1; self->width = S_calc_width(self->num_uniq); /* Validate file lengths. */ { double bytes_per_doc = self->width / 8.0; double max_ords = ord_len / bytes_per_doc; if (max_ords < self->doc_max + 1) { THROW("Conflict between ord count max %f64 and doc_max %i32", max_ords, self->doc_max); } } /* Mmap ords, offsets and character data. */ self->ords = InStream_Buf(self->ord_in, (size_t)ord_len); self->offsets = (i64_t*)InStream_Buf(self->ix_in, (size_t)ix_len); self->char_data = InStream_Buf(self->dat_in, dat_len); { char *offs = (char*)self->offsets; self->offsets_limit = (i64_t*)(offs + ix_len); self->char_data_limit = self->char_data + dat_len; } DECREF(ord_file); DECREF(ix_file); DECREF(dat_file); return self; }