Beispiel #1
0
PostingPool*
PostPool_init(PostingPool *self, Schema *schema, Snapshot *snapshot,
              Segment *segment, PolyReader *polyreader, String *field,
              LexiconWriter *lex_writer, MemoryPool *mem_pool,
              OutStream *lex_temp_out, OutStream *post_temp_out,
              OutStream *skip_out) {
    // Init.
    SortEx_init((SortExternal*)self);
    PostingPoolIVARS *const ivars = PostPool_IVARS(self);
    ivars->doc_base         = 0;
    ivars->last_doc_id      = 0;
    ivars->doc_map          = NULL;
    ivars->post_count       = 0;
    ivars->lexicon          = NULL;
    ivars->plist            = NULL;
    ivars->lex_temp_in      = NULL;
    ivars->post_temp_in     = NULL;
    ivars->lex_start        = INT64_MAX;
    ivars->post_start       = INT64_MAX;
    ivars->lex_end          = 0;
    ivars->post_end         = 0;
    ivars->skip_stepper     = SkipStepper_new();

    // Assign.
    ivars->schema         = (Schema*)INCREF(schema);
    ivars->snapshot       = (Snapshot*)INCREF(snapshot);
    ivars->segment        = (Segment*)INCREF(segment);
    ivars->polyreader     = (PolyReader*)INCREF(polyreader);
    ivars->lex_writer     = (LexiconWriter*)INCREF(lex_writer);
    ivars->mem_pool       = (MemoryPool*)INCREF(mem_pool);
    ivars->field          = Str_Clone(field);
    ivars->lex_temp_out   = (OutStream*)INCREF(lex_temp_out);
    ivars->post_temp_out  = (OutStream*)INCREF(post_temp_out);
    ivars->skip_out       = (OutStream*)INCREF(skip_out);

    // Derive.
    Similarity *sim = Schema_Fetch_Sim(schema, field);
    ivars->posting   = Sim_Make_Posting(sim);
    ivars->type      = (FieldType*)INCREF(Schema_Fetch_Type(schema, field));
    ivars->field_num = Seg_Field_Num(segment, field);

    return self;
}
Beispiel #2
0
Lexicon*
PolyLexReader_Lexicon_IMP(PolyLexiconReader *self, String *field,
                          Obj *term) {
    PolyLexicon *lexicon = NULL;

    if (field != NULL) {
        Schema *schema = PolyLexReader_Get_Schema(self);
        FieldType *type = Schema_Fetch_Type(schema, field);
        if (type != NULL) {
            PolyLexiconReaderIVARS *const ivars = PolyLexReader_IVARS(self);
            lexicon = PolyLex_new(field, ivars->readers);
            if (!PolyLex_Get_Num_Seg_Lexicons(lexicon)) {
                DECREF(lexicon);
                return NULL;
            }
            if (term) { PolyLex_Seek(lexicon, term); }
        }
    }

    return (Lexicon*)lexicon;
}
Lexicon*
PolyLexReader_lexicon(PolyLexiconReader *self, const CharBuf *field,
                      Obj *term)
{
    PolyLexicon *lexicon = NULL;

    if (field != NULL) {
        Schema *schema = PolyLexReader_Get_Schema(self);
        FieldType *type = Schema_Fetch_Type(schema, field);
        if (type != NULL) {
            lexicon = PolyLex_new(field, self->readers);
            if (!VA_Get_Size(lexicon->seg_lexicons)) {
                DECREF(lexicon);
                return NULL;
            }
            if (term) { Lex_Seek(lexicon, term); }
        }
    }

    return (Lexicon*)lexicon;
}
Beispiel #4
0
void
LexWriter_enter_temp_mode(LexiconWriter *self, const CharBuf *field, 
                          OutStream *temp_outstream)
{
    Schema    *schema = LexWriter_Get_Schema(self);
    FieldType *type   = Schema_Fetch_Type(schema, field);

    // Assign outstream. 
    if (self->dat_out != NULL)
        THROW(ERR, "Can't enter temp mode (filename: %o) ", self->dat_file);
    self->dat_out = (OutStream*)INCREF(temp_outstream);

    // Initialize count and ix_count, term stepper and term info stepper. 
    self->count    = 0;
    self->ix_count = 0;
    self->term_stepper = FType_Make_Term_Stepper(type);
    TermStepper_Reset(self->tinfo_stepper);

    // Remember that we're in temp mode. 
    self->temp_mode = true;
}
PostingPool*
PostPool_init(PostingPool *self, Schema *schema, 
              const CharBuf *field, MemoryPool *mem_pool)
{
    Architecture *arch = Schema_Get_Architecture(schema);

    /* Init. */
    SortExRun_init((SortExRun*)self);
    self->lex_instream     = NULL;
    self->post_instream    = NULL;
    self->lex_start        = I64_MAX;
    self->post_start       = I64_MAX;
    self->lex_end          = 0;
    self->post_end         = 0;
    self->flipped          = false;
    self->from_seg         = false;
    self->mem_thresh       = 0;
    self->doc_base         = 0;
    self->last_doc_id      = 0;
    self->doc_map          = NULL;
    self->post_count       = 0;
    self->scratch          = NULL;
    self->scratch_cap      = 0;
    self->lex_stepper = LexStepper_new(field, Arch_Skip_Interval(arch));

    /* Assign. */
    self->schema         = (Schema*)INCREF(schema);
    self->mem_pool       = (MemoryPool*)INCREF(mem_pool);
    self->field          = CB_Clone(field);

    /* Derive. */
    self->posting = Schema_Fetch_Posting(schema, field);
    self->posting = (Posting*)Post_Clone(self->posting);
    self->type    = (FieldType*)INCREF(Schema_Fetch_Type(schema, field));
    self->compare = PostPoolQ_compare_rawp;

    return self;
}
Beispiel #6
0
HitDoc*
DefDocReader_Fetch_Doc_IMP(DefaultDocReader *self, int32_t doc_id) {
    DefaultDocReaderIVARS *const ivars = DefDocReader_IVARS(self);
    Schema   *const schema = ivars->schema;
    InStream *const dat_in = ivars->dat_in;
    InStream *const ix_in  = ivars->ix_in;
    Hash     *const fields = Hash_new(1);
    int64_t   start;
    uint32_t  num_fields;
    uint32_t  field_name_cap = 31;
    char     *field_name = (char*)MALLOCATE(field_name_cap + 1);

    // Get data file pointer from index, read number of fields.
    InStream_Seek(ix_in, (int64_t)doc_id * 8);
    start = InStream_Read_U64(ix_in);
    InStream_Seek(dat_in, start);
    num_fields = InStream_Read_C32(dat_in);

    // Decode stored data and build up the doc field by field.
    while (num_fields--) {
        uint32_t        field_name_len;
        Obj       *value;
        FieldType *type;

        // Read field name.
        field_name_len = InStream_Read_C32(dat_in);
        if (field_name_len > field_name_cap) {
            field_name_cap = field_name_len;
            field_name     = (char*)REALLOCATE(field_name,
                                                    field_name_cap + 1);
        }
        InStream_Read_Bytes(dat_in, field_name, field_name_len);

        // Find the Field's FieldType.
        StackString *field_name_str
            = SSTR_WRAP_UTF8(field_name, field_name_len);
        type = Schema_Fetch_Type(schema, (String*)field_name_str);

        // Read the field value.
        switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) {
            case FType_TEXT: {
                    uint32_t value_len = InStream_Read_C32(dat_in);
                    char *buf = (char*)MALLOCATE(value_len + 1);
                    InStream_Read_Bytes(dat_in, buf, value_len);
                    buf[value_len] = '\0'; 
                    value = (Obj*)Str_new_steal_utf8(buf, value_len);
                    break;
                }
            case FType_BLOB: {
                    uint32_t value_len = InStream_Read_C32(dat_in);
                    char *buf = (char*)MALLOCATE(value_len);
                    InStream_Read_Bytes(dat_in, buf, value_len);
                    value = (Obj*)BB_new_steal_bytes(
                                buf, value_len, value_len);
                    break;
                }
            case FType_FLOAT32:
                value = (Obj*)Float32_new(
                                InStream_Read_F32(dat_in));
                break;
            case FType_FLOAT64:
                value = (Obj*)Float64_new(
                                InStream_Read_F64(dat_in));
                break;
            case FType_INT32:
                value = (Obj*)Int32_new(
                                (int32_t)InStream_Read_C32(dat_in));
                break;
            case FType_INT64:
                value = (Obj*)Int64_new(
                                (int64_t)InStream_Read_C64(dat_in));
                break;
            default:
                value = NULL;
                THROW(ERR, "Unrecognized type: %o", type);
        }

        // Store the value.
        Hash_Store_Utf8(fields, field_name, field_name_len, value);
    }
    FREEMEM(field_name);

    HitDoc *retval = HitDoc_new(fields, doc_id, 0.0);
    DECREF(fields);
    return retval;
}
Beispiel #7
0
static SortCache*
S_lazy_init_sort_cache(DefaultSortReader *self, String *field) {
    DefaultSortReaderIVARS *const ivars = DefSortReader_IVARS(self);

    // See if we have any values.
    Obj *count_obj = Hash_Fetch(ivars->counts, (Obj*)field);
    int32_t count = count_obj ? (int32_t)Obj_To_I64(count_obj) : 0;
    if (!count) { return NULL; }

    // Get a FieldType and sanity check that the field is sortable.
    Schema    *schema = DefSortReader_Get_Schema(self);
    FieldType *type   = Schema_Fetch_Type(schema, field);
    if (!type || !FType_Sortable(type)) {
        THROW(ERR, "'%o' isn't a sortable field", field);
    }

    // Open streams.
    Folder    *folder    = DefSortReader_Get_Folder(self);
    Segment   *segment   = DefSortReader_Get_Segment(self);
    String    *seg_name  = Seg_Get_Name(segment);
    int32_t    field_num = Seg_Field_Num(segment, field);
    int8_t     prim_id   = FType_Primitive_ID(type);
    bool       var_width = (prim_id == FType_TEXT || prim_id == FType_BLOB)
                           ? true
                           : false;
    String *ord_path = Str_newf("%o/sort-%i32.ord", seg_name, field_num);
    InStream *ord_in = Folder_Open_In(folder, ord_path);
    DECREF(ord_path);
    if (!ord_in) {
        THROW(ERR, "Error building sort cache for '%o': %o",
              field, Err_get_error());
    }
    InStream *ix_in = NULL;
    if (var_width) {
        String *ix_path = Str_newf("%o/sort-%i32.ix", seg_name, field_num);
        ix_in = Folder_Open_In(folder, ix_path);
        DECREF(ix_path);
        if (!ix_in) {
            THROW(ERR, "Error building sort cache for '%o': %o",
                  field, Err_get_error());
        }
    }
    String *dat_path = Str_newf("%o/sort-%i32.dat", seg_name, field_num);
    InStream *dat_in = Folder_Open_In(folder, dat_path);
    DECREF(dat_path);
    if (!dat_in) {
        THROW(ERR, "Error building sort cache for '%o': %o",
              field, Err_get_error());
    }

    Obj     *null_ord_obj = Hash_Fetch(ivars->null_ords, (Obj*)field);
    int32_t  null_ord = null_ord_obj ? (int32_t)Obj_To_I64(null_ord_obj) : -1;
    Obj     *ord_width_obj = Hash_Fetch(ivars->ord_widths, (Obj*)field);
    int32_t  ord_width = ord_width_obj
                         ? (int32_t)Obj_To_I64(ord_width_obj)
                         : S_calc_ord_width(count);
    int32_t  doc_max = (int32_t)Seg_Get_Count(segment);

    SortCache *cache = NULL;
    switch (prim_id & FType_PRIMITIVE_ID_MASK) {
        case FType_TEXT:
            cache = (SortCache*)TextSortCache_new(field, type, count, doc_max,
                                                  null_ord, ord_width, ord_in,
                                                  ix_in, dat_in);
            break;
        case FType_INT32:
            cache = (SortCache*)I32SortCache_new(field, type, count, doc_max,
                                                 null_ord, ord_width, ord_in,
                                                 dat_in);
            break;
        case FType_INT64:
            cache = (SortCache*)I64SortCache_new(field, type, count, doc_max,
                                                 null_ord, ord_width, ord_in,
                                                 dat_in);
            break;
        case FType_FLOAT32:
            cache = (SortCache*)F32SortCache_new(field, type, count, doc_max,
                                                 null_ord, ord_width, ord_in,
                                                 dat_in);
            break;
        case FType_FLOAT64:
            cache = (SortCache*)F64SortCache_new(field, type, count, doc_max,
                                                 null_ord, ord_width, ord_in,
                                                 dat_in);
            break;
        default:
            THROW(ERR, "No SortCache class for %o", type);
    }
    Hash_Store(ivars->caches, (Obj*)field, (Obj*)cache);

    if (ivars->format == 2) { // bug compatibility
        SortCache_Set_Native_Ords(cache, true);
    }

    DECREF(ord_in);
    DECREF(ix_in);
    DECREF(dat_in);

    return cache;
}
Beispiel #8
0
HitQueue*
HitQ_init(HitQueue *self, Schema *schema, SortSpec *sort_spec,
          uint32_t wanted) {
    HitQueueIVARS *const ivars = HitQ_IVARS(self);
    if (sort_spec) {
        VArray   *rules      = SortSpec_Get_Rules(sort_spec);
        uint32_t  num_rules  = VA_Get_Size(rules);
        uint32_t  action_num = 0;

        if (!schema) {
            THROW(ERR, "Can't supply sort_spec without schema");
        }

        ivars->need_values = false;
        ivars->num_actions = num_rules;
        ivars->actions     = (uint8_t*)MALLOCATE(num_rules * sizeof(uint8_t));
        ivars->field_types = (FieldType**)CALLOCATE(num_rules, sizeof(FieldType*));

        for (uint32_t i = 0; i < num_rules; i++) {
            SortRule *rule      = (SortRule*)VA_Fetch(rules, i);
            int32_t   rule_type = SortRule_Get_Type(rule);
            bool      reverse   = SortRule_Get_Reverse(rule);

            if (rule_type == SortRule_SCORE) {
                ivars->actions[action_num++] = reverse
                                              ? COMPARE_BY_SCORE_REV
                                              : COMPARE_BY_SCORE;
            }
            else if (rule_type == SortRule_DOC_ID) {
                ivars->actions[action_num++] = reverse
                                              ? COMPARE_BY_DOC_ID_REV
                                              : COMPARE_BY_DOC_ID;
            }
            else if (rule_type == SortRule_FIELD) {
                String    *field = SortRule_Get_Field(rule);
                FieldType *type  = Schema_Fetch_Type(schema, field);
                if (type) {
                    ivars->field_types[action_num] = (FieldType*)INCREF(type);
                    ivars->actions[action_num++] = reverse
                                                  ? COMPARE_BY_VALUE_REV
                                                  : COMPARE_BY_VALUE;
                    ivars->need_values = true;
                }
                else {
                    // Skip over fields we don't know how to sort on.
                    continue;
                }
            }
            else {
                THROW(ERR, "Unknown SortRule type: %i32", rule_type);
            }
        }
    }
    else {
        ivars->num_actions = 2;
        ivars->actions     = (uint8_t*)MALLOCATE(ivars->num_actions * sizeof(uint8_t));
        ivars->actions[0]  = COMPARE_BY_SCORE;
        ivars->actions[1]  = COMPARE_BY_DOC_ID;
    }

    return (HitQueue*)PriQ_init((PriorityQueue*)self, wanted);
}
Beispiel #9
0
SortCollector*
SortColl_init(SortCollector *self, Schema *schema, SortSpec *sort_spec,
              uint32_t wanted) {
    VArray *rules = sort_spec
                    ? (VArray*)INCREF(SortSpec_Get_Rules(sort_spec))
                    : S_default_sort_rules();
    uint32_t num_rules = VA_Get_Size(rules);

    // Validate.
    if (sort_spec && !schema) {
        THROW(ERR, "Can't supply a SortSpec without a Schema.");
    }
    if (!num_rules) {
        THROW(ERR, "Can't supply a SortSpec with no SortRules.");
    }

    // Init.
    Coll_init((Collector*)self);
    SortCollectorIVARS *const ivars = SortColl_IVARS(self);
    ivars->total_hits    = 0;
    ivars->bubble_doc    = INT32_MAX;
    ivars->bubble_score  = F32_NEGINF;
    ivars->seg_doc_max   = 0;

    // Assign.
    ivars->wanted        = wanted;

    // Derive.
    ivars->hit_q         = HitQ_new(schema, sort_spec, wanted);
    ivars->rules         = rules; // absorb refcount.
    ivars->num_rules     = num_rules;
    ivars->sort_caches   = (SortCache**)CALLOCATE(num_rules, sizeof(SortCache*));
    ivars->ord_arrays    = (void**)CALLOCATE(num_rules, sizeof(void*));
    ivars->actions       = (uint8_t*)CALLOCATE(num_rules, sizeof(uint8_t));

    // Build up an array of "actions" which we will execute during each call
    // to Collect(). Determine whether we need to track scores and field
    // values.
    ivars->need_score  = false;
    ivars->need_values = false;
    for (uint32_t i = 0; i < num_rules; i++) {
        SortRule *rule   = (SortRule*)VA_Fetch(rules, i);
        int32_t rule_type  = SortRule_Get_Type(rule);
        ivars->actions[i] = S_derive_action(rule, NULL);
        if (rule_type == SortRule_SCORE) {
            ivars->need_score = true;
        }
        else if (rule_type == SortRule_FIELD) {
            CharBuf *field = SortRule_Get_Field(rule);
            FieldType *type = Schema_Fetch_Type(schema, field);
            if (!type || !FType_Sortable(type)) {
                THROW(ERR, "'%o' isn't a sortable field", field);
            }
            ivars->need_values = true;
        }
    }

    // Perform an optimization.  So long as we always collect docs in
    // ascending order, Collect() will favor lower doc numbers -- so we may
    // not need to execute a final COMPARE_BY_DOC_ID action.
    ivars->num_actions = num_rules;
    if (ivars->actions[num_rules - 1] == COMPARE_BY_DOC_ID) {
        ivars->num_actions--;
    }

    // Override our derived actions with an action which will be excecuted
    // autmatically until the queue fills up.
    ivars->auto_actions    = (uint8_t*)MALLOCATE(1);
    ivars->auto_actions[0] = wanted ? AUTO_ACCEPT : AUTO_REJECT;
    ivars->derived_actions = ivars->actions;
    ivars->actions         = ivars->auto_actions;


    // Prepare a MatchDoc-in-waiting.
    VArray *values = ivars->need_values ? VA_new(num_rules) : NULL;
    float   score  = ivars->need_score  ? F32_NEGINF : F32_NAN;
    ivars->bumped = MatchDoc_new(INT32_MAX, score, values);
    DECREF(values);

    return self;
}
Beispiel #10
0
SortCache*
SortCache_init(SortCache *self, Schema *schema, Folder *folder,
               Segment *segment, i32_t field_num)
{
    CharBuf *field    = Seg_Field_Name(segment, field_num);
    CharBuf *seg_name = Seg_Get_Name(segment);
    CharBuf *ord_file = CB_newf("%o/sort-%i32.ord", seg_name, field_num);
    CharBuf *ix_file  = CB_newf("%o/sort-%i32.ix",  seg_name, field_num);
    CharBuf *dat_file = CB_newf("%o/sort-%i32.dat", seg_name, field_num);
    i64_t ord_len, ix_len, dat_len;

    /* Derive. */
    self->doc_max = Seg_Get_Count(segment);
    self->type    = Schema_Fetch_Type(schema, field);
    if (!self->type || !FType_Sortable(self->type)) {
        THROW("'%o' isn't a sortable field", field);
    }

    /* Open instreams. */
    self->ord_in  = Folder_Open_In(folder, ord_file);
    self->ix_in   = Folder_Open_In(folder, ix_file);
    self->dat_in  = Folder_Open_In(folder, dat_file);
    if (!self->ix_in || !self->dat_in || !self->ord_in) {
        CharBuf *mess = MAKE_MESS("Can't open either %o, %o or %o", ord_file, 
            ix_file, dat_file);
        DECREF(ord_file);
        DECREF(ix_file);
        DECREF(dat_file);
        Err_throw_mess(mess);
    }
    ord_len = InStream_Length(self->ord_in);
    ix_len  = InStream_Length(self->ix_in);
    dat_len = InStream_Length(self->dat_in);

    /* Calculate the number of unique values and derive the ord bit width. */
    self->num_uniq = (i32_t)(ix_len / 8) - 1; 
    self->width    = S_calc_width(self->num_uniq);

    /* Validate file lengths. */
    {
        double bytes_per_doc = self->width / 8.0;
        double max_ords      = ord_len / bytes_per_doc;
        if (max_ords < self->doc_max + 1) {
            THROW("Conflict between ord count max %f64 and doc_max %i32", 
                max_ords, self->doc_max);
        }
    }

    /* Mmap ords, offsets and character data. */
    self->ords      = InStream_Buf(self->ord_in, (size_t)ord_len);
    self->offsets   = (i64_t*)InStream_Buf(self->ix_in, (size_t)ix_len);
    self->char_data = InStream_Buf(self->dat_in, dat_len);
    {
        char *offs            = (char*)self->offsets;
        self->offsets_limit   = (i64_t*)(offs + ix_len);
        self->char_data_limit = self->char_data + dat_len;
    }

    DECREF(ord_file);
    DECREF(ix_file);
    DECREF(dat_file);

    return self;
}