SortFieldWriter*
SortFieldWriter_init(SortFieldWriter *self, Schema *schema,
                     Snapshot *snapshot, Segment *segment,
                     PolyReader *polyreader, String *field,
                     Counter *counter, size_t mem_thresh,
                     OutStream *temp_ord_out, OutStream *temp_ix_out,
                     OutStream *temp_dat_out) {
    // Init.
    SortEx_init((SortExternal*)self);
    SortFieldWriterIVARS *const ivars = SortFieldWriter_IVARS(self);
    ivars->null_ord        = -1;
    ivars->count           = 0;
    ivars->ord_start       = 0;
    ivars->ord_end         = 0;
    ivars->ix_start        = 0;
    ivars->ix_end          = 0;
    ivars->dat_start       = 0;
    ivars->dat_end         = 0;
    ivars->run_cardinality = -1;
    ivars->run_max         = -1;
    ivars->sort_cache      = NULL;
    ivars->doc_map         = NULL;
    ivars->sorted_ids      = NULL;
    ivars->run_tick        = 1;
    ivars->ord_width       = 0;

    // Assign.
    ivars->field        = Str_Clone(field);
    ivars->schema       = (Schema*)INCREF(schema);
    ivars->snapshot     = (Snapshot*)INCREF(snapshot);
    ivars->segment      = (Segment*)INCREF(segment);
    ivars->polyreader   = (PolyReader*)INCREF(polyreader);
    ivars->counter      = (Counter*)INCREF(counter);
    ivars->temp_ord_out = (OutStream*)INCREF(temp_ord_out);
    ivars->temp_ix_out  = (OutStream*)INCREF(temp_ix_out);
    ivars->temp_dat_out = (OutStream*)INCREF(temp_dat_out);
    ivars->mem_thresh   = mem_thresh;

    // Derive.
    ivars->field_num = Seg_Field_Num(segment, field);
    FieldType *type = (FieldType*)CERTIFY(
                          Schema_Fetch_Type(ivars->schema, field), FIELDTYPE);
    ivars->type    = (FieldType*)INCREF(type);
    ivars->prim_id = FType_Primitive_ID(type);
    ivars->mem_per_entry = Class_Get_Obj_Alloc_Size(SFWRITERELEM);
    if (ivars->prim_id == FType_TEXT) {
        ivars->mem_per_entry += Class_Get_Obj_Alloc_Size(STRING);
        ivars->var_width = true;
    }
    else if (ivars->prim_id == FType_BLOB) {
        ivars->mem_per_entry += Class_Get_Obj_Alloc_Size(BLOB);
        ivars->var_width = true;
    }
    else {
        ivars->mem_per_entry += Class_Get_Obj_Alloc_Size(FLOAT);
        ivars->var_width = false;
    }

    return self;
}
Пример #2
0
InverterEntry*
InvEntry_init(InverterEntry *self, Schema *schema, String *field,
              int32_t field_num) {
    InverterEntryIVARS *const ivars = InvEntry_IVARS(self);
    ivars->field_num  = field_num;
    ivars->field      = field ? Str_Clone(field) : NULL;
    ivars->inversion  = NULL;

    if (schema) {
        ivars->analyzer
            = (Analyzer*)INCREF(Schema_Fetch_Analyzer(schema, field));
        ivars->sim  = (Similarity*)INCREF(Schema_Fetch_Sim(schema, field));
        ivars->type = (FieldType*)INCREF(Schema_Fetch_Type(schema, field));
        if (!ivars->type) { THROW(ERR, "Unknown field: '%o'", field); }

        uint8_t prim_id = FType_Primitive_ID(ivars->type);
        switch (prim_id & FType_PRIMITIVE_ID_MASK) {
            case FType_TEXT:
                ivars->value = NULL;
                break;
            case FType_BLOB:
                ivars->value = (Obj*)ViewBB_new(NULL, 0);
                break;
            case FType_INT32:
                ivars->value = (Obj*)Int32_new(0);
                break;
            case FType_INT64:
                ivars->value = (Obj*)Int64_new(0);
                break;
            case FType_FLOAT32:
                ivars->value = (Obj*)Float32_new(0);
                break;
            case FType_FLOAT64:
                ivars->value = (Obj*)Float64_new(0);
                break;
            default:
                THROW(ERR, "Unrecognized primitive id: %i8", prim_id);
        }

        ivars->indexed = FType_Indexed(ivars->type);
        if (ivars->indexed && FType_Is_A(ivars->type, NUMERICTYPE)) {
            THROW(ERR, "Field '%o' spec'd as indexed, but numerical types cannot "
                  "be indexed yet", field);
        }
        if (FType_Is_A(ivars->type, FULLTEXTTYPE)) {
            ivars->highlightable
                = FullTextType_Highlightable((FullTextType*)ivars->type);
        }
    }
    return self;
}
Пример #3
0
void
Inverter_Invert_Doc_IMP(Inverter *self, Doc *doc) {
    InverterIVARS *const ivars = Inverter_IVARS(self);
    Hash *const fields = (Hash*)Doc_Get_Fields(doc);

    // Prepare for the new doc.
    Inverter_Set_Doc(self, doc);

    // Extract and invert the doc's fields.
    HashIterator *iter = HashIter_new(fields);
    while (HashIter_Next(iter)) {
        String *field = HashIter_Get_Key(iter);
        Obj    *obj   = HashIter_Get_Value(iter);

        InverterEntry *inventry = S_fetch_entry(ivars, field);
        InverterEntryIVARS *inventry_ivars = InvEntry_IVARS(inventry);
        FieldType *type = inventry_ivars->type;

        // Get the field value.
        switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) {
            case FType_TEXT: {
                    CERTIFY(obj, STRING);
                    break;
                }
            case FType_BLOB: {
                    CERTIFY(obj, BLOB);
                    break;
                }
            case FType_INT32:
            case FType_INT64: {
                    CERTIFY(obj, INTEGER);
                    break;
                }
            case FType_FLOAT32:
            case FType_FLOAT64: {
                    CERTIFY(obj, FLOAT);
                    break;
                }
            default:
                THROW(ERR, "Unrecognized type: %o", type);
        }

        if (inventry_ivars->value != obj) {
            DECREF(inventry_ivars->value);
            inventry_ivars->value = INCREF(obj);
        }

        Inverter_Add_Field(self, inventry);
    }
    DECREF(iter);
}
Пример #4
0
HitDoc*
DefDocReader_Fetch_Doc_IMP(DefaultDocReader *self, int32_t doc_id) {
    DefaultDocReaderIVARS *const ivars = DefDocReader_IVARS(self);
    Schema   *const schema = ivars->schema;
    InStream *const dat_in = ivars->dat_in;
    InStream *const ix_in  = ivars->ix_in;
    Hash     *const fields = Hash_new(1);
    int64_t   start;
    uint32_t  num_fields;
    uint32_t  field_name_cap = 31;
    char     *field_name = (char*)MALLOCATE(field_name_cap + 1);

    // Get data file pointer from index, read number of fields.
    InStream_Seek(ix_in, (int64_t)doc_id * 8);
    start = InStream_Read_U64(ix_in);
    InStream_Seek(dat_in, start);
    num_fields = InStream_Read_C32(dat_in);

    // Decode stored data and build up the doc field by field.
    while (num_fields--) {
        uint32_t        field_name_len;
        Obj       *value;
        FieldType *type;

        // Read field name.
        field_name_len = InStream_Read_C32(dat_in);
        if (field_name_len > field_name_cap) {
            field_name_cap = field_name_len;
            field_name     = (char*)REALLOCATE(field_name,
                                                    field_name_cap + 1);
        }
        InStream_Read_Bytes(dat_in, field_name, field_name_len);

        // Find the Field's FieldType.
        StackString *field_name_str
            = SSTR_WRAP_UTF8(field_name, field_name_len);
        type = Schema_Fetch_Type(schema, (String*)field_name_str);

        // Read the field value.
        switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) {
            case FType_TEXT: {
                    uint32_t value_len = InStream_Read_C32(dat_in);
                    char *buf = (char*)MALLOCATE(value_len + 1);
                    InStream_Read_Bytes(dat_in, buf, value_len);
                    buf[value_len] = '\0'; 
                    value = (Obj*)Str_new_steal_utf8(buf, value_len);
                    break;
                }
            case FType_BLOB: {
                    uint32_t value_len = InStream_Read_C32(dat_in);
                    char *buf = (char*)MALLOCATE(value_len);
                    InStream_Read_Bytes(dat_in, buf, value_len);
                    value = (Obj*)BB_new_steal_bytes(
                                buf, value_len, value_len);
                    break;
                }
            case FType_FLOAT32:
                value = (Obj*)Float32_new(
                                InStream_Read_F32(dat_in));
                break;
            case FType_FLOAT64:
                value = (Obj*)Float64_new(
                                InStream_Read_F64(dat_in));
                break;
            case FType_INT32:
                value = (Obj*)Int32_new(
                                (int32_t)InStream_Read_C32(dat_in));
                break;
            case FType_INT64:
                value = (Obj*)Int64_new(
                                (int64_t)InStream_Read_C64(dat_in));
                break;
            default:
                value = NULL;
                THROW(ERR, "Unrecognized type: %o", type);
        }

        // Store the value.
        Hash_Store_Utf8(fields, field_name, field_name_len, value);
    }
    FREEMEM(field_name);

    HitDoc *retval = HitDoc_new(fields, doc_id, 0.0);
    DECREF(fields);
    return retval;
}
Пример #5
0
void
Inverter_invert_doc(Inverter *self, Doc *doc) {
    InverterIVARS *const ivars = Inverter_IVARS(self);
    Hash *const fields = (Hash*)Doc_Get_Fields(doc);
    uint32_t   num_keys     = Hash_Iterate(fields);

    // Prepare for the new doc.
    Inverter_Set_Doc(self, doc);

    // Extract and invert the doc's fields.
    while (num_keys--) {
        Obj *key, *obj;
        Hash_Next(fields, &key, &obj);
        CharBuf *field = (CharBuf*)CERTIFY(key, CHARBUF);
        InverterEntry *inventry = S_fetch_entry(ivars, field);
        InverterEntryIVARS *inventry_ivars = InvEntry_IVARS(inventry);
        FieldType *type = inventry_ivars->type;

        // Get the field value.
        switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) {
            case FType_TEXT: {
                    CharBuf *char_buf
                        = (CharBuf*)CERTIFY(obj, CHARBUF);
                    ViewCharBuf *value
                        = (ViewCharBuf*)inventry_ivars->value;
                    ViewCB_Assign(value, char_buf);
                    break;
                }
            case FType_BLOB: {
                    ByteBuf *byte_buf
                        = (ByteBuf*)CERTIFY(obj, BYTEBUF);
                    ViewByteBuf *value
                        = (ViewByteBuf*)inventry_ivars->value;
                    ViewBB_Assign(value, byte_buf);
                    break;
                }
            case FType_INT32: {
                    int32_t int_val = (int32_t)Obj_To_I64(obj);
                    Integer32* value = (Integer32*)inventry_ivars->value;
                    Int32_Set_Value(value, int_val);
                    break;
                }
            case FType_INT64: {
                    int64_t int_val = Obj_To_I64(obj);
                    Integer64* value = (Integer64*)inventry_ivars->value;
                    Int64_Set_Value(value, int_val);
                    break;
                }
            case FType_FLOAT32: {
                    float float_val = (float)Obj_To_F64(obj);
                    Float32* value = (Float32*)inventry_ivars->value;
                    Float32_Set_Value(value, float_val);
                    break;
                }
            case FType_FLOAT64: {
                    double float_val = Obj_To_F64(obj);
                    Float64* value = (Float64*)inventry_ivars->value;
                    Float64_Set_Value(value, float_val);
                    break;
                }
            default:
                THROW(ERR, "Unrecognized type: %o", type);
        }

        Inverter_Add_Field(self, inventry);
    }
}
Пример #6
0
static SortCache*
S_lazy_init_sort_cache(DefaultSortReader *self, String *field) {
    DefaultSortReaderIVARS *const ivars = DefSortReader_IVARS(self);

    // See if we have any values.
    Obj *count_obj = Hash_Fetch(ivars->counts, (Obj*)field);
    int32_t count = count_obj ? (int32_t)Obj_To_I64(count_obj) : 0;
    if (!count) { return NULL; }

    // Get a FieldType and sanity check that the field is sortable.
    Schema    *schema = DefSortReader_Get_Schema(self);
    FieldType *type   = Schema_Fetch_Type(schema, field);
    if (!type || !FType_Sortable(type)) {
        THROW(ERR, "'%o' isn't a sortable field", field);
    }

    // Open streams.
    Folder    *folder    = DefSortReader_Get_Folder(self);
    Segment   *segment   = DefSortReader_Get_Segment(self);
    String    *seg_name  = Seg_Get_Name(segment);
    int32_t    field_num = Seg_Field_Num(segment, field);
    int8_t     prim_id   = FType_Primitive_ID(type);
    bool       var_width = (prim_id == FType_TEXT || prim_id == FType_BLOB)
                           ? true
                           : false;
    String *ord_path = Str_newf("%o/sort-%i32.ord", seg_name, field_num);
    InStream *ord_in = Folder_Open_In(folder, ord_path);
    DECREF(ord_path);
    if (!ord_in) {
        THROW(ERR, "Error building sort cache for '%o': %o",
              field, Err_get_error());
    }
    InStream *ix_in = NULL;
    if (var_width) {
        String *ix_path = Str_newf("%o/sort-%i32.ix", seg_name, field_num);
        ix_in = Folder_Open_In(folder, ix_path);
        DECREF(ix_path);
        if (!ix_in) {
            THROW(ERR, "Error building sort cache for '%o': %o",
                  field, Err_get_error());
        }
    }
    String *dat_path = Str_newf("%o/sort-%i32.dat", seg_name, field_num);
    InStream *dat_in = Folder_Open_In(folder, dat_path);
    DECREF(dat_path);
    if (!dat_in) {
        THROW(ERR, "Error building sort cache for '%o': %o",
              field, Err_get_error());
    }

    Obj     *null_ord_obj = Hash_Fetch(ivars->null_ords, (Obj*)field);
    int32_t  null_ord = null_ord_obj ? (int32_t)Obj_To_I64(null_ord_obj) : -1;
    Obj     *ord_width_obj = Hash_Fetch(ivars->ord_widths, (Obj*)field);
    int32_t  ord_width = ord_width_obj
                         ? (int32_t)Obj_To_I64(ord_width_obj)
                         : S_calc_ord_width(count);
    int32_t  doc_max = (int32_t)Seg_Get_Count(segment);

    SortCache *cache = NULL;
    switch (prim_id & FType_PRIMITIVE_ID_MASK) {
        case FType_TEXT:
            cache = (SortCache*)TextSortCache_new(field, type, count, doc_max,
                                                  null_ord, ord_width, ord_in,
                                                  ix_in, dat_in);
            break;
        case FType_INT32:
            cache = (SortCache*)I32SortCache_new(field, type, count, doc_max,
                                                 null_ord, ord_width, ord_in,
                                                 dat_in);
            break;
        case FType_INT64:
            cache = (SortCache*)I64SortCache_new(field, type, count, doc_max,
                                                 null_ord, ord_width, ord_in,
                                                 dat_in);
            break;
        case FType_FLOAT32:
            cache = (SortCache*)F32SortCache_new(field, type, count, doc_max,
                                                 null_ord, ord_width, ord_in,
                                                 dat_in);
            break;
        case FType_FLOAT64:
            cache = (SortCache*)F64SortCache_new(field, type, count, doc_max,
                                                 null_ord, ord_width, ord_in,
                                                 dat_in);
            break;
        default:
            THROW(ERR, "No SortCache class for %o", type);
    }
    Hash_Store(ivars->caches, (Obj*)field, (Obj*)cache);

    if (ivars->format == 2) { // bug compatibility
        SortCache_Set_Native_Ords(cache, true);
    }

    DECREF(ord_in);
    DECREF(ix_in);
    DECREF(dat_in);

    return cache;
}
Пример #7
0
void
DocWriter_Add_Inverted_Doc_IMP(DocWriter *self, Inverter *inverter,
                               int32_t doc_id) {
    DocWriterIVARS *const ivars = DocWriter_IVARS(self);
    OutStream *dat_out    = S_lazy_init(self);
    OutStream *ix_out     = ivars->ix_out;
    uint32_t   num_stored = 0;
    int64_t    start      = OutStream_Tell(dat_out);
    int64_t    expected   = OutStream_Tell(ix_out) / 8;

    // Verify doc id.
    if (doc_id != expected) {
        THROW(ERR, "Expected doc id %i64 but got %i32", expected, doc_id);
    }

    // Write the number of stored fields.
    Inverter_Iterate(inverter);
    while (Inverter_Next(inverter)) {
        FieldType *type = Inverter_Get_Type(inverter);
        if (FType_Stored(type)) { num_stored++; }
    }
    OutStream_Write_C32(dat_out, num_stored);

    Inverter_Iterate(inverter);
    while (Inverter_Next(inverter)) {
        // Only store fields marked as "stored".
        FieldType *type = Inverter_Get_Type(inverter);
        if (FType_Stored(type)) {
            String *field = Inverter_Get_Field_Name(inverter);
            Obj *value = Inverter_Get_Value(inverter);
            Freezer_serialize_string(field, dat_out);
            switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) {
                case FType_TEXT: {
                    const char *buf  = Str_Get_Ptr8((String*)value);
                    size_t      size = Str_Get_Size((String*)value);
                    OutStream_Write_C32(dat_out, size);
                    OutStream_Write_Bytes(dat_out, buf, size);
                    break;
                }
                case FType_BLOB: {
                    char   *buf  = BB_Get_Buf((ByteBuf*)value);
                    size_t  size = BB_Get_Size((ByteBuf*)value);
                    OutStream_Write_C32(dat_out, size);
                    OutStream_Write_Bytes(dat_out, buf, size);
                    break;
                }
                case FType_INT32: {
                    int32_t val = Int32_Get_Value((Integer32*)value);
                    OutStream_Write_C32(dat_out, val);
                    break;
                }
                case FType_INT64: {
                    int64_t val = Int64_Get_Value((Integer64*)value);
                    OutStream_Write_C64(dat_out, val);
                    break;
                }
                case FType_FLOAT32: {
                    float val = Float32_Get_Value((Float32*)value);
                    OutStream_Write_F32(dat_out, val);
                    break;
                }
                case FType_FLOAT64: {
                    double val = Float64_Get_Value((Float64*)value);
                    OutStream_Write_F64(dat_out, val);
                    break;
                }
                default:
                    THROW(ERR, "Unrecognized type: %o", type);
            }
        }
    }

    // Write file pointer.
    OutStream_Write_I64(ix_out, start);
}