Beispiel #1
0
static void
test_fields(TestBatch *batch)
{
    Segment *segment = Seg_new(1);
    ZombieCharBuf *foo = ZCB_WRAP_STR("foo",3 );
    ZombieCharBuf *bar = ZCB_WRAP_STR("bar", 3);
    ZombieCharBuf *baz = ZCB_WRAP_STR("baz", 3);
    int32_t field_num; 
    
    field_num = Seg_Add_Field(segment, (CharBuf*)foo);
    TEST_TRUE(batch, field_num == 1, 
        "Add_Field returns field number, and field numbers start at 1");
    field_num = Seg_Add_Field(segment, (CharBuf*)bar);
    TEST_TRUE(batch, field_num == 2, "add a second field");
    field_num = Seg_Add_Field(segment, (CharBuf*)foo);
    TEST_TRUE(batch, field_num == 1,
        "Add_Field returns existing field number if field is already known");

    TEST_TRUE(batch, ZCB_Equals(bar, (Obj*)Seg_Field_Name(segment, 2)),
        "Field_Name");
    TEST_TRUE(batch, Seg_Field_Name(segment, 3) == NULL, 
        "Field_Name returns NULL for unknown field number");
    TEST_TRUE(batch, Seg_Field_Num(segment, (CharBuf*)bar) == 2,
        "Field_Num");
    TEST_TRUE(batch, Seg_Field_Num(segment, (CharBuf*)baz) == 0, 
        "Field_Num returns 0 for unknown field name");

    DECREF(segment);
}
Beispiel #2
0
static void
test_fields(TestBatchRunner *runner) {
    Segment *segment = Seg_new(1);
    StackString *foo = SSTR_WRAP_UTF8("foo", 3);
    StackString *bar = SSTR_WRAP_UTF8("bar", 3);
    StackString *baz = SSTR_WRAP_UTF8("baz", 3);
    int32_t field_num;

    field_num = Seg_Add_Field(segment, (String*)foo);
    TEST_TRUE(runner, field_num == 1,
              "Add_Field returns field number, and field numbers start at 1");
    field_num = Seg_Add_Field(segment, (String*)bar);
    TEST_TRUE(runner, field_num == 2, "add a second field");
    field_num = Seg_Add_Field(segment, (String*)foo);
    TEST_TRUE(runner, field_num == 1,
              "Add_Field returns existing field number if field is already known");

    TEST_TRUE(runner, SStr_Equals(bar, (Obj*)Seg_Field_Name(segment, 2)),
              "Field_Name");
    TEST_TRUE(runner, Seg_Field_Name(segment, 3) == NULL,
              "Field_Name returns NULL for unknown field number");
    TEST_TRUE(runner, Seg_Field_Num(segment, (String*)bar) == 2,
              "Field_Num");
    TEST_TRUE(runner, Seg_Field_Num(segment, (String*)baz) == 0,
              "Field_Num returns 0 for unknown field name");

    DECREF(segment);
}
Beispiel #3
0
void
LexWriter_finish_field(LexiconWriter *self, int32_t field_num)
{
    CharBuf *field = Seg_Field_Name(self->segment, field_num);
    
    // Store count of terms for this field as metadata. 
    Hash_Store(self->counts, (Obj*)field, 
        (Obj*)CB_newf("%i32", self->count));
    Hash_Store(self->ix_counts, (Obj*)field, 
        (Obj*)CB_newf("%i32", self->ix_count));

    // Close streams. 
    OutStream_Close(self->dat_out);
    OutStream_Close(self->ix_out);
    OutStream_Close(self->ixix_out);
    DECREF(self->dat_out);
    DECREF(self->ix_out);
    DECREF(self->ixix_out);
    self->dat_out  = NULL;
    self->ix_out   = NULL;
    self->ixix_out = NULL;

    // Close term stepper. 
    DECREF(self->term_stepper);
    self->term_stepper = NULL;
}
Beispiel #4
0
void
LexWriter_start_field(LexiconWriter *self, int32_t field_num)
{
    Segment   *const segment  = LexWriter_Get_Segment(self);
    Folder    *const folder   = LexWriter_Get_Folder(self);
    Schema    *const schema   = LexWriter_Get_Schema(self);
    CharBuf   *const seg_name = Seg_Get_Name(segment);
    CharBuf   *const field    = Seg_Field_Name(segment, field_num);
    FieldType *const type    = Schema_Fetch_Type(schema, field);

    // Open outstreams. 
    CB_setf(self->dat_file,  "%o/lexicon-%i32.dat",  seg_name, field_num);
    CB_setf(self->ix_file,   "%o/lexicon-%i32.ix",   seg_name, field_num);
    CB_setf(self->ixix_file, "%o/lexicon-%i32.ixix", seg_name, field_num);
    self->dat_out = Folder_Open_Out(folder, self->dat_file);
    if (!self->dat_out) { RETHROW(INCREF(Err_get_error())); }
    self->ix_out = Folder_Open_Out(folder, self->ix_file);
    if (!self->ix_out) { RETHROW(INCREF(Err_get_error())); }
    self->ixix_out = Folder_Open_Out(folder, self->ixix_file);
    if (!self->ixix_out) { RETHROW(INCREF(Err_get_error())); }

    // Initialize count and ix_count, term stepper and term info stepper. 
    self->count    = 0;
    self->ix_count = 0;
    self->term_stepper = FType_Make_Term_Stepper(type);
    TermStepper_Reset(self->tinfo_stepper);
}
Beispiel #5
0
static SortFieldWriter*
S_lazy_init_field_writer(SortWriter *self, int32_t field_num) {
    SortWriterIVARS *const ivars = SortWriter_IVARS(self);

    SortFieldWriter *field_writer
        = (SortFieldWriter*)VA_Fetch(ivars->field_writers, field_num);
    if (!field_writer) {

        // Open temp files.
        if (!ivars->temp_ord_out) {
            Folder  *folder   = ivars->folder;
            CharBuf *seg_name = Seg_Get_Name(ivars->segment);
            CharBuf *path     = CB_newf("%o/sort_ord_temp", seg_name);
            ivars->temp_ord_out = Folder_Open_Out(folder, path);
            if (!ivars->temp_ord_out) {
                DECREF(path);
                RETHROW(INCREF(Err_get_error()));
            }
            CB_setf(path, "%o/sort_ix_temp", seg_name);
            ivars->temp_ix_out = Folder_Open_Out(folder, path);
            if (!ivars->temp_ix_out) {
                DECREF(path);
                RETHROW(INCREF(Err_get_error()));
            }
            CB_setf(path, "%o/sort_dat_temp", seg_name);
            ivars->temp_dat_out = Folder_Open_Out(folder, path);
            if (!ivars->temp_dat_out) {
                DECREF(path);
                RETHROW(INCREF(Err_get_error()));
            }
            DECREF(path);
        }

        CharBuf *field = Seg_Field_Name(ivars->segment, field_num);
        field_writer
            = SortFieldWriter_new(ivars->schema, ivars->snapshot, ivars->segment,
                                  ivars->polyreader, field, ivars->mem_pool,
                                  ivars->mem_thresh, ivars->temp_ord_out,
                                  ivars->temp_ix_out, ivars->temp_dat_out);
        VA_Store(ivars->field_writers, field_num, (Obj*)field_writer);
    }
    return field_writer;
}
Beispiel #6
0
static SortFieldWriter*
S_lazy_init_field_writer(SortWriter *self, int32_t field_num) {
    SortWriterIVARS *const ivars = SortWriter_IVARS(self);

    SortFieldWriter *field_writer
        = (SortFieldWriter*)Vec_Fetch(ivars->field_writers, (size_t)field_num);
    if (!field_writer) {

        // Open temp files.
        if (!ivars->temp_ord_out) {
            Folder *folder   = ivars->folder;
            String *seg_name = Seg_Get_Name(ivars->segment);
            String *ord_path = Str_newf("%o/sort_ord_temp", seg_name);
            ivars->temp_ord_out = Folder_Open_Out(folder, ord_path);
            DECREF(ord_path);
            if (!ivars->temp_ord_out) {
                RETHROW(INCREF(Err_get_error()));
            }
            String *ix_path = Str_newf("%o/sort_ix_temp", seg_name);
            ivars->temp_ix_out = Folder_Open_Out(folder, ix_path);
            DECREF(ix_path);
            if (!ivars->temp_ix_out) {
                RETHROW(INCREF(Err_get_error()));
            }
            String *dat_path = Str_newf("%o/sort_dat_temp", seg_name);
            ivars->temp_dat_out = Folder_Open_Out(folder, dat_path);
            DECREF(dat_path);
            if (!ivars->temp_dat_out) {
                RETHROW(INCREF(Err_get_error()));
            }
        }

        String *field = Seg_Field_Name(ivars->segment, field_num);
        field_writer
            = SortFieldWriter_new(ivars->schema, ivars->snapshot, ivars->segment,
                                  ivars->polyreader, field, ivars->counter,
                                  ivars->mem_thresh, ivars->temp_ord_out,
                                  ivars->temp_ix_out, ivars->temp_dat_out);
        Vec_Store(ivars->field_writers, (size_t)field_num, (Obj*)field_writer);
    }
    return field_writer;
}
Beispiel #7
0
DefaultLexiconReader*
DefLexReader_init(DefaultLexiconReader *self, Schema *schema, Folder *folder,
                  Snapshot *snapshot, VArray *segments, int32_t seg_tick) {

    // Init.
    LexReader_init((LexiconReader*)self, schema, folder, snapshot, segments,
                   seg_tick);
    DefaultLexiconReaderIVARS *const ivars = DefLexReader_IVARS(self);
    Segment *segment = DefLexReader_Get_Segment(self);

    // Build an array of SegLexicon objects.
    ivars->lexicons = VA_new(Schema_Num_Fields(schema));
    for (uint32_t i = 1, max = Schema_Num_Fields(schema) + 1; i < max; i++) {
        String *field = Seg_Field_Name(segment, i);
        if (field && S_has_data(schema, folder, segment, field)) {
            SegLexicon *lexicon = SegLex_new(schema, folder, segment, field);
            VA_Store(ivars->lexicons, i, (Obj*)lexicon);
        }
    }

    return self;
}
DefaultLexiconReader*
DefLexReader_init(DefaultLexiconReader *self, Schema *schema, Folder *folder,
               Snapshot *snapshot, VArray *segments, i32_t seg_tick)
{
    Segment *segment;
    u32_t    i, max;

    /* Init. */
    LexReader_init((LexiconReader*)self, schema, folder, snapshot, segments,
        seg_tick);
    segment = DefLexReader_Get_Segment(self);

    /* Build an array of SegLexicon objects. */
    self->lexicons = VA_new(Schema_Num_Fields(schema));
    for (i = 1, max = Schema_Num_Fields(schema) + 1; i < max; i++) {
        CharBuf *field = Seg_Field_Name(segment, i);
        if (field && S_has_data(schema, folder, segment, field)) {
            SegLexicon *lexicon = SegLex_new(schema, folder, segment, field);
            VA_Store(self->lexicons, i, (Obj*)lexicon);
        }
    }

    return self;
}
Beispiel #9
0
void
SortWriter_finish(SortWriter *self) {
    SortWriterIVARS *const ivars = SortWriter_IVARS(self);
    VArray *const field_writers = ivars->field_writers;

    // If we have no data, bail out.
    if (!ivars->temp_ord_out) { return; }

    // If we've either flushed or added segments, flush everything so that any
    // one field can use the entire margin up to mem_thresh.
    if (ivars->flush_at_finish) {
        for (uint32_t i = 1, max = VA_Get_Size(field_writers); i < max; i++) {
            SortFieldWriter *field_writer
                = (SortFieldWriter*)VA_Fetch(field_writers, i);
            if (field_writer) {
                SortFieldWriter_Flush(field_writer);
            }
        }
    }

    // Close down temp streams.
    OutStream_Close(ivars->temp_ord_out);
    OutStream_Close(ivars->temp_ix_out);
    OutStream_Close(ivars->temp_dat_out);

    for (uint32_t i = 1, max = VA_Get_Size(field_writers); i < max; i++) {
        SortFieldWriter *field_writer
            = (SortFieldWriter*)VA_Delete(field_writers, i);
        if (field_writer) {
            CharBuf *field = Seg_Field_Name(ivars->segment, i);
            SortFieldWriter_Flip(field_writer);
            int32_t count = SortFieldWriter_Finish(field_writer);
            Hash_Store(ivars->counts, (Obj*)field,
                       (Obj*)CB_newf("%i32", count));
            int32_t null_ord = SortFieldWriter_Get_Null_Ord(field_writer);
            if (null_ord != -1) {
                Hash_Store(ivars->null_ords, (Obj*)field,
                           (Obj*)CB_newf("%i32", null_ord));
            }
            int32_t ord_width = SortFieldWriter_Get_Ord_Width(field_writer);
            Hash_Store(ivars->ord_widths, (Obj*)field,
                       (Obj*)CB_newf("%i32", ord_width));
        }

        DECREF(field_writer);
    }
    VA_Clear(field_writers);

    // Store metadata.
    Seg_Store_Metadata_Str(ivars->segment, "sort", 4,
                           (Obj*)SortWriter_Metadata(self));

    // Clean up.
    Folder  *folder   = ivars->folder;
    CharBuf *seg_name = Seg_Get_Name(ivars->segment);
    CharBuf *path     = CB_newf("%o/sort_ord_temp", seg_name);
    Folder_Delete(folder, path);
    CB_setf(path, "%o/sort_ix_temp", seg_name);
    Folder_Delete(folder, path);
    CB_setf(path, "%o/sort_dat_temp", seg_name);
    Folder_Delete(folder, path);
    DECREF(path);
}
Beispiel #10
0
SortCache*
SortCache_init(SortCache *self, Schema *schema, Folder *folder,
               Segment *segment, i32_t field_num)
{
    CharBuf *field    = Seg_Field_Name(segment, field_num);
    CharBuf *seg_name = Seg_Get_Name(segment);
    CharBuf *ord_file = CB_newf("%o/sort-%i32.ord", seg_name, field_num);
    CharBuf *ix_file  = CB_newf("%o/sort-%i32.ix",  seg_name, field_num);
    CharBuf *dat_file = CB_newf("%o/sort-%i32.dat", seg_name, field_num);
    i64_t ord_len, ix_len, dat_len;

    /* Derive. */
    self->doc_max = Seg_Get_Count(segment);
    self->type    = Schema_Fetch_Type(schema, field);
    if (!self->type || !FType_Sortable(self->type)) {
        THROW("'%o' isn't a sortable field", field);
    }

    /* Open instreams. */
    self->ord_in  = Folder_Open_In(folder, ord_file);
    self->ix_in   = Folder_Open_In(folder, ix_file);
    self->dat_in  = Folder_Open_In(folder, dat_file);
    if (!self->ix_in || !self->dat_in || !self->ord_in) {
        CharBuf *mess = MAKE_MESS("Can't open either %o, %o or %o", ord_file, 
            ix_file, dat_file);
        DECREF(ord_file);
        DECREF(ix_file);
        DECREF(dat_file);
        Err_throw_mess(mess);
    }
    ord_len = InStream_Length(self->ord_in);
    ix_len  = InStream_Length(self->ix_in);
    dat_len = InStream_Length(self->dat_in);

    /* Calculate the number of unique values and derive the ord bit width. */
    self->num_uniq = (i32_t)(ix_len / 8) - 1; 
    self->width    = S_calc_width(self->num_uniq);

    /* Validate file lengths. */
    {
        double bytes_per_doc = self->width / 8.0;
        double max_ords      = ord_len / bytes_per_doc;
        if (max_ords < self->doc_max + 1) {
            THROW("Conflict between ord count max %f64 and doc_max %i32", 
                max_ords, self->doc_max);
        }
    }

    /* Mmap ords, offsets and character data. */
    self->ords      = InStream_Buf(self->ord_in, (size_t)ord_len);
    self->offsets   = (i64_t*)InStream_Buf(self->ix_in, (size_t)ix_len);
    self->char_data = InStream_Buf(self->dat_in, dat_len);
    {
        char *offs            = (char*)self->offsets;
        self->offsets_limit   = (i64_t*)(offs + ix_len);
        self->char_data_limit = self->char_data + dat_len;
    }

    DECREF(ord_file);
    DECREF(ix_file);
    DECREF(dat_file);

    return self;
}
static void
S_flip_run(SortFieldWriter *run, size_t sub_thresh, InStream *ord_in,
           InStream *ix_in, InStream *dat_in) {
    SortFieldWriterIVARS *const run_ivars = SortFieldWriter_IVARS(run);

    if (run_ivars->flipped) { THROW(ERR, "Can't Flip twice"); }
    run_ivars->flipped = true;

    // Get our own slice of mem_thresh.
    DECREF(run_ivars->counter);
    run_ivars->counter    = Counter_new();
    run_ivars->mem_thresh = sub_thresh;

    // Done if we already have a SortCache to read from.
    if (run_ivars->sort_cache) { return; }

    // Open the temp files for reading.
    String *seg_name  = Seg_Get_Name(run_ivars->segment);
    String *ord_alias = Str_newf("%o/sort_ord_temp-%i64-to-%i64", seg_name,
                                 run_ivars->ord_start, run_ivars->ord_end);
    InStream *ord_in_dupe
        = InStream_Reopen(ord_in, ord_alias, run_ivars->ord_start,
                          run_ivars->ord_end - run_ivars->ord_start);
    DECREF(ord_alias);
    InStream *ix_in_dupe = NULL;
    if (run_ivars->var_width) {
        String *ix_alias = Str_newf("%o/sort_ix_temp-%i64-to-%i64", seg_name,
                                    run_ivars->ix_start, run_ivars->ix_end);
        ix_in_dupe = InStream_Reopen(ix_in, ix_alias, run_ivars->ix_start,
                                     run_ivars->ix_end - run_ivars->ix_start);
        DECREF(ix_alias);
    }
    String *dat_alias = Str_newf("%o/sort_dat_temp-%i64-to-%i64", seg_name,
                                 run_ivars->dat_start, run_ivars->dat_end);
    InStream *dat_in_dupe
        = InStream_Reopen(dat_in, dat_alias, run_ivars->dat_start,
                          run_ivars->dat_end - run_ivars->dat_start);
    DECREF(dat_alias);

    // Get a SortCache.
    String *field = Seg_Field_Name(run_ivars->segment, run_ivars->field_num);
    switch (run_ivars->prim_id & FType_PRIMITIVE_ID_MASK) {
        case FType_TEXT:
            run_ivars->sort_cache = (SortCache*)TextSortCache_new(
                                  field, run_ivars->type, run_ivars->run_cardinality,
                                  run_ivars->run_max, run_ivars->null_ord,
                                  run_ivars->ord_width, ord_in_dupe,
                                  ix_in_dupe, dat_in_dupe);
            break;
        case FType_INT32:
            run_ivars->sort_cache = (SortCache*)I32SortCache_new(
                                  field, run_ivars->type, run_ivars->run_cardinality,
                                  run_ivars->run_max, run_ivars->null_ord,
                                  run_ivars->ord_width, ord_in_dupe,
                                  dat_in_dupe);
            break;
        case FType_INT64:
            run_ivars->sort_cache = (SortCache*)I64SortCache_new(
                                  field, run_ivars->type, run_ivars->run_cardinality,
                                  run_ivars->run_max, run_ivars->null_ord,
                                  run_ivars->ord_width, ord_in_dupe,
                                  dat_in_dupe);
            break;
        case FType_FLOAT32:
            run_ivars->sort_cache = (SortCache*)F32SortCache_new(
                                  field, run_ivars->type, run_ivars->run_cardinality,
                                  run_ivars->run_max, run_ivars->null_ord,
                                  run_ivars->ord_width, ord_in_dupe,
                                  dat_in_dupe);
            break;
        case FType_FLOAT64:
            run_ivars->sort_cache = (SortCache*)F64SortCache_new(
                                  field, run_ivars->type, run_ivars->run_cardinality,
                                  run_ivars->run_max, run_ivars->null_ord,
                                  run_ivars->ord_width, ord_in_dupe,
                                  dat_in_dupe);
            break;
        default:
            THROW(ERR, "No SortCache class for %o", run_ivars->type);
    }

    DECREF(ord_in_dupe);
    DECREF(ix_in_dupe);
    DECREF(dat_in_dupe);
}