Beispiel #1
0
LexIndex*
LexIndex_init(LexIndex *self, Schema *schema, Folder *folder,
              Segment *segment, String *field) {
    int32_t  field_num = Seg_Field_Num(segment, field);
    String  *seg_name  = Seg_Get_Name(segment);
    String  *ixix_file = Str_newf("%o/lexicon-%i32.ixix", seg_name, field_num);
    String  *ix_file   = Str_newf("%o/lexicon-%i32.ix", seg_name, field_num);
    Architecture *arch = Schema_Get_Architecture(schema);

    // Init.
    Lex_init((Lexicon*)self, field);
    LexIndexIVARS *const ivars = LexIndex_IVARS(self);
    ivars->tinfo        = TInfo_new(0);
    ivars->tick         = 0;

    // Derive
    ivars->field_type = Schema_Fetch_Type(schema, field);
    if (!ivars->field_type) {
        String *mess = MAKE_MESS("Unknown field: '%o'", field);
        DECREF(ix_file);
        DECREF(ixix_file);
        DECREF(self);
        Err_throw_mess(ERR, mess);
    }
    ivars->field_type = (FieldType*)INCREF(ivars->field_type);
    ivars->term_stepper = FType_Make_Term_Stepper(ivars->field_type);
    ivars->ixix_in = Folder_Open_In(folder, ixix_file);
    if (!ivars->ixix_in) {
        Err *error = (Err*)INCREF(Err_get_error());
        DECREF(ix_file);
        DECREF(ixix_file);
        DECREF(self);
        RETHROW(error);
    }
    ivars->ix_in = Folder_Open_In(folder, ix_file);
    if (!ivars->ix_in) {
        Err *error = (Err*)INCREF(Err_get_error());
        DECREF(ix_file);
        DECREF(ixix_file);
        DECREF(self);
        RETHROW(error);
    }
    ivars->index_interval = Arch_Index_Interval(arch);
    ivars->skip_interval  = Arch_Skip_Interval(arch);
    ivars->size    = (int32_t)(InStream_Length(ivars->ixix_in) / sizeof(int64_t));
    ivars->offsets = (const int64_t*)InStream_Buf(ivars->ixix_in,
            (size_t)InStream_Length(ivars->ixix_in));

    DECREF(ixix_file);
    DECREF(ix_file);

    return self;
}
Beispiel #2
0
static void
test_Clone_and_Reopen(TestBatchRunner *runner) {
    String        *foo       = SSTR_WRAP_C("foo");
    String        *bar       = SSTR_WRAP_C("bar");
    RAMFile       *file      = RAMFile_new(NULL, false);
    OutStream     *outstream = OutStream_open((Obj*)file);
    RAMFileHandle *fh;
    InStream      *instream;
    InStream      *clone;
    InStream      *reopened;

    for (uint8_t i = 0; i < 26; i++) {
        OutStream_Write_U8(outstream, 'a' + i);
    }
    OutStream_Close(outstream);

    fh = RAMFH_open(foo, FH_READ_ONLY, file);
    instream = InStream_open((Obj*)fh);
    InStream_Seek(instream, 1);
    TEST_TRUE(runner, Str_Equals(InStream_Get_Filename(instream), (Obj*)foo),
              "Get_Filename");

    clone    = InStream_Clone(instream);
    TEST_TRUE(runner, Str_Equals(InStream_Get_Filename(clone), (Obj*)foo),
              "Clones have same filename");
    TEST_TRUE(runner, InStream_Length(instream) == InStream_Length(clone),
              "Clones have same length");
    TEST_TRUE(runner, InStream_Read_U8(instream) == InStream_Read_U8(clone),
              "Clones start at same file position");

    reopened = InStream_Reopen(instream, bar, 25, 1);
    TEST_TRUE(runner, Str_Equals(InStream_Get_Filename(reopened), (Obj*)bar),
              "Reopened InStreams take new filename");
    TEST_TRUE(runner, InStream_Read_U8(reopened) == 'z',
              "Reopened stream starts at supplied offset");
    TEST_TRUE(runner, InStream_Length(reopened) == 1,
              "Reopened stream uses supplied length");
    TEST_TRUE(runner, InStream_Tell(reopened) == 1,
              "Tell() uses supplied offset for reopened stream");
    InStream_Seek(reopened, 0);
    TEST_TRUE(runner, InStream_Read_U8(reopened) == 'z',
              "Seek() uses supplied offset for reopened stream");

    DECREF(reopened);
    DECREF(clone);
    DECREF(instream);
    DECREF(outstream);
    DECREF(fh);
    DECREF(file);
}
Beispiel #3
0
LexIndex*
LexIndex_init(LexIndex *self, Schema *schema, Folder *folder, 
              Segment *segment, const CharBuf *field)
{
    i32_t    field_num = Seg_Field_Num(segment, field);
    CharBuf *seg_name  = Seg_Get_Name(segment);
    CharBuf *ixix_file = CB_newf("%o/lexicon-%i32.ixix", seg_name, field_num);
    CharBuf *ix_file   = CB_newf("%o/lexicon-%i32.ix", seg_name, field_num);
    Architecture *arch = Schema_Get_Architecture(schema);

    /* Init. */
    self->term  = ViewCB_new_from_trusted_utf8(NULL, 0);
    self->tinfo = TInfo_new(0,0,0,0);
    self->tick  = 0;

    /* Derive */
    self->field_type = Schema_Fetch_Type(schema, field);
    if (!self->field_type) {
        CharBuf *mess = MAKE_MESS("Unknown field: '%o'", field);
        DECREF(ix_file);
        DECREF(ixix_file);
        DECREF(self);
        Err_throw_mess(mess);
    }
    INCREF(self->field_type);
    self->ixix_in = Folder_Open_In(folder, ixix_file);
    self->ix_in   = Folder_Open_In(folder, ix_file);
    if (!self->ixix_in || !self->ix_in) {
        CharBuf *mess =
             MAKE_MESS("Can't open either %o or %o", ix_file, ixix_file);
        DECREF(ix_file);
        DECREF(ixix_file);
        DECREF(self);
        Err_throw_mess(mess);
    }
    self->index_interval = Arch_Index_Interval(arch);
    self->skip_interval  = Arch_Skip_Interval(arch);
    self->size    = (i32_t)(InStream_Length(self->ixix_in) / sizeof(i64_t));
    self->offsets = (i64_t*)InStream_Buf(self->ixix_in,
        (size_t)InStream_Length(self->ixix_in));
    self->data = InStream_Buf(self->ix_in, InStream_Length(self->ix_in));
    self->limit = self->data + InStream_Length(self->ix_in);

    DECREF(ixix_file);
    DECREF(ix_file);

    return self;
}
Beispiel #4
0
NumericSortCache*
NumSortCache_init(NumericSortCache *self, const CharBuf *field,
                  FieldType *type, int32_t cardinality, int32_t doc_max,
                  int32_t null_ord, int32_t ord_width, InStream *ord_in,
                  InStream *dat_in) {
    // Validate.
    if (!type || !FType_Sortable(type) || !FType_Is_A(type, NUMERICTYPE)) {
        DECREF(self);
        THROW(ERR, "'%o' isn't a sortable NumericType field", field);
    }

    // Mmap ords and super-init.
    int64_t  ord_len = InStream_Length(ord_in);
    void    *ords    = InStream_Buf(ord_in, (size_t)ord_len);
    SortCache_init((SortCache*)self, field, type, ords, cardinality, doc_max,
                   null_ord, ord_width);

    // Assign.
    self->ord_in = (InStream*)INCREF(ord_in);
    self->dat_in = (InStream*)INCREF(dat_in);

    // Validate ord file length.
    double BITS_PER_BYTE = 8.0;
    double docs_per_byte = BITS_PER_BYTE / self->ord_width;
    double max_ords      = ord_len * docs_per_byte;
    if (max_ords < self->doc_max + 1) {
        DECREF(self);
        THROW(ERR, "Conflict between ord count max %f64 and doc_max %i32 for "
              "field %o", max_ords, self->doc_max, field);
    }

    ABSTRACT_CLASS_CHECK(self, NUMERICSORTCACHE);
    return self;
}
Beispiel #5
0
ByteBuf*
Folder_slurp_file(Folder *self, const CharBuf *path) {
    InStream *instream = Folder_Open_In(self, path);
    ByteBuf  *retval   = NULL;

    if (!instream) {
        RETHROW(INCREF(Err_get_error()));
    }
    else {
        uint64_t length = InStream_Length(instream);

        if (length >= SIZE_MAX) {
            InStream_Close(instream);
            DECREF(instream);
            THROW(ERR, "File %o is too big to slurp (%u64 bytes)", path,
                  length);
        }
        else {
            size_t size = (size_t)length;
            char *ptr = (char*)MALLOCATE((size_t)size + 1);
            InStream_Read_Bytes(instream, ptr, size);
            ptr[size] = '\0';
            retval = BB_new_steal_bytes(ptr, size, size + 1);
            InStream_Close(instream);
            DECREF(instream);
        }
    }

    return retval;
}
Beispiel #6
0
TextSortCache*
TextSortCache_init(TextSortCache *self, String *field,
                   FieldType *type, int32_t cardinality,
                   int32_t doc_max, int32_t null_ord, int32_t ord_width,
                   InStream *ord_in, InStream *ix_in, InStream *dat_in) {
    // Validate.
    if (!type || !FType_Sortable(type)) {
        DECREF(self);
        THROW(ERR, "'%o' isn't a sortable field", field);
    }

    // Memory map ords and super-init.
    int64_t ord_len = InStream_Length(ord_in);
    const void *ords = InStream_Buf(ord_in, (size_t)ord_len);
    SortCache_init((SortCache*)self, field, type, ords, cardinality, doc_max,
                   null_ord, ord_width);
    TextSortCacheIVARS *const ivars = TextSortCache_IVARS(self);

    // Validate ords file length.
    double  bytes_per_doc = ivars->ord_width / 8.0;
    double  max_ords      = ord_len / bytes_per_doc;
    if (max_ords < ivars->doc_max + 1) {
        WARN("ORD WIDTH: %i32 %i32", ord_width, ivars->ord_width);
        THROW(ERR, "Conflict between ord count max %f64 and doc_max %i32 for "
              "field %o", max_ords, doc_max, field);
    }

    // Assign.
    ivars->ord_in = (InStream*)INCREF(ord_in);
    ivars->ix_in  = (InStream*)INCREF(ix_in);
    ivars->dat_in = (InStream*)INCREF(dat_in);

    return self;
}
void
PostPool_assign_seg(PostingPool *self, Folder *other_folder, 
                    Segment *other_segment, i32_t doc_base, I32Array *doc_map)
{
    i32_t    field_num = Seg_Field_Num(other_segment, self->field);
    CharBuf *other_seg_name = Seg_Get_Name(other_segment);
    CharBuf *lex_file 
        = CB_newf("%o/lexicon-%i32.dat", other_seg_name, field_num);

    /* Dedicate pool to this task alone. */
    if (self->from_seg || self->cache_max > 0 || self->lex_end != 0)
        THROW("Can't Assign_Segment to PostingPool with other content");
    self->from_seg = true;

    /* Prepare to read from existing files. */
    if (Folder_Exists(other_folder, lex_file)) {
        CharBuf *post_file
            = CB_newf("%o/postings-%i32.dat", other_seg_name, field_num);

        /* Open lexicon and postings files. */
        self->lex_instream  = Folder_Open_In(other_folder, lex_file);
        self->post_instream = Folder_Open_In(other_folder, post_file);
        if (!self->lex_instream)  { THROW("Can't open %o", lex_file); }
        if (!self->post_instream) { THROW("Can't open %o", post_file); }
        self->lex_end       = InStream_Length(self->lex_instream);
        self->post_end      = InStream_Length(self->post_instream);

        /* Assign doc base and doc map. */
        self->doc_base = doc_base;
        self->doc_map  = doc_map ? (I32Array*)INCREF(doc_map) : NULL;

        DECREF(post_file);
    }
    else {
        /* This posting pool will be empty. */
    }

    /* Clean up. */
    DECREF(lex_file);
}
Beispiel #8
0
Obj*
Json_slurp_json(Folder *folder, const CharBuf *path) {
    InStream *instream = Folder_Open_In(folder, path);
    if (!instream) {
        ERR_ADD_FRAME(Err_get_error());
        return NULL;
    }
    size_t len = (size_t)InStream_Length(instream);
    char *buf = InStream_Buf(instream, len);
    Obj *dump = S_parse_json(buf, len);
    InStream_Close(instream);
    DECREF(instream);
    if (!dump) {
        ERR_ADD_FRAME(Err_get_error());
    }
    return dump;
}
Beispiel #9
0
void
OutStream_Absorb_IMP(OutStream *self, InStream *instream) {
    OutStreamIVARS *const ivars = OutStream_IVARS(self);
    char buf[IO_STREAM_BUF_SIZE];
    int64_t bytes_left = InStream_Length(instream);

    // Read blocks of content into an intermediate buffer, than write them to
    // the OutStream.
    //
    // TODO: optimize by utilizing OutStream's buffer directly, while still
    // not flushing too frequently and keeping code complexity under control.
    OutStream_Grow(self, OutStream_Tell(self) + bytes_left);
    while (bytes_left) {
        const size_t bytes_this_iter = bytes_left < IO_STREAM_BUF_SIZE
                                       ? (size_t)bytes_left
                                       : IO_STREAM_BUF_SIZE;
        InStream_Read_Bytes(instream, buf, bytes_this_iter);
        SI_write_bytes(self, ivars, buf, bytes_this_iter);
        bytes_left -= bytes_this_iter;
    }
}
Beispiel #10
0
static void
test_Buf(TestBatchRunner *runner) {
    RAMFile    *file      = RAMFile_new(NULL, false);
    OutStream  *outstream = OutStream_open((Obj*)file);
    size_t      size      = IO_STREAM_BUF_SIZE * 2 + 5;
    InStream   *instream;
    char       *buf;

    for (uint32_t i = 0; i < size; i++) {
        OutStream_Write_U8(outstream, 'a');
    }
    OutStream_Close(outstream);

    instream = InStream_open((Obj*)file);
    InStreamIVARS *const ivars = InStream_IVARS(instream);
    buf = InStream_Buf(instream, 5);
    TEST_INT_EQ(runner, ivars->limit - buf, IO_STREAM_BUF_SIZE,
                "Small request bumped up");

    buf += IO_STREAM_BUF_SIZE - 10; // 10 bytes left in buffer.
    InStream_Advance_Buf(instream, buf);

    buf = InStream_Buf(instream, 10);
    TEST_INT_EQ(runner, ivars->limit - buf, 10,
                "Exact request doesn't trigger refill");

    buf = InStream_Buf(instream, 11);
    TEST_INT_EQ(runner, ivars->limit - buf, IO_STREAM_BUF_SIZE,
                "Requesting over limit triggers refill");

    int64_t  expected = InStream_Length(instream) - InStream_Tell(instream);
    char    *buff     = InStream_Buf(instream, 100000);
    int64_t  got      = PTR_TO_I64(ivars->limit) - PTR_TO_I64(buff);
    TEST_TRUE(runner, got == expected,
              "Requests greater than file size get pared down");

    DECREF(instream);
    DECREF(outstream);
    DECREF(file);
}
Beispiel #11
0
SortCache*
SortCache_init(SortCache *self, Schema *schema, Folder *folder,
               Segment *segment, i32_t field_num)
{
    CharBuf *field    = Seg_Field_Name(segment, field_num);
    CharBuf *seg_name = Seg_Get_Name(segment);
    CharBuf *ord_file = CB_newf("%o/sort-%i32.ord", seg_name, field_num);
    CharBuf *ix_file  = CB_newf("%o/sort-%i32.ix",  seg_name, field_num);
    CharBuf *dat_file = CB_newf("%o/sort-%i32.dat", seg_name, field_num);
    i64_t ord_len, ix_len, dat_len;

    /* Derive. */
    self->doc_max = Seg_Get_Count(segment);
    self->type    = Schema_Fetch_Type(schema, field);
    if (!self->type || !FType_Sortable(self->type)) {
        THROW("'%o' isn't a sortable field", field);
    }

    /* Open instreams. */
    self->ord_in  = Folder_Open_In(folder, ord_file);
    self->ix_in   = Folder_Open_In(folder, ix_file);
    self->dat_in  = Folder_Open_In(folder, dat_file);
    if (!self->ix_in || !self->dat_in || !self->ord_in) {
        CharBuf *mess = MAKE_MESS("Can't open either %o, %o or %o", ord_file, 
            ix_file, dat_file);
        DECREF(ord_file);
        DECREF(ix_file);
        DECREF(dat_file);
        Err_throw_mess(mess);
    }
    ord_len = InStream_Length(self->ord_in);
    ix_len  = InStream_Length(self->ix_in);
    dat_len = InStream_Length(self->dat_in);

    /* Calculate the number of unique values and derive the ord bit width. */
    self->num_uniq = (i32_t)(ix_len / 8) - 1; 
    self->width    = S_calc_width(self->num_uniq);

    /* Validate file lengths. */
    {
        double bytes_per_doc = self->width / 8.0;
        double max_ords      = ord_len / bytes_per_doc;
        if (max_ords < self->doc_max + 1) {
            THROW("Conflict between ord count max %f64 and doc_max %i32", 
                max_ords, self->doc_max);
        }
    }

    /* Mmap ords, offsets and character data. */
    self->ords      = InStream_Buf(self->ord_in, (size_t)ord_len);
    self->offsets   = (i64_t*)InStream_Buf(self->ix_in, (size_t)ix_len);
    self->char_data = InStream_Buf(self->dat_in, dat_len);
    {
        char *offs            = (char*)self->offsets;
        self->offsets_limit   = (i64_t*)(offs + ix_len);
        self->char_data_limit = self->char_data + dat_len;
    }

    DECREF(ord_file);
    DECREF(ix_file);
    DECREF(dat_file);

    return self;
}