示例#1
0
void
PostPool_Flip_IMP(PostingPool *self) {
    PostingPoolIVARS *const ivars = PostPool_IVARS(self);
    uint32_t num_runs   = VA_Get_Size(ivars->runs);
    uint32_t sub_thresh = num_runs > 0
                          ? ivars->mem_thresh / num_runs
                          : ivars->mem_thresh;

    if (num_runs) {
        Folder  *folder = PolyReader_Get_Folder(ivars->polyreader);
        String *seg_name = Seg_Get_Name(ivars->segment);
        String *lex_temp_path  = Str_newf("%o/lextemp", seg_name);
        String *post_temp_path = Str_newf("%o/ptemp", seg_name);
        ivars->lex_temp_in = Folder_Open_In(folder, lex_temp_path);
        if (!ivars->lex_temp_in) {
            RETHROW(INCREF(Err_get_error()));
        }
        ivars->post_temp_in = Folder_Open_In(folder, post_temp_path);
        if (!ivars->post_temp_in) {
            RETHROW(INCREF(Err_get_error()));
        }
        DECREF(lex_temp_path);
        DECREF(post_temp_path);
    }

    PostPool_Sort_Buffer(self);
    if (num_runs && (ivars->buf_max - ivars->buf_tick) > 0) {
        uint32_t num_items = PostPool_Buffer_Count(self);
        // Cheap imitation of flush. FIXME.
        PostingPool *run
            = PostPool_new(ivars->schema, ivars->snapshot, ivars->segment,
                           ivars->polyreader, ivars->field, ivars->lex_writer,
                           ivars->mem_pool, ivars->lex_temp_out,
                           ivars->post_temp_out, ivars->skip_out);
        PostPool_Grow_Buffer(run, num_items);
        PostingPoolIVARS *const run_ivars = PostPool_IVARS(run);

        memcpy(run_ivars->buffer, (ivars->buffer) + ivars->buf_tick,
               num_items * sizeof(Obj*));
        run_ivars->buf_max = num_items;
        PostPool_Add_Run(self, (SortExternal*)run);
        ivars->buf_tick = 0;
        ivars->buf_max = 0;
    }

    // Assign.
    for (uint32_t i = 0; i < num_runs; i++) {
        PostingPool *run = (PostingPool*)VA_Fetch(ivars->runs, i);
        if (run != NULL) {
            PostPool_Set_Mem_Thresh(run, sub_thresh);
            if (!PostPool_IVARS(run)->lexicon) {
                S_fresh_flip(run, ivars->lex_temp_in, ivars->post_temp_in);
            }
        }
    }

    ivars->flipped = true;
}
示例#2
0
DefaultHighlightReader*
DefHLReader_init(DefaultHighlightReader *self, Schema *schema,
                 Folder *folder, Snapshot *snapshot, VArray *segments,
                 int32_t seg_tick)
{
    Segment *segment;
    Hash    *metadata; 
    HLReader_init((HighlightReader*)self, schema, folder, snapshot,
        segments, seg_tick);
    segment  = DefHLReader_Get_Segment(self);
    metadata = (Hash*)Seg_Fetch_Metadata_Str(segment, "highlight", 9);
    if (!metadata) {
        metadata = (Hash*)Seg_Fetch_Metadata_Str(segment, "term_vectors", 12);
    }
    
    // Check format. 
    if (metadata) {
        Obj     *format    = Hash_Fetch_Str(metadata, "format", 6);
        if (!format) { THROW(ERR, "Missing 'format' var"); }
        else {
            if (Obj_To_I64(format) != HLWriter_current_file_format) {
                THROW(ERR, "Unsupported highlight data format: %i64", 
                    Obj_To_I64(format));
            }
        }
    }


    // Open instreams. 
    {
        CharBuf *seg_name = Seg_Get_Name(segment);
        CharBuf *ix_file  = CB_newf("%o/highlight.ix", seg_name);
        CharBuf *dat_file = CB_newf("%o/highlight.dat", seg_name);
        if (Folder_Exists(folder, ix_file)) {
            self->ix_in = Folder_Open_In(folder, ix_file);
            if (!self->ix_in) {
                Err *error = (Err*)INCREF(Err_get_error());
                DECREF(ix_file);
                DECREF(dat_file);
                DECREF(self);
                RETHROW(error);
            }   
            self->dat_in = Folder_Open_In(folder, dat_file);
            if (!self->dat_in) {
                Err *error = (Err*)INCREF(Err_get_error());
                DECREF(ix_file);
                DECREF(dat_file);
                DECREF(self);
                RETHROW(error);
            }   
        }   
        DECREF(ix_file);
        DECREF(dat_file);
    }

    return self;
}
示例#3
0
文件: DocReader.c 项目: kidaa/lucy
DefaultDocReader*
DefDocReader_init(DefaultDocReader *self, Schema *schema, Folder *folder,
                  Snapshot *snapshot, Vector *segments, int32_t seg_tick) {
    Hash *metadata;
    Segment *segment;
    DocReader_init((DocReader*)self, schema, folder, snapshot, segments,
                   seg_tick);
    DefaultDocReaderIVARS *const ivars = DefDocReader_IVARS(self);
    segment = DefDocReader_Get_Segment(self);
    metadata = (Hash*)Seg_Fetch_Metadata_Utf8(segment, "documents", 9);

    if (metadata) {
        String *seg_name  = Seg_Get_Name(segment);
        String *ix_file   = Str_newf("%o/documents.ix", seg_name);
        String *dat_file  = Str_newf("%o/documents.dat", seg_name);
        Obj     *format   = Hash_Fetch_Utf8(metadata, "format", 6);

        // Check format.
        if (!format) { THROW(ERR, "Missing 'format' var"); }
        else {
            int64_t format_val = Json_obj_to_i64(format);
            if (format_val < DocWriter_current_file_format) {
                THROW(ERR, "Obsolete doc storage format %i64; "
                      "Index regeneration is required", format_val);
            }
            else if (format_val != DocWriter_current_file_format) {
                THROW(ERR, "Unsupported doc storage format: %i64", format_val);
            }
        }

        // Get streams.
        if (Folder_Exists(folder, ix_file)) {
            ivars->ix_in = Folder_Open_In(folder, ix_file);
            if (!ivars->ix_in) {
                Err *error = (Err*)INCREF(Err_get_error());
                DECREF(ix_file);
                DECREF(dat_file);
                DECREF(self);
                RETHROW(error);
            }
            ivars->dat_in = Folder_Open_In(folder, dat_file);
            if (!ivars->dat_in) {
                Err *error = (Err*)INCREF(Err_get_error());
                DECREF(ix_file);
                DECREF(dat_file);
                DECREF(self);
                RETHROW(error);
            }
        }
        DECREF(ix_file);
        DECREF(dat_file);
    }

    return self;
}
示例#4
0
文件: LexIndex.c 项目: kidaa/lucy
LexIndex*
LexIndex_init(LexIndex *self, Schema *schema, Folder *folder,
              Segment *segment, String *field) {
    int32_t  field_num = Seg_Field_Num(segment, field);
    String  *seg_name  = Seg_Get_Name(segment);
    String  *ixix_file = Str_newf("%o/lexicon-%i32.ixix", seg_name, field_num);
    String  *ix_file   = Str_newf("%o/lexicon-%i32.ix", seg_name, field_num);
    Architecture *arch = Schema_Get_Architecture(schema);

    // Init.
    Lex_init((Lexicon*)self, field);
    LexIndexIVARS *const ivars = LexIndex_IVARS(self);
    ivars->tinfo        = TInfo_new(0);
    ivars->tick         = 0;

    // Derive
    ivars->field_type = Schema_Fetch_Type(schema, field);
    if (!ivars->field_type) {
        String *mess = MAKE_MESS("Unknown field: '%o'", field);
        DECREF(ix_file);
        DECREF(ixix_file);
        DECREF(self);
        Err_throw_mess(ERR, mess);
    }
    ivars->field_type = (FieldType*)INCREF(ivars->field_type);
    ivars->term_stepper = FType_Make_Term_Stepper(ivars->field_type);
    ivars->ixix_in = Folder_Open_In(folder, ixix_file);
    if (!ivars->ixix_in) {
        Err *error = (Err*)INCREF(Err_get_error());
        DECREF(ix_file);
        DECREF(ixix_file);
        DECREF(self);
        RETHROW(error);
    }
    ivars->ix_in = Folder_Open_In(folder, ix_file);
    if (!ivars->ix_in) {
        Err *error = (Err*)INCREF(Err_get_error());
        DECREF(ix_file);
        DECREF(ixix_file);
        DECREF(self);
        RETHROW(error);
    }
    ivars->index_interval = Arch_Index_Interval(arch);
    ivars->skip_interval  = Arch_Skip_Interval(arch);
    ivars->size    = (int32_t)(InStream_Length(ivars->ixix_in) / sizeof(int64_t));
    ivars->offsets = (const int64_t*)InStream_Buf(ivars->ixix_in,
            (size_t)InStream_Length(ivars->ixix_in));

    DECREF(ixix_file);
    DECREF(ix_file);

    return self;
}
示例#5
0
DefaultHighlightReader*
DefHLReader_init(DefaultHighlightReader *self, Schema *schema,
                 Folder *folder, Snapshot *snapshot, Vector *segments,
                 int32_t seg_tick) {
    HLReader_init((HighlightReader*)self, schema, folder, snapshot,
                  segments, seg_tick);
    DefaultHighlightReaderIVARS *const ivars = DefHLReader_IVARS(self);
    Segment *segment    = DefHLReader_Get_Segment(self);
    Hash *metadata      = (Hash*)Seg_Fetch_Metadata_Utf8(segment, "highlight", 9);
    if (!metadata) {
        metadata = (Hash*)Seg_Fetch_Metadata_Utf8(segment, "term_vectors", 12);
    }

    // Check format.
    if (metadata) {
        Obj *format = Hash_Fetch_Utf8(metadata, "format", 6);
        if (!format) { THROW(ERR, "Missing 'format' var"); }
        else {
            if (Json_obj_to_i64(format) != HLWriter_current_file_format) {
                THROW(ERR, "Unsupported highlight data format: %i64",
                      Json_obj_to_i64(format));
            }
        }
    }

    // Open instreams.
    String *seg_name = Seg_Get_Name(segment);
    String *ix_file  = Str_newf("%o/highlight.ix", seg_name);
    String *dat_file = Str_newf("%o/highlight.dat", seg_name);
    if (Folder_Exists(folder, ix_file)) {
        ivars->ix_in = Folder_Open_In(folder, ix_file);
        if (!ivars->ix_in) {
            Err *error = (Err*)INCREF(Err_get_error());
            DECREF(ix_file);
            DECREF(dat_file);
            DECREF(self);
            RETHROW(error);
        }
        ivars->dat_in = Folder_Open_In(folder, dat_file);
        if (!ivars->dat_in) {
            Err *error = (Err*)INCREF(Err_get_error());
            DECREF(ix_file);
            DECREF(dat_file);
            DECREF(self);
            RETHROW(error);
        }
    }
    DECREF(ix_file);
    DECREF(dat_file);

    return self;
}
示例#6
0
DefaultDocReader*
DefDocReader_init(DefaultDocReader *self, Schema *schema, Folder *folder, 
                  Snapshot *snapshot, VArray *segments, i32_t seg_tick)
{
    Hash *metadata; 
    Segment *segment;
    DocReader_init((DocReader*)self, schema, folder, snapshot, segments,
        seg_tick);
    segment = DefDocReader_Get_Segment(self);
    metadata = (Hash*)Seg_Fetch_Metadata_Str(segment, "documents", 9);

    if (metadata) {
        CharBuf *seg_name  = Seg_Get_Name(segment);
        CharBuf *ix_file   = CB_newf("%o/documents.ix", seg_name);
        CharBuf *dat_file  = CB_newf("%o/documents.dat", seg_name);
        Obj     *format    = Hash_Fetch_Str(metadata, "format", 6);

        /* Check format. */
        if (!format) { THROW("Missing 'format' var"); }
        else {
            i64_t format_val = Obj_To_I64(format);
            if (format_val < DocWriter_current_file_format) {
                THROW("Obsolete doc storage format %i64; "
                    "Index regeneration is required", format_val);
            }
            else if (format_val != DocWriter_current_file_format) {
                THROW("Unsupported doc storage format: %i64", format_val);
            }
        }

        /* Get streams. */
        if (Folder_Exists(folder, ix_file)) {
            self->ix_in  = Folder_Open_In(folder, ix_file);
            self->dat_in = Folder_Open_In(folder, dat_file);
            if (!self->ix_in || !self->dat_in) {
                CharBuf *mess = MAKE_MESS("Can't open either %o or %o",
                    ix_file, dat_file);
                DECREF(ix_file);
                DECREF(dat_file);
                DECREF(self);
                Err_throw_mess(mess);
            }
        }
        DECREF(ix_file);
        DECREF(dat_file);
    }
    
    return self;
}
示例#7
0
LexIndex*
LexIndex_init(LexIndex *self, Schema *schema, Folder *folder, 
              Segment *segment, const CharBuf *field)
{
    i32_t    field_num = Seg_Field_Num(segment, field);
    CharBuf *seg_name  = Seg_Get_Name(segment);
    CharBuf *ixix_file = CB_newf("%o/lexicon-%i32.ixix", seg_name, field_num);
    CharBuf *ix_file   = CB_newf("%o/lexicon-%i32.ix", seg_name, field_num);
    Architecture *arch = Schema_Get_Architecture(schema);

    /* Init. */
    self->term  = ViewCB_new_from_trusted_utf8(NULL, 0);
    self->tinfo = TInfo_new(0,0,0,0);
    self->tick  = 0;

    /* Derive */
    self->field_type = Schema_Fetch_Type(schema, field);
    if (!self->field_type) {
        CharBuf *mess = MAKE_MESS("Unknown field: '%o'", field);
        DECREF(ix_file);
        DECREF(ixix_file);
        DECREF(self);
        Err_throw_mess(mess);
    }
    INCREF(self->field_type);
    self->ixix_in = Folder_Open_In(folder, ixix_file);
    self->ix_in   = Folder_Open_In(folder, ix_file);
    if (!self->ixix_in || !self->ix_in) {
        CharBuf *mess =
             MAKE_MESS("Can't open either %o or %o", ix_file, ixix_file);
        DECREF(ix_file);
        DECREF(ixix_file);
        DECREF(self);
        Err_throw_mess(mess);
    }
    self->index_interval = Arch_Index_Interval(arch);
    self->skip_interval  = Arch_Skip_Interval(arch);
    self->size    = (i32_t)(InStream_Length(self->ixix_in) / sizeof(i64_t));
    self->offsets = (i64_t*)InStream_Buf(self->ixix_in,
        (size_t)InStream_Length(self->ixix_in));
    self->data = InStream_Buf(self->ix_in, InStream_Length(self->ix_in));
    self->limit = self->data + InStream_Length(self->ix_in);

    DECREF(ixix_file);
    DECREF(ix_file);

    return self;
}
void
SortFieldWriter_Flip_IMP(SortFieldWriter *self) {
    SortFieldWriterIVARS *const ivars = SortFieldWriter_IVARS(self);
    uint32_t num_items = SortFieldWriter_Buffer_Count(self);
    uint32_t num_runs = Vec_Get_Size(ivars->runs);

    if (ivars->flipped) { THROW(ERR, "Can't call Flip() twice"); }
    ivars->flipped = true;

    // Sanity check.
    if (num_runs && num_items) {
        THROW(ERR, "Sanity check failed: num_runs: %u32 num_items: %u32",
              num_runs, num_items);
    }

    if (num_items) {
        SortFieldWriter_Sort_Buffer(self);
    }
    else if (num_runs) {
        Folder  *folder = PolyReader_Get_Folder(ivars->polyreader);
        String *seg_name = Seg_Get_Name(ivars->segment);
        String *ord_path = Str_newf("%o/sort_ord_temp", seg_name);
        ivars->ord_in = Folder_Open_In(folder, ord_path);
        DECREF(ord_path);
        if (!ivars->ord_in) { RETHROW(INCREF(Err_get_error())); }
        if (ivars->var_width) {
            String *ix_path = Str_newf("%o/sort_ix_temp", seg_name);
            ivars->ix_in = Folder_Open_In(folder, ix_path);
            DECREF(ix_path);
            if (!ivars->ix_in) { RETHROW(INCREF(Err_get_error())); }
        }
        String *dat_path = Str_newf("%o/sort_dat_temp", seg_name);
        ivars->dat_in = Folder_Open_In(folder, dat_path);
        DECREF(dat_path);
        if (!ivars->dat_in) { RETHROW(INCREF(Err_get_error())); }

        // Assign streams and a slice of mem_thresh.
        size_t sub_thresh = ivars->mem_thresh / num_runs;
        if (sub_thresh < 65536) { sub_thresh = 65536; }
        for (uint32_t i = 0; i < num_runs; i++) {
            SortFieldWriter *run = (SortFieldWriter*)Vec_Fetch(ivars->runs, i);
            S_flip_run(run, sub_thresh, ivars->ord_in, ivars->ix_in,
                       ivars->dat_in);
        }
    }

    ivars->flipped = true;
}
示例#9
0
文件: Folder.c 项目: theory/lucy
ByteBuf*
Folder_slurp_file(Folder *self, const CharBuf *path) {
    InStream *instream = Folder_Open_In(self, path);
    ByteBuf  *retval   = NULL;

    if (!instream) {
        RETHROW(INCREF(Err_get_error()));
    }
    else {
        uint64_t length = InStream_Length(instream);

        if (length >= SIZE_MAX) {
            InStream_Close(instream);
            DECREF(instream);
            THROW(ERR, "File %o is too big to slurp (%u64 bytes)", path,
                  length);
        }
        else {
            size_t size = (size_t)length;
            char *ptr = (char*)MALLOCATE((size_t)size + 1);
            InStream_Read_Bytes(instream, ptr, size);
            ptr[size] = '\0';
            retval = BB_new_steal_bytes(ptr, size, size + 1);
            InStream_Close(instream);
            DECREF(instream);
        }
    }

    return retval;
}
示例#10
0
void
PostPool_assign_seg(PostingPool *self, Folder *other_folder, 
                    Segment *other_segment, i32_t doc_base, I32Array *doc_map)
{
    i32_t    field_num = Seg_Field_Num(other_segment, self->field);
    CharBuf *other_seg_name = Seg_Get_Name(other_segment);
    CharBuf *lex_file 
        = CB_newf("%o/lexicon-%i32.dat", other_seg_name, field_num);

    /* Dedicate pool to this task alone. */
    if (self->from_seg || self->cache_max > 0 || self->lex_end != 0)
        THROW("Can't Assign_Segment to PostingPool with other content");
    self->from_seg = true;

    /* Prepare to read from existing files. */
    if (Folder_Exists(other_folder, lex_file)) {
        CharBuf *post_file
            = CB_newf("%o/postings-%i32.dat", other_seg_name, field_num);

        /* Open lexicon and postings files. */
        self->lex_instream  = Folder_Open_In(other_folder, lex_file);
        self->post_instream = Folder_Open_In(other_folder, post_file);
        if (!self->lex_instream)  { THROW("Can't open %o", lex_file); }
        if (!self->post_instream) { THROW("Can't open %o", post_file); }
        self->lex_end       = InStream_Length(self->lex_instream);
        self->post_end      = InStream_Length(self->post_instream);

        /* Assign doc base and doc map. */
        self->doc_base = doc_base;
        self->doc_map  = doc_map ? (I32Array*)INCREF(doc_map) : NULL;

        DECREF(post_file);
    }
    else {
        /* This posting pool will be empty. */
    }

    /* Clean up. */
    DECREF(lex_file);
}
示例#11
0
static void
test_Open_In(TestBatch *batch)
{
    Folder     *folder = (Folder*)RAMFolder_new(NULL);
    FileHandle *fh;
    InStream   *instream;

    Folder_MkDir(folder, &foo);
    Folder_MkDir(folder, &foo_bar);
    fh = Folder_Open_FileHandle(folder, &boffo, FH_CREATE | FH_WRITE_ONLY);
    DECREF(fh);
    fh = Folder_Open_FileHandle(folder, &foo_boffo, FH_CREATE | FH_WRITE_ONLY);
    DECREF(fh);

    instream = Folder_Open_In(folder, &boffo);
    TEST_TRUE(batch, instream && InStream_Is_A(instream, INSTREAM), 
        "Open_In");
    DECREF(instream);

    instream = Folder_Open_In(folder, &foo_boffo);
    TEST_TRUE(batch, instream && InStream_Is_A(instream, INSTREAM), 
        "Open_In for nested file");
    DECREF(instream);

    Err_set_error(NULL);
    instream = Folder_Open_In(folder, &foo);
    TEST_TRUE(batch, instream == NULL,
        "Open_InStream on existing dir path fails");
    TEST_TRUE(batch, Err_get_error() != NULL,
        "Open_In on existing dir name sets Err_error");

    Err_set_error(NULL);
    instream = Folder_Open_In(folder, &foo_bar_baz_boffo);
    TEST_TRUE(batch, instream == NULL,
        "Open_In for entry within non-existent dir fails");
    TEST_TRUE(batch, Err_get_error() != NULL,
        "Open_In for entry within non-existent dir sets Err_error");

    DECREF(folder);
}
示例#12
0
文件: Json.c 项目: pavansondur/lucy
Obj*
Json_slurp_json(Folder *folder, const CharBuf *path) {
    InStream *instream = Folder_Open_In(folder, path);
    if (!instream) {
        ERR_ADD_FRAME(Err_get_error());
        return NULL;
    }
    size_t len = (size_t)InStream_Length(instream);
    char *buf = InStream_Buf(instream, len);
    Obj *dump = S_parse_json(buf, len);
    InStream_Close(instream);
    DECREF(instream);
    if (!dump) {
        ERR_ADD_FRAME(Err_get_error());
    }
    return dump;
}
示例#13
0
static SortCache*
S_lazy_init_sort_cache(DefaultSortReader *self, String *field) {
    DefaultSortReaderIVARS *const ivars = DefSortReader_IVARS(self);

    // See if we have any values.
    Obj *count_obj = Hash_Fetch(ivars->counts, (Obj*)field);
    int32_t count = count_obj ? (int32_t)Obj_To_I64(count_obj) : 0;
    if (!count) { return NULL; }

    // Get a FieldType and sanity check that the field is sortable.
    Schema    *schema = DefSortReader_Get_Schema(self);
    FieldType *type   = Schema_Fetch_Type(schema, field);
    if (!type || !FType_Sortable(type)) {
        THROW(ERR, "'%o' isn't a sortable field", field);
    }

    // Open streams.
    Folder    *folder    = DefSortReader_Get_Folder(self);
    Segment   *segment   = DefSortReader_Get_Segment(self);
    String    *seg_name  = Seg_Get_Name(segment);
    int32_t    field_num = Seg_Field_Num(segment, field);
    int8_t     prim_id   = FType_Primitive_ID(type);
    bool       var_width = (prim_id == FType_TEXT || prim_id == FType_BLOB)
                           ? true
                           : false;
    String *ord_path = Str_newf("%o/sort-%i32.ord", seg_name, field_num);
    InStream *ord_in = Folder_Open_In(folder, ord_path);
    DECREF(ord_path);
    if (!ord_in) {
        THROW(ERR, "Error building sort cache for '%o': %o",
              field, Err_get_error());
    }
    InStream *ix_in = NULL;
    if (var_width) {
        String *ix_path = Str_newf("%o/sort-%i32.ix", seg_name, field_num);
        ix_in = Folder_Open_In(folder, ix_path);
        DECREF(ix_path);
        if (!ix_in) {
            THROW(ERR, "Error building sort cache for '%o': %o",
                  field, Err_get_error());
        }
    }
    String *dat_path = Str_newf("%o/sort-%i32.dat", seg_name, field_num);
    InStream *dat_in = Folder_Open_In(folder, dat_path);
    DECREF(dat_path);
    if (!dat_in) {
        THROW(ERR, "Error building sort cache for '%o': %o",
              field, Err_get_error());
    }

    Obj     *null_ord_obj = Hash_Fetch(ivars->null_ords, (Obj*)field);
    int32_t  null_ord = null_ord_obj ? (int32_t)Obj_To_I64(null_ord_obj) : -1;
    Obj     *ord_width_obj = Hash_Fetch(ivars->ord_widths, (Obj*)field);
    int32_t  ord_width = ord_width_obj
                         ? (int32_t)Obj_To_I64(ord_width_obj)
                         : S_calc_ord_width(count);
    int32_t  doc_max = (int32_t)Seg_Get_Count(segment);

    SortCache *cache = NULL;
    switch (prim_id & FType_PRIMITIVE_ID_MASK) {
        case FType_TEXT:
            cache = (SortCache*)TextSortCache_new(field, type, count, doc_max,
                                                  null_ord, ord_width, ord_in,
                                                  ix_in, dat_in);
            break;
        case FType_INT32:
            cache = (SortCache*)I32SortCache_new(field, type, count, doc_max,
                                                 null_ord, ord_width, ord_in,
                                                 dat_in);
            break;
        case FType_INT64:
            cache = (SortCache*)I64SortCache_new(field, type, count, doc_max,
                                                 null_ord, ord_width, ord_in,
                                                 dat_in);
            break;
        case FType_FLOAT32:
            cache = (SortCache*)F32SortCache_new(field, type, count, doc_max,
                                                 null_ord, ord_width, ord_in,
                                                 dat_in);
            break;
        case FType_FLOAT64:
            cache = (SortCache*)F64SortCache_new(field, type, count, doc_max,
                                                 null_ord, ord_width, ord_in,
                                                 dat_in);
            break;
        default:
            THROW(ERR, "No SortCache class for %o", type);
    }
    Hash_Store(ivars->caches, (Obj*)field, (Obj*)cache);

    if (ivars->format == 2) { // bug compatibility
        SortCache_Set_Native_Ords(cache, true);
    }

    DECREF(ord_in);
    DECREF(ix_in);
    DECREF(dat_in);

    return cache;
}
示例#14
0
CompoundFileReader*
CFReader_do_open(CompoundFileReader *self, Folder *folder) {
    CharBuf *cfmeta_file = (CharBuf*)ZCB_WRAP_STR("cfmeta.json", 11);
    Hash *metadata = (Hash*)Json_slurp_json((Folder*)folder, cfmeta_file);
    Err *error = NULL;

    Folder_init((Folder*)self, Folder_Get_Path(folder));

    // Parse metadata file.
    if (!metadata || !Hash_Is_A(metadata, HASH)) {
        error = Err_new(CB_newf("Can't read '%o' in '%o'", cfmeta_file,
                                Folder_Get_Path(folder)));
    }
    else {
        Obj *format = Hash_Fetch_Str(metadata, "format", 6);
        self->format = format ? (int32_t)Obj_To_I64(format) : 0;
        self->records = (Hash*)INCREF(Hash_Fetch_Str(metadata, "files", 5));
        if (self->format < 1) {
            error = Err_new(CB_newf("Corrupt %o file: Missing or invalid 'format'",
                                    cfmeta_file));
        }
        else if (self->format > CFWriter_current_file_format) {
            error = Err_new(CB_newf("Unsupported compound file format: %i32 "
                                    "(current = %i32", self->format,
                                    CFWriter_current_file_format));
        }
        else if (!self->records) {
            error = Err_new(CB_newf("Corrupt %o file: missing 'files' key",
                                    cfmeta_file));
        }
    }
    DECREF(metadata);
    if (error) {
        Err_set_error(error);
        DECREF(self);
        return NULL;
    }

    // Open an instream which we'll clone over and over.
    CharBuf *cf_file = (CharBuf*)ZCB_WRAP_STR("cf.dat", 6);
    self->instream = Folder_Open_In(folder, cf_file);
    if (!self->instream) {
        ERR_ADD_FRAME(Err_get_error());
        DECREF(self);
        return NULL;
    }

    // Assign.
    self->real_folder = (Folder*)INCREF(folder);

    // Strip directory name from filepaths for old format.
    if (self->format == 1) {
        VArray *files = Hash_Keys(self->records);
        ZombieCharBuf *filename = ZCB_BLANK();
        ZombieCharBuf *folder_name
            = IxFileNames_local_part(Folder_Get_Path(folder), ZCB_BLANK());
        size_t folder_name_len = ZCB_Length(folder_name);

        for (uint32_t i = 0, max = VA_Get_Size(files); i < max; i++) {
            CharBuf *orig = (CharBuf*)VA_Fetch(files, i);
            if (CB_Starts_With(orig, (CharBuf*)folder_name)) {
                Obj *record = Hash_Delete(self->records, (Obj*)orig);
                ZCB_Assign(filename, orig);
                ZCB_Nip(filename, folder_name_len + sizeof(DIR_SEP) - 1);
                Hash_Store(self->records, (Obj*)filename, (Obj*)record);
            }
        }

        DECREF(files);
    }

    return self;
}
示例#15
0
static void
S_do_consolidate(CompoundFileWriter *self, CompoundFileWriterIVARS *ivars) {
    UNUSED_VAR(self);
    Folder    *folder       = ivars->folder;
    Hash      *metadata     = Hash_new(0);
    Hash      *sub_files    = Hash_new(0);
    Vector    *files        = Folder_List(folder, NULL);
    Vector    *merged       = Vec_new(Vec_Get_Size(files));
    String    *cf_file      = (String*)SSTR_WRAP_UTF8("cf.dat", 6);
    OutStream *outstream    = Folder_Open_Out(folder, (String*)cf_file);
    bool       rename_success;

    if (!outstream) { RETHROW(INCREF(Err_get_error())); }

    // Start metadata.
    Hash_Store_Utf8(metadata, "files", 5, INCREF(sub_files));
    Hash_Store_Utf8(metadata, "format", 6,
                    (Obj*)Str_newf("%i32", CFWriter_current_file_format));

    Vec_Sort(files);
    for (uint32_t i = 0, max = Vec_Get_Size(files); i < max; i++) {
        String *infilename = (String*)Vec_Fetch(files, i);

        if (!Str_Ends_With_Utf8(infilename, ".json", 5)) {
            InStream *instream   = Folder_Open_In(folder, infilename);
            Hash     *file_data  = Hash_new(2);
            int64_t   offset, len;

            if (!instream) { RETHROW(INCREF(Err_get_error())); }

            // Absorb the file.
            offset = OutStream_Tell(outstream);
            OutStream_Absorb(outstream, instream);
            len = OutStream_Tell(outstream) - offset;

            // Record offset and length.
            Hash_Store_Utf8(file_data, "offset", 6,
                            (Obj*)Str_newf("%i64", offset));
            Hash_Store_Utf8(file_data, "length", 6,
                            (Obj*)Str_newf("%i64", len));
            Hash_Store(sub_files, infilename, (Obj*)file_data);
            Vec_Push(merged, INCREF(infilename));

            // Add filler NULL bytes so that every sub-file begins on a file
            // position multiple of 8.
            OutStream_Align(outstream, 8);

            InStream_Close(instream);
            DECREF(instream);
        }
    }

    // Write metadata to cfmeta file.
    String *cfmeta_temp = (String*)SSTR_WRAP_UTF8("cfmeta.json.temp", 16);
    String *cfmeta_file = (String*)SSTR_WRAP_UTF8("cfmeta.json", 11);
    Json_spew_json((Obj*)metadata, (Folder*)ivars->folder, cfmeta_temp);
    rename_success = Folder_Rename(ivars->folder, cfmeta_temp, cfmeta_file);
    if (!rename_success) { RETHROW(INCREF(Err_get_error())); }

    // Clean up.
    OutStream_Close(outstream);
    DECREF(outstream);
    DECREF(files);
    DECREF(metadata);
    /*
    HashIterator *iter = HashIter_new(sub_files);
    while (HashIter_Next(iter)) {
        String *merged_file = HashIter_Get_Key(iter);
        if (!Folder_Delete(folder, merged_file)) {
            String *mess = MAKE_MESS("Can't delete '%o'", merged_file);
            DECREF(sub_files);
            Err_throw_mess(ERR, mess);
        }
    }
    DECREF(iter);
    */
    DECREF(sub_files);
    for (uint32_t i = 0, max = Vec_Get_Size(merged); i < max; i++) {
        String *merged_file = (String*)Vec_Fetch(merged, i);
        if (!Folder_Delete(folder, merged_file)) {
            String *mess = MAKE_MESS("Can't delete '%o'", merged_file);
            DECREF(merged);
            Err_throw_mess(ERR, mess);
        }
    }
    DECREF(merged);
}
示例#16
0
CompoundFileReader*
CFReader_do_open(CompoundFileReader *self, Folder *folder) {
    CompoundFileReaderIVARS *const ivars = CFReader_IVARS(self);
    String *cfmeta_file = (String*)SSTR_WRAP_UTF8("cfmeta.json", 11);
    Hash *metadata = (Hash*)Json_slurp_json((Folder*)folder, cfmeta_file);
    Err *error = NULL;

    Folder_init((Folder*)self, Folder_Get_Path(folder));

    // Parse metadata file.
    if (!metadata || !Hash_Is_A(metadata, HASH)) {
        error = Err_new(Str_newf("Can't read '%o' in '%o'", cfmeta_file,
                                 Folder_Get_Path(folder)));
    }
    else {
        Obj *format = Hash_Fetch_Utf8(metadata, "format", 6);
        ivars->format = format ? (int32_t)Obj_To_I64(format) : 0;
        ivars->records = (Hash*)INCREF(Hash_Fetch_Utf8(metadata, "files", 5));
        if (ivars->format < 1) {
            error = Err_new(Str_newf("Corrupt %o file: Missing or invalid 'format'",
                                     cfmeta_file));
        }
        else if (ivars->format > CFWriter_current_file_format) {
            error = Err_new(Str_newf("Unsupported compound file format: %i32 "
                                     "(current = %i32", ivars->format,
                                     CFWriter_current_file_format));
        }
        else if (!ivars->records) {
            error = Err_new(Str_newf("Corrupt %o file: missing 'files' key",
                                     cfmeta_file));
        }
    }
    DECREF(metadata);
    if (error) {
        Err_set_error(error);
        DECREF(self);
        return NULL;
    }

    // Open an instream which we'll clone over and over.
    String *cf_file = (String*)SSTR_WRAP_UTF8("cf.dat", 6);
    ivars->instream = Folder_Open_In(folder, cf_file);
    if (!ivars->instream) {
        ERR_ADD_FRAME(Err_get_error());
        DECREF(self);
        return NULL;
    }

    // Assign.
    ivars->real_folder = (Folder*)INCREF(folder);

    // Strip directory name from filepaths for old format.
    if (ivars->format == 1) {
        Vector *files = Hash_Keys(ivars->records);
        String *folder_name = IxFileNames_local_part(Folder_Get_Path(folder));
        size_t folder_name_len = Str_Length(folder_name);

        for (uint32_t i = 0, max = Vec_Get_Size(files); i < max; i++) {
            String *orig = (String*)Vec_Fetch(files, i);
            if (Str_Starts_With(orig, folder_name)) {
                Obj *record = Hash_Delete(ivars->records, orig);
                size_t offset = folder_name_len + sizeof(CHY_DIR_SEP) - 1;
                size_t len    = Str_Length(orig) - offset;
                String *filename = Str_SubString(orig, offset, len);
                Hash_Store(ivars->records, filename, (Obj*)record);
                DECREF(filename);
            }
        }

        DECREF(folder_name);
        DECREF(files);
    }

    return self;
}
示例#17
0
SortCache*
SortCache_init(SortCache *self, Schema *schema, Folder *folder,
               Segment *segment, i32_t field_num)
{
    CharBuf *field    = Seg_Field_Name(segment, field_num);
    CharBuf *seg_name = Seg_Get_Name(segment);
    CharBuf *ord_file = CB_newf("%o/sort-%i32.ord", seg_name, field_num);
    CharBuf *ix_file  = CB_newf("%o/sort-%i32.ix",  seg_name, field_num);
    CharBuf *dat_file = CB_newf("%o/sort-%i32.dat", seg_name, field_num);
    i64_t ord_len, ix_len, dat_len;

    /* Derive. */
    self->doc_max = Seg_Get_Count(segment);
    self->type    = Schema_Fetch_Type(schema, field);
    if (!self->type || !FType_Sortable(self->type)) {
        THROW("'%o' isn't a sortable field", field);
    }

    /* Open instreams. */
    self->ord_in  = Folder_Open_In(folder, ord_file);
    self->ix_in   = Folder_Open_In(folder, ix_file);
    self->dat_in  = Folder_Open_In(folder, dat_file);
    if (!self->ix_in || !self->dat_in || !self->ord_in) {
        CharBuf *mess = MAKE_MESS("Can't open either %o, %o or %o", ord_file, 
            ix_file, dat_file);
        DECREF(ord_file);
        DECREF(ix_file);
        DECREF(dat_file);
        Err_throw_mess(mess);
    }
    ord_len = InStream_Length(self->ord_in);
    ix_len  = InStream_Length(self->ix_in);
    dat_len = InStream_Length(self->dat_in);

    /* Calculate the number of unique values and derive the ord bit width. */
    self->num_uniq = (i32_t)(ix_len / 8) - 1; 
    self->width    = S_calc_width(self->num_uniq);

    /* Validate file lengths. */
    {
        double bytes_per_doc = self->width / 8.0;
        double max_ords      = ord_len / bytes_per_doc;
        if (max_ords < self->doc_max + 1) {
            THROW("Conflict between ord count max %f64 and doc_max %i32", 
                max_ords, self->doc_max);
        }
    }

    /* Mmap ords, offsets and character data. */
    self->ords      = InStream_Buf(self->ord_in, (size_t)ord_len);
    self->offsets   = (i64_t*)InStream_Buf(self->ix_in, (size_t)ix_len);
    self->char_data = InStream_Buf(self->dat_in, dat_len);
    {
        char *offs            = (char*)self->offsets;
        self->offsets_limit   = (i64_t*)(offs + ix_len);
        self->char_data_limit = self->char_data + dat_len;
    }

    DECREF(ord_file);
    DECREF(ix_file);
    DECREF(dat_file);

    return self;
}