Example #1
0
TDB_EXPORT tdb_error tdb_cons_finalize(tdb_cons *cons)
{
    struct tdb_file items_mmapped;
    uint64_t num_events = cons->events.next;
    int ret = 0;

    memset(&items_mmapped, 0, sizeof(struct tdb_file));

    /* finalize event items */
    if ((ret = arena_flush(&cons->items)))
        goto done;

    if (cons->items.fd && fclose(cons->items.fd)) {
        cons->items.fd = NULL;
        ret = TDB_ERR_IO_CLOSE;
        goto done;
    }
    cons->items.fd = NULL;

    if (cons->tempfile[0]){
        if (num_events && cons->num_ofields) {
            if (file_mmap(cons->tempfile, NULL, &items_mmapped, NULL)){
                ret = TDB_ERR_IO_READ;
                goto done;
            }
        }

        TDB_TIMER_DEF

        TDB_TIMER_START
        if ((ret = store_lexicons(cons)))
            goto done;
        TDB_TIMER_END("encoder/store_lexicons")

        TDB_TIMER_START
        if ((ret = store_uuids(cons)))
            goto done;
        TDB_TIMER_END("encoder/store_uuids")

        TDB_TIMER_START
        if ((ret = store_version(cons)))
            goto done;
        TDB_TIMER_END("encoder/store_version")

        TDB_TIMER_START
        if ((ret = tdb_encode(cons, (const tdb_item*)items_mmapped.data)))
            goto done;
        TDB_TIMER_END("encoder/encode")
    }
done:
    if (items_mmapped.ptr)
        munmap(items_mmapped.ptr, items_mmapped.mmap_size);

    if (cons->tempfile[0])
        unlink(cons->tempfile);

    if (!ret){
        #ifdef HAVE_ARCHIVE_H
        if (cons->output_format == TDB_OPT_CONS_OUTPUT_FORMAT_PACKAGE)
            ret = cons_package(cons);
        #endif
    }
    return ret;
}
Example #2
0
tdb_error tdb_encode(tdb_cons *cons, const tdb_item *items)
{
    char path[TDB_MAX_PATH_SIZE];
    char grouped_path[TDB_MAX_PATH_SIZE];
    char toc_path[TDB_MAX_PATH_SIZE];
    char *root = cons->root;
    char *read_buf = NULL;
    struct field_stats *fstats = NULL;
    uint64_t num_trails = 0;
    uint64_t num_events = cons->events.next;
    uint64_t num_fields = cons->num_ofields + 1;
    uint64_t max_timestamp = 0;
    uint64_t max_timedelta = 0;
    uint64_t *field_cardinalities = NULL;
    uint64_t i;
    Pvoid_t unigram_freqs = NULL;
    struct judy_128_map gram_freqs;
    struct judy_128_map codemap;
    Word_t tmp;
    FILE *grouped_w = NULL;
    FILE *grouped_r = NULL;
    int fd, ret = 0;
    TDB_TIMER_DEF

    j128m_init(&gram_freqs);
    j128m_init(&codemap);

    if (!(field_cardinalities = calloc(cons->num_ofields, 8))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }

    for (i = 0; i < cons->num_ofields; i++)
        field_cardinalities[i] = jsm_num_keys(&cons->lexicons[i]);

    /* 1. group events by trail, sort events of each trail by time,
          and delta-encode timestamps */
    TDB_TIMER_START

    TDB_PATH(grouped_path, "%s/tmp.grouped.XXXXXX", root);
    if ((fd = mkstemp(grouped_path)) == -1){
        ret = TDB_ERR_IO_OPEN;
        goto done;
    }
    if (!(grouped_w = fdopen(fd, "w"))){
        ret = TDB_ERR_IO_OPEN;
        goto done;
    }

    if (cons->events.data)
        if ((ret = groupby_uuid(grouped_w,
                                (struct tdb_cons_event*)cons->events.data,
                                cons,
                                &num_trails,
                                &max_timestamp,
                                &max_timedelta)))
            goto done;

    /*
    not the most clean separation of ownership here, but these objects
    can be huge so keeping them around unecessarily is expensive
    */
    free(cons->events.data);
    cons->events.data = NULL;
    j128m_free(&cons->trails);

    TDB_CLOSE(grouped_w);
    grouped_w = NULL;

    TDB_OPEN(grouped_r, grouped_path, "r");
    if (!(read_buf = malloc(READ_BUFFER_SIZE))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }

    setvbuf(grouped_r, read_buf, _IOFBF, READ_BUFFER_SIZE);
    TDB_TIMER_END("trail/groupby_uuid");

    /* 2. store metatadata */
    TDB_TIMER_START
    TDB_PATH(path, "%s/info", root);
    if ((ret = store_info(path,
                          num_trails,
                          num_events,
                          cons->min_timestamp,
                          max_timestamp,
                          max_timedelta)))
        goto done;
    TDB_TIMER_END("trail/info");

    /* 3. collect value (unigram) freqs, including delta-encoded timestamps */
    TDB_TIMER_START
    unigram_freqs = collect_unigrams(grouped_r, num_events, items, num_fields);
    if (num_events > 0 && !unigram_freqs){
        ret = TDB_ERR_NOMEM;
        goto done;
    }
    TDB_TIMER_END("trail/collect_unigrams");

    /* 4. construct uni/bi-grams */
    tdb_opt_value dont_build_bigrams;
    tdb_cons_get_opt(cons, TDB_OPT_CONS_NO_BIGRAMS, &dont_build_bigrams);

    TDB_TIMER_START
    if ((ret = make_grams(grouped_r,
                          num_events,
                          items,
                          num_fields,
                          unigram_freqs,
                          &gram_freqs,
                          dont_build_bigrams.value)))
        goto done;
    TDB_TIMER_END("trail/gram_freqs");

    /* 5. build a huffman codebook and stats struct for encoding grams */
    TDB_TIMER_START
    if ((ret = huff_create_codemap(&gram_freqs, &codemap)))
        goto done;
    if (!(fstats = huff_field_stats(field_cardinalities,
                                    num_fields,
                                    max_timedelta))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }
    TDB_TIMER_END("trail/huff_create_codemap");

    /* 6. encode and write trails to disk */
    TDB_TIMER_START
    TDB_PATH(path, "%s/trails.data", root);
    TDB_PATH(toc_path, "%s/trails.toc", root);
    if ((ret = encode_trails(items,
                             grouped_r,
                             num_events,
                             num_trails,
                             num_fields,
                             &codemap,
                             &gram_freqs,
                             fstats,
                             path,
                             toc_path)))
        goto done;
    TDB_TIMER_END("trail/encode_trails");

    /* 7. write huffman codebook to disk */
    TDB_TIMER_START
    tdb_path(path, "%s/trails.codebook", root);
    if ((ret = store_codebook(&codemap, path)))
        goto done;
    TDB_TIMER_END("trail/store_codebook");

done:
    TDB_CLOSE_FINAL(grouped_w);
    TDB_CLOSE_FINAL(grouped_r);
    j128m_free(&gram_freqs);
    j128m_free(&codemap);
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wsign-compare"
    JLFA(tmp, unigram_freqs);
#pragma GCC diagnostic pop

    unlink(grouped_path);

    free(field_cardinalities);
    free(read_buf);
    free(fstats);

    return ret;

out_of_memory:
    return TDB_ERR_NOMEM;
}