Beispiel #1
0
static tdb_error store_info(const char *path,
                            uint64_t num_trails,
                            uint64_t num_events,
                            uint64_t min_timestamp,
                            uint64_t max_timestamp,
                            uint64_t max_timedelta)
{
    FILE *out = NULL;
    int ret = 0;
    /*
    NOTE - this file shouldn't grow to be more than 512
    bytes, so it occupies a constant amount of space in a
    tar package.
    */
    TDB_OPEN(out, path, "w");
    TDB_FPRINTF(out,
                "%"PRIu64" %"PRIu64" %"PRIu64" %"PRIu64" %"PRIu64"\n",
                num_trails,
                num_events,
                min_timestamp,
                max_timestamp,
                max_timedelta);
done:
    TDB_CLOSE_FINAL(out);
    return ret;
}
Beispiel #2
0
static tdb_error store_version(tdb_cons *cons)
{
    FILE *out = NULL;
    char path[TDB_MAX_PATH_SIZE];
    int ret = 0;

    TDB_PATH(path, "%s/version", cons->root);
    TDB_OPEN(out, path, "w");
    TDB_FPRINTF(out, "%llu", TDB_VERSION_LATEST);
done:
    TDB_CLOSE_FINAL(out);
    return ret;
}
Beispiel #3
0
static tdb_error store_codebook(const struct judy_128_map *codemap,
                                const char *path)
{
    FILE *out = NULL;
    uint32_t size;
    struct huff_codebook *book = huff_create_codebook(codemap, &size);
    int ret = 0;

    TDB_OPEN(out, path, "w");
    TDB_WRITE(out, book, size);

done:
    TDB_CLOSE_FINAL(out);
    free(book);
    return ret;
}
Beispiel #4
0
static tdb_error lexicon_store(const struct judy_str_map *lexicon,
                               const char *path)
{
    /*
    Lexicon format:
    [ number of values N ] 4 or 8 bytes
    [ value offsets ...  ] N * (4 or 8 bytes)
    [ last value offset  ] 4 or 8 bytes
    [ values ...         ] X bytes
    */

    struct jm_fold_state state;
    uint64_t count = jsm_num_keys(lexicon);
    uint64_t size = (count + 2) * 4 + jsm_values_size(lexicon);
    int ret = 0;

    state.offset = (count + 2) * 4;
    state.width = 4;

    if (size > UINT32_MAX){
        size = (count + 2) * 8 + jsm_values_size(lexicon);
        state.offset = (count + 2) * 8;
        state.width = 8;
    }

    if (size > TDB_MAX_LEXICON_SIZE)
        return TDB_ERR_LEXICON_TOO_LARGE;

    state.out = NULL;
    state.ret = 0;

    TDB_OPEN(state.out, path, "w");
    TDB_TRUNCATE(state.out, (off_t)size);
    TDB_WRITE(state.out, &count, state.width);

    jsm_fold(lexicon, lexicon_store_fun, &state);
    if ((ret = state.ret))
        goto done;

    TDB_SEEK(state.out, (count + 1) * state.width);
    TDB_WRITE(state.out, &state.offset, state.width);

done:
    TDB_CLOSE_FINAL(state.out);
    return ret;
}
Beispiel #5
0
static tdb_error store_lexicons(tdb_cons *cons)
{
    tdb_field i;
    FILE *out = NULL;
    char path[TDB_MAX_PATH_SIZE];
    int ret = 0;

    TDB_PATH(path, "%s/fields", cons->root);
    TDB_OPEN(out, path, "w");

    for (i = 0; i < cons->num_ofields; i++){
        TDB_PATH(path, "%s/lexicon.%s", cons->root, cons->ofield_names[i]);
        if ((ret = lexicon_store(&cons->lexicons[i], path)))
            goto done;
        TDB_FPRINTF(out, "%s\n", cons->ofield_names[i]);
    }
    TDB_FPRINTF(out, "\n");
done:
    TDB_CLOSE_FINAL(out);
    return ret;
}
Beispiel #6
0
static tdb_error store_uuids(tdb_cons *cons)
{
    char path[TDB_MAX_PATH_SIZE];
    struct jm_fold_state state = {.ret = 0};
    uint64_t num_trails = j128m_num_keys(&cons->trails);
    int ret = 0;

    /* this is why num_trails < TDB_MAX)NUM_TRAILS < 2^59:
       (2^59 - 1) * 16 < LONG_MAX (off_t) */
    if (num_trails > TDB_MAX_NUM_TRAILS)
        return TDB_ERR_TOO_MANY_TRAILS;

    TDB_PATH(path, "%s/uuids", cons->root);
    TDB_OPEN(state.out, path, "w");
    TDB_TRUNCATE(state.out, ((off_t)(num_trails * 16)));

    j128m_fold(&cons->trails, store_uuids_fun, &state);
    ret = state.ret;

done:
    TDB_CLOSE_FINAL(state.out);
    return ret;
}

int is_fieldname_invalid(const char* field)
{
    uint64_t i;

    if (!strcmp(field, "time"))
        return 1;

    for (i = 0; i < TDB_MAX_FIELDNAME_LENGTH && field[i]; i++)
        if (!index(TDB_FIELDNAME_CHARS, field[i]))
            return 1;

    if (i == 0 || i == TDB_MAX_FIELDNAME_LENGTH)
        return 1;

    return 0;
}
Beispiel #7
0
tdb_error tdb_encode(tdb_cons *cons, const tdb_item *items)
{
    char path[TDB_MAX_PATH_SIZE];
    char grouped_path[TDB_MAX_PATH_SIZE];
    char toc_path[TDB_MAX_PATH_SIZE];
    char *root = cons->root;
    char *read_buf = NULL;
    struct field_stats *fstats = NULL;
    uint64_t num_trails = 0;
    uint64_t num_events = cons->events.next;
    uint64_t num_fields = cons->num_ofields + 1;
    uint64_t max_timestamp = 0;
    uint64_t max_timedelta = 0;
    uint64_t *field_cardinalities = NULL;
    uint64_t i;
    Pvoid_t unigram_freqs = NULL;
    struct judy_128_map gram_freqs;
    struct judy_128_map codemap;
    Word_t tmp;
    FILE *grouped_w = NULL;
    FILE *grouped_r = NULL;
    int fd, ret = 0;
    TDB_TIMER_DEF

    j128m_init(&gram_freqs);
    j128m_init(&codemap);

    if (!(field_cardinalities = calloc(cons->num_ofields, 8))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }

    for (i = 0; i < cons->num_ofields; i++)
        field_cardinalities[i] = jsm_num_keys(&cons->lexicons[i]);

    /* 1. group events by trail, sort events of each trail by time,
          and delta-encode timestamps */
    TDB_TIMER_START

    TDB_PATH(grouped_path, "%s/tmp.grouped.XXXXXX", root);
    if ((fd = mkstemp(grouped_path)) == -1){
        ret = TDB_ERR_IO_OPEN;
        goto done;
    }
    if (!(grouped_w = fdopen(fd, "w"))){
        ret = TDB_ERR_IO_OPEN;
        goto done;
    }

    if (cons->events.data)
        if ((ret = groupby_uuid(grouped_w,
                                (struct tdb_cons_event*)cons->events.data,
                                cons,
                                &num_trails,
                                &max_timestamp,
                                &max_timedelta)))
            goto done;

    /*
    not the most clean separation of ownership here, but these objects
    can be huge so keeping them around unecessarily is expensive
    */
    free(cons->events.data);
    cons->events.data = NULL;
    j128m_free(&cons->trails);

    TDB_CLOSE(grouped_w);
    grouped_w = NULL;

    TDB_OPEN(grouped_r, grouped_path, "r");
    if (!(read_buf = malloc(READ_BUFFER_SIZE))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }

    setvbuf(grouped_r, read_buf, _IOFBF, READ_BUFFER_SIZE);
    TDB_TIMER_END("trail/groupby_uuid");

    /* 2. store metatadata */
    TDB_TIMER_START
    TDB_PATH(path, "%s/info", root);
    if ((ret = store_info(path,
                          num_trails,
                          num_events,
                          cons->min_timestamp,
                          max_timestamp,
                          max_timedelta)))
        goto done;
    TDB_TIMER_END("trail/info");

    /* 3. collect value (unigram) freqs, including delta-encoded timestamps */
    TDB_TIMER_START
    unigram_freqs = collect_unigrams(grouped_r, num_events, items, num_fields);
    if (num_events > 0 && !unigram_freqs){
        ret = TDB_ERR_NOMEM;
        goto done;
    }
    TDB_TIMER_END("trail/collect_unigrams");

    /* 4. construct uni/bi-grams */
    tdb_opt_value dont_build_bigrams;
    tdb_cons_get_opt(cons, TDB_OPT_CONS_NO_BIGRAMS, &dont_build_bigrams);

    TDB_TIMER_START
    if ((ret = make_grams(grouped_r,
                          num_events,
                          items,
                          num_fields,
                          unigram_freqs,
                          &gram_freqs,
                          dont_build_bigrams.value)))
        goto done;
    TDB_TIMER_END("trail/gram_freqs");

    /* 5. build a huffman codebook and stats struct for encoding grams */
    TDB_TIMER_START
    if ((ret = huff_create_codemap(&gram_freqs, &codemap)))
        goto done;
    if (!(fstats = huff_field_stats(field_cardinalities,
                                    num_fields,
                                    max_timedelta))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }
    TDB_TIMER_END("trail/huff_create_codemap");

    /* 6. encode and write trails to disk */
    TDB_TIMER_START
    TDB_PATH(path, "%s/trails.data", root);
    TDB_PATH(toc_path, "%s/trails.toc", root);
    if ((ret = encode_trails(items,
                             grouped_r,
                             num_events,
                             num_trails,
                             num_fields,
                             &codemap,
                             &gram_freqs,
                             fstats,
                             path,
                             toc_path)))
        goto done;
    TDB_TIMER_END("trail/encode_trails");

    /* 7. write huffman codebook to disk */
    TDB_TIMER_START
    tdb_path(path, "%s/trails.codebook", root);
    if ((ret = store_codebook(&codemap, path)))
        goto done;
    TDB_TIMER_END("trail/store_codebook");

done:
    TDB_CLOSE_FINAL(grouped_w);
    TDB_CLOSE_FINAL(grouped_r);
    j128m_free(&gram_freqs);
    j128m_free(&codemap);
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wsign-compare"
    JLFA(tmp, unigram_freqs);
#pragma GCC diagnostic pop

    unlink(grouped_path);

    free(field_cardinalities);
    free(read_buf);
    free(fstats);

    return ret;

out_of_memory:
    return TDB_ERR_NOMEM;
}
Beispiel #8
0
static tdb_error encode_trails(const tdb_item *items,
                               FILE *grouped,
                               uint64_t num_events,
                               uint64_t num_trails,
                               uint64_t num_fields,
                               const struct judy_128_map *codemap,
                               const struct judy_128_map *gram_freqs,
                               const struct field_stats *fstats,
                               const char *path,
                               const char *toc_path)
{
    __uint128_t *grams = NULL;
    tdb_item *prev_items = NULL;
    uint64_t *encoded = NULL;
    uint64_t encoded_size = 0;
    uint64_t buf_size = INITIAL_ENCODING_BUF_BITS;
    uint64_t i = 1;
    char *buf = NULL;
    FILE *out = NULL;
    uint64_t file_offs = 0;
    uint64_t *toc = NULL;
    struct gram_bufs gbufs;
    struct tdb_grouped_event ev;
    int ret = 0;
    char *write_buf = NULL;

    if ((ret = init_gram_bufs(&gbufs, num_fields)))
        goto done;

    if (!(write_buf = malloc(WRITE_BUFFER_SIZE))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }

    TDB_OPEN(out, path, "w");
    setvbuf(out, write_buf, _IOFBF, WRITE_BUFFER_SIZE);

    if (!(buf = calloc(1, buf_size / 8 + 8))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }
    if (!(prev_items = malloc(num_fields * sizeof(tdb_item)))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }
    if (!(grams = malloc(num_fields * 16))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }
    if (!(toc = malloc((num_trails + 1) * 8))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }

    rewind(grouped);
    if (num_events)
        TDB_READ(grouped, &ev, sizeof(struct tdb_grouped_event));

    while (i <= num_events){
        /* encode trail for one UUID (multiple events) */

        /* reserve 3 bits in the head of the trail for a length residual:
           Length of a trail is measured in bytes but the last byte may
           be short. The residual indicates how many bits in the end we
           should ignore. */
        uint64_t offs = 3;
        uint64_t trail_id = ev.trail_id;
        uint64_t n, m, trail_size;

        toc[trail_id] = file_offs;
        memset(prev_items, 0, num_fields * sizeof(tdb_item));

        while (ev.trail_id == trail_id){

            /* 1) produce an edge-encoded set of items for this event */
            if ((ret = edge_encode_items(items,
                                         &encoded,
                                         &n,
                                         &encoded_size,
                                         prev_items,
                                         &ev)))
                goto done;

            /* 2) cover the encoded set with a set of unigrams and bigrams */
            if ((ret = choose_grams_one_event(encoded,
                                              n,
                                              gram_freqs,
                                              &gbufs,
                                              grams,
                                              &m,
                                              &ev)))
                goto done;

            uint64_t bits_needed = offs + huff_encoded_max_bits(m) + 64;
            if (bits_needed > buf_size){
                char *new_buf;
                buf_size = bits_needed * 2;
                if (!(new_buf = calloc(1, buf_size / 8 + 8))){
                    ret = TDB_ERR_NOMEM;
                    goto done;
                }
                memcpy(new_buf, buf, offs / 8 + 1);
                free(buf);
                buf = new_buf;
            }

            /* 3) huffman-encode grams */
            huff_encode_grams(codemap,
                              grams,
                              m,
                              buf,
                              &offs,
                              fstats);

            if (i++ < num_events){
                TDB_READ(grouped, &ev, sizeof(struct tdb_grouped_event));
            }else
                break;
        }

        /* write the length residual */
        if (offs & 7){
            trail_size = offs / 8 + 1;
            write_bits(buf, 0, 8 - (uint32_t)(offs & 7LLU));
        }else{
            trail_size = offs / 8;
        }

        /* append trail to the end of file */
        TDB_WRITE(out, buf, trail_size);

        file_offs += trail_size;
        memset(buf, 0, trail_size);

    }
    /* keep the redundant last offset in the TOC, so we can determine
       trail length with toc[i + 1] - toc[i]. */
    toc[num_trails] = file_offs;

    /* write an extra 8 null bytes: huffman may require up to 7 when reading */
    uint64_t zero = 0;
    TDB_WRITE(out, &zero, 8);
    file_offs += 8;
    TDB_CLOSE(out);

    TDB_OPEN(out, toc_path, "w");
    size_t offs_size = file_offs < UINT32_MAX ? 4 : 8;
    for (i = 0; i < num_trails + 1; i++)
        TDB_WRITE(out, &toc[i], offs_size);

done:
    TDB_CLOSE_FINAL(out);

    free(write_buf);
    free_gram_bufs(&gbufs);
    free(grams);
    free(encoded);
    free(prev_items);
    free(buf);
    free(toc);

    return ret;
}