Ejemplo n.º 1
0
static tdb_error lexicon_store(const struct judy_str_map *lexicon,
                               const char *path)
{
    /*
    Lexicon format:
    [ number of values N ] 4 or 8 bytes
    [ value offsets ...  ] N * (4 or 8 bytes)
    [ last value offset  ] 4 or 8 bytes
    [ values ...         ] X bytes
    */

    struct jm_fold_state state;
    uint64_t count = jsm_num_keys(lexicon);
    uint64_t size = (count + 2) * 4 + jsm_values_size(lexicon);
    int ret = 0;

    state.offset = (count + 2) * 4;
    state.width = 4;

    if (size > UINT32_MAX){
        size = (count + 2) * 8 + jsm_values_size(lexicon);
        state.offset = (count + 2) * 8;
        state.width = 8;
    }

    if (size > TDB_MAX_LEXICON_SIZE)
        return TDB_ERR_LEXICON_TOO_LARGE;

    state.out = NULL;
    state.ret = 0;

    TDB_OPEN(state.out, path, "w");
    TDB_TRUNCATE(state.out, (off_t)size);
    TDB_WRITE(state.out, &count, state.width);

    jsm_fold(lexicon, lexicon_store_fun, &state);
    if ((ret = state.ret))
        goto done;

    TDB_SEEK(state.out, (count + 1) * state.width);
    TDB_WRITE(state.out, &state.offset, state.width);

done:
    TDB_CLOSE_FINAL(state.out);
    return ret;
}
Ejemplo n.º 2
0
static void *lexicon_store_fun(uint64_t id,
                               const char *value,
                               uint64_t len,
                               void *state)
{
    struct jm_fold_state *s = (struct jm_fold_state*)state;
    int ret = 0;

    if (s->ret)
        return state;

    /* NOTE: vals start at 1, otherwise we would need to +1 */
    TDB_SEEK(s->out, id * s->width);
    TDB_WRITE(s->out, &s->offset, s->width);

    TDB_SEEK(s->out, s->offset);
    TDB_WRITE(s->out, value, len);

done:
    s->ret = ret;
    s->offset += len;
    return state;
}
Ejemplo n.º 3
0
static tdb_error store_codebook(const struct judy_128_map *codemap,
                                const char *path)
{
    FILE *out = NULL;
    uint32_t size;
    struct huff_codebook *book = huff_create_codebook(codemap, &size);
    int ret = 0;

    TDB_OPEN(out, path, "w");
    TDB_WRITE(out, book, size);

done:
    TDB_CLOSE_FINAL(out);
    free(book);
    return ret;
}
Ejemplo n.º 4
0
static void *groupby_uuid_handle_one_trail(
    __uint128_t uuid __attribute__((unused)),
    Word_t *value,
    void *state)
{
    struct jm_fold_state *s = (struct jm_fold_state*)state;
    /* find the last event belonging to this trail */
    const struct tdb_cons_event *ev = &s->events[*value - 1];
    uint64_t j = 0;
    uint64_t num_events = 0;
    int ret = 0;

    if (s->ret)
        return s;

    /* loop through all events belonging to this trail,
       following back-links */
    while (1){
        if (j >= s->buf_size){
            s->buf_size += GROUPBUF_INCREMENT;
            if (!(s->buf = realloc(s->buf,
                    s->buf_size * sizeof(struct tdb_grouped_event)))){
                ret = TDB_ERR_NOMEM;
                goto done;
            }
        }
        s->buf[j].trail_id = s->trail_id;
        s->buf[j].item_zero = ev->item_zero;
        s->buf[j].num_items = ev->num_items;
        s->buf[j].timestamp = ev->timestamp;

        /* TODO write a test for an extra long (>2^32) trail */
        if (++j == TDB_MAX_TRAIL_LENGTH){
            ret = TDB_ERR_TRAIL_TOO_LONG;
            goto done;
        }

        if (ev->prev_event_idx)
            ev = &s->events[ev->prev_event_idx - 1];
        else
            break;
    }
    num_events = j;

    /* sort events of this trail by time */
    /* TODO make this stable sort */
    /* TODO this could really benefit from Timsort since raw data
       is often partially sorted */
    qsort(s->buf, num_events, sizeof(struct tdb_grouped_event), compare);

    /* delta-encode timestamps */
    uint64_t prev_timestamp = s->min_timestamp;
    for (j = 0; j < num_events; j++){
        uint64_t timestamp = s->buf[j].timestamp;
        uint64_t delta = timestamp - prev_timestamp;
        if (delta < TDB_MAX_TIMEDELTA){
            if (timestamp > s->max_timestamp)
                s->max_timestamp = timestamp;
            if (delta > s->max_timedelta)
                s->max_timedelta = delta;
            prev_timestamp = timestamp;
            /* convert the delta value to a proper item */
            s->buf[j].timestamp = tdb_make_item(0, delta);
        }else{
            ret = TDB_ERR_TIMESTAMP_TOO_LARGE;
            goto done;
        }
    }

    TDB_WRITE(s->grouped_w,
              s->buf,
              num_events * sizeof(struct tdb_grouped_event));
    ++s->trail_id;

done:
    s->ret = ret;
    return s;
}
Ejemplo n.º 5
0
static tdb_error encode_trails(const tdb_item *items,
                               FILE *grouped,
                               uint64_t num_events,
                               uint64_t num_trails,
                               uint64_t num_fields,
                               const struct judy_128_map *codemap,
                               const struct judy_128_map *gram_freqs,
                               const struct field_stats *fstats,
                               const char *path,
                               const char *toc_path)
{
    __uint128_t *grams = NULL;
    tdb_item *prev_items = NULL;
    uint64_t *encoded = NULL;
    uint64_t encoded_size = 0;
    uint64_t buf_size = INITIAL_ENCODING_BUF_BITS;
    uint64_t i = 1;
    char *buf = NULL;
    FILE *out = NULL;
    uint64_t file_offs = 0;
    uint64_t *toc = NULL;
    struct gram_bufs gbufs;
    struct tdb_grouped_event ev;
    int ret = 0;
    char *write_buf = NULL;

    if ((ret = init_gram_bufs(&gbufs, num_fields)))
        goto done;

    if (!(write_buf = malloc(WRITE_BUFFER_SIZE))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }

    TDB_OPEN(out, path, "w");
    setvbuf(out, write_buf, _IOFBF, WRITE_BUFFER_SIZE);

    if (!(buf = calloc(1, buf_size / 8 + 8))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }
    if (!(prev_items = malloc(num_fields * sizeof(tdb_item)))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }
    if (!(grams = malloc(num_fields * 16))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }
    if (!(toc = malloc((num_trails + 1) * 8))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }

    rewind(grouped);
    if (num_events)
        TDB_READ(grouped, &ev, sizeof(struct tdb_grouped_event));

    while (i <= num_events){
        /* encode trail for one UUID (multiple events) */

        /* reserve 3 bits in the head of the trail for a length residual:
           Length of a trail is measured in bytes but the last byte may
           be short. The residual indicates how many bits in the end we
           should ignore. */
        uint64_t offs = 3;
        uint64_t trail_id = ev.trail_id;
        uint64_t n, m, trail_size;

        toc[trail_id] = file_offs;
        memset(prev_items, 0, num_fields * sizeof(tdb_item));

        while (ev.trail_id == trail_id){

            /* 1) produce an edge-encoded set of items for this event */
            if ((ret = edge_encode_items(items,
                                         &encoded,
                                         &n,
                                         &encoded_size,
                                         prev_items,
                                         &ev)))
                goto done;

            /* 2) cover the encoded set with a set of unigrams and bigrams */
            if ((ret = choose_grams_one_event(encoded,
                                              n,
                                              gram_freqs,
                                              &gbufs,
                                              grams,
                                              &m,
                                              &ev)))
                goto done;

            uint64_t bits_needed = offs + huff_encoded_max_bits(m) + 64;
            if (bits_needed > buf_size){
                char *new_buf;
                buf_size = bits_needed * 2;
                if (!(new_buf = calloc(1, buf_size / 8 + 8))){
                    ret = TDB_ERR_NOMEM;
                    goto done;
                }
                memcpy(new_buf, buf, offs / 8 + 1);
                free(buf);
                buf = new_buf;
            }

            /* 3) huffman-encode grams */
            huff_encode_grams(codemap,
                              grams,
                              m,
                              buf,
                              &offs,
                              fstats);

            if (i++ < num_events){
                TDB_READ(grouped, &ev, sizeof(struct tdb_grouped_event));
            }else
                break;
        }

        /* write the length residual */
        if (offs & 7){
            trail_size = offs / 8 + 1;
            write_bits(buf, 0, 8 - (uint32_t)(offs & 7LLU));
        }else{
            trail_size = offs / 8;
        }

        /* append trail to the end of file */
        TDB_WRITE(out, buf, trail_size);

        file_offs += trail_size;
        memset(buf, 0, trail_size);

    }
    /* keep the redundant last offset in the TOC, so we can determine
       trail length with toc[i + 1] - toc[i]. */
    toc[num_trails] = file_offs;

    /* write an extra 8 null bytes: huffman may require up to 7 when reading */
    uint64_t zero = 0;
    TDB_WRITE(out, &zero, 8);
    file_offs += 8;
    TDB_CLOSE(out);

    TDB_OPEN(out, toc_path, "w");
    size_t offs_size = file_offs < UINT32_MAX ? 4 : 8;
    for (i = 0; i < num_trails + 1; i++)
        TDB_WRITE(out, &toc[i], offs_size);

done:
    TDB_CLOSE_FINAL(out);

    free(write_buf);
    free_gram_bufs(&gbufs);
    free(grams);
    free(encoded);
    free(prev_items);
    free(buf);
    free(toc);

    return ret;
}