Example #1
0
static tdb_error store_lexicons(tdb_cons *cons)
{
    tdb_field i;
    FILE *out = NULL;
    char path[TDB_MAX_PATH_SIZE];
    int ret = 0;

    TDB_PATH(path, "%s/fields", cons->root);
    TDB_OPEN(out, path, "w");

    for (i = 0; i < cons->num_ofields; i++){
        TDB_PATH(path, "%s/lexicon.%s", cons->root, cons->ofield_names[i]);
        if ((ret = lexicon_store(&cons->lexicons[i], path)))
            goto done;
        TDB_FPRINTF(out, "%s\n", cons->ofield_names[i]);
    }
    TDB_FPRINTF(out, "\n");
done:
    TDB_CLOSE_FINAL(out);
    return ret;
}
Example #2
0
static tdb_error store_version(tdb_cons *cons)
{
    FILE *out = NULL;
    char path[TDB_MAX_PATH_SIZE];
    int ret = 0;

    TDB_PATH(path, "%s/version", cons->root);
    TDB_OPEN(out, path, "w");
    TDB_FPRINTF(out, "%llu", TDB_VERSION_LATEST);
done:
    TDB_CLOSE_FINAL(out);
    return ret;
}
Example #3
0
static tdb_error store_uuids(tdb_cons *cons)
{
    char path[TDB_MAX_PATH_SIZE];
    struct jm_fold_state state = {.ret = 0};
    uint64_t num_trails = j128m_num_keys(&cons->trails);
    int ret = 0;

    /* this is why num_trails < TDB_MAX)NUM_TRAILS < 2^59:
       (2^59 - 1) * 16 < LONG_MAX (off_t) */
    if (num_trails > TDB_MAX_NUM_TRAILS)
        return TDB_ERR_TOO_MANY_TRAILS;

    TDB_PATH(path, "%s/uuids", cons->root);
    TDB_OPEN(state.out, path, "w");
    TDB_TRUNCATE(state.out, ((off_t)(num_trails * 16)));

    j128m_fold(&cons->trails, store_uuids_fun, &state);
    ret = state.ret;

done:
    TDB_CLOSE_FINAL(state.out);
    return ret;
}

int is_fieldname_invalid(const char* field)
{
    uint64_t i;

    if (!strcmp(field, "time"))
        return 1;

    for (i = 0; i < TDB_MAX_FIELDNAME_LENGTH && field[i]; i++)
        if (!index(TDB_FIELDNAME_CHARS, field[i]))
            return 1;

    if (i == 0 || i == TDB_MAX_FIELDNAME_LENGTH)
        return 1;

    return 0;
}
Example #4
0
TDB_EXPORT tdb_error tdb_cons_open(tdb_cons *cons,
                                   const char *root,
                                   const char **ofield_names,
                                   uint64_t num_ofields)
{
    tdb_field i;
    int fd;
    int ret = 0;

    /*
    by handling the "cons == NULL" case here gracefully, we allow the return
    value of tdb_init() to be used unchecked like here:

    int err;
    tdb_cons *cons = tdb_cons_init();
    if ((err = tdb_cons_open(cons, path, fields, num_fields)))
        printf("Opening cons failed: %s", tdb_error(err));
    */
    if (!cons)
        return TDB_ERR_HANDLE_IS_NULL;

    if (cons->events.item_size)
        return TDB_ERR_HANDLE_ALREADY_OPENED;

    if (num_ofields > TDB_MAX_NUM_FIELDS)
        return TDB_ERR_TOO_MANY_FIELDS;

    if ((ret = find_duplicate_fieldnames(ofield_names, num_ofields)))
        goto done;

    if (!(cons->ofield_names = calloc(num_ofields, sizeof(char*))))
        return TDB_ERR_NOMEM;

    for (i = 0; i < num_ofields; i++){
        if (is_fieldname_invalid(ofield_names[i])){
            ret = TDB_ERR_INVALID_FIELDNAME;
            goto done;
        }
        if (!(cons->ofield_names[i] = strdup(ofield_names[i]))){
            ret = TDB_ERR_NOMEM;
            goto done;
        }
    }

    j128m_init(&cons->trails);

    if (!(cons->root = strdup(root))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }

    cons->min_timestamp = UINT64_MAX;
    cons->num_ofields = num_ofields;
    cons->events.arena_increment = EVENTS_ARENA_INCREMENT;
    cons->events.item_size = sizeof(struct tdb_cons_event);
    cons->items.item_size = sizeof(tdb_item);

    /* Opportunistically try to create the output directory.
       We don't care if it fails, e.g. because it already exists */
    mkdir(root, 0755);
    TDB_PATH(cons->tempfile, "%s/tmp.items.XXXXXX", root);
    if ((fd = mkstemp(cons->tempfile)) == -1){
        ret = TDB_ERR_IO_OPEN;
        goto done;
    }

    if (!(cons->items.fd = fdopen(fd, "w"))){
        ret = TDB_ERR_IO_OPEN;
        goto done;
    }

    if (cons->num_ofields > 0)
        if (!(cons->lexicons = calloc(cons->num_ofields,
                                      sizeof(struct judy_str_map)))){
            ret = TDB_ERR_NOMEM;
            goto done;
        }

    for (i = 0; i < cons->num_ofields; i++)
        if (jsm_init(&cons->lexicons[i])){
            ret = TDB_ERR_NOMEM;
            goto done;
        }

done:
    return ret;
}
Example #5
0
tdb_error tdb_encode(tdb_cons *cons, const tdb_item *items)
{
    char path[TDB_MAX_PATH_SIZE];
    char grouped_path[TDB_MAX_PATH_SIZE];
    char toc_path[TDB_MAX_PATH_SIZE];
    char *root = cons->root;
    char *read_buf = NULL;
    struct field_stats *fstats = NULL;
    uint64_t num_trails = 0;
    uint64_t num_events = cons->events.next;
    uint64_t num_fields = cons->num_ofields + 1;
    uint64_t max_timestamp = 0;
    uint64_t max_timedelta = 0;
    uint64_t *field_cardinalities = NULL;
    uint64_t i;
    Pvoid_t unigram_freqs = NULL;
    struct judy_128_map gram_freqs;
    struct judy_128_map codemap;
    Word_t tmp;
    FILE *grouped_w = NULL;
    FILE *grouped_r = NULL;
    int fd, ret = 0;
    TDB_TIMER_DEF

    j128m_init(&gram_freqs);
    j128m_init(&codemap);

    if (!(field_cardinalities = calloc(cons->num_ofields, 8))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }

    for (i = 0; i < cons->num_ofields; i++)
        field_cardinalities[i] = jsm_num_keys(&cons->lexicons[i]);

    /* 1. group events by trail, sort events of each trail by time,
          and delta-encode timestamps */
    TDB_TIMER_START

    TDB_PATH(grouped_path, "%s/tmp.grouped.XXXXXX", root);
    if ((fd = mkstemp(grouped_path)) == -1){
        ret = TDB_ERR_IO_OPEN;
        goto done;
    }
    if (!(grouped_w = fdopen(fd, "w"))){
        ret = TDB_ERR_IO_OPEN;
        goto done;
    }

    if (cons->events.data)
        if ((ret = groupby_uuid(grouped_w,
                                (struct tdb_cons_event*)cons->events.data,
                                cons,
                                &num_trails,
                                &max_timestamp,
                                &max_timedelta)))
            goto done;

    /*
    not the most clean separation of ownership here, but these objects
    can be huge so keeping them around unecessarily is expensive
    */
    free(cons->events.data);
    cons->events.data = NULL;
    j128m_free(&cons->trails);

    TDB_CLOSE(grouped_w);
    grouped_w = NULL;

    TDB_OPEN(grouped_r, grouped_path, "r");
    if (!(read_buf = malloc(READ_BUFFER_SIZE))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }

    setvbuf(grouped_r, read_buf, _IOFBF, READ_BUFFER_SIZE);
    TDB_TIMER_END("trail/groupby_uuid");

    /* 2. store metatadata */
    TDB_TIMER_START
    TDB_PATH(path, "%s/info", root);
    if ((ret = store_info(path,
                          num_trails,
                          num_events,
                          cons->min_timestamp,
                          max_timestamp,
                          max_timedelta)))
        goto done;
    TDB_TIMER_END("trail/info");

    /* 3. collect value (unigram) freqs, including delta-encoded timestamps */
    TDB_TIMER_START
    unigram_freqs = collect_unigrams(grouped_r, num_events, items, num_fields);
    if (num_events > 0 && !unigram_freqs){
        ret = TDB_ERR_NOMEM;
        goto done;
    }
    TDB_TIMER_END("trail/collect_unigrams");

    /* 4. construct uni/bi-grams */
    tdb_opt_value dont_build_bigrams;
    tdb_cons_get_opt(cons, TDB_OPT_CONS_NO_BIGRAMS, &dont_build_bigrams);

    TDB_TIMER_START
    if ((ret = make_grams(grouped_r,
                          num_events,
                          items,
                          num_fields,
                          unigram_freqs,
                          &gram_freqs,
                          dont_build_bigrams.value)))
        goto done;
    TDB_TIMER_END("trail/gram_freqs");

    /* 5. build a huffman codebook and stats struct for encoding grams */
    TDB_TIMER_START
    if ((ret = huff_create_codemap(&gram_freqs, &codemap)))
        goto done;
    if (!(fstats = huff_field_stats(field_cardinalities,
                                    num_fields,
                                    max_timedelta))){
        ret = TDB_ERR_NOMEM;
        goto done;
    }
    TDB_TIMER_END("trail/huff_create_codemap");

    /* 6. encode and write trails to disk */
    TDB_TIMER_START
    TDB_PATH(path, "%s/trails.data", root);
    TDB_PATH(toc_path, "%s/trails.toc", root);
    if ((ret = encode_trails(items,
                             grouped_r,
                             num_events,
                             num_trails,
                             num_fields,
                             &codemap,
                             &gram_freqs,
                             fstats,
                             path,
                             toc_path)))
        goto done;
    TDB_TIMER_END("trail/encode_trails");

    /* 7. write huffman codebook to disk */
    TDB_TIMER_START
    tdb_path(path, "%s/trails.codebook", root);
    if ((ret = store_codebook(&codemap, path)))
        goto done;
    TDB_TIMER_END("trail/store_codebook");

done:
    TDB_CLOSE_FINAL(grouped_w);
    TDB_CLOSE_FINAL(grouped_r);
    j128m_free(&gram_freqs);
    j128m_free(&codemap);
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wsign-compare"
    JLFA(tmp, unigram_freqs);
#pragma GCC diagnostic pop

    unlink(grouped_path);

    free(field_cardinalities);
    free(read_buf);
    free(fstats);

    return ret;

out_of_memory:
    return TDB_ERR_NOMEM;
}