TDB_EXPORT tdb_error tdb_cons_open(tdb_cons *cons, const char *root, const char **ofield_names, uint64_t num_ofields) { tdb_field i; int fd; int ret = 0; /* by handling the "cons == NULL" case here gracefully, we allow the return value of tdb_init() to be used unchecked like here: int err; tdb_cons *cons = tdb_cons_init(); if ((err = tdb_cons_open(cons, path, fields, num_fields))) printf("Opening cons failed: %s", tdb_error(err)); */ if (!cons) return TDB_ERR_HANDLE_IS_NULL; if (cons->events.item_size) return TDB_ERR_HANDLE_ALREADY_OPENED; if (num_ofields > TDB_MAX_NUM_FIELDS) return TDB_ERR_TOO_MANY_FIELDS; if ((ret = find_duplicate_fieldnames(ofield_names, num_ofields))) goto done; if (!(cons->ofield_names = calloc(num_ofields, sizeof(char*)))) return TDB_ERR_NOMEM; for (i = 0; i < num_ofields; i++){ if (is_fieldname_invalid(ofield_names[i])){ ret = TDB_ERR_INVALID_FIELDNAME; goto done; } if (!(cons->ofield_names[i] = strdup(ofield_names[i]))){ ret = TDB_ERR_NOMEM; goto done; } } j128m_init(&cons->trails); if (!(cons->root = strdup(root))){ ret = TDB_ERR_NOMEM; goto done; } cons->min_timestamp = UINT64_MAX; cons->num_ofields = num_ofields; cons->events.arena_increment = EVENTS_ARENA_INCREMENT; cons->events.item_size = sizeof(struct tdb_cons_event); cons->items.item_size = sizeof(tdb_item); /* Opportunistically try to create the output directory. We don't care if it fails, e.g. because it already exists */ mkdir(root, 0755); TDB_PATH(cons->tempfile, "%s/tmp.items.XXXXXX", root); if ((fd = mkstemp(cons->tempfile)) == -1){ ret = TDB_ERR_IO_OPEN; goto done; } if (!(cons->items.fd = fdopen(fd, "w"))){ ret = TDB_ERR_IO_OPEN; goto done; } if (cons->num_ofields > 0) if (!(cons->lexicons = calloc(cons->num_ofields, sizeof(struct judy_str_map)))){ ret = TDB_ERR_NOMEM; goto done; } for (i = 0; i < cons->num_ofields; i++) if (jsm_init(&cons->lexicons[i])){ ret = TDB_ERR_NOMEM; goto done; } done: return ret; }
tdb_error tdb_encode(tdb_cons *cons, const tdb_item *items) { char path[TDB_MAX_PATH_SIZE]; char grouped_path[TDB_MAX_PATH_SIZE]; char toc_path[TDB_MAX_PATH_SIZE]; char *root = cons->root; char *read_buf = NULL; struct field_stats *fstats = NULL; uint64_t num_trails = 0; uint64_t num_events = cons->events.next; uint64_t num_fields = cons->num_ofields + 1; uint64_t max_timestamp = 0; uint64_t max_timedelta = 0; uint64_t *field_cardinalities = NULL; uint64_t i; Pvoid_t unigram_freqs = NULL; struct judy_128_map gram_freqs; struct judy_128_map codemap; Word_t tmp; FILE *grouped_w = NULL; FILE *grouped_r = NULL; int fd, ret = 0; TDB_TIMER_DEF j128m_init(&gram_freqs); j128m_init(&codemap); if (!(field_cardinalities = calloc(cons->num_ofields, 8))){ ret = TDB_ERR_NOMEM; goto done; } for (i = 0; i < cons->num_ofields; i++) field_cardinalities[i] = jsm_num_keys(&cons->lexicons[i]); /* 1. group events by trail, sort events of each trail by time, and delta-encode timestamps */ TDB_TIMER_START TDB_PATH(grouped_path, "%s/tmp.grouped.XXXXXX", root); if ((fd = mkstemp(grouped_path)) == -1){ ret = TDB_ERR_IO_OPEN; goto done; } if (!(grouped_w = fdopen(fd, "w"))){ ret = TDB_ERR_IO_OPEN; goto done; } if (cons->events.data) if ((ret = groupby_uuid(grouped_w, (struct tdb_cons_event*)cons->events.data, cons, &num_trails, &max_timestamp, &max_timedelta))) goto done; /* not the most clean separation of ownership here, but these objects can be huge so keeping them around unecessarily is expensive */ free(cons->events.data); cons->events.data = NULL; j128m_free(&cons->trails); TDB_CLOSE(grouped_w); grouped_w = NULL; TDB_OPEN(grouped_r, grouped_path, "r"); if (!(read_buf = malloc(READ_BUFFER_SIZE))){ ret = TDB_ERR_NOMEM; goto done; } setvbuf(grouped_r, read_buf, _IOFBF, READ_BUFFER_SIZE); TDB_TIMER_END("trail/groupby_uuid"); /* 2. store metatadata */ TDB_TIMER_START TDB_PATH(path, "%s/info", root); if ((ret = store_info(path, num_trails, num_events, cons->min_timestamp, max_timestamp, max_timedelta))) goto done; TDB_TIMER_END("trail/info"); /* 3. collect value (unigram) freqs, including delta-encoded timestamps */ TDB_TIMER_START unigram_freqs = collect_unigrams(grouped_r, num_events, items, num_fields); if (num_events > 0 && !unigram_freqs){ ret = TDB_ERR_NOMEM; goto done; } TDB_TIMER_END("trail/collect_unigrams"); /* 4. construct uni/bi-grams */ tdb_opt_value dont_build_bigrams; tdb_cons_get_opt(cons, TDB_OPT_CONS_NO_BIGRAMS, &dont_build_bigrams); TDB_TIMER_START if ((ret = make_grams(grouped_r, num_events, items, num_fields, unigram_freqs, &gram_freqs, dont_build_bigrams.value))) goto done; TDB_TIMER_END("trail/gram_freqs"); /* 5. build a huffman codebook and stats struct for encoding grams */ TDB_TIMER_START if ((ret = huff_create_codemap(&gram_freqs, &codemap))) goto done; if (!(fstats = huff_field_stats(field_cardinalities, num_fields, max_timedelta))){ ret = TDB_ERR_NOMEM; goto done; } TDB_TIMER_END("trail/huff_create_codemap"); /* 6. encode and write trails to disk */ TDB_TIMER_START TDB_PATH(path, "%s/trails.data", root); TDB_PATH(toc_path, "%s/trails.toc", root); if ((ret = encode_trails(items, grouped_r, num_events, num_trails, num_fields, &codemap, &gram_freqs, fstats, path, toc_path))) goto done; TDB_TIMER_END("trail/encode_trails"); /* 7. write huffman codebook to disk */ TDB_TIMER_START tdb_path(path, "%s/trails.codebook", root); if ((ret = store_codebook(&codemap, path))) goto done; TDB_TIMER_END("trail/store_codebook"); done: TDB_CLOSE_FINAL(grouped_w); TDB_CLOSE_FINAL(grouped_r); j128m_free(&gram_freqs); j128m_free(&codemap); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wsign-compare" JLFA(tmp, unigram_freqs); #pragma GCC diagnostic pop unlink(grouped_path); free(field_cardinalities); free(read_buf); free(fstats); return ret; out_of_memory: return TDB_ERR_NOMEM; }