TDB_EXPORT tdb_error tdb_cons_finalize(tdb_cons *cons) { struct tdb_file items_mmapped; uint64_t num_events = cons->events.next; int ret = 0; memset(&items_mmapped, 0, sizeof(struct tdb_file)); /* finalize event items */ if ((ret = arena_flush(&cons->items))) goto done; if (cons->items.fd && fclose(cons->items.fd)) { cons->items.fd = NULL; ret = TDB_ERR_IO_CLOSE; goto done; } cons->items.fd = NULL; if (cons->tempfile[0]){ if (num_events && cons->num_ofields) { if (file_mmap(cons->tempfile, NULL, &items_mmapped, NULL)){ ret = TDB_ERR_IO_READ; goto done; } } TDB_TIMER_DEF TDB_TIMER_START if ((ret = store_lexicons(cons))) goto done; TDB_TIMER_END("encoder/store_lexicons") TDB_TIMER_START if ((ret = store_uuids(cons))) goto done; TDB_TIMER_END("encoder/store_uuids") TDB_TIMER_START if ((ret = store_version(cons))) goto done; TDB_TIMER_END("encoder/store_version") TDB_TIMER_START if ((ret = tdb_encode(cons, (const tdb_item*)items_mmapped.data))) goto done; TDB_TIMER_END("encoder/encode") } done: if (items_mmapped.ptr) munmap(items_mmapped.ptr, items_mmapped.mmap_size); if (cons->tempfile[0]) unlink(cons->tempfile); if (!ret){ #ifdef HAVE_ARCHIVE_H if (cons->output_format == TDB_OPT_CONS_OUTPUT_FORMAT_PACKAGE) ret = cons_package(cons); #endif } return ret; }
tdb_error tdb_encode(tdb_cons *cons, const tdb_item *items) { char path[TDB_MAX_PATH_SIZE]; char grouped_path[TDB_MAX_PATH_SIZE]; char toc_path[TDB_MAX_PATH_SIZE]; char *root = cons->root; char *read_buf = NULL; struct field_stats *fstats = NULL; uint64_t num_trails = 0; uint64_t num_events = cons->events.next; uint64_t num_fields = cons->num_ofields + 1; uint64_t max_timestamp = 0; uint64_t max_timedelta = 0; uint64_t *field_cardinalities = NULL; uint64_t i; Pvoid_t unigram_freqs = NULL; struct judy_128_map gram_freqs; struct judy_128_map codemap; Word_t tmp; FILE *grouped_w = NULL; FILE *grouped_r = NULL; int fd, ret = 0; TDB_TIMER_DEF j128m_init(&gram_freqs); j128m_init(&codemap); if (!(field_cardinalities = calloc(cons->num_ofields, 8))){ ret = TDB_ERR_NOMEM; goto done; } for (i = 0; i < cons->num_ofields; i++) field_cardinalities[i] = jsm_num_keys(&cons->lexicons[i]); /* 1. group events by trail, sort events of each trail by time, and delta-encode timestamps */ TDB_TIMER_START TDB_PATH(grouped_path, "%s/tmp.grouped.XXXXXX", root); if ((fd = mkstemp(grouped_path)) == -1){ ret = TDB_ERR_IO_OPEN; goto done; } if (!(grouped_w = fdopen(fd, "w"))){ ret = TDB_ERR_IO_OPEN; goto done; } if (cons->events.data) if ((ret = groupby_uuid(grouped_w, (struct tdb_cons_event*)cons->events.data, cons, &num_trails, &max_timestamp, &max_timedelta))) goto done; /* not the most clean separation of ownership here, but these objects can be huge so keeping them around unecessarily is expensive */ free(cons->events.data); cons->events.data = NULL; j128m_free(&cons->trails); TDB_CLOSE(grouped_w); grouped_w = NULL; TDB_OPEN(grouped_r, grouped_path, "r"); if (!(read_buf = malloc(READ_BUFFER_SIZE))){ ret = TDB_ERR_NOMEM; goto done; } setvbuf(grouped_r, read_buf, _IOFBF, READ_BUFFER_SIZE); TDB_TIMER_END("trail/groupby_uuid"); /* 2. store metatadata */ TDB_TIMER_START TDB_PATH(path, "%s/info", root); if ((ret = store_info(path, num_trails, num_events, cons->min_timestamp, max_timestamp, max_timedelta))) goto done; TDB_TIMER_END("trail/info"); /* 3. collect value (unigram) freqs, including delta-encoded timestamps */ TDB_TIMER_START unigram_freqs = collect_unigrams(grouped_r, num_events, items, num_fields); if (num_events > 0 && !unigram_freqs){ ret = TDB_ERR_NOMEM; goto done; } TDB_TIMER_END("trail/collect_unigrams"); /* 4. construct uni/bi-grams */ tdb_opt_value dont_build_bigrams; tdb_cons_get_opt(cons, TDB_OPT_CONS_NO_BIGRAMS, &dont_build_bigrams); TDB_TIMER_START if ((ret = make_grams(grouped_r, num_events, items, num_fields, unigram_freqs, &gram_freqs, dont_build_bigrams.value))) goto done; TDB_TIMER_END("trail/gram_freqs"); /* 5. build a huffman codebook and stats struct for encoding grams */ TDB_TIMER_START if ((ret = huff_create_codemap(&gram_freqs, &codemap))) goto done; if (!(fstats = huff_field_stats(field_cardinalities, num_fields, max_timedelta))){ ret = TDB_ERR_NOMEM; goto done; } TDB_TIMER_END("trail/huff_create_codemap"); /* 6. encode and write trails to disk */ TDB_TIMER_START TDB_PATH(path, "%s/trails.data", root); TDB_PATH(toc_path, "%s/trails.toc", root); if ((ret = encode_trails(items, grouped_r, num_events, num_trails, num_fields, &codemap, &gram_freqs, fstats, path, toc_path))) goto done; TDB_TIMER_END("trail/encode_trails"); /* 7. write huffman codebook to disk */ TDB_TIMER_START tdb_path(path, "%s/trails.codebook", root); if ((ret = store_codebook(&codemap, path))) goto done; TDB_TIMER_END("trail/store_codebook"); done: TDB_CLOSE_FINAL(grouped_w); TDB_CLOSE_FINAL(grouped_r); j128m_free(&gram_freqs); j128m_free(&codemap); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wsign-compare" JLFA(tmp, unigram_freqs); #pragma GCC diagnostic pop unlink(grouped_path); free(field_cardinalities); free(read_buf); free(fstats); return ret; out_of_memory: return TDB_ERR_NOMEM; }