static tdb_error lexicon_store(const struct judy_str_map *lexicon, const char *path) { /* Lexicon format: [ number of values N ] 4 or 8 bytes [ value offsets ... ] N * (4 or 8 bytes) [ last value offset ] 4 or 8 bytes [ values ... ] X bytes */ struct jm_fold_state state; uint64_t count = jsm_num_keys(lexicon); uint64_t size = (count + 2) * 4 + jsm_values_size(lexicon); int ret = 0; state.offset = (count + 2) * 4; state.width = 4; if (size > UINT32_MAX){ size = (count + 2) * 8 + jsm_values_size(lexicon); state.offset = (count + 2) * 8; state.width = 8; } if (size > TDB_MAX_LEXICON_SIZE) return TDB_ERR_LEXICON_TOO_LARGE; state.out = NULL; state.ret = 0; TDB_OPEN(state.out, path, "w"); TDB_TRUNCATE(state.out, (off_t)size); TDB_WRITE(state.out, &count, state.width); jsm_fold(lexicon, lexicon_store_fun, &state); if ((ret = state.ret)) goto done; TDB_SEEK(state.out, (count + 1) * state.width); TDB_WRITE(state.out, &state.offset, state.width); done: TDB_CLOSE_FINAL(state.out); return ret; }
static void *lexicon_store_fun(uint64_t id, const char *value, uint64_t len, void *state) { struct jm_fold_state *s = (struct jm_fold_state*)state; int ret = 0; if (s->ret) return state; /* NOTE: vals start at 1, otherwise we would need to +1 */ TDB_SEEK(s->out, id * s->width); TDB_WRITE(s->out, &s->offset, s->width); TDB_SEEK(s->out, s->offset); TDB_WRITE(s->out, value, len); done: s->ret = ret; s->offset += len; return state; }
static tdb_error store_codebook(const struct judy_128_map *codemap, const char *path) { FILE *out = NULL; uint32_t size; struct huff_codebook *book = huff_create_codebook(codemap, &size); int ret = 0; TDB_OPEN(out, path, "w"); TDB_WRITE(out, book, size); done: TDB_CLOSE_FINAL(out); free(book); return ret; }
static void *groupby_uuid_handle_one_trail( __uint128_t uuid __attribute__((unused)), Word_t *value, void *state) { struct jm_fold_state *s = (struct jm_fold_state*)state; /* find the last event belonging to this trail */ const struct tdb_cons_event *ev = &s->events[*value - 1]; uint64_t j = 0; uint64_t num_events = 0; int ret = 0; if (s->ret) return s; /* loop through all events belonging to this trail, following back-links */ while (1){ if (j >= s->buf_size){ s->buf_size += GROUPBUF_INCREMENT; if (!(s->buf = realloc(s->buf, s->buf_size * sizeof(struct tdb_grouped_event)))){ ret = TDB_ERR_NOMEM; goto done; } } s->buf[j].trail_id = s->trail_id; s->buf[j].item_zero = ev->item_zero; s->buf[j].num_items = ev->num_items; s->buf[j].timestamp = ev->timestamp; /* TODO write a test for an extra long (>2^32) trail */ if (++j == TDB_MAX_TRAIL_LENGTH){ ret = TDB_ERR_TRAIL_TOO_LONG; goto done; } if (ev->prev_event_idx) ev = &s->events[ev->prev_event_idx - 1]; else break; } num_events = j; /* sort events of this trail by time */ /* TODO make this stable sort */ /* TODO this could really benefit from Timsort since raw data is often partially sorted */ qsort(s->buf, num_events, sizeof(struct tdb_grouped_event), compare); /* delta-encode timestamps */ uint64_t prev_timestamp = s->min_timestamp; for (j = 0; j < num_events; j++){ uint64_t timestamp = s->buf[j].timestamp; uint64_t delta = timestamp - prev_timestamp; if (delta < TDB_MAX_TIMEDELTA){ if (timestamp > s->max_timestamp) s->max_timestamp = timestamp; if (delta > s->max_timedelta) s->max_timedelta = delta; prev_timestamp = timestamp; /* convert the delta value to a proper item */ s->buf[j].timestamp = tdb_make_item(0, delta); }else{ ret = TDB_ERR_TIMESTAMP_TOO_LARGE; goto done; } } TDB_WRITE(s->grouped_w, s->buf, num_events * sizeof(struct tdb_grouped_event)); ++s->trail_id; done: s->ret = ret; return s; }
static tdb_error encode_trails(const tdb_item *items, FILE *grouped, uint64_t num_events, uint64_t num_trails, uint64_t num_fields, const struct judy_128_map *codemap, const struct judy_128_map *gram_freqs, const struct field_stats *fstats, const char *path, const char *toc_path) { __uint128_t *grams = NULL; tdb_item *prev_items = NULL; uint64_t *encoded = NULL; uint64_t encoded_size = 0; uint64_t buf_size = INITIAL_ENCODING_BUF_BITS; uint64_t i = 1; char *buf = NULL; FILE *out = NULL; uint64_t file_offs = 0; uint64_t *toc = NULL; struct gram_bufs gbufs; struct tdb_grouped_event ev; int ret = 0; char *write_buf = NULL; if ((ret = init_gram_bufs(&gbufs, num_fields))) goto done; if (!(write_buf = malloc(WRITE_BUFFER_SIZE))){ ret = TDB_ERR_NOMEM; goto done; } TDB_OPEN(out, path, "w"); setvbuf(out, write_buf, _IOFBF, WRITE_BUFFER_SIZE); if (!(buf = calloc(1, buf_size / 8 + 8))){ ret = TDB_ERR_NOMEM; goto done; } if (!(prev_items = malloc(num_fields * sizeof(tdb_item)))){ ret = TDB_ERR_NOMEM; goto done; } if (!(grams = malloc(num_fields * 16))){ ret = TDB_ERR_NOMEM; goto done; } if (!(toc = malloc((num_trails + 1) * 8))){ ret = TDB_ERR_NOMEM; goto done; } rewind(grouped); if (num_events) TDB_READ(grouped, &ev, sizeof(struct tdb_grouped_event)); while (i <= num_events){ /* encode trail for one UUID (multiple events) */ /* reserve 3 bits in the head of the trail for a length residual: Length of a trail is measured in bytes but the last byte may be short. The residual indicates how many bits in the end we should ignore. */ uint64_t offs = 3; uint64_t trail_id = ev.trail_id; uint64_t n, m, trail_size; toc[trail_id] = file_offs; memset(prev_items, 0, num_fields * sizeof(tdb_item)); while (ev.trail_id == trail_id){ /* 1) produce an edge-encoded set of items for this event */ if ((ret = edge_encode_items(items, &encoded, &n, &encoded_size, prev_items, &ev))) goto done; /* 2) cover the encoded set with a set of unigrams and bigrams */ if ((ret = choose_grams_one_event(encoded, n, gram_freqs, &gbufs, grams, &m, &ev))) goto done; uint64_t bits_needed = offs + huff_encoded_max_bits(m) + 64; if (bits_needed > buf_size){ char *new_buf; buf_size = bits_needed * 2; if (!(new_buf = calloc(1, buf_size / 8 + 8))){ ret = TDB_ERR_NOMEM; goto done; } memcpy(new_buf, buf, offs / 8 + 1); free(buf); buf = new_buf; } /* 3) huffman-encode grams */ huff_encode_grams(codemap, grams, m, buf, &offs, fstats); if (i++ < num_events){ TDB_READ(grouped, &ev, sizeof(struct tdb_grouped_event)); }else break; } /* write the length residual */ if (offs & 7){ trail_size = offs / 8 + 1; write_bits(buf, 0, 8 - (uint32_t)(offs & 7LLU)); }else{ trail_size = offs / 8; } /* append trail to the end of file */ TDB_WRITE(out, buf, trail_size); file_offs += trail_size; memset(buf, 0, trail_size); } /* keep the redundant last offset in the TOC, so we can determine trail length with toc[i + 1] - toc[i]. */ toc[num_trails] = file_offs; /* write an extra 8 null bytes: huffman may require up to 7 when reading */ uint64_t zero = 0; TDB_WRITE(out, &zero, 8); file_offs += 8; TDB_CLOSE(out); TDB_OPEN(out, toc_path, "w"); size_t offs_size = file_offs < UINT32_MAX ? 4 : 8; for (i = 0; i < num_trails + 1; i++) TDB_WRITE(out, &toc[i], offs_size); done: TDB_CLOSE_FINAL(out); free(write_buf); free_gram_bufs(&gbufs); free(grams); free(encoded); free(prev_items); free(buf); free(toc); return ret; }