/* Take an event from the old db, translate its items to new vals and append to the new cons */ static void append_event(tdb_cons *cons, const tdb_event *event, Word_t *uuid_ptr, tdb_val **lexicon_maps) { uint64_t i; struct tdb_cons_event *new_event = (struct tdb_cons_event*)arena_add_item(&cons->events); new_event->item_zero = cons->items.next; new_event->num_items = 0; new_event->timestamp = event->timestamp; new_event->prev_event_idx = *uuid_ptr; *uuid_ptr = cons->events.next; for (i = 0; i < event->num_items; i++){ tdb_val val = tdb_item_val(event->items[i]); tdb_field field = tdb_item_field(event->items[i]); tdb_val new_val = 0; /* translate val */ if (val) new_val = lexicon_maps[field - 1][val - 1]; tdb_item item = tdb_make_item(field, new_val); memcpy(arena_add_item(&cons->items), &item, sizeof(tdb_item)); ++new_event->num_items; } }
/* this function adds events from db to cons one by one, using the public API. We need to use this with filtered dbs or otherwise when we need to re-create lexicons. */ static tdb_error tdb_cons_append_subset_lexicon(tdb_cons *cons, const tdb *db) { const char **values = NULL; uint64_t *lengths = NULL; uint64_t i, trail_id; int ret = 0; const uint64_t num_fields = tdb_num_fields(db); tdb_cursor *cursor = tdb_cursor_new(db); if (!cursor) return TDB_ERR_NOMEM; if (!(values = malloc(num_fields * sizeof(char*)))){ ret = TDB_ERR_NOMEM; goto done; } if (!(lengths = malloc(num_fields * sizeof(uint64_t)))){ ret = TDB_ERR_NOMEM; goto done; } for (trail_id = 0; trail_id < tdb_num_trails(db); trail_id++){ const tdb_event *event; const uint8_t *uuid = tdb_get_uuid(db, trail_id); if ((ret = tdb_get_trail(cursor, trail_id))) goto done; while ((event = tdb_cursor_next(cursor))){ /* with TDB_OPT_ONLY_DIFF_ITEMS event->items may be sparse, hence we need to reset lengths to zero */ memset(lengths, 0, num_fields * sizeof(uint64_t)); for (i = 0; i < event->num_items; i++){ tdb_field field = tdb_item_field(event->items[i]); tdb_val val = tdb_item_val(event->items[i]); values[field - 1] = tdb_get_value(db, field, val, &lengths[field - 1]); } if ((ret = tdb_cons_add(cons, uuid, event->timestamp, values, lengths))) goto done; } } done: free(values); free(lengths); tdb_cursor_free(cursor); return ret; }
TDB_EXPORT int _tdb_cursor_next_batch(tdb_cursor *cursor) { struct tdb_decode_state *s = cursor->state; const struct huff_codebook *codebook = (const struct huff_codebook*)s->db->codebook.data; const struct field_stats *fstats = s->db->field_stats; uint64_t *dst = (uint64_t*)s->events_buffer; uint64_t i = 0; uint64_t num_events = 0; tdb_field field; tdb_item item; const int edge_encoded = s->edge_encoded; /* decode the trail - exit early if destination buffer runs out of space */ while (s->offset < s->size && num_events < s->events_buffer_len){ /* Every event starts with a timestamp. Timestamp may be the first member of a bigram */ __uint128_t gram = huff_decode_value(codebook, s->data, &s->offset, fstats); uint64_t orig_i = i; uint64_t delta = tdb_item_val(HUFF_BIGRAM_TO_ITEM(gram)); uint64_t *num_items; /* events buffer format: [ [ timestamp | num_items | items ... ] tdb_event 1 [ timestamp | num_items | items ... ] tdb_event 2 ... [ timestamp | num_items | items ... ] tdb_event N ] note that events may have a varying number of items, due to edge encoding */ s->tstamp += delta; dst[i++] = s->tstamp; num_items = &dst[i++]; item = HUFF_BIGRAM_OTHER_ITEM(gram); /* handle a possible latter part of the first bigram */ if (item){ field = tdb_item_field(item); s->previous_items[field] = item; if (edge_encoded) dst[i++] = item; } /* decode one event: timestamp is followed by at most num_ofields field values */ while (s->offset < s->size){ uint64_t prev_offs = s->offset; gram = huff_decode_value(codebook, s->data, &s->offset, fstats); item = HUFF_BIGRAM_TO_ITEM(gram); field = tdb_item_field(item); if (field){ /* value may be either a unigram or a bigram */ do{ s->previous_items[field] = item; if (edge_encoded) dst[i++] = item; gram = item = HUFF_BIGRAM_OTHER_ITEM(gram); }while ((field = tdb_item_field(item))); }else{ /* we hit the next timestamp, take a step back and break */ s->offset = prev_offs; break; } } if (!s->filter || event_satisfies_filter(s->previous_items, s->filter, s->filter_len)){ /* no filter or filter matches, finalize the event */ if (!edge_encoded){ /* dump all the fields of this event in the result, if edge encoding is not requested */ for (field = 1; field < s->db->num_fields; field++) dst[i++] = s->previous_items[field]; } ++num_events; *num_items = (i - (orig_i + 2)); }else{ /* filter doesn't match - ignore this event */ i = orig_i; } } cursor->next_event = s->events_buffer; cursor->num_events_left = num_events; return num_events > 0 ? 1: 0; }