/
plane_trie.hpp
454 lines (392 loc) · 12.9 KB
/
plane_trie.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
#pragma once
#include <stdint.h>
#include <cstdio>
#include <map>
#include <queue>
#include <vector>
#include <list>
#include <string>
#include <ostream>
#include <algorithm>
#include <iostream>
#include <boost/static_assert.hpp>
#include "queue.hpp"
#include "types.hpp"
#include "temp_node.hpp"
#include "utils.hpp"
// fast load trie
// build from sorted string list
// fast read only access
/*
struct plane_node_t {
value_t value;
uint32_t child_count;
};
*/
/*
struct temp_node_t {
temp_node_t() : value(0) {}
value_t value;
std::vector<char_t> key_list;
std::vector<temp_node_t*> ptr_list;
std::vector<size_t> offset_list;
};
*/
struct trie_t {
plane_node_t *root_node;
size_t segment_size;
void* addr;
};
BOOST_STATIC_ASSERT(sizeof(plane_node_t) == 8);
/*
size_t calc_node_size(const temp_node_t *node) {
return sizeof(plane_node_t) + node->offset_list.size() * sizeof(void*) + round_up_8(node->key_list.size());
}
*/
// load trie structure from mapped region
void load_trie(void* addr, size_t size, trie_t& trie) {
size_t root_offset = *(size_t*)((char*)(addr) + size - sizeof(void*));
trie.addr = addr;
trie.segment_size = size;
trie.root_node = reinterpret_cast<plane_node_t*>((char*)addr + root_offset);
}
void load_trie_by_root_offset(void* addr, size_t root_offset, trie_t& trie) {
trie.addr = addr;
trie.segment_size = root_offset; //TODO
trie.root_node = reinterpret_cast<plane_node_t*>((char*)addr + root_offset);
}
inline const char_t* _get_key_begin(const plane_node_t *node) {
return (char_t*)((char*)(node) + sizeof(plane_node_t));
}
inline const size_t* _get_offset_begin(const plane_node_t *node) {
return (size_t*)((char*)(node) + sizeof(plane_node_t) + round_up_8(node->child_count));
}
inline const plane_node_t* _get_node(const trie_t& trie, size_t offset) {
return (const plane_node_t*)((char*)trie.addr + offset);
}
struct less_t {
inline bool operator ()(const char_t &lhs, const char_t& rhs) const {
return (uint8_t)lhs < (uint8_t)rhs;
}
};
bool _find_child_node(const trie_t &trie, const plane_node_t *curr_node, const char_t &key, const plane_node_t **next_node) {
char_t *key_begin = (char_t*)((char*)(curr_node) + sizeof(plane_node_t));
char_t *key_end = key_begin + curr_node->child_count;
static less_t _less;
char_t *key_ptr = std::lower_bound(key_begin, key_end, key, _less);
if (key_end == key_ptr or *key_ptr != key) {
return false;
}
size_t *offset_begin = (size_t*)((char*)(curr_node) + sizeof(plane_node_t) + round_up_8(curr_node->child_count));
size_t new_offset = *(offset_begin + (size_t)(key_ptr - key_begin));
*next_node = _get_node(trie, new_offset); //(const plane_node_t*)((char*)trie.addr + new_offset);
return true;
}
value_t get_node(const trie_t &trie, const char_t* word) {
BOOST_STATIC_ASSERT(sizeof(size_t) == sizeof(void*));
const plane_node_t *curr_node = trie.root_node;
for (size_t i = 0; word[i] != 0; ++i) {
if (!_find_child_node(trie, curr_node, word[i], &curr_node)) return 0;
}
return curr_node->value;
}
struct task_t {
size_t word_pos;
size_t dict_pos;
size_t ttl;
const plane_node_t *node;
};
template <typename exit_condition_t, typename output_t>
void fuzzy_search_impl(
const trie_t &trie,
const char_t* word,
const exit_condition_t &exit_condition,
size_t max_dist,
output_t &output
){
//task: word_pos, dict_pos, ttl
queue_t<task_t, (sizeof(task_t) * 1024 - sizeof(void*)) / sizeof(task_t)> q;
//std::queue<task_t> q;
q.push((task_t){0, 0, max_dist, trie.root_node});
while(!q.empty()) {
const task_t &task = q.front();
if (task.ttl) {
// delete
// foreach curr_node keys
const size_t *offset_begin = _get_offset_begin(task.node);
for (size_t i = 0; i < task.node->child_count; ++i) {
q.push(task);
q.back().ttl -= 1;
q.back().dict_pos += 1;
q.back().node = _get_node(trie, offset_begin[i]);
}
// insert
if (task.ttl and !exit_condition(word, task.word_pos)) {
q.push(task);
q.back().ttl -= 1;
q.back().word_pos += 1;
}
// replace
// foreach curr_node keys
if (!exit_condition(word, task.word_pos)) {
for (size_t i = 0; i < task.node->child_count; ++i) {
const char_t &key = ((char_t*)((char*)task.node + sizeof(plane_node_t)))[i];
if (key == word[task.word_pos]) {
continue; // ignore replace if it's the same as paste
}
q.push(task);
q.back().ttl -= 1;
q.back().dict_pos += 1;
q.back().word_pos += 1;
q.back().node = _get_node(trie, offset_begin[i]);
}
}
// transpose
// check two positions
}
bool no_pop = false;
if (!exit_condition(word, task.word_pos)) {
// paste
const plane_node_t *next_node;
if (_find_child_node(trie, task.node, word[task.word_pos], &next_node)) {
no_pop = true;
task_t &new_task = q.front();
// q.push(task);
// task_t &new_task = q.back();
new_task.dict_pos += 1;
new_task.word_pos += 1;
new_task.node = next_node;
}
} else {
// report found word
if (task.node->value) {
output(task.node->value, max_dist - task.ttl);
// std::cout << "FOUND: " << task.node->value << std::endl;
}
}
if (not no_pop)
q.pop();
}
}
struct _fuzzy_search_task_t {
const plane_node_t *dict_node;
const plane_node_t *word_node;
size_t ttl;
};
struct _fuzzy_search_greater_task_t {
bool operator() (const _fuzzy_search_task_t& lhs, const _fuzzy_search_task_t& rhs) const {
// descendents have lower addreses tan their parents
// siblings go in key order
return
lhs.word_node > rhs.word_node or
(
lhs.word_node == rhs.word_node and
(
lhs.dict_node > rhs.dict_node or
(
lhs.dict_node == rhs.dict_node and
lhs.ttl < rhs.ttl
)
)
);
}
};
inline bool _empty_node(const plane_node_t* node) {
return (node->value == 0 and node->child_count == 0);
}
// get fuzy intersection of two tries
// lower_limit - lowest word in task trie
// upper_limit - a word after largest word in task trie
//
// output format: pairs: word_id -> dict_id
template <typename output_t>
void fuzzy_search(
const trie_t &dict_trie,
const trie_t &word_trie,
const char_t *lower_limit,
const char_t *upper_limit,
size_t max_dist,
output_t &output) {
less_t _less;
typedef _fuzzy_search_task_t task_t;
typedef _fuzzy_search_greater_task_t greater_task_t;
struct equal_task_t {
bool operator()(const task_t &lhs, const task_t &rhs) {
return
lhs.word_node == rhs.word_node and
lhs.dict_node == rhs.dict_node;// and
// lhs.ttl <= rhs.ttl;
}
};
//queue_t<task_t, (sizeof(task_t) * 1024 - sizeof(void*)) / sizeof(task_t)> q;
std::priority_queue<task_t, typename std::vector<task_t>, greater_task_t> q;
q.push((task_t){dict_trie.root_node, word_trie.root_node, max_dist});
// size_t step_no = 0;
// size_t skip_no = 0;
while (! q.empty() ) {
//++step_no;
const task_t task = q.top();
q.pop();
// while (! q.empty() and equal_task_t()(q.top(), task)) {
// ++skip_no;
// q.pop();
// }
// if (step_no % 10000 == 0) {
// std::cout << "step_no:" << step_no << "\tq_size:" << q.size() << "\tskip_no:" << skip_no <<"\t"<<task.word_node << "\t"<<task.dict_node << "\t" << task.ttl<< std::endl;
// }
const size_t *dict_offset_begin = _get_offset_begin(task.dict_node);
const size_t *word_offset_begin = _get_offset_begin(task.word_node);
if (task.ttl) {
//delete
for (size_t i = 0; i < task.dict_node->child_count; ++i) {
const plane_node_t *dict_child = _get_node(dict_trie, dict_offset_begin[i]);
if (_empty_node(dict_child)) continue;
task_t new_task = task;
new_task.ttl -= 1;
new_task.dict_node = dict_child;
q.push(new_task);
}
// insert
for (size_t i = 0; i < task.word_node->child_count; ++i) {
const plane_node_t *word_child = _get_node(word_trie, word_offset_begin[i]);
if (_empty_node(word_child)) continue;
task_t new_task = task;
new_task.ttl -= 1;
new_task.word_node = word_child;
q.push(new_task);
}
// replace
// foreach curr_node keys
for (size_t dict_child_id = 0; dict_child_id < task.dict_node->child_count; ++dict_child_id) {
const plane_node_t *dict_child = _get_node(dict_trie, dict_offset_begin[dict_child_id]);
if (_empty_node(dict_child)) continue;
for (size_t word_child_id = 0; word_child_id < task.word_node->child_count; ++word_child_id) {
const plane_node_t *word_child = _get_node(word_trie, word_offset_begin[word_child_id]);
if (_empty_node(word_child)) continue;
const char_t &dict_key = ((char_t*)((char*)task.dict_node + sizeof(plane_node_t)))[dict_child_id];
const char_t &word_key = ((char_t*)((char*)task.word_node + sizeof(plane_node_t)))[word_child_id];
if (dict_key == word_key) {
continue; // ignore replace if it's the same as paste
}
task_t new_task = task;
new_task.ttl -= 1;
new_task.dict_node = dict_child;
new_task.word_node = word_child;
q.push(new_task);
}
}
}
// paste
for (size_t i = 0, j = 0; i < task.dict_node->child_count && j < task.word_node->child_count; ) {
const char_t &dict_key = ((char_t*)((char*)task.dict_node + sizeof(plane_node_t)))[i];
const char_t &word_key = ((char_t*)((char*)task.word_node + sizeof(plane_node_t)))[j];
if (_less(dict_key, word_key)) {
++i;
continue;
} else if (_less(word_key, dict_key)) {
++j;
continue;
} else {
const plane_node_t *dict_child = _get_node(dict_trie, dict_offset_begin[i]);
if (_empty_node(dict_child)) {
++i;
++j;
continue;
}
const plane_node_t *word_child = _get_node(word_trie, word_offset_begin[j]);
if (_empty_node(word_child)) {
++i;
++j;
continue;
}
//std::cout << "EQ " << i << " " << j << " : " << word_key << " " << dict_key << std::endl;
task_t new_task = task;
new_task.dict_node = dict_child;
new_task.word_node = word_child;
q.push(new_task);
++i;
++j;
}
}
if (task.dict_node->value && task.word_node->value) {
output.insert(std::make_pair(task.word_node->value, task.dict_node->value));
}
}
}
//void save_trie(std::ostream &os, const trie_t& trie);
template <typename exit_condition_t>
inline bool trie_find_impl(
const trie_t &trie,
const char_t* key,
const exit_condition_t &exit_condition,
uint32_t &value
) {
const plane_node_t *current_node = trie.root_node;
for (size_t i = 0; !exit_condition(key, i); ++i) {
const plane_node_t *next_node;
if (_find_child_node(trie, current_node, key[i], &next_node)) {
current_node = next_node;
} else {
return false;
}
}
value = current_node->value;
return true;
}
struct zero_str_exit_condition_t {
inline bool operator()(const char_t*key, size_t i) const {
return key[i] == '\0';
}
};
bool trie_find(const trie_t &trie, const char_t* key, uint32_t &value) {
zero_str_exit_condition_t exit_condition;
return trie_find_impl(trie, key, exit_condition, value);
}
struct str_len_exit_condition_t {
str_len_exit_condition_t(size_t len): len_(len) {}
inline bool operator()(const char_t* /*key*/, size_t i) const {
return i >= len_;
}
size_t len_;
};
bool trie_find(const trie_t &trie, const char_t* key, size_t key_len, uint32_t &value) {
str_len_exit_condition_t exit_condition(key_len);
return trie_find_impl(trie, key, exit_condition, value);
}
template <typename output_t>
void fuzzy_search(
const trie_t &trie,
const char_t* word,
size_t max_dist,
output_t &output
) {
zero_str_exit_condition_t exit_condition;
fuzzy_search_impl<zero_str_exit_condition_t, output_t>(trie, word, exit_condition, max_dist, output);
}
template <typename output_t>
void fuzzy_search(
const trie_t &trie,
const char_t* key,
size_t key_len,
size_t max_dist,
output_t &output
) {
str_len_exit_condition_t exit_condition(key_len);
fuzzy_search_impl<str_len_exit_condition_t, output_t>(trie, key, exit_condition, max_dist, output);
}
template <typename container_t>
struct inserter_out_t {
inserter_out_t(container_t &storage)
: storage_(storage) {}
void operator() (const uint32_t &value, size_t /* distance*/) {
storage_.insert(value);
}
container_t &storage_;
};
struct map_out_t {
typedef std::map<uint32_t, size_t> storage_t;
void operator() (const uint32_t &value, size_t distance) {
storage_.insert(std::make_pair(value, distance));
}
storage_t storage_;
};