static TrieNode<CharT, BucketT>* pseudo_sample(unsigned char** strings, size_t n) { debug()<<__func__<<"(): sampling "<<n/8192<<" strings ...\n"; size_t max_nodes = (sizeof(CharT) == 1) ? 5000 : 2000; TrieNode<CharT, BucketT>* root = new TrieNode<CharT, BucketT>; for (size_t i=0; i < n; i += 8192) { unsigned char* str = strings[i]; size_t depth = 0; TrieNode<CharT, BucketT>* node = root; while (true) { CharT c = get_char<CharT>(str, depth); if (is_end(c)) break; depth += sizeof(CharT); node->extend(c+1); if (not node->is_trie(c)) { node->_buckets[c] = new TrieNode<CharT, BucketT>; make_trie(node->_buckets[c]); if (--max_nodes==0) goto finish; } node = node->get_node(c); assert(node); } } finish: return root; }
static TrieNode<CharT, BucketT>* random_sample(unsigned char** strings, size_t n) { const size_t sample_size = n/8192; debug()<<__PRETTY_FUNCTION__<<" sampling "<<sample_size<<" strings\n"; size_t max_nodes = (sizeof(CharT) == 1) ? 5000 : 2000; TrieNode<CharT, BucketT>* root = new TrieNode<CharT, BucketT>; for (size_t i=0; i < sample_size; ++i) { unsigned char* str = strings[size_t(drand48()*n)]; size_t depth = 0; TrieNode<CharT, BucketT>* node = root; while (true) { CharT c = get_char<CharT>(str, depth); if (is_end(c)) break; depth += sizeof(CharT); node->extend(c+1); if (not node->is_trie(c)) { node->_buckets[c] = new TrieNode<CharT, BucketT>; make_trie(node->_buckets[c]); if (--max_nodes==0) goto finish; } node = node->get_node(c); assert(node); } } finish: return root; }