ivec_t do_hash(const words_t &strs, const ivec_t &pos, const ivec_t &assoc) { ivec_t ivec; ivec.reserve(strs.size()); for(auto &s:strs) { int t = s.length(); for(auto p:pos) if(p < (int)s.size()) t += assoc[s[p]]; ivec.push_back(t); } return ivec; }
ivec_t find_assoc(const words_t &strs, const ivec_t &pos) { ivec_t assoc; int current_dups = strs.size(); int N = 127; std::vector<char> useful_chars; for(auto w:strs) for(auto c:w) if(!has(useful_chars, c)) useful_chars.push_back(c); for(int i=0; i<N; ++i) assoc.push_back(0); int assoc_best = -1; int assoc_best_val = INT_MAX; for(int k=0; k<4; ++k) { for(int i:useful_chars) { assoc_best_val = INT_MAX; for(int j=0; j<100; ++j) { //printf("."); assoc[i] = j; auto hashed = do_hash(strs, pos, assoc); //for(int i=0; i<hashed.size(); ++i) // printf("%d ", hashed[i]); //printf("\n"); int d = count_dups(hashed); //printf("dup %d\n",d); if(d < assoc_best_val) { assoc_best_val = d; assoc_best = j; } } assoc[i] = assoc_best; } if(assoc_best_val >= current_dups) break; current_dups = assoc_best_val; } auto hashed = do_hash(strs, pos, assoc); //int d = count_dups(hashed); //printf("Total Dups Assoc: %d\n", d); return assoc; }
ivec_t find_pos(words_t &strs) { ivec_t pos; int current_dups = strs.size(); int N = 0; for(auto w:strs) N = rtosc_max(N,w.length()); int pos_best = -1; int pos_best_val = INT_MAX; while(true) { for(int i=0; i<N; ++i) { ivec_t npos = pos; if(has(pos, i)) continue; npos.push_back(i); auto hashed = do_hash(strs, npos); int d = count_dups(hashed); if(d < pos_best_val) { pos_best_val = d; pos_best = i; } } if(pos_best_val >= current_dups) break; current_dups = pos_best_val; pos.push_back(pos_best); } auto hashed = do_hash(strs, pos); int d = count_dups(hashed); //printf("Total Dups: %d\n", d); if(d != 0) pos.clear(); return pos; }
bool marky::Backend_SQLite::get_next(const State& state, selector_t selector, scorer_t scorer, const words_t& search_words, word_t& next) { #ifdef READ_DEBUG_ENABLED DEBUG("get_next(%s)", str(search_words).c_str()); #endif if (!bind_words(stmt_get_nexts, 1, search_words)) { sqlite3_clear_bindings(stmt_get_nexts); sqlite3_reset(stmt_get_nexts); return false; } bool ok = true; snippets_ptr_t snippets(new snippet_ptr_set_t); for (;;) { int step = sqlite3_step(stmt_get_nexts); bool done = false; switch (step) { case SQLITE_DONE: done = true; break; case SQLITE_ROW: { words_t words; unpack((const char*)sqlite3_column_text(stmt_get_nexts, 0), words); snippet_t snippet(new Snippet(words, sqlite3_column_int64(stmt_get_nexts, 1), sqlite3_column_int64(stmt_get_nexts, 2), sqlite3_column_int64(stmt_get_nexts, 3))); snippets->insert(snippet); break; } default: ok = false; ERROR("Error when parsing response to '%s': %d/%s", QUERY_GET_NEXTS, step, sqlite3_errmsg(db)); break; } if (!ok || done) { break; } } sqlite3_clear_bindings(stmt_get_nexts); sqlite3_reset(stmt_get_nexts); if (snippets->empty()) { if (search_words.size() >= 2) { words_t search_words_shortened(++search_words.begin(), search_words.end()); #ifdef READ_DEBUG_ENABLED DEBUG(" get_next -> %s", str(search_words_shortened).c_str()); #endif /* recurse with shorter search */ return get_next(state, selector, scorer, search_words_shortened, next); } else { #ifdef READ_DEBUG_ENABLED DEBUG(" next_snippet -> NOTFOUND"); #endif next = IBackend::LINE_END; } } else { const words_t& next_snippet = selector(*snippets, scorer, state)->words; #ifdef READ_DEBUG_ENABLED for (snippet_ptr_set_t::const_iterator siter = snippets->begin(); siter != snippets->end(); ++siter) { DEBUG(" nexts%s = snippet(%s, %lu)", str(search_words).c_str(), str((*siter)->words).c_str(), (*siter)->score(scorer, state)); } DEBUG(" next_snippet -> %s", str(next_snippet).c_str()); #endif next = next_snippet.back(); } return ok; }