int main() { int runs; scanf("%d", &runs); for (int run = 1; run <= runs; ++run) { scanf("%s", str); int ls = strlen(str); for (int i = 0; i < ls; ++i) num[i] = str[i]; num[ls] = 1; for (int i = 0; i < ls; ++i) num[ls + 1 + i] = str[ls - i - 1]; num[ls+ls+1] = 0; sa.init(num, ls + ls + 1, 128); sa.init_rmq(); memset(vis, 0, sizeof(vis)); int ans = 0, cnt = 0; for (int i = 1; i <= ls+ls; ++i) { cnt = min(cnt, sa.height[i]); if (!vis[sa[i]] && vis[ls+ls+1-sa[i]+1]) { int t = sa.lcp(sa[i], ls+ls+1-sa[i]+1); if (t <= cnt) continue; ans += t - cnt; if (cnt < t) ans = t; } else vis[sa[i]] = 1; } printf("%d\n", ans); } return 0; }
int main() { SuffixArray in; while(gets(in.str) && in.str[0] != '\0') { int n = 0; for(int i = 0; in.str[i]; i++) if(in.str[i] != ' ') in.str[n++] = in.str[i]; in.str[n] = '\0'; in.build(); in.build_h(); if(n == 0) puts("0"); for(int i = 1; i <= in.n; i++) { int cnt = 0, ret = 0; for(int j = 0; j < in.n; j++) { if(in.h[j] >= i) cnt++; else ret = max(ret, cnt), cnt = 0; } ret = max(ret, cnt); if(ret <= 0) break; printf("%d\n", ret + 1); } puts(""); } return 0; }
// Validate the sampled suffix array values are correct void SampledSuffixArray::validate(const std::string filename, const BWT* pBWT) { ReadTable* pRT = new ReadTable(filename); SuffixArray* pSA = new SuffixArray(pRT, 1); std::cout << "Validating sampled suffix array entries\n"; for(size_t i = 0; i < pSA->getSize(); ++i) { SAElem calc = calcSA(i, pBWT); SAElem real = pSA->get(i); if(calc.getID() != real.getID() || calc.getPos() != real.getPos()) { std::cout << "Error SA elements do not match for " << i << "\n"; std::cout << "Calc: " << calc << "\n"; std::cout << "Real: " << real << "\n"; exit(1); } } std::cout << "All calculate SA values are correct\n"; delete pRT; delete pSA; }
int main() { scanf("%s", buf); SuffixArray sa; sa.create(buf); sa.output(); return 0; }
int main() { int n, m, l, r; set<pair<int, int> > st; scanf("%d%d", &n, &m); scanf("%s", buf); sa.init(n, buf); rmq.init(n, sa.height); l = 0; r = 1; for (int i = 0; i < m; ++i) { scanf("%s", op); int& t = op[0] == 'L' ? l : r; t += op[1] == '+' ? 1 : -1; int k = sa.rank[l]; int lo = 0, hi = k; while (lo < hi) { int mi = (lo + hi) / 2; if (rmq.value(mi + 1, k + 1) >= r - l) { hi = mi; } else { lo = mi + 1; } } st.insert(make_pair(hi, r - l)); } printf("%d\n", (int)st.size()); return 0; }
int main() { int i, first = 0; while(scanf("%d", &n) == 1 && n) { if(first) puts(""); first = 1; int m = 0; half = n/2; // > half int mxlen = 0; for(i = 0; i < n; i++) { scanf("%s", SA.str+m); int cnt = 0; while(SA.str[m]) Sfrom[m] = i, m++, cnt++; if(cnt > mxlen) mxlen = cnt; Wlen[i] = cnt; SA.str[m++] = '$'; SA.str[m] = 0; } SA.str[m-1] = '\0'; if(n == 1) { puts(SA.str); continue; } //puts(SA.str); SA.build(); SA.build_h(); int l = 1, r = mxlen; int res = 0; while(l <= r) { m = (l+r)/2; if(check(m, 0)) { l = m+1; if(m > res) res = m; } else { r = m-1; } } if(res == 0) puts("?"); else check(res, 1); } return 0; }
void test1(wordstring& ws, intstring& ids) { struct timeval start; struct timeval end; SuffixArray sa; gettimeofday(&start,NULL); sa.DA(ids); gettimeofday(&end,NULL); double dur = 0; dur += (end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec); cout<<dur/1000000<<endl; cerr << endl; vector<RepeatSubString> repeat; CaculateRepeatSubString(sa, repeat, 5); for (int i = 0; i < repeat.size(); ++i) { cout << repeat[i] << repeat[i].ToString(ws) << endl; } }
void buildIndexForTable(std::string prefix, const ReadTable* pRT, bool isReverse) { // Create suffix array from read table SuffixArray* pSA = new SuffixArray(pRT, opt::numThreads); if(opt::validate) { std::cout << "Validating suffix array\n"; pSA->validate(pRT); } std::string bwt_filename = prefix + (!isReverse ? BWT_EXT : RBWT_EXT); pSA->writeBWT(bwt_filename, pRT); std::string sufidx_filename = prefix + (!isReverse ? SAI_EXT : RSAI_EXT); pSA->writeIndex(sufidx_filename); delete pSA; pSA = NULL; }
void print_ranks(std::string const &s, SuffixArray &sa) { int n = sa.dp.empty() ? 0 : sa.dp[0].size(); for (int r = 0; r < n; ++r) { printf("%3c", s[r]); for (size_t c = 0; c < sa.dp.size(); ++c) { printf("%4d", sa.get_dp(c)[r]); } printf("\n"); } }
static Int find_max_length( const SuffixArray& sa, const String& s ) { Int len = 0; int n = s.size(); for ( int i = 0; i + len < n; ) { if ( sa.find(s.substr(i, len + 1)) ) { len ++; } else { i ++; } } return len; }
void print_suffix_array(std::string const &s, SuffixArray &sa, vi_t *plcp = NULL) { vi_t pos; sa.sorted_indexes(pos); for (size_t i = 0; i < pos.size(); ++i) { // Limit each line to 60 characters if (plcp) { printf("%3d: [%2d]: %s\n", pos[i], (*plcp)[pos[i]], s.substr(pos[i], 60).c_str()); } else { printf("%3d: %s\n", pos[i], s.substr(pos[i], 60).c_str()); } } }
std::vector<int> longest_common_prefix(const T &s, const SuffixArray &sa){ const int n = sa.size(); std::vector<int> vs(n), isa(n), lcp(n - 1); for(int i = 0; i + 1 < n; ++i){ vs[i] = s[i]; } for(int i = 0; i < n; ++i){ isa[sa[i]] = i; } int h = 0; for(int i = 0; i < n; ++i){ const int j = isa[i]; if(j > 0){ const int k = j - 1; while(vs[sa[j] + h] == vs[sa[k] + h]){ ++h; } lcp[k] = h; if(h > 0){ --h; } } } return lcp; }
/** Add the overlaps of vseq to the graph. */ static void addOverlapsSA(Graph& g, const SuffixArray& sa, ContigNode v, const string& vseq) { assert(!vseq.empty()); set<ContigNode> seen; typedef SuffixArray::const_iterator It; for (string q(vseq, 0, vseq.size() - 1); q.size() >= opt::minOverlap; chop(q)) { pair<It, It> range = sa.equal_range(q); for (It it = range.first; it != range.second; ++it) { ContigNode u(it->second); if (seen.insert(u).second) { // Add the longest overlap between two vertices. unsigned overlap = it->first.size(); add_edge(u, v, -overlap, static_cast<DG&>(g)); } } } }
void RunTest(SuffixArray &index, const context_t *context, const unordered_map<vector<wid_t>, size_t, phrase_hash> &ngrams, vector<speed_perf_t> &speedData) { size_t queryCount = 0; for (auto entry = ngrams.begin(); entry != ngrams.end(); ++entry) { Collector *collector = index.NewCollector(context, true); for (size_t i = 0; i < entry->first.size(); ++i) { double begin = GetTime(); vector<sample_t> samples; collector->Extend(entry->first[i], 1000, samples); speedData[i].seconds += GetElapsedTime(begin); speedData[i].requests++; queryCount++; if (queryCount % 10000 == 0) cout << "." << flush; } delete collector; } }
void ILCPConstruct(const SuffixArray& sa, std::vector<SuffixArray::Index>* ilcp) { typedef SuffixArray::Index Index; std::vector<Index>& text_lcp = *ilcp; text_lcp.resize(sa.size()); Index start = 0; int num_docs = 0; const char* text = sa.text(); for (Index i = 0; i <= (Index)sa.size(); ++i) { if (i == (Index)sa.size() || (unsigned char)text[i] <= 1) { const char* doc = text + start; Index doc_len = i - start; SuffixArray doc_sa(doc, doc_len); for (Index j = 0; j < doc_len; ++j) { Index p = doc_sa.sa(j); Index lcp = doc_sa.lcp(j); text_lcp[start + p] = lcp; } num_docs++; start = i; } } std::vector<bool> visited(sa.size()); // permutate text_lcp[i] = text_lcp[sa[i]] implace for (Index i = 0; i < (Index)sa.size(); ++i) { if (!visited[i]) { int j = i; while (true) { visited[j] = 1; Index to = sa.sa(j); if (visited[to]) break; std::swap(text_lcp[j], text_lcp[to]); j = to; } } // ilcp[i] = text_lcp[sa.sa(i)]; } }
std::string parseDupHits(const StringVector& hitsFilenames, const std::string& out_prefix) { // Load the suffix array index and the reverse suffix array index // Note these are not the full suffix arrays SuffixArray* pFwdSAI = new SuffixArray(opt::prefix + SAI_EXT); SuffixArray* pRevSAI = new SuffixArray(opt::prefix + RSAI_EXT); // Load the read table to look up the lengths of the reads and their ids. // When rmduping a set of reads, the ReadInfoTable can actually be larger than the // BWT if the names of the reads are very long. Previously, when two reads // are duplicated, the read with the lexographically lower read name was chosen // to be kept. To save memory here, we break ties using the index in the ReadInfoTable // instead. This allows us to avoid loading the read names. ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pFwdSAI->getNumStrings(), RIO_NUMERICID); std::string outFile = out_prefix + ".fa"; std::string dupFile = out_prefix + ".dups.fa"; std::ostream* pWriter = createWriter(outFile); std::ostream* pDupWriter = createWriter(dupFile); size_t substringRemoved = 0; size_t identicalRemoved = 0; size_t kept = 0; size_t buffer_size = SequenceProcessFramework::BUFFER_SIZE; // The reads must be output in their original ordering. // The hits are in the blocks of buffer_size items. We read // buffer_size items from the first hits file, then buffer_size // from the second and so on until all the hits have been processed. size_t num_files = hitsFilenames.size(); std::vector<std::istream*> reader_vec(num_files, 0); for(size_t i = 0; i < num_files; ++i) { std::cout << "Opening " << hitsFilenames[i] << "\n"; reader_vec[i] = createReader(hitsFilenames[i]); } bool done = false; size_t currReaderIdx = 0; size_t numRead = 0; size_t numReadersDone = 0; std::string line; while(!done) { // Parse a line from the current file bool valid = getline(*reader_vec[currReaderIdx], line); ++numRead; // Deal with switching the active reader and the end of files if(!valid || numRead == buffer_size) { // Switch the reader currReaderIdx = (currReaderIdx + 1) % num_files; numRead = 0; // Break once all the readers are invalid if(!valid) { ++numReadersDone; if(numReadersDone == num_files) { done = true; break; } } } // Parse the data if(valid) { std::string id; std::string sequence; std::string hitsStr; size_t readIdx; size_t numCopies; bool isSubstring; std::stringstream parser(line); parser >> id; parser >> sequence; getline(parser, hitsStr); OverlapVector ov; OverlapCommon::parseHitsString(hitsStr, pRIT, pRIT, pFwdSAI, pRevSAI, true, readIdx, numCopies, ov, isSubstring); bool isContained = false; if(isSubstring) { ++substringRemoved; isContained = true; } else { for(OverlapVector::iterator iter = ov.begin(); iter != ov.end(); ++iter) { if(iter->isContainment() && iter->getContainedIdx() == 0) { // This read is contained by some other read ++identicalRemoved; isContained = true; break; } } } SeqItem item = {id, sequence}; std::stringstream meta; meta << id << " NumDuplicates=" << numCopies; if(isContained) { // The read's index in the sequence data base // is needed when removing it from the FM-index. // In the output fasta, we set the reads ID to be the index // and record its old id in the fasta header. std::stringstream newID; newID << item.id << ",seqrank=" << readIdx; item.id = newID.str(); // Write some metadata with the fasta record item.write(*pDupWriter, meta.str()); } else { ++kept; // Write the read item.write(*pWriter, meta.str()); } } } for(size_t i = 0; i < num_files; ++i) { delete reader_vec[i]; unlink(hitsFilenames[i].c_str()); } printf("[%s] Removed %zu substring reads\n", PROGRAM_IDENT, substringRemoved); printf("[%s] Removed %zu identical reads\n", PROGRAM_IDENT, identicalRemoved); printf("[%s] Kept %zu reads\n", PROGRAM_IDENT, kept); // Delete allocated data delete pFwdSAI; delete pRevSAI; delete pRIT; delete pWriter; delete pDupWriter; return dupFile; }
// The algorithm is as follows. We create M BWTs for subsets of // the input reads. These are created independently and written // to disk. They are then merged either sequentially or pairwise // to create the final BWT void buildBWTDisk(const std::string& in_filename, const std::string& out_prefix, const std::string& bwt_extension, const std::string& sai_extension, bool doReverse, int numThreads, int numReadsPerBatch, int storageLevel) { size_t MAX_READS_PER_GROUP = numReadsPerBatch; SeqReader* pReader = new SeqReader(in_filename); SeqRecord record; int groupID = 0; size_t numReadTotal = 0; MergeVector mergeVector; MergeItem mergeItem; mergeItem.start_index = 0; // Phase 1: Compute the initial BWTs ReadTable* pCurrRT = new ReadTable; bool done = false; while(!done) { done = !pReader->get(record); if(!done) { // the read is valid SeqItem item = record.toSeqItem(); if(doReverse) item.seq.reverse(); pCurrRT->addRead(item); ++numReadTotal; } if(pCurrRT->getCount() >= MAX_READS_PER_GROUP || (done && pCurrRT->getCount() > 0)) { // Compute the SA and BWT for this group SuffixArray* pSA = new SuffixArray(pCurrRT, numThreads); // Write the BWT to disk std::string bwt_temp_filename = makeTempName(out_prefix, groupID, bwt_extension); pSA->writeBWT(bwt_temp_filename, pCurrRT); std::string sai_temp_filename = makeTempName(out_prefix, groupID, sai_extension); pSA->writeIndex(sai_temp_filename); // Push the merge info mergeItem.end_index = numReadTotal - 1; // inclusive mergeItem.reads_filename = in_filename; mergeItem.bwt_filename = bwt_temp_filename; mergeItem.sai_filename = sai_temp_filename; mergeVector.push_back(mergeItem); // Cleanup delete pSA; // Start the new group mergeItem.start_index = numReadTotal; ++groupID; pCurrRT->clear(); } } delete pCurrRT; delete pReader; // Phase 2: Pairwise merge the BWTs int round = 1; MergeVector nextMergeRound; while(mergeVector.size() > 1) { std::cout << "Starting round " << round << "\n"; pReader = new SeqReader(in_filename); for(size_t i = 0; i < mergeVector.size(); i+=2) { if(i + 1 != mergeVector.size()) { std::string bwt_merged_name = makeTempName(out_prefix, groupID, bwt_extension); std::string sai_merged_name = makeTempName(out_prefix, groupID, sai_extension); MergeItem item1 = mergeVector[i]; MergeItem item2 = mergeVector[i+1]; // Perform the actual merge int64_t curr_idx = merge(pReader, item1, item2, bwt_merged_name, sai_merged_name, doReverse, numThreads, storageLevel); // pReader now points to the end of item1's block of // reads. Skip item2's reads assert(curr_idx == item2.start_index); while(curr_idx <= item2.end_index) { bool eof = !pReader->get(record); assert(!eof); (void)eof; ++curr_idx; } // Create the merged mergeItem to use in the next round MergeItem merged; merged.start_index = item1.start_index; merged.end_index = item2.end_index; merged.bwt_filename = bwt_merged_name; merged.sai_filename = sai_merged_name; nextMergeRound.push_back(merged); // Done with the temp files, remove them unlink(item1.bwt_filename.c_str()); unlink(item2.bwt_filename.c_str()); unlink(item1.sai_filename.c_str()); unlink(item2.sai_filename.c_str()); ++groupID; } else { // Singleton, pass through to the next round nextMergeRound.push_back(mergeVector[i]); } } delete pReader; mergeVector.clear(); mergeVector.swap(nextMergeRound); ++round; } assert(mergeVector.size() == 1); // Done, rename the files to their final name std::stringstream bwt_ss; bwt_ss << out_prefix << bwt_extension << (USE_GZ ? ".gz" : ""); std::string bwt_final_filename = bwt_ss.str(); rename(mergeVector.front().bwt_filename.c_str(), bwt_final_filename.c_str()); std::stringstream sai_ss; sai_ss << out_prefix << sai_extension << (USE_GZ ? ".gz" : ""); std::string sai_final_filename = sai_ss.str(); rename(mergeVector.front().sai_filename.c_str(), sai_final_filename.c_str()); }
static void init_suffix_array( const String& s, SuffixArray& sa ) { sa.init(s); sa.build(); sa.buildHeight(); }
int main(int argc, char* argv[]) { // handle parameters string query; string fileNameSuffix; string fileNameSource; int loadFlag = false; int saveFlag = false; int createFlag = false; int queryFlag = false; int stdioFlag = false; // receive requests from STDIN, respond to STDOUT string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create corpus]\n\t[--query string]\n\t[--stdio]\n"; while(1) { static struct option long_options[] = { {"load", required_argument, 0, 'l'}, {"save", required_argument, 0, 's'}, {"create", required_argument, 0, 'c'}, {"query", required_argument, 0, 'q'}, {"stdio", no_argument, 0, 'i'}, {0, 0, 0, 0} }; int option_index = 0; int c = getopt_long (argc, argv, "l:s:c:q:i", long_options, &option_index); if (c == -1) break; switch (c) { case 'l': fileNameSuffix = string(optarg); loadFlag = true; break; case 's': fileNameSuffix = string(optarg); saveFlag = true; break; case 'c': fileNameSource = string(optarg); createFlag = true; break; case 'q': query = string(optarg); queryFlag = true; break; case 'i': stdioFlag = true; break; default: cerr << info; exit(1); } } if (stdioFlag) { queryFlag = true; } // check if parameter settings are legal if (saveFlag && !createFlag) { cerr << "error: cannot save without creating\n" << info; exit(1); } if (saveFlag && loadFlag) { cerr << "error: cannot load and save at the same time\n" << info; exit(1); } if (!loadFlag && !createFlag) { cerr << "error: neither load or create - i have no info!\n" << info; exit(1); } // do your thing if (createFlag) { cerr << "will create\n"; cerr << "corpus is in " << fileNameSource << endl; suffixArray.Create( fileNameSource ); if (saveFlag) { suffixArray.Save( fileNameSuffix ); cerr << "will save in " << fileNameSuffix << endl; } } if (loadFlag) { cerr << "will load from " << fileNameSuffix << endl; suffixArray.Load( fileNameSuffix ); } if (stdioFlag) { while(true) { string query; if (getline(cin, query, '\n').eof()) { return 0; } cout << lookup( query ) << endl; } } else if (queryFlag) { cout << lookup( query ) << endl; } return 0; }
size_t lookup( string query ) { cerr << "query is " << query << endl; vector< string > queryString = tokenize( query.c_str() ); return suffixArray.Count( queryString ); }
int main(int argc, char* argv[]) { // handle parameters string query; string fileNameSuffix; string fileNameSource; string fileNameTarget = ""; string fileNameAlignment = ""; int loadFlag = false; int saveFlag = false; int createFlag = false; int queryFlag = false; int htmlFlag = false; string info = "usage: suffix-query\n\t[--load file]\n\t[--save file]\n\t[--create source-corpus]\n\t[--query string]\n\t[--target target-corpus]\n\t[--alignment file]\n"; while(1) { static struct option long_options[] = { {"load", required_argument, 0, 'l'}, {"save", required_argument, 0, 's'}, {"create", required_argument, 0, 'c'}, {"query", required_argument, 0, 'q'}, {"target", required_argument, 0, 't'}, {"alignment", required_argument, 0, 'a'}, {"html", no_argument, &htmlFlag, 0}, {0, 0, 0, 0} }; int option_index = 0; int c = getopt_long (argc, argv, "l:s:c:q:t:a:h", long_options, &option_index); if (c == -1) break; switch (c) { case 'l': fileNameSuffix = string(optarg); loadFlag = true; break; case 't': fileNameTarget = string(optarg); break; case 'a': fileNameAlignment = string(optarg); break; case 's': fileNameSuffix = string(optarg); saveFlag = true; break; case 'c': fileNameSource = string(optarg); createFlag = true; break; case 'q': query = string(optarg); queryFlag = true; break; default: cerr << info; exit(1); } } // check if parameter settings are legal if (saveFlag && !createFlag) { cerr << "error: cannot save without creating\n" << info; exit(1); } if (saveFlag && loadFlag) { cerr << "error: cannot load and save at the same time\n" << info; exit(1); } if (!loadFlag && !createFlag) { cerr << "error: neither load or create - i have no info!\n" << info; exit(1); } if (createFlag && (fileNameTarget == "" || fileNameAlignment == "")) { cerr << "error: i have no target corpus or alignment\n" << info; exit(1); } // do your thing SuffixArray suffixArray; TargetCorpus targetCorpus; Alignment alignment; if (createFlag) { cerr << "will create\n"; cerr << "source corpus is in " << fileNameSource << endl; suffixArray.Create( fileNameSource ); cerr << "target corpus is in " << fileNameTarget << endl; targetCorpus.Create( fileNameTarget ); cerr << "alignment is in " << fileNameAlignment << endl; alignment.Create( fileNameAlignment ); if (saveFlag) { suffixArray.Save( fileNameSuffix ); targetCorpus.Save( fileNameSuffix ); alignment.Save( fileNameSuffix ); cerr << "will save in " << fileNameSuffix << endl; } } if (loadFlag) { cerr << "will load from " << fileNameSuffix << endl; suffixArray.Load( fileNameSuffix ); targetCorpus.Load( fileNameSuffix ); alignment.Load( fileNameSuffix ); } if (queryFlag) { cerr << "query is " << query << endl; vector< string > queryString = alignment.Tokenize( query.c_str() ); PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment ); ppCollection.GetCollection( queryString ); ppCollection.PrintHTML(); } }
void print_pairwise_lcp(std::string const &s, SuffixArray &sa) { vi_t pos, lcp; sa.lcp_pairwise(pos, lcp); print_suffix_array(s, sa, &lcp); }
int main(int argc, char* argv[]) { // handle parameters string query; string fileNameSuffix; string fileNameSource; string fileNameTarget = ""; string fileNameAlignment = ""; int loadFlag = false; int saveFlag = false; int createFlag = false; int queryFlag = false; int htmlFlag = false; // output as HTML int prettyFlag = false; // output readable on screen int stdioFlag = false; // receive requests from STDIN, respond to STDOUT int max_translation = 20; int max_example = 50; string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create source-corpus]\n\t[--query string]\n\t[--target target-corpus]\n\t[--alignment file]\n\t[--translations count]\n\t[--examples count]\n\t[--html]\n\t[--stdio]\n"; while(1) { static struct option long_options[] = { {"load", required_argument, 0, 'l'}, {"save", required_argument, 0, 's'}, {"create", required_argument, 0, 'c'}, {"query", required_argument, 0, 'q'}, {"target", required_argument, 0, 't'}, {"alignment", required_argument, 0, 'a'}, {"html", no_argument, 0, 'h'}, {"pretty", no_argument, 0, 'p'}, {"stdio", no_argument, 0, 'i'}, {"translations", required_argument, 0, 'o'}, {"examples", required_argument, 0, 'e'}, {0, 0, 0, 0} }; int option_index = 0; int c = getopt_long (argc, argv, "l:s:c:q:Q:t:a:hpio:e:", long_options, &option_index); if (c == -1) break; switch (c) { case 'l': fileNameSuffix = string(optarg); loadFlag = true; break; case 't': fileNameTarget = string(optarg); break; case 'a': fileNameAlignment = string(optarg); break; case 's': fileNameSuffix = string(optarg); saveFlag = true; break; case 'c': fileNameSource = string(optarg); createFlag = true; break; case 'Q': query = base64_decode(string(optarg)); queryFlag = true; break; case 'q': query = string(optarg); queryFlag = true; break; case 'o': max_translation = atoi(optarg); break; case 'e': max_example = atoi(optarg); break; case 'p': prettyFlag = true; break; case 'h': htmlFlag = true; break; case 'i': stdioFlag = true; break; default: cerr << info; exit(1); } } if (stdioFlag) { queryFlag = true; } // check if parameter settings are legal if (saveFlag && !createFlag) { cerr << "error: cannot save without creating\n" << info; exit(1); } if (saveFlag && loadFlag) { cerr << "error: cannot load and save at the same time\n" << info; exit(1); } if (!loadFlag && !createFlag) { cerr << "error: neither load or create - i have no info!\n" << info; exit(1); } if (createFlag && (fileNameTarget == "" || fileNameAlignment == "")) { cerr << "error: i have no target corpus or alignment\n" << info; exit(1); } // do your thing SuffixArray suffixArray; TargetCorpus targetCorpus; Alignment alignment; if (createFlag) { cerr << "will create\n"; cerr << "source corpus is in " << fileNameSource << endl; suffixArray.Create( fileNameSource ); cerr << "target corpus is in " << fileNameTarget << endl; targetCorpus.Create( fileNameTarget ); cerr << "alignment is in " << fileNameAlignment << endl; alignment.Create( fileNameAlignment ); if (saveFlag) { suffixArray.Save( fileNameSuffix ); targetCorpus.Save( fileNameSuffix ); alignment.Save( fileNameSuffix ); cerr << "will save in " << fileNameSuffix << endl; } } if (loadFlag) { cerr << "will load from " << fileNameSuffix << endl; suffixArray.Load( fileNameSuffix ); targetCorpus.Load( fileNameSuffix ); alignment.Load( fileNameSuffix ); } if (stdioFlag) { cout << "-|||- BICONCOR START -|||-" << endl << flush; while(true) { string query; if (getline(cin, query, '\n').eof()) { return 0; } vector< string > queryString = alignment.Tokenize( query.c_str() ); PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example ); int total = ppCollection.GetCollection( queryString ); cout << "TOTAL: " << total << endl; if (htmlFlag) { ppCollection.PrintHTML(); } else { ppCollection.Print(prettyFlag); } cout << "-|||- BICONCOR END -|||-" << endl << flush; } } else if (queryFlag) { cerr << "query is " << query << endl; vector< string > queryString = alignment.Tokenize( query.c_str() ); PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example ); ppCollection.GetCollection( queryString ); if (htmlFlag) { ppCollection.PrintHTML(); } else { ppCollection.Print(prettyFlag); } } return 0; }
int main(int argc, char* argv[]) { if (argc < 2) { PrintUsage(); exit(1); } int argi = 1; string saFile = argv[argi++]; vector<string> inFiles; int doBLT = 1; int bltPrefixLength = 8; int parsingOptions = 0; SAType saBuildType = larsson; int read4BitCompressed = 0; int diffCoverSize = 0; while (argi < argc) { if (strlen(argv[argi]) > 0 and argv[argi][0] == '-'){ parsingOptions = 1; } if (!parsingOptions) { inFiles.push_back(argv[argi]); } else { if (strcmp(argv[argi], "-blt") == 0) { doBLT = 1; if (argi < argc - 1) { bltPrefixLength = atoi(argv[++argi]); if (bltPrefixLength == 0) { cout << argv[argi] << " is not a valid lookup table length." << endl; exit(1); } } else { cout << "Please specify a lookup table length." << endl; exit(1); } } else if (strcmp(argv[argi], "-mamy") == 0) { saBuildType = manmy; } else if (strcmp(argv[argi], "-larsson") == 0) { saBuildType = larsson; } else if (strcmp(argv[argi], "-mcilroy") == 0) { saBuildType = mcilroy; } else if (strcmp(argv[argi], "-slow") == 0) { saBuildType = slow; } else if (strcmp(argv[argi], "-kark") == 0) { saBuildType = kark; } else if (strcmp(argv[argi], "-mafe") == 0) { saBuildType = mafe; } else if (strcmp(argv[argi], "-welter") == 0) { saBuildType = welter; } else if (strcmp(argv[argi], "-welterweight") == 0) { if (argi < argc-1) { diffCoverSize = atoi(argv[++argi]); } else { cout << "Please specify a difference cover size. Valid values are 7,32,64,111, and 2281. Larger values use less memory but may be slower." << endl; exit(1); } if ( ! (diffCoverSize == 7 or diffCoverSize == 32 or diffCoverSize == 64 or diffCoverSize == 111 or diffCoverSize == 2281) ) { cout << "The difference cover size must be one of 7,32,64,111, or 2281." << endl; cout << "Larger numbers use less space but are more slow." << endl; exit(1); } } else if (strcmp(argv[argi], "-4bit") == 0) { read4BitCompressed = 1; } else { PrintUsage(); cout << "ERROR, bad option: " << argv[argi] << endl; exit(1); } } ++argi; } if (inFiles.size() == 0) { // // Special use case: the input file is a fasta file. Write to that file + .sa // inFiles.push_back(saFile); saFile = saFile + ".sa"; } VectorIndex inFileIndex; FASTASequence seq; CompressedSequence<FASTASequence> compSeq; if (read4BitCompressed == 0) { for (inFileIndex = 0; inFileIndex < inFiles.size(); ++inFileIndex) { FASTAReader reader; reader.Init(inFiles[inFileIndex]); reader.SetSpacePadding(111); if (saBuildType == kark) { // // The Karkkainen sa building method requires a little extra // space at the end of the dna sequence so that counting may // be done mod 3 without adding extra logic for boundaries. // } if (inFileIndex == 0) { reader.ReadAllSequencesIntoOne(seq); reader.Close(); } else { while(reader.ConcatenateNext(seq)) { cout << "added " << seq.title << endl; } } } seq.ToThreeBit(); //seq.ToUpper(); } else { assert(inFiles.size() == 1); cout << "reading compressed sequence." << endl; compSeq.Read(inFiles[0]); seq.seq = compSeq.seq; seq.length = compSeq.length; compSeq.RemoveCompressionCounts(); cout << "done." << endl; } // // For now, do not allow creation of suffix arrays on sequences > 4G. // if (seq.length >= UINT_MAX) { cout << "ERROR, references greater than " << UINT_MAX << " bases are not supported." << endl; cout << "Consider breaking the reference into multiple files, running alignment. " << endl; cout << "against each file, and merging the result." << endl; exit(1); } vector<int> alphabet; SuffixArray<Nucleotide, vector<int> > sa; // sa.InitTwoBitDNAAlphabet(alphabet); // sa.InitAsciiCharDNAAlphabet(alphabet); sa.InitThreeBitDNAAlphabet(alphabet); if (saBuildType == manmy) { sa.MMBuildSuffixArray(seq.seq, seq.length, alphabet); } else if (saBuildType == mcilroy) { sa.index = new SAIndex[seq.length+1]; DNALength i; for (i = 0; i < seq.length; i++) { sa.index[i] = seq.seq[i] + 1;} sa.index[seq.length] = 0; ssort(sa.index, NULL); for (i = 1; i < seq.length+1; i++ ){ sa.index[i-1] = sa.index[i];}; sa.length = seq.length; } else if (saBuildType == larsson) { sa.LarssonBuildSuffixArray(seq.seq, seq.length, alphabet); } else if (saBuildType == kark) { sa.index = new SAIndex[seq.length]; seq.ToThreeBit(); DNALength p; for (p = 0; p < seq.length; p++ ){ seq.seq[p]++; } KarkkainenBuildSuffixArray<Nucleotide>(seq.seq, sa.index, seq.length, 5); sa.length = seq.length; } else if (saBuildType == mafe) { // sa.MaFeBuildSuffixArray(seq.seq, seq.length); } else if (saBuildType == welter) { if (diffCoverSize == 0) { sa.LightweightBuildSuffixArray(seq.seq, seq.length); } else { sa.LightweightBuildSuffixArray(seq.seq, seq.length, diffCoverSize); } } if (doBLT) { sa.BuildLookupTable(seq.seq, seq.length, bltPrefixLength); } sa.Write(saFile); return 0; }
void cluster() { BWT* pBWT = new BWT(opt::prefix + BWT_EXT); BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT); OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT,opt::errorRate, opt::seedLength, opt::seedStride, true); pOverlapper->setExactModeOverlap(opt::errorRate < 0.001f); pOverlapper->setExactModeIrreducible(opt::errorRate < 0.001f); BitVector markedReads(pBWT->getNumStrings()); std::string preclustersFile = opt::outFile + ".preclusters"; std::ostream* pPreWriter = createWriter(preclustersFile); ClusterPostProcess postProcessor(pPreWriter, opt::minSize, &markedReads); // Set the cluster parameters ClusterParameters parameters; parameters.pOverlapper = pOverlapper; parameters.minOverlap = opt::minOverlap; parameters.maxClusterSize = opt::maxSize; parameters.maxIterations = opt::maxIterations; parameters.pMarkedReads = &markedReads; // Read the limit kmer sequences, if provided std::set<std::string>* pLimitKmers = NULL; if(!opt::limitFile.empty()) { // Read in the limit sequences pLimitKmers = new std::set<std::string>; readLimitKmers(pLimitKmers); parameters.pLimitKmers = pLimitKmers; parameters.limitK = opt::limitKmer; } else { parameters.pLimitKmers = NULL; parameters.limitK = 0; } // Make pre-clusters from the reads if(opt::numThreads <= 1) { printf("[%s] starting serial-mode read clustering\n", PROGRAM_IDENT); ClusterProcess processor(parameters); // If the extend file is empty, build new clusters if(opt::extendFile.empty()) { PROCESS_CLUSTER_SERIAL(opt::readsFile, &processor, &postProcessor); } else { // Process a set of preexisting clusters ClusterReader clusterReader(opt::extendFile); PROCESS_EXTEND_SERIAL(clusterReader, &processor, &postProcessor); } } else { printf("[%s] starting parallel-mode read clustering computation with %d threads\n", PROGRAM_IDENT, opt::numThreads); std::vector<ClusterProcess*> processorVector; for(int i = 0; i < opt::numThreads; ++i) { ClusterProcess* pProcessor = new ClusterProcess(parameters); processorVector.push_back(pProcessor); } if(opt::extendFile.empty()) { PROCESS_CLUSTER_PARALLEL(opt::readsFile, processorVector, &postProcessor); } else { ClusterReader clusterReader(opt::extendFile); PROCESS_EXTEND_PARALLEL(clusterReader, processorVector, &postProcessor); } for(size_t i = 0; i < processorVector.size(); ++i) { delete processorVector[i]; processorVector[i] = NULL; } } delete pPreWriter; delete pBWT; delete pRBWT; delete pOverlapper; // Deallocate limit kmers if(pLimitKmers != NULL) delete pLimitKmers; // Open the preclusters file and convert them to read names SuffixArray* pFwdSAI = new SuffixArray(opt::prefix + SAI_EXT); ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pFwdSAI->getNumStrings()); size_t seedIdx = 0; std::istream* pPreReader = createReader(preclustersFile); std::ostream* pClusterWriter = createWriter(opt::outFile); std::string line; while(getline(*pPreReader,line)) { std::stringstream parser(line); std::string clusterName; std::string readSequence; size_t clusterSize; int64_t lowIdx; int64_t highIdx; parser >> clusterName >> clusterSize >> readSequence >> lowIdx >> highIdx; if(lowIdx > highIdx) { // This is an extra read that is not present in the FM-index // Output a record with a fake read ID *pClusterWriter << clusterName << "\t" << clusterSize << "\tseed-" << seedIdx++ << "\t" << readSequence << "\n"; } else { for(int64_t i = lowIdx; i <= highIdx; ++i) { const ReadInfo& targetInfo = pRIT->getReadInfo(pFwdSAI->get(i).getID()); std::string readName = targetInfo.id; *pClusterWriter << clusterName << "\t" << clusterSize << "\t" << readName << "\t" << readSequence << "\n"; } } } unlink(preclustersFile.c_str()); delete pFwdSAI; delete pRIT; delete pPreReader; delete pClusterWriter; }
// Compute the initial BWTs for the input file split into blocks of records using the SAIS algorithm MergeVector computeInitialSAIS(const BWTDiskParameters& parameters) { SeqReader* pReader = new SeqReader(parameters.inFile); SeqRecord record; int groupID = 0; size_t numReadTotal = 0; MergeVector mergeVector; MergeItem mergeItem; mergeItem.start_index = 0; // Phase 1: Compute the initial BWTs ReadTable* pCurrRT = new ReadTable; bool done = false; while(!done) { done = !pReader->get(record); if(!done) { // the read is valid SeqItem item = record.toSeqItem(); if(parameters.bBuildReverse) item.seq.reverse(); pCurrRT->addRead(item); ++numReadTotal; } if(pCurrRT->getCount() >= parameters.numReadsPerBatch || (done && pCurrRT->getCount() > 0)) { // Compute the SA and BWT for this group SuffixArray* pSA = new SuffixArray(pCurrRT, 1); // Write the BWT to disk std::string bwt_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.bwtExtension); pSA->writeBWT(bwt_temp_filename, pCurrRT); std::string sai_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.saiExtension); pSA->writeIndex(sai_temp_filename); // Push the merge info mergeItem.end_index = numReadTotal - 1; // inclusive mergeItem.reads_filename = parameters.inFile; mergeItem.bwt_filename = bwt_temp_filename; mergeItem.sai_filename = sai_temp_filename; mergeVector.push_back(mergeItem); // Cleanup delete pSA; // Start the new group mergeItem.start_index = numReadTotal; ++groupID; pCurrRT->clear(); } } delete pCurrRT; delete pReader; return mergeVector; }