static std::pair< std::string,std::vector<bool> > circularBwt(std::string const s, uint64_t const beg, uint64_t const len, int64_t const term, uint64_t * zrank = 0) { assert ( beg + len <= s.size() ); std::string const s2 = s+s; // std::cerr << "[V] Block sorting..."; uint8_t const * utext = reinterpret_cast<uint8_t const *>(s2.c_str()) + beg; typedef ::libmaus::suffixsort::DivSufSort<32,uint8_t *,uint8_t const *,int32_t *,int32_t const *> sort_type; typedef sort_type::saidx_t saidx_t; ::libmaus::autoarray::AutoArray<saidx_t> SA(s2.size()-beg); sort_type::divsufsort ( utext, SA.begin() , SA.size() ); // std::cerr << "done." << std::endl; std::vector<bool> gt(len+1,false); std::string bwt(len,' '); uint64_t j = 0; bool gtf = false; for ( uint64_t i = 0; i < SA.size(); ++i ) if ( SA[i] < static_cast<saidx_t>(len) ) { gt [ SA[i] ] = gtf; #if defined(DEBUG) assert ( gtf == (s2.substr(beg+SA[i]) > s2.substr(beg)) ); #endif if ( SA[i] == 0 ) { if ( zrank ) *zrank = j; bwt[j++] = term; // 0; gtf = true; } else { bwt[j++] = utext [ SA[i]-1 ]; } } #if defined(DEBUG) std::cerr << "-----\n\n"; for ( uint64_t i = 0; i < SA.size(); ++i ) if ( SA[i] < len ) std::cerr << "[" << std::setw(2) << std::setfill('0') << SA[i] << std::setw(0) << "] = " << s2.substr(beg+SA[i]) << std::endl; #endif std::string const gtbackleft = s2.substr(beg+len); std::string const gtbackright = s2.substr(beg); #if defined(DEBUG) std::cerr << "gt[len] = " << gtbackleft << " > " << gtbackright << " = " << (gtbackleft > gtbackright) << std::endl; #endif gt[len] = gtbackleft > gtbackright; assert ( j == len ); return std::make_pair(bwt,gt); }
TEST bwt_wikipedia_example1() { size_t index; char *input = "^BANANA|"; char output[9]; bwt(input, output, &index, 8); output[8] = '\0'; ASSERT_STR_EQ("BNN^AA|A", output); PASS(); }
TEST bwt_drdobbs() { size_t index; char *input = "DRDOBBS"; char output[8]; bwt(input, output, &index, 7); output[7] = '\0'; ASSERT_EQ(3, index); ASSERT_STR_EQ("OBRSDDB", output); PASS(); }
TEST bwt_abracadabra() { size_t index; char *input = "abracadabra"; char output[12]; bwt(input, output, &index, 11); output[11] = '\0'; ASSERT_EQ(2, index); ASSERT_STR_EQ("rdarcaaaabb", output); PASS(); }
TEST bwt_bacabba() { size_t index; char *input = "bacabba"; char output[8]; bwt(input, output, &index, 7); output[7] = '\0'; ASSERT_EQ(4, index); ASSERT_STR_EQ("bcbbaaa", output); PASS(); }
TEST ibwt_wikipedia_example2() { size_t index; char *input = "SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES"; char encoded[45]; char decoded[45]; bwt(input, encoded, &index, 44); ibwt(encoded, decoded, index, 44); decoded[44] = '\0'; ASSERT_STR_EQ(input, decoded); PASS(); }
TEST bwt_wikipedia_example2() { /* Not passing, possibly wrong answer */ SKIP(); size_t index; char *input = "SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES"; char output[45]; bwt(input, output, &index, 44); output[44] = '\0'; ASSERT_STR_EQ("TEXYDST.E.IXXIIXXSSMPPS.B..E.S.UESFXDIIOIIIT", output); PASS(); }
void CompTool::search(int argc, char** argv){ if(argc < 3) {cout << "File not enough" << endl; exit(1);} const string seq1_file = argv[1]; const string seq2_file = argv[2]; const string seq1_name = basename(seq1_file); const string seq2_name = basename(seq2_file); // Options int kmer_size = 15; int slide_letters = 1; int bwt_interval = 1; int max_num_matches = 1000000000; bool search_forward = true; bool search_reverse = true; if(argc > 3){ for(int i = 3; i < argc; i++){ if (argv[i][1] == 'k') kmer_size = atoi(argv[++i]); else if(argv[i][1] == 'l') slide_letters = atoi(argv[++i]); else if(argv[i][1] == 'i') bwt_interval = atoi(argv[++i]); else if(argv[i][1] == 'm') max_num_matches = atoi(argv[++i]); else if(argv[i][1] == 'f') search_reverse = false; else if(argv[i][1] == 'r') search_forward = false; } } seq1_size_ = get_seq_length(seq1_file) + 1; seq2_size_ = get_seq_length(seq2_file) + 1; seq1_ = read_fasta_and_create_int8_t_array(seq1_file, seq1_size_); seq2_ = read_fasta_and_create_int8_t_array(seq2_file, seq2_size_); int* SA = create_SA(seq1_file, seq1_size_); BWT bwt(seq1_, SA, seq1_size_, num_char_, bwt_interval); if(search_forward) search_forward_matches(seq1_name, seq2_name, SA, bwt, kmer_size, slide_letters, max_num_matches); if(search_reverse) search_reverse_matches(seq1_name, seq2_name, SA, bwt, kmer_size, slide_letters, max_num_matches); delete SA; }