예제 #1
0
			static std::pair< std::string,std::vector<bool> > circularBwt(std::string const s, uint64_t const beg, uint64_t const len, int64_t const term, uint64_t * zrank = 0)
			{
				assert ( beg + len <= s.size() );
				std::string const s2 = s+s;

				// std::cerr << "[V] Block sorting...";
				uint8_t const * utext = reinterpret_cast<uint8_t const *>(s2.c_str()) + beg;
				typedef ::libmaus::suffixsort::DivSufSort<32,uint8_t *,uint8_t const *,int32_t *,int32_t const *> sort_type;
				typedef sort_type::saidx_t saidx_t;
				::libmaus::autoarray::AutoArray<saidx_t> SA(s2.size()-beg);
				sort_type::divsufsort ( utext, SA.begin() , SA.size() );
				// std::cerr << "done." << std::endl;
				
				std::vector<bool> gt(len+1,false);

				std::string bwt(len,' ');
				uint64_t j = 0;
				bool gtf = false;
				for ( uint64_t i = 0; i < SA.size(); ++i )
					if ( SA[i] < static_cast<saidx_t>(len) )
					{
						gt [ SA[i] ] = gtf;
						
						#if defined(DEBUG)
						assert ( gtf == (s2.substr(beg+SA[i]) > s2.substr(beg)) );
						#endif
						
						if ( SA[i] == 0 )
						{
							if ( zrank )
								*zrank = j;
							bwt[j++] = term; // 0;
							gtf = true;
						}
						else
						{
							bwt[j++] = utext [ SA[i]-1 ];
						}
					}
					
				#if defined(DEBUG)
				std::cerr << "-----\n\n";
				for ( uint64_t i = 0; i < SA.size(); ++i )
					if ( SA[i] < len )
						std::cerr << "[" << std::setw(2) << std::setfill('0') << SA[i] << std::setw(0) << "] = " << s2.substr(beg+SA[i]) << std::endl;
				#endif
				
				std::string const gtbackleft = s2.substr(beg+len);
				std::string const gtbackright = s2.substr(beg);
				
				#if defined(DEBUG)
				std::cerr << "gt[len] = " << gtbackleft << " > " << gtbackright << " = " << (gtbackleft > gtbackright) << std::endl;
				#endif
				
				gt[len] = gtbackleft > gtbackright;

				assert ( j == len );

				return std::make_pair(bwt,gt);
			}
TEST bwt_wikipedia_example1()
{
	size_t index;
	char *input = "^BANANA|";
	char output[9];
	bwt(input, output, &index, 8);
	output[8] = '\0';

	ASSERT_STR_EQ("BNN^AA|A", output);
	PASS();
}
TEST bwt_drdobbs()
{
	size_t index;
	char *input = "DRDOBBS";
	char output[8];
	bwt(input, output, &index, 7);
	output[7] = '\0';

	ASSERT_EQ(3, index);
	ASSERT_STR_EQ("OBRSDDB", output);
	PASS();
}
TEST bwt_abracadabra()
{
	size_t index;
	char *input = "abracadabra";
	char output[12];
	bwt(input, output, &index, 11);
	output[11] = '\0';

	ASSERT_EQ(2, index);
	ASSERT_STR_EQ("rdarcaaaabb", output);
	PASS();
}
TEST bwt_bacabba()
{
	size_t index;
	char *input = "bacabba";
	char output[8];
	bwt(input, output, &index, 7);
	output[7] = '\0';

	ASSERT_EQ(4, index);
	ASSERT_STR_EQ("bcbbaaa", output);
	PASS();
}
TEST ibwt_wikipedia_example2()
{
	size_t index;
	char *input = "SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES";
	char encoded[45];
	char decoded[45];
	bwt(input, encoded, &index, 44);
	ibwt(encoded, decoded, index, 44);
	decoded[44] = '\0';

	ASSERT_STR_EQ(input, decoded);
	PASS();
}
TEST bwt_wikipedia_example2()
{
	/* Not passing, possibly wrong answer */
	SKIP();

	size_t index;
	char *input = "SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES";
	char output[45];
	bwt(input, output, &index, 44);
	output[44] = '\0';

	ASSERT_STR_EQ("TEXYDST.E.IXXIIXXSSMPPS.B..E.S.UESFXDIIOIIIT", output);
	PASS();
}
예제 #8
0
파일: comptool.cpp 프로젝트: stomk/dop
void CompTool::search(int argc, char** argv){
    if(argc < 3) {cout << "File not enough" << endl; exit(1);}

    const string seq1_file = argv[1];
    const string seq2_file = argv[2];
    const string seq1_name = basename(seq1_file);
    const string seq2_name = basename(seq2_file);

    // Options
    int  kmer_size       = 15;
    int  slide_letters   = 1;
    int  bwt_interval    = 1;
    int  max_num_matches = 1000000000;
    bool search_forward  = true;
    bool search_reverse  = true;

    if(argc > 3){
        for(int i = 3; i < argc; i++){
            if     (argv[i][1] == 'k') kmer_size       = atoi(argv[++i]);
            else if(argv[i][1] == 'l') slide_letters   = atoi(argv[++i]);
            else if(argv[i][1] == 'i') bwt_interval    = atoi(argv[++i]);
            else if(argv[i][1] == 'm') max_num_matches = atoi(argv[++i]);
            else if(argv[i][1] == 'f') search_reverse  = false;
            else if(argv[i][1] == 'r') search_forward  = false;
        }
    }

    seq1_size_ = get_seq_length(seq1_file) + 1;
    seq2_size_ = get_seq_length(seq2_file) + 1;
    seq1_ = read_fasta_and_create_int8_t_array(seq1_file, seq1_size_);
    seq2_ = read_fasta_and_create_int8_t_array(seq2_file, seq2_size_);

    int* SA = create_SA(seq1_file, seq1_size_);
    BWT bwt(seq1_, SA, seq1_size_, num_char_, bwt_interval);

    if(search_forward) search_forward_matches(seq1_name, seq2_name, SA, bwt, kmer_size, slide_letters, max_num_matches);
    if(search_reverse) search_reverse_matches(seq1_name, seq2_name, SA, bwt, kmer_size, slide_letters, max_num_matches);

    delete SA;
}