Example #1
0
void padLeft(std::string &str, const size_t num, const char paddingChar)
{
	if (num > str.size())
		str.insert(0, num - str.size(), paddingChar);
}
Example #2
0
 void write(const std::string& data) final {
     if (!data.empty()) {
         const int nwrite = ::gzwrite(m_gzfile, data.data(), static_cast_with_assert<unsigned int>(data.size()));
         if (nwrite == 0) {
             detail::throw_gzip_error(m_gzfile, "write failed");
         }
     }
 }
SequenceOverlapPairVector KmerOverlaps::retrieveMatches(const std::string& query, size_t k, 
                                                        int min_overlap, double min_identity,
                                                        int bandwidth, const BWTIndexSet& indices)
{
    PROFILE_FUNC("OverlapHaplotypeBuilder::retrieveMatches")
    assert(indices.pBWT != NULL);
    assert(indices.pSSA != NULL);

    static size_t n_calls = 0;
    static size_t n_candidates = 0;
    static size_t n_output = 0;
    static double t_time = 0;
    Timer timer("test", true);

    n_calls++;

    int64_t max_interval_size = 200;
    SequenceOverlapPairVector overlap_vector;

    // Use the FM-index to look up intervals for each kmer of the read. Each index
    // in the interval is stored individually in the KmerMatchMap. We then
    // backtrack to map these kmer indices to read IDs. As reads can share
    // multiple kmers, we use the map to avoid redundant lookups.
    // There is likely a faster algorithm which performs direct decompression
    // of the read sequences without having to expand the intervals to individual
    // indices. The current algorithm suffices for now.
    KmerMatchMap prematchMap;
    size_t num_kmers = query.size() - k + 1;
    for(size_t i = 0; i < num_kmers; ++i)
    {
        std::string kmer = query.substr(i, k);
        BWTInterval interval = BWTAlgorithms::findInterval(indices, kmer);
        if(interval.isValid() && interval.size() < max_interval_size) 
        {
            for(int64_t j = interval.lower; j <= interval.upper; ++j)
            {
                KmerMatch match = { i, static_cast<size_t>(j), false };
                prematchMap.insert(std::make_pair(match, false));
            }
        }

        kmer = reverseComplement(kmer);
        interval = BWTAlgorithms::findInterval(indices, kmer);
        if(interval.isValid() && interval.size() < max_interval_size) 
        {
            for(int64_t j = interval.lower; j <= interval.upper; ++j)
            {
                KmerMatch match = { i, static_cast<size_t>(j), true };
                prematchMap.insert(std::make_pair(match, false));
            }
        }
    }

    // Backtrack through the kmer indices to turn them into read indices.
    // This mirrors the calcSA function in SampledSuffixArray except we mark each entry
    // as visited once it is processed.
    KmerMatchSet matches;
    for(KmerMatchMap::iterator iter = prematchMap.begin(); iter != prematchMap.end(); ++iter)
    {
        // This index has been visited
        if(iter->second)
            continue;

        // Mark this as visited
        iter->second = true;

        // Backtrack the index until we hit the starting symbol
        KmerMatch out_match = iter->first;
        while(1) 
        {
            char b = indices.pBWT->getChar(out_match.index);
            out_match.index = indices.pBWT->getPC(b) + indices.pBWT->getOcc(b, out_match.index - 1);

            // Check if the hash indicates we have visited this index. If so, stop the backtrack
            KmerMatchMap::iterator find_iter = prematchMap.find(out_match);
            if(find_iter != prematchMap.end())
            {
                // We have processed this index already
                if(find_iter->second)
                    break;
                else
                    find_iter->second = true;
            }

            if(b == '$')
            {
                // We've found the lexicographic index for this read. Turn it into a proper ID
                out_match.index = indices.pSSA->lookupLexoRank(out_match.index);
                matches.insert(out_match);
                break;
            }
        }
    }

    // Refine the matches by computing proper overlaps between the sequences
    // Use the overlaps that meet the thresholds to build a multiple alignment
    for(KmerMatchSet::iterator iter = matches.begin(); iter != matches.end(); ++iter)
    {
		
        std::string match_sequence = BWTAlgorithms::extractString(indices.pBWT, iter->index);
        if(iter->is_reverse)
            match_sequence = reverseComplement(match_sequence);
        
        // Ignore identical matches
        if(match_sequence == query)
            continue;

        // Compute the overlap. If the kmer match occurs a single time in each sequence we use
        // the banded extension overlap strategy. Otherwise we use the slow O(M*N) overlapper.
        SequenceOverlap overlap;
        std::string match_kmer = query.substr(iter->position, k);
        size_t pos_0 = query.find(match_kmer);
        size_t pos_1 = match_sequence.find(match_kmer);
        assert(pos_0 != std::string::npos && pos_1 != std::string::npos);

        // Check for secondary occurrences
        if(query.find(match_kmer, pos_0 + 1) != std::string::npos || 
           match_sequence.find(match_kmer, pos_1 + 1) != std::string::npos) {
            // One of the reads has a second occurrence of the kmer. Use
            // the slow overlapper.
            overlap = Overlapper::computeOverlap(query, match_sequence);
        } else {
            overlap = Overlapper::extendMatch(query, match_sequence, pos_0, pos_1, bandwidth);
        }
		
		
        n_candidates += 1;
        bool bPassedOverlap = overlap.getOverlapLength() >= min_overlap;
        bool bPassedIdentity = overlap.getPercentIdentity() / 100 >= min_identity;

        if(bPassedOverlap && bPassedIdentity)
        {
            SequenceOverlapPair op;
            op.sequence[0] = query;
            op.sequence[1] = match_sequence;
            op.overlap = overlap;
            op.is_reversed = iter->is_reverse;
            overlap_vector.push_back(op);
            n_output += 1;
        }
    }

    t_time += timer.getElapsedCPUTime();

    if(Verbosity::Instance().getPrintLevel() > 6 && n_calls % 100 == 0)
        printf("[kmer overlaps] n: %zu candidates: %zu valid: %zu (%.2lf) time: %.2lfs\n", 
            n_calls, n_candidates, n_output, (double)n_output / n_candidates, t_time);
    return overlap_vector;
}
Example #4
0
void stdupper(std::string &inoutstring)
{
	for (size_t i = 0; i < inoutstring.size(); ++i)
		inoutstring[i] = toupper(inoutstring[i]);
}
SequenceOverlapPairVector KmerOverlaps::PacBioRetrieveMatches(const std::string& query, size_t k, 
                                                        int min_overlap, double min_identity,
                                                        int bandwidth, const BWTIndexSet& indices,
														KmerDistribution& kd, int round)
{
    PROFILE_FUNC("OverlapHaplotypeBuilder::PacBioRetrieveMatches")
    assert(indices.pBWT != NULL);
    assert(indices.pSSA != NULL);
	
	//size_t numStringCount[query.size()+1] = 0;
	
	int64_t intervalSum = 0;
    static size_t n_calls = 0;
    static size_t n_candidates = 0;
    static size_t n_output = 0;
    static double t_time = 0;
	size_t count = 0;
	size_t numKmer = 0;
	size_t numRepeatKmer = 0;
	size_t totalKmer = 0;
	size_t numNoSeedRead = 0;
	size_t repeatCutoff = kd.getRepeatKmerCutoff();
	size_t errorCutoff = kd.getMedian() - kd.getSdv();
    Timer timer("test", true);
	
    n_calls++;
	//std::cout<<"PacBioRetrieveMatches\n";
	std::cout<<"\tk :\t"<<k<<"\n";
    SequenceOverlapPairVector overlap_vector;
	std::vector<long> identityVector(100);
	for(int j = 0;j < identityVector.size(); j++)
		identityVector[j] = 0;
		
    // Use the FM-index to look up intervals for each kmer of the read. Each index
    // in the interval is stored individually in the KmerMatchMap. We then
    // backtrack to map these kmer indices to read IDs. As reads can share
    // multiple kmers, we use the map to avoid redundant lookups.
    // There is likely a faster algorithm which performs direct decompression
    // of the read sequences without having to expand the intervals to individual
    // indices. The current algorithm suffices for now.
    KmerMatchMap prematchMap;
    size_t num_kmers = query.size() - k + 1;
	clock_t search_seeds_s = clock(), search_seeds_e;
    
	
	for(size_t i = 0; i < num_kmers; i++)
    {
        std::string kmer = query.substr(i, k);
        BWTInterval interval = BWTAlgorithms::findInterval(indices, kmer);
		if(interval.upper - interval.lower < errorCutoff)
			numNoSeedRead++;
		if((interval.upper - interval.lower) > 20 && (interval.upper - interval.lower) < repeatCutoff)
		{
			numKmer++;
			totalKmer++;
		}
		
		//To avoid the repeat region
		/*if((interval.upper - interval.lower) > repeatCutoff)
		{	
			numRepeatKmer++;
			totalKmer++;
			continue;
		}
		else
			interval.upper = ((interval.upper - interval.lower)>20)?interval.lower + 20 : interval.upper;*/
		
        if(interval.isValid() && interval.size()) 
        {
			//std::cout<<"\tinterval size : "<<interval.upper - interval.lower<<std::endl;
            for(int64_t j = interval.lower; j <= interval.upper; ++j)
            {
                KmerMatch match = { i, static_cast<size_t>(j), false };
                prematchMap.insert(std::make_pair(match, false));
            }
			intervalSum += interval.upper - interval.lower;
			count++;
        }

        kmer = reverseComplement(kmer);
        interval = BWTAlgorithms::findInterval(indices, kmer);
		interval.upper = ((interval.upper - interval.lower)>20)?interval.lower + 20 : interval.upper;
        if(interval.isValid() && interval.size())
        {
            for(int64_t j = interval.lower; j <= interval.upper; ++j)
            {
                KmerMatch match = { i, static_cast<size_t>(j), true };
                prematchMap.insert(std::make_pair(match, false));
            }
			intervalSum += interval.upper - interval.lower;
			count++;
        }
    }
	if(numNoSeedRead == num_kmers)
		std::cout<<"\tnoSeedRead : 1"<<std::endl;
	std::cout<<"\tnumber of kmer : "<<numKmer<<std::endl;
	std::cout<<"\tnumber of RepeatKmer : "<<numRepeatKmer<<std::endl;
	std::cout<<"\tnumber of totalkmer : "<<totalKmer<<std::endl;
	
	
    // Backtrack through the kmer indices to turn them into read indices.
    // This mirrors the calcSA function in SampledSuffixArray except we mark each entry
    // as visited once it is processed.
	//std::cout<<"\tintervalSum : "<<intervalSum<<std::endl;
	//std::cout<<"\tintervalCount : "<<count<<std::endl;
	std::cout<<"\tprematchMap :\t"<<prematchMap.size()<<std::endl;
    KmerMatchSet matches;
    for(KmerMatchMap::iterator iter = prematchMap.begin(); iter != prematchMap.end(); ++iter)
    {
		//std::cout<<"iter->first.position : "<<iter->first.position<<std::endl;
        // This index has been visited
        if(iter->second)
            continue;

        // Mark this as visited
        iter->second = true;

        // Backtrack the index until we hit the starting symbol
        KmerMatch out_match = iter->first;
        while(1) 
        {
            char b = indices.pBWT->getChar(out_match.index);
            out_match.index = indices.pBWT->getPC(b) + indices.pBWT->getOcc(b, out_match.index - 1);

            // Check if the hash indicates we have visited this index. If so, stop the backtrack
            KmerMatchMap::iterator find_iter = prematchMap.find(out_match);
            if(find_iter != prematchMap.end())
            {
                // We have processed this index already
                if(find_iter->second)
                    break;
                else
                    find_iter->second = true;
            }

            if(b == '$')
            {
                // We've found the lexicographic index for this read. Turn it into a proper ID
                out_match.index = indices.pSSA->lookupLexoRank(out_match.index);
				//std::cout<<"out_match.position"<<out_match.position<<std::endl;
                matches.insert(out_match);
                break;
            }
        }
    }
	search_seeds_e = clock();
	std::cout<<"\tmatchset :\t"<<matches.size()<<"\n";
    // Refine the matches by computing proper overlaps between the sequences
    // Use the overlaps that meet the thresholds to build a multiple alignment
	clock_t  extrac_s, extrac_e;
	clock_t  overlapE_s, overlapE_e;
	clock_t  overlapC_s, overlapC_e;
	double extrac_sum = 0.0;
	double overlapE_sum = 0.0, overlapC_sum = 0.0;
	int compute_count = 0,extend_count = 0;
	size_t acNumber = 0;
    for(KmerMatchSet::iterator iter = matches.begin(); iter != matches.end(); ++iter)
    {
		extrac_s = clock();
        std::string match_sequence;// = BWTAlgorithms::extractString(indices.pBWT, iter->index);
		
		if(indices.pReadTable != NULL)
            match_sequence = indices.pReadTable->getRead(iter->index).seq.toString();
		/*else
			match_sequence = BWTAlgorithms::extractString(indices.pBWT, iter->index);*/
		
		extrac_e = clock();
		extrac_sum += (double)extrac_e - extrac_s;
        
		if(iter->is_reverse)
            match_sequence = reverseComplement(match_sequence);
        
        // Ignore identical matches
        if(match_sequence == query)
            continue;

        // Compute the overlap. If the kmer match occurs a single time in each sequence we use
        // the banded extension overlap strategy. Otherwise we use the slow O(M*N) overlapper.
        SequenceOverlap overlap;
        std::string match_kmer = query.substr(iter->position, k);
        size_t pos_0 = iter->position;//query.find(match_kmer);
        size_t pos_1 = match_sequence.find(match_kmer);
        assert(pos_0 != std::string::npos && pos_1 != std::string::npos);

		//Timer* sTimer = new Timer("seeds overlap");
        // Check for secondary occurrences
        /*if(query.find(match_kmer, pos_0 + 1) != std::string::npos || 
           match_sequence.find(match_kmer, pos_1 + 1) != std::string::npos) {
            // One of the reads has a second occurrence of the kmer. Use
            // the slow overlapper.
			overlapC_s = clock();
			compute_count++;
            overlap = Overlapper::computeOverlap(query, match_sequence);
			overlapC_e = clock();
			overlapC_sum += (double)overlapC_e - overlapC_s;
        } 
	
		else {*/
			overlapE_s = clock();
			extend_count++;
            overlap = Overlapper::PacBioExtendMatch(query, match_sequence, pos_0, pos_1, bandwidth);
			overlapE_e = clock();
			overlapE_sum += (double)overlapE_e - overlapE_s;
		//}
	
		//delete sTimer;
        n_candidates += 1;
        bool bPassedOverlap = overlap.getOverlapLength() >= min_overlap;
        bool bPassedIdentity = overlap.getPercentIdentity()  >= min_identity;
		identityVector[(int)overlap.getPercentIdentity()] += 1;
		//overlap.printTotal_columns();
		//overlap.printEdit_distance();
		//std::cout<<"min_overlap == "<<overlap.getOverlapLength()<<"\n";
		//std::cout<<"overlap.getOverlapLength() / 100 == "<<overlap.getOverlapLength() / 100<<"\n";
		//std::cout<<"min_identity == "<<min_identity<<"\n";
		//std::cout<<"bPassedOverlap == "<<bPassedOverlap<<"\n";
		//std::cout<<"bPassedIdentity == "<<bPassedIdentity<<"\n";
		//std::cout<<match_sequence<<"\n";
        if(bPassedOverlap && bPassedIdentity)
        {
            SequenceOverlapPair op;
            op.sequence[0] = query;
            op.sequence[1] = match_sequence;
            op.overlap = overlap;
            op.is_reversed = iter->is_reverse;
            overlap_vector.push_back(op);
            n_output += 1;
			acNumber += 1;
			//numStringCount
        } 
    }
	
	std::cout<<"\tacceptable number of seeds == "<<acNumber<<"\n";
	std::cout<<"\tsearch seeds time : "<<(double)(search_seeds_e - search_seeds_s)/CLOCKS_PER_SEC<<std::endl;
	std::cout<<"\textract time : "<<extrac_sum/CLOCKS_PER_SEC<<std::endl;
	//std::cout<<"\tcompute_count : "<<compute_count<<std::endl;
	//std::cout<<"\tbanded_count : "<<extend_count<<std::endl;
	//std::cout<<"\tcompute overlap time : "<<overlapC_sum/CLOCKS_PER_SEC<<std::endl;
	//std::cout<<"\tbanded overlap time : "<<overlapE_sum/CLOCKS_PER_SEC<<std::endl;
	/*------------------output-identity------------------------------------
	double mean = 0.0, temp_mean = 0.0,temp = 0.0;
	for(int i = 0; i < 100; i++)
	{	//count*identity
		mean+=identityVector[i]*i;
		temp+=identityVector[i];
	}
	mean=mean/temp;
	
	for(int i = 0; i < 100; i++)
		//count*identity^2
		temp_mean+=identityVector[i]*pow(i,2);
	std::cout<<"-----------outputIdentity------------"<<std::endl;
	std::cout<<"\tround "<<round;
	std::cout<<"\tmean identity :\t"<<mean<<std::endl;
	std::cout<<"\tSD identity :\t"<<sqrt(temp_mean/temp - pow(mean,2))<<std::endl;
	std::cout<<"-------------------------------------\n"<<std::endl;
	/*---------------------------------------------------------------------*/
	
    t_time += timer.getElapsedCPUTime();

    if(Verbosity::Instance().getPrintLevel() > 6 && n_calls % 100 == 0)
        printf("[kmer overlaps] n: %zu candidates: %zu valid: %zu (%.2lf) time: %.2lfs\n", 
            n_calls, n_candidates, n_output, (double)n_output / n_candidates, t_time);
    return overlap_vector;
}
/**
 * \brief Factory method to create metavalues with the right comments
 */
Metavalue	FITSKeywords::meta(const std::string& name,
			const FITSdate& value, const std::string& comment) {
	FITSKeyword	k = keyword(name);
	return Metavalue(name, k.index, value.showVeryLong(),
			(comment.size() == 0) ? k.comment : comment);
}
/**
 * \brief Factory method to create metavalues with the right comments
 */
Metavalue	FITSKeywords::meta(const std::string& name, double value,
			const std::string& comment) {
	FITSKeyword	k = keyword(name);
	return Metavalue(name, k.index, stringprintf("%f", value),
		(comment.size() == 0) ? k.comment : comment);
}
Example #8
0
		void put(const char* filename,const std::string& s)
		{
			put(filename,s.c_str(),s.size());
		}
void writeToSocket(std::string command, int Socket) {
	std::ostringstream ss;
	ss << std::setw(4) << std::setfill('0') << command.size();
	std::string buf = ss.str() + command;
	send(Socket, buf.c_str(), buf.size(), 0);
}