Beispiel #1
0
/// compute a position frequency matrix form the alignment positions
/// between gap_start and gap_end, returning a [0,1]-valued
/// match_score and a maximum number of deletions in dels.
// ==============================================================
PWM get_PWM(int gap_start, int gap_end, float& match_score, int& dels)
// ==============================================================
{
	int freq_A, freq_C, freq_G, freq_U, freq_gap;
	float_vector A,C,G,T;
	int gap_len = gap_end-gap_start;
	int max_gaps = 0;
	int min_gaps = gap_len;
	dels = get_deletions(gap_start,gap_end);
	int i;
	for (i=gap_start; i<gap_end; i++)
	{
		freq_A = 0;
		freq_C = 0;
		freq_G = 0;
		freq_U = 0;
		freq_gap = 0;
		alignment_reader::iterator a_it = AR.begin();
		alignment_reader::iterator a_end = AR.end();
		int dels=0;
		for (; a_it!=a_end; ++a_it)
		{
			char ch=a_it->second[i];
			switch (a_it->second[i])
			{
				case 'A': freq_A++; break;
				case 'C': freq_C++; break;
				case 'G': freq_G++; break;
				case 'U': freq_U++; break;
				case 'T': freq_U++; break;
				case '#': break;
				case '-' : break;
				case '.' : break;
				default : std::cerr<<"UNKNOWN CHAR: "<<a_it->second[i]<<"\n";
			}
		}
		A.push_back((float)freq_A);
		C.push_back((float)freq_C);
		G.push_back((float)freq_G);
		T.push_back((float)freq_U);
		
		//result.push_back(consensus_char(freq_A,freq_C,freq_G,freq_U));
	}
	PWM result(A,C,G,T);
	match_score=1.;
	alignment_reader::iterator a_it = AR.begin();
	alignment_reader::iterator a_end = AR.end();
	for (; a_it!=a_end; ++a_it)
	{
		typedef std::pair<float,float> float_pair;
		std::string s_cstr = a_it->second.substr(gap_start,gap_end-gap_start);
		remove_gaps(s_cstr);
		result.set_indel_costs(1.,1.,dels);
		float_pair m_score = result.get_frac_score(s_cstr.c_str());
		if (m_score.first<match_score)
			match_score=m_score.first;
	}
	match_score = floor(100.*match_score)/100.;
	return result;
}
Beispiel #2
0
int main(int argc,char **argv)
{
	int i,nseqs;
	char infile[FILENAMELEN+1];
	char outfile[FILENAMELEN+1];
	ALN mult_aln;
	OPT opt;

	if(argc!=3) {
		fprintf(stderr,"Usage: %s input_aln output_aln\n",argv[0]);
		exit(1);
	}
	strcpy(infile,argv[1]);
	strcpy(outfile,argv[2]);

        init_options(&opt);

	(*opt.alnout_opt).output_clustal=FALSE;
	(*opt.alnout_opt).output_tfa=TRUE;

/* read in the sequences */
	seq_input(infile,opt.explicit_type,FALSE,&mult_aln);
	if(mult_aln.nseqs<=0) {
		fprintf(stderr,"ERROR: No sequences in %s\n",infile);
		exit(1);
	}
	nseqs=mult_aln.nseqs;

/* remove the gaps */
	remove_gaps(&mult_aln);

/* write out the sequences */
	strcpy((*opt.alnout_opt).tfa_outname, outfile);
	for (i=0;i<mult_aln.nseqs;i++) mult_aln.seqs[i].output_index = i;

	if(!open_alignment_output(infile,opt.alnout_opt)) exit(1);
        create_alignment_output(mult_aln,*opt.alnout_opt);
}
// extract differences between the pair of strings
std::vector<Variant> extract_variants(const std::string& reference, 
                                      const std::string& haplotype)
{
    AlnParam par = aln_param_nt2nt;
    par.band_width = std::max(20, abs(reference.size() - haplotype.size()) * 2);
    AlnAln* aln = aln_stdaln(reference.c_str(), haplotype.c_str(), &par, 1, 1);
    
    // Make aligned strings where gaps are padded with '-'
    std::string pad_ref(aln->out1);
    std::string pad_hap(aln->out2);

    assert(pad_ref.size() == pad_hap.size());
    
    //std::cout << "PR: " << pad_ref << "\n";
    //std::cout << "PH: " << pad_hap << "\n";

    // parse variants from the alignment
    std::vector<Variant> variants;

    // generate a map from padded bases to positions in the original reference sequence
    std::vector<size_t> ref_positions(pad_ref.size(), 0);
    size_t pos = 0;
    for(size_t i = 0; i < pad_ref.size(); ++i) {
        ref_positions[i] = pad_ref[i] != '-' ? pos : std::string::npos;
        pos += pad_ref[i] != '-';
    }

    // diff_start iterates over the places where these sequences are different
    size_t diff_start = 0;
    while(1) {
        
        // find the start point of the next difference between the strings
        while(diff_start < pad_ref.size() && pad_ref[diff_start] == pad_hap[diff_start]) {
            diff_start++;
        }
 
        // check for end of alignment
        if(diff_start == pad_ref.size())
            break;

        // find the end point of the difference
        bool is_indel = false;
        size_t diff_end = diff_start;
        while(diff_end < pad_ref.size() && pad_ref[diff_end] != pad_hap[diff_end]) {
            is_indel = is_indel || pad_ref[diff_end] == '-' || pad_hap[diff_end] == '-';
            diff_end++;
        }

        // If the difference is an indel, we include the previous matching reference base
        diff_start -= is_indel;
    
        Variant v;
        v.ref_name = "noctg";

        assert(ref_positions[diff_start] != std::string::npos);
        v.ref_position = ref_positions[diff_start];
        v.ref_seq = remove_gaps(pad_ref.substr(diff_start, diff_end - diff_start).c_str());
        v.alt_seq = remove_gaps(pad_hap.substr(diff_start, diff_end - diff_start).c_str());
        
        variants.push_back(v);
        diff_start = diff_end;
    }

    aln_free_AlnAln(aln);
    return variants;
}