/// compute a position frequency matrix form the alignment positions /// between gap_start and gap_end, returning a [0,1]-valued /// match_score and a maximum number of deletions in dels. // ============================================================== PWM get_PWM(int gap_start, int gap_end, float& match_score, int& dels) // ============================================================== { int freq_A, freq_C, freq_G, freq_U, freq_gap; float_vector A,C,G,T; int gap_len = gap_end-gap_start; int max_gaps = 0; int min_gaps = gap_len; dels = get_deletions(gap_start,gap_end); int i; for (i=gap_start; i<gap_end; i++) { freq_A = 0; freq_C = 0; freq_G = 0; freq_U = 0; freq_gap = 0; alignment_reader::iterator a_it = AR.begin(); alignment_reader::iterator a_end = AR.end(); int dels=0; for (; a_it!=a_end; ++a_it) { char ch=a_it->second[i]; switch (a_it->second[i]) { case 'A': freq_A++; break; case 'C': freq_C++; break; case 'G': freq_G++; break; case 'U': freq_U++; break; case 'T': freq_U++; break; case '#': break; case '-' : break; case '.' : break; default : std::cerr<<"UNKNOWN CHAR: "<<a_it->second[i]<<"\n"; } } A.push_back((float)freq_A); C.push_back((float)freq_C); G.push_back((float)freq_G); T.push_back((float)freq_U); //result.push_back(consensus_char(freq_A,freq_C,freq_G,freq_U)); } PWM result(A,C,G,T); match_score=1.; alignment_reader::iterator a_it = AR.begin(); alignment_reader::iterator a_end = AR.end(); for (; a_it!=a_end; ++a_it) { typedef std::pair<float,float> float_pair; std::string s_cstr = a_it->second.substr(gap_start,gap_end-gap_start); remove_gaps(s_cstr); result.set_indel_costs(1.,1.,dels); float_pair m_score = result.get_frac_score(s_cstr.c_str()); if (m_score.first<match_score) match_score=m_score.first; } match_score = floor(100.*match_score)/100.; return result; }
int main(int argc,char **argv) { int i,nseqs; char infile[FILENAMELEN+1]; char outfile[FILENAMELEN+1]; ALN mult_aln; OPT opt; if(argc!=3) { fprintf(stderr,"Usage: %s input_aln output_aln\n",argv[0]); exit(1); } strcpy(infile,argv[1]); strcpy(outfile,argv[2]); init_options(&opt); (*opt.alnout_opt).output_clustal=FALSE; (*opt.alnout_opt).output_tfa=TRUE; /* read in the sequences */ seq_input(infile,opt.explicit_type,FALSE,&mult_aln); if(mult_aln.nseqs<=0) { fprintf(stderr,"ERROR: No sequences in %s\n",infile); exit(1); } nseqs=mult_aln.nseqs; /* remove the gaps */ remove_gaps(&mult_aln); /* write out the sequences */ strcpy((*opt.alnout_opt).tfa_outname, outfile); for (i=0;i<mult_aln.nseqs;i++) mult_aln.seqs[i].output_index = i; if(!open_alignment_output(infile,opt.alnout_opt)) exit(1); create_alignment_output(mult_aln,*opt.alnout_opt); }
// extract differences between the pair of strings std::vector<Variant> extract_variants(const std::string& reference, const std::string& haplotype) { AlnParam par = aln_param_nt2nt; par.band_width = std::max(20, abs(reference.size() - haplotype.size()) * 2); AlnAln* aln = aln_stdaln(reference.c_str(), haplotype.c_str(), &par, 1, 1); // Make aligned strings where gaps are padded with '-' std::string pad_ref(aln->out1); std::string pad_hap(aln->out2); assert(pad_ref.size() == pad_hap.size()); //std::cout << "PR: " << pad_ref << "\n"; //std::cout << "PH: " << pad_hap << "\n"; // parse variants from the alignment std::vector<Variant> variants; // generate a map from padded bases to positions in the original reference sequence std::vector<size_t> ref_positions(pad_ref.size(), 0); size_t pos = 0; for(size_t i = 0; i < pad_ref.size(); ++i) { ref_positions[i] = pad_ref[i] != '-' ? pos : std::string::npos; pos += pad_ref[i] != '-'; } // diff_start iterates over the places where these sequences are different size_t diff_start = 0; while(1) { // find the start point of the next difference between the strings while(diff_start < pad_ref.size() && pad_ref[diff_start] == pad_hap[diff_start]) { diff_start++; } // check for end of alignment if(diff_start == pad_ref.size()) break; // find the end point of the difference bool is_indel = false; size_t diff_end = diff_start; while(diff_end < pad_ref.size() && pad_ref[diff_end] != pad_hap[diff_end]) { is_indel = is_indel || pad_ref[diff_end] == '-' || pad_hap[diff_end] == '-'; diff_end++; } // If the difference is an indel, we include the previous matching reference base diff_start -= is_indel; Variant v; v.ref_name = "noctg"; assert(ref_positions[diff_start] != std::string::npos); v.ref_position = ref_positions[diff_start]; v.ref_seq = remove_gaps(pad_ref.substr(diff_start, diff_end - diff_start).c_str()); v.alt_seq = remove_gaps(pad_hap.substr(diff_start, diff_end - diff_start).c_str()); variants.push_back(v); diff_start = diff_end; } aln_free_AlnAln(aln); return variants; }