Ejemplo n.º 1
0
ublas::matrix<int> get_path_counts(const alignment& A,int node1, int node2) 
{
  using namespace A2;

  int state1 = states::S;

  ublas::matrix<int> counts(5,5);
  counts.clear();

  for(int column=0;column<A.length();column++) 
  {
    int state2 = -1;
    if (A.gap(column,node1)) {
      if (A.gap(column,node2)) 
       continue;
      else
       state2 = states::G1;
    }
    else {
      if (A.gap(column,node2))
       state2 = states::G2;
      else
       state2 = states::M;
    }

    counts(state1,state2)++;
    state1 = state2;
  }

  counts(state1,states::E)++;

  return counts;
}
Ejemplo n.º 2
0
alignment add_internal(alignment A,const SequenceTree& T) 
{
  // Complain if A and T don't correspond
  if (A.n_sequences() != T.n_leaves())
    throw myexception()<<"Number of sequence in alignment doesn't match number of leaves in tree"
		       <<"- can't add internal sequences";

  // Add empty sequences
  vector<sequence> S;
  for(int i=T.n_leaves();i<T.n_nodes();i++) 
  {
    sequence s;

    if (T.label(i) == "")
      throw myexception()<<"Adding internal sequences: Tree has missing internal node name!";

    s.name = T.label(i);

    S.push_back(s);
  }

  A.add_sequences(S);

  return A;
}
Ejemplo n.º 3
0
alignment reorder_sequences(const alignment& A, const vector<string>& names)
{
  // Check the names and stuff.
  vector<string> n2 = sequence_names(A);

  if (names == n2) return A;

  alignment A2;
  try {
    vector<int> new_order = compute_mapping(names,n2);

    A2 = reorder_sequences(A,new_order);
  }
  catch(bad_mapping<string>& e)
  {
    e.clear();
    if (e.size2 < e.size1)
      e<<"Alignment has too few sequences! (Got "<<A.n_sequences()<<", expected "<<names.size()<<")\n";

    if (e.size1 < e.size2)
      e<<"Alignmnent has too many sequences! (Got "<<A.n_sequences()<<", expected "<<names.size()<<")\n";

    if (e.from == 0)
      e<<"Alignment is missing sequence \""<<e.missing<<"\".";
    else
      e<<"Alignment has extra sequence \""<<e.missing<<"\".";
    throw e;
  }

  return A2;
}
Ejemplo n.º 4
0
std::pair<vector<int>,vector<int> > find_major_character(const alignment& A,int allowed_differences)
{
  const alphabet& a = A.get_alphabet();

  vector<int> majority(A.length(), alphabet::unknown);

  vector<int> safe(A.length(), 0);

  for(int c=0;c<majority.size();c++) 
  {
    vector<int> count = column_count(A,c);
    
    int max_letter = argmax(count);
    majority[c] = max_letter;
    
    // NOTE! Major character is gap if there is more than 1 gap!
    if (count[a.size()] > 1)
      majority[c] = alphabet::gap;
    else if (A.n_sequences() - count[max_letter] <= allowed_differences)
      safe[c] = 1;
    
    /*
      if (safe[c] == 1) {
      std::cerr<<"Column "<<c+1<<" is safe: "<<a.lookup(max_letter)<<"\n";
      }
    */
  }
  
  return std::pair<vector<int>,vector<int> >(majority,safe);
}
Ejemplo n.º 5
0
LogProb transpair_model4::prob_of_target_and_alignment_given_source(const alignment&al, short distortionType,bool verb)const
{
  LogProb total = 1.0 ;
  static const LogProb almostZero = 1E-299 ; 
  if( distortionType&1 )
    {
      total *= prob_of_target_and_alignment_given_source_1(al,verb);
    }
  if( distortionType&2 )
    {
      for(WordIndex j=1;j<=m;j++)
        if( al(j) )
          if( al.get_head(al(j))==j)
            {
              int ep=al.prev_cept(al(j));
              float x2=probFirst[ep](j,al.get_center(ep));
              massert(x2<=1.0);
              total*=x2;
              if( verb) cerr << "IBM-4: d=1 of " << j << ": " << x2  << " -> " << total << endl;
            }
          else
            {
              float x2=probSecond(j,al.prev_in_cept(j));
              massert(x2<=1.0);
              total*=x2;
              if( verb) cerr << "IBM-4: d>1 of " << j << ": " << x2  << " -> " << total << endl;
            }
    }
  return total?total:almostZero;
}
Ejemplo n.º 6
0
/// \brief Load a tree and an alignment based on command line parameters.
///
/// \param args The command line parameters.
/// \param alignments The alignments.
/// \param T The leaf-labelled tree.
/// \param internal_sequences Should each resulting alignment have sequences for internal nodes on the tree?
/// 
void load_A_and_T(const variables_map& args,alignment& A,RootedSequenceTree& T,bool internal_sequences)
{
  A = load_A(args,internal_sequences);

  T = load_T(args);

  //------------- Link Alignment and Tree -----------------//
  link(A,T,internal_sequences);

  //---------------- Randomize alignment? -----------------//
  if (args.count("randomize-alignment"))
    A = randomize(A,T.n_leaves());
  else if (args.count("unalign-all"))
    A = unalign_all(A,T.n_leaves()); 
 
  //------------------ Analyze 'internal'------------------//
  if ((args.count("internal") and args["internal"].as<string>() == "+")
      or args.count("randomize-alignment"))
    for(int column=0;column< A.length();column++) {
      for(int i=T.n_leaves();i<A.n_sequences();i++) 
	A.set_value(column,i, alphabet::not_gap );
    }

  //---- Check that internal sequence satisfy constraints ----//
  check_alignment(A,T,internal_sequences);
}
Ejemplo n.º 7
0
alignment get_alignment(const ublas::matrix<int>& M, alignment& A1) 
{
  alignment A2 = A1;
  A2.changelength(M.size1());

  // get letters information
  vector<vector<int> > sequences;
  for(int i=0;i<A1.n_sequences();i++) {
    vector<int> sequence;
    for(int c=0;c<A1.length();c++) {
      if (not A1.gap(c,i))
	sequence.push_back(A1(c,i));
    }
    sequences.push_back(sequence);
  }

  for(int i=0;i<A2.n_sequences();i++) {
    for(int c=0;c<A2.length();c++) {
      int index = M(c,i);

      if (index >= 0)
	index = sequences[i][index];

      A2.set_value(c,i, index);
    }
  }

  return A2;
}
Ejemplo n.º 8
0
/// \brief Create an alignment with randomized homology
///
/// \param A An alignment containing the sequences to re-align
alignment randomize(const alignment& A,int n) {
    if (n == -1)
        n = A.n_sequences();

    int maxlength = -1;
    for(int s=0; s<n; s++) {
        if (A.seqlength(s) > maxlength)
            maxlength = A.seqlength(s);
    }

    // Choose the length of the new alignment
    alignment A2 = A;
    int newlength = int( maxlength + 2 + 0.1*maxlength);
    A2.changelength(newlength);

    // For each row of the alignment
    const int temp = alphabet::gap;
    for(int i=0; i<n; i++)
    {
        /// Collect the letters of the row
        vector<int> s = alignment_row_letters(A,i);

        /// Randomly insert gaps until the row is filled
        while(s.size() < newlength) {
            int pos = myrandom(s.size()+1);
            s.insert(s.begin()+pos,temp);
        }

        for(int c=0; c<A2.length(); c++)
            A2(c,i) = s[c];
    }

    remove_empty_columns(A2);
    return A2;
}
Ejemplo n.º 9
0
/// \brief TODOCUMENT
///
/// \relates display_colourer
display_colour_spec cath::get_colour_spec(const display_colourer &arg_colourer, ///< TODOCUMENT
                                          const pdb_list         &arg_pdbs,     ///< TODOCUMENT
                                          const str_vec          &arg_names,    ///< TODOCUMENT
                                          const alignment        &arg_alignment ///< TODOCUMENT
                                          ) {
	const alignment::size_type num_entries   = arg_alignment.num_entries();
	const alignment::size_type aln_length    = arg_alignment.length();

	if ( aln_length <= 0 || num_entries <= 0 ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the alignment is empty"));
	}
	if ( num_entries != arg_pdbs.size()  ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the number of entries doesn't match the number of PDBs"));
	}
	if ( num_entries != arg_names.size() ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the number of entries doesn't match the number of names"));
	}
	auto &&result_spec = arg_colourer.get_colour_spec( alignment_context(
		arg_pdbs,
		arg_names,
		arg_alignment
	) );

	return has_score_colour_handler( arg_colourer )
		? adjust_display_colour_spec_copy(
			std::forward< decltype( result_spec ) >( result_spec ),
			get_score_colour_handler( arg_colourer ),
			arg_alignment
		)
		: result_spec;
}
Ejemplo n.º 10
0
/// \brief Load an alignment based on command line parameters and generate a random tree.
///
/// \param args The command line parameters.
/// \param alignments The alignments.
/// \param T The leaf-labelled tree.
/// \param internal_sequences Should each resulting alignment have sequences for internal nodes on the tree?
/// 
void load_A_and_random_T(const variables_map& args,alignment& A,SequenceTree& T,bool internal_sequences)
{
  // NO internal sequences, yet!
  A = load_A(args,internal_sequences);

  //------------- Load random tree ------------------------//
  SequenceTree TC = star_tree(sequence_names(A));
  if (args.count("t-constraint"))
    TC = load_constraint_tree(args["t-constraint"].as<string>(),sequence_names(A));

  T = TC;
  RandomTree(T,1.0);

  //------------- Link Alignment and Tree -----------------//
  link(A,T,internal_sequences);

  //---------------- Randomize alignment? -----------------//
  if (args.count("randomize-alignment"))
    A = randomize(A,T.n_leaves());
  
  //------------------ Analyze 'internal'------------------//
  if ((args.count("internal") and args["internal"].as<string>() == "+")
      or args.count("randomize-alignment"))
    for(int column=0;column< A.length();column++) {
      for(int i=T.n_leaves();i<A.n_sequences();i++) 
	A(column,i) = alphabet::not_gap;
    }

  //---- Check that internal sequence satisfy constraints ----//
  check_alignment(A,T,internal_sequences);
}
Ejemplo n.º 11
0
bool intersect(int c1, int c2, const alignment& A) {
  for(int i=0;i<A.n_sequences();i++) {
    if (not A.gap(c1,i) and not A.gap(c2,i))
      return true;
  }
  return false;
}
Ejemplo n.º 12
0
int n_characters(const alignment& A, int column) 
{
  int count=0;
  for(int i=0;i<A.n_sequences();i++)
    if (A.character(column,i))
      count++;
  return count;
}
Ejemplo n.º 13
0
vector<int> alignment_row_letters(const alignment& A, int i)
{
  vector<int> s;
  for(int c=0;c<A.length();c++)
    if (A.character(c,i))
      s.push_back(A(c,i));
  return s;
}
Ejemplo n.º 14
0
/// \brief TODOCUMENT
float_score_type score_colour_handler::get_score_of_postion(const alignment &arg_alignment, ///< TODOCUMENT
                                                            const size_t    &arg_entry,     ///< TODOCUMENT
                                                            const size_t    &arg_index      ///< TODOCUMENT
                                                            ) const {
	const bool using_scores     = show_scores_if_present && arg_alignment.is_scored();
	const bool using_this_score = using_scores && has_score( arg_alignment.get_alignment_residue_scores(), arg_entry, arg_index );
	return using_this_score ? get_score( arg_alignment.get_alignment_residue_scores(), arg_entry, arg_index, ! scores_to_equivs, normalise_scores )
	                        : 1.0;
}
Ejemplo n.º 15
0
void check_names_unique(const alignment& A)
{
  // check that names are all unique
  for(int i=0;i<A.n_sequences();i++) {
    for(int j=0;j<i;j++)
      if (A.seq(i).name == A.seq(j).name)
	throw myexception()<<"Sequence name '"<<A.seq(i).name<<"' occurs multiple times in the alignment!";
  }
}
Ejemplo n.º 16
0
bool names_are_unique(const alignment& A)
{
  // check that names are all unique
  for(int i=0;i<A.n_sequences();i++)
    for(int j=0;j<i;j++)
      if (A.seq(i).name == A.seq(j).name)
	return false;
  return true;
}
Ejemplo n.º 17
0
void transpair_model4::computeScores(const alignment&al,vector<double>&d)const
{
  LogProb total1 = 1.0,total2=1.0,total3=1.0,total4=1.0 ;
  total1 *= pow(double(1-p1), m-2.0 * al.fert(0)) * pow(double(p1), double(al.fert(0)));
  for (WordIndex i = 1 ; i <= al.fert(0) ; i++)
    total1 *= double(m - al.fert(0) - i + 1) / (double(DeficientDistortionForEmptyWord?(max(2,int(m))/DeficientDistortionForEmptyWord):i)) ;
  for (WordIndex i = 1 ; i <= l ; i++)
    total2 *= get_fertility(i, al.fert(i));// * (LogProb) factorial(al.fert(i));
  for (WordIndex j = 1 ; j <= m ; j++)
    total3*= get_t(al(j), j) ;
  for(WordIndex j=1;j<=m;j++)
    if( al(j) )
      if( al.get_head(al(j))==j)
        {
          int ep=al.prev_cept(al(j));
          float x2=probFirst[ep](j,al.get_center(ep));
          total4*=x2;
        }
      else
        {
          float x2=probSecond(j,al.prev_in_cept(j));
          total4*=x2;
        }
  d.push_back(total1);//9
  d.push_back(total2);//10
  d.push_back(total3);//11
  d.push_back(total4);//12
}
Ejemplo n.º 18
0
/// Count the number of times the letter with index \a l occurs in \a A.
int letter_count(const alignment& A,int l) 
{
  // Count the occurrence of the different letters
  int count=0;
  for(int i=0;i<A.length();i++)
    for(int j=0;j<A.n_sequences();j++)
      if (A(i,j) == l)
	count++;

  return count;
}
Ejemplo n.º 19
0
unsigned n_homologous(const alignment& A,int s1,int s2) 
{
  unsigned same =0;
  for(int i=0;i<A.length();i++) 
  {
    if (A.character(i,s1) and A.character(i,s2))
      same++;
  }

  return same;;
}
Ejemplo n.º 20
0
double getSimilarity(const alignment& A,int s1,int s2) {
  int match=0;
  int total=0;
  for(int column=0;column<A.length();column++) {
    if (A.gap(column,s1) or A.gap(column,s2)) continue;
    total++;

    if (A(column,s1) == A(column,s2))
      match++;
  }
  return double(match)/total;
}
Ejemplo n.º 21
0
// FIXME - should perhaps also check names?
// use this function in alignment-gild, alignment-compare, alignment-diff, etc.
void check_same_sequence_lengths(const vector<int>& L, const alignment& A)
{
  if (A.n_sequences() != L.size())
    throw myexception()<<"Expected alignment has "<<L.size()<<", but this one has "<<A.n_sequences();

  for(int i=0;i<L.size();i++)
  {
    int L2 = A.seqlength(i);
    if (L[i] != L2)
      throw myexception()<<"Sequence "<<i+1<<": length "<<L2<<" differs from expected length "<<L[i];
  }
}
Ejemplo n.º 22
0
dynamic_bitset<> gap_variable_sites(const alignment& A)
{
  valarray<int> counts(0, 2);

  dynamic_bitset<> columns(A.length());
  for(int c=0; c<A.length(); c++)
  {
    count_gaps(A,c,counts);
    if (variable_counts(counts))
      columns[c] = true;
  }
  return columns;
}
Ejemplo n.º 23
0
/// Check that internal node states are consistent
void check_internal_nodes_connected(const alignment& A,const Tree& T,const vector<int>& ignore) {
  for(int column=0;column<A.length();column++) {
    dynamic_bitset<> present(T.n_nodes());
    for(int i=0;i<T.n_nodes();i++) 
      present[i] = not A.gap(column,i);
    
    if (not all_characters_connected(T,present,ignore)) {
      cerr<<"Internal node states are inconsistent in column "<<column<<endl;
      cerr<<A<<endl;
      throw myexception()<<"Internal node states are inconsistent in column "<<column;
    }
  }
}
Ejemplo n.º 24
0
/// Check that internal nodes don't have letters (or anything wierder!)
void check_internal_sequences_composition(const alignment& A,int n_leaves) {

  for(int column=0;column<A.length();column++)
    for(int i=n_leaves;i<A.n_sequences();i++) 
      if (A(column,i) == alphabet::gap)
	;
      else if (A(column,i) == alphabet::not_gap)
	;
      else
	throw myexception()<<"Found a illegal index "<<A(column,i)
			   <<"in column "<<column<<" of internal sequence '"
			   <<A.seq(i).name<<"': only - and * are allowed";
}
Ejemplo n.º 25
0
bool after(int c1, int c2, const alignment& A,const vector<int>& nodes) {
  assert(nodes.size() == A.n_sequences());

  for(int i=0;i<nodes.size();i++) {
    bool p1 = not A.gap(c1,nodes[i]);
    bool p2 = not A.gap(c2,nodes[i]);
    if (p2 and not p1)
      return true;
    if (p1 and not p2)
      return false;
  }
  return false;
}
Ejemplo n.º 26
0
void count_letters(const alignment& A, int c, valarray<int>& counts)
{
  const alphabet& a = A.get_alphabet();
  assert(counts.size() == a.size());

  counts = 0;
  for(int i=0;i<A.n_sequences();i++) 
  {
    int l = A(c,i);
    if (a.is_letter(l))
      counts[l]++;
  }
}
Ejemplo n.º 27
0
/// \brief Estimate the empirical frequencies of different letters from the alignment, with pseudocounts
///
/// \param args The command line parameters.
/// \param A The alignment.
///
valarray<double> empirical_frequencies(const variables_map& args,const alignment& A) 
{
  const alphabet& a = A.get_alphabet();

  // Count the occurrence of the different letters
  valarray<double> counts = letter_counts(A);

  valarray<double> frequencies(a.size());

  // empirical frequencies
  frequencies = A.get_alphabet().get_frequencies_from_counts(counts,chop_internal(A).n_sequences());

  return frequencies;
}
/// \brief TODOCUMENT
alignment_split_list cath::align::detail::get_preexisting_alignment_splits(const alignment &arg_alignment ///< TODOCUMENT
                                                                           ) {
	const size_t num_entries = arg_alignment.num_entries();
	const size_t aln_length  = arg_alignment.length();
	alignment_split_list new_alignment_splits;
	for (size_t aln_index = 0; aln_index < aln_length; ++aln_index) {
		const size_vec present_positions = entries_present_at_index( arg_alignment, aln_index );
		const alignment_split multi_split = make_alignment_split( present_positions, num_entries );
		if ( is_valid_split( multi_split) ) {
			new_alignment_splits.insert( get_least_version( multi_split ) );
		}
	}
	return new_alignment_splits;
}
Ejemplo n.º 29
0
unsigned n_with_identity(const alignment& A,int s1,int s2,double I)
{
  // Get matches
  vector<int> F(A.length()+1);

  unsigned L=0;
  unsigned T = 0;
  F[0]=0;
  for(int i=0;i<A.length();i++) 
  {
    if (not A.character(i,s1) and not A.character(i,s2)) continue;

    L++;
    
    if (A(i,s1) == A(i,s2))
      T++;

    F[L] = T;
  }
  F.resize(L+1);

  // Get positions
  vector<int> FI(T+1);
  FI[0]=0;
  for(int i=0;i<L;i++)
    if (F[i+1] > F[i])
      FI[F[i+1]] = i+1;

  // tag positions that 
  vector<int> tagged(L,0);

  const unsigned w = 4;
  for(int i=1;i<=T;i++) {
    for(int j=20;j>=w;j--) {
      int i2 = i+j;
      if (i2 > T) continue;
      assert(FI[i]  > 0 and FI[i]  <=L);
      assert(FI[i2] > 0 and FI[i2] <=L);
      assert(FI[i2] > FI[i]);

      if (double(i2-i+1)/(FI[i2]-FI[i]+1) > I) {
	for(int k=FI[i];k<=FI[i2];k++)
	  tagged[k-1]=1;
	break;
      }
    }
  }

  return sum(tagged);
}
Ejemplo n.º 30
0
bool A_constant(alignment A1, alignment A2, const dynamic_bitset<>& ignore) {
  assert(A1.n_sequences() == A2.n_sequences());

  // equality holds if we have internal node sequences -- otherwise ignore is larger
  assert(A1.n_sequences() <= ignore.size());

  // convert to feature-number notation
  ublas::matrix<int> M1 = M(A1);
  ublas::matrix<int> M2 = M(A2);

  // lookup and cache the column each feature is in
  vector< vector< int> > column_indices = column_lookup(A2);

  //----- Check that the sequence lengths match ------//
  for(int i=0;i<M1.size2();i++) {
    if (ignore[i]) continue;

    if (A1.seqlength(i) != A2.seqlength(i))
      return false;
  }

  //----- Check that each homology in A1 is in A2 -----//
  for(int column=0; column<A1.length(); column++)
    for(int s1=0; s1 < A1.n_sequences(); s1++) {
      if (ignore[s1]) continue;
      for(int s2=s1+1; s2 < A1.n_sequences(); s2++) {
	if (ignore[s2]) continue;
	if (not A_match(M1,column,s1,s2,M2,column_indices))
	  return false;
      }
    }

  return true;
}