/// \brief Construct the permutation represented by changing from the original amino acid ordering to the new amino acid ordering /// /// \todo Tidied up this code size_vec substitution_matrix::order_permutation(const amino_acid_vec &arg_orig_aa_ordering, ///< The original amino acid ordering const amino_acid_vec &arg_new_aa_ordering ///< The new amino acid ordering ) { // Grab the number of amino acids and check it's consistent const size_t num_amino_acids = arg_orig_aa_ordering.size(); if ( num_amino_acids != arg_new_aa_ordering.size() ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Mismatch in the number of amino acids")); } // Construct a permutation and reserve the memory size_vec permutation; permutation.reserve( num_amino_acids ); // For each of the amino acids in the original ordering... for (const amino_acid &orig_amino_acid : arg_orig_aa_ordering) { // Find the index of the amino acid in the new ordering const size_t index_in_new = numeric_cast<size_t>( distance( common::cbegin( arg_new_aa_ordering ), find( arg_new_aa_ordering, orig_amino_acid ) ) ); // ...check the index is valid... if ( index_in_new >= num_amino_acids ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Amino acid not found in new amino acids")); } // ...and then add the index to the back of the permutation permutation.push_back( index_in_new ); } // Return the resulting permutation return permutation; }
/// \brief Align a sequence against a corresponding pdb /// (broadly handling residues missing in the sequence but not extra residues) /// /// \returns A vector of opt_aln_posns corresponding to the letters of the sequence. Each is: /// * none if the entry is a '-' character /// * the index of the corresponding residue in arg_pdb otherwise opt_aln_posn_vec cath::align::align_sequence_to_amino_acids(const string &arg_sequence_string, ///< The raw sequence string (no headers; no whitespace) to be aligned const amino_acid_vec &arg_amino_acids, ///< The PDB against which the sequence is to be aligned const string &arg_name, ///< The name of the entry to use in warnings / errors ostream &/*arg_stderr*/ ///< The ostream to which warnings should be output ) { const size_t sequence_length = arg_sequence_string.length(); const size_t num_pdb_residues = arg_amino_acids.size(); // Prepare the variables to be populated when looping through the sequence str_vec skipped_residues; opt_aln_posn_vec new_posns; new_posns.reserve( sequence_length ); size_t pdb_ctr = 0; // Loop along the sequence for (size_t seq_ctr = 0; seq_ctr < sequence_length; ++seq_ctr) { const char &sequence_char = arg_sequence_string[ seq_ctr ]; // If this is a '-' character then add none to the back of new_posns if ( sequence_char == '-' ) { new_posns.push_back( none ); } // Otherwise, it's an amino-acid letter else { // Continue searching pdb_ctr through the PDB until it matches this letter while ( sequence_char != arg_amino_acids[ pdb_ctr ].get_letter() ) { skipped_residues.push_back( lexical_cast<string>( pdb_ctr ) ); // Increment pdb_ctr ++pdb_ctr; // If pdb_ctr has overrun the end, then throw an exception if ( pdb_ctr >= num_pdb_residues ) { BOOST_THROW_EXCEPTION(runtime_error_exception( "When aligning a sequence to a PDB for " + arg_name + ", could not find match in PDB for residue '" + sequence_char + "' at position " + lexical_cast<string>( seq_ctr ) )); } } // Add the found PDB position to the back of new_positions new_posns.push_back( pdb_ctr ); // Increment the pdb_ctr to the next residue ++pdb_ctr; } } const size_t num_posns_skipped = skipped_residues.size(); const size_t num_posns_found = num_pdb_residues - num_posns_skipped; if ( num_posns_skipped > num_pdb_residues ) { BOOST_THROW_EXCEPTION(runtime_error_exception("The number of residues skipped exceeds the total number of residues")); } // If the number of residues found is an unacceptably low fraction of residues in the PDB, // then throw an exception const double fraction_pdb_residues_found = numeric_cast<double>( num_posns_found ) / numeric_cast<double>( num_pdb_residues ); if ( fraction_pdb_residues_found < MIN_FRAC_OF_PDB_RESIDUES_IN_SEQ ) { BOOST_THROW_EXCEPTION(runtime_error_exception( "When aligning a sequence to a PDB for " + arg_name + ", only found matches for " + lexical_cast<string>( num_posns_found ) + " of the " + lexical_cast<string>( pdb_ctr ) + " residues in the PDB" )); } // If not all residues were found, then output a warning about if ( num_posns_found < num_pdb_residues ) { BOOST_LOG_TRIVIAL( warning ) << "When aligning a sequence to a PDB for \"" << arg_name << "\", " << lexical_cast<string>( num_pdb_residues - num_posns_found ) << " of the PDB's " << lexical_cast<string>( num_pdb_residues ) << " residues were missing in the sequence and had to be inserted (residue indices, using offset of 0 : " << join( skipped_residues, ", " ) << ")"; } // Return the result of this work return new_posns; }