/// \brief Construct the permutation represented by changing from the original amino acid ordering to the new amino acid ordering
///
/// \todo Tidied up this code
size_vec substitution_matrix::order_permutation(const amino_acid_vec &arg_orig_aa_ordering, ///< The original amino acid ordering
                                                const amino_acid_vec &arg_new_aa_ordering   ///< The new amino acid ordering
                                                ) {
	// Grab the number of amino acids and check it's consistent
	const size_t num_amino_acids = arg_orig_aa_ordering.size();
	if ( num_amino_acids != arg_new_aa_ordering.size() ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("Mismatch in the number of amino acids"));
	}

	// Construct a permutation and reserve the memory
	size_vec permutation;
	permutation.reserve( num_amino_acids );

	// For each of the amino acids in the original ordering...
	for (const amino_acid &orig_amino_acid : arg_orig_aa_ordering) {
		// Find the index of the amino acid in the new ordering
		const size_t index_in_new = numeric_cast<size_t>( distance(
			common::cbegin( arg_new_aa_ordering ),
			find(
				arg_new_aa_ordering,
				orig_amino_acid
			)
		) );
		// ...check the index is valid...
		if ( index_in_new >= num_amino_acids ) {
			BOOST_THROW_EXCEPTION(invalid_argument_exception("Amino acid not found in new amino acids"));
		}
		// ...and then add the index to the back of the permutation
		permutation.push_back( index_in_new );
	}
	// Return the resulting permutation
	return permutation;
}
Exemple #2
0
/// \brief Align a sequence against a corresponding pdb
///        (broadly handling residues missing in the sequence but not extra residues)
///
/// \returns A vector of opt_aln_posns corresponding to the letters of the sequence. Each is:
///           * none if the entry is a '-' character
///           * the index of the corresponding residue in arg_pdb otherwise
opt_aln_posn_vec cath::align::align_sequence_to_amino_acids(const string         &arg_sequence_string, ///< The raw sequence string (no headers; no whitespace) to be aligned
                                                            const amino_acid_vec &arg_amino_acids,     ///< The PDB against which the sequence is to be aligned
                                                            const string         &arg_name,            ///< The name of the entry to use in warnings / errors
                                                            ostream              &/*arg_stderr*/       ///< The ostream to which warnings should be output
                                                            ) {
	const size_t sequence_length  = arg_sequence_string.length();
	const size_t num_pdb_residues = arg_amino_acids.size();

	// Prepare the variables to be populated when looping through the sequence
	str_vec skipped_residues;
	opt_aln_posn_vec new_posns;
	new_posns.reserve( sequence_length );
	size_t pdb_ctr         = 0;

	// Loop along the sequence
	for (size_t seq_ctr = 0; seq_ctr < sequence_length; ++seq_ctr) {
		const char &sequence_char = arg_sequence_string[ seq_ctr ];

		// If this is a '-' character then add none to the back of new_posns
		if ( sequence_char == '-' ) {
			new_posns.push_back( none );
		}
		// Otherwise, it's an amino-acid letter
		else {
			// Continue searching pdb_ctr through the PDB until it matches this letter
			while ( sequence_char != arg_amino_acids[ pdb_ctr ].get_letter() ) {
				skipped_residues.push_back( lexical_cast<string>( pdb_ctr ) );

				// Increment pdb_ctr
				++pdb_ctr;

				// If pdb_ctr has overrun the end, then throw an exception
				if ( pdb_ctr >= num_pdb_residues ) {
					BOOST_THROW_EXCEPTION(runtime_error_exception(
						"When aligning a sequence to a PDB for "
						+ arg_name
						+ ", could not find match in PDB for residue  '"
						+ sequence_char
						+ "' at position "
						+ lexical_cast<string>( seq_ctr )
					));
				}
			}

			// Add the found PDB position to the back of new_positions
			new_posns.push_back( pdb_ctr );

			// Increment the pdb_ctr to the next residue
			++pdb_ctr;
		}
	}

	const size_t num_posns_skipped = skipped_residues.size();
	const size_t num_posns_found   = num_pdb_residues - num_posns_skipped;
	if ( num_posns_skipped > num_pdb_residues ) {
		BOOST_THROW_EXCEPTION(runtime_error_exception("The number of residues skipped exceeds the total number of residues"));
	}

	// If the number of residues found is an unacceptably low fraction of residues in the PDB,
	// then throw an exception
	const double fraction_pdb_residues_found = numeric_cast<double>( num_posns_found ) / numeric_cast<double>( num_pdb_residues );
	if ( fraction_pdb_residues_found < MIN_FRAC_OF_PDB_RESIDUES_IN_SEQ ) {
		BOOST_THROW_EXCEPTION(runtime_error_exception(
			"When aligning a sequence to a PDB for "
			+ arg_name
			+ ", only found matches for "
			+ lexical_cast<string>( num_posns_found )
			+ " of the "
			+ lexical_cast<string>( pdb_ctr )
			+ " residues in the PDB"
		));
	}
	// If not all residues were found, then output a warning about
	if ( num_posns_found < num_pdb_residues ) {
		BOOST_LOG_TRIVIAL( warning ) << "When aligning a sequence to a PDB for \""
		                             << arg_name
		                             << "\", "
		                             << lexical_cast<string>( num_pdb_residues - num_posns_found )
		                             << " of the PDB's "
		                             << lexical_cast<string>( num_pdb_residues )
		                             << " residues were missing in the sequence and had to be inserted (residue indices, using offset of 0 : "
		                             << join( skipped_residues, ", " )
		                             << ")";
	}

	// Return the result of this work
	return new_posns;
}