示例#1
0
/// \brief TODOCUMENT
///
/// \relates alignment
///
/// The lists of residues are used for finding the indices of residues.
///
/// Should this parse based on:
///  - exact column positions (hence breaking if an extra space is added) or
///  - whitespace splitting (hence breaking if a column is missing, eg with an insert as a space rather than a 0)
alignment cath::align::read_alignment_from_cath_ssap_legacy_format(istream                &arg_istream,     ///< TODOCUMENT
                                                                   const residue_name_vec &arg_res_names_a, ///< TODOCUMENT
                                                                   const residue_name_vec &arg_res_names_b, ///< TODOCUMENT
                                                                   ostream                &arg_stderr       ///< TODOCUMENT
                                                                   ) {
	arg_istream.exceptions( ios::badbit );

	alignment new_alignment( alignment::NUM_ENTRIES_IN_PAIR_ALIGNMENT );

	string line_string;
	size_t pos_a(0);
	size_t pos_b(0);
	opt_score_vec scores;
	while ( getline(arg_istream, line_string ) ) {
		const int             res_num_a = lexical_cast<int>(    trim_copy( line_string.substr(  0, 4 ))); // Column 1: Protein 1 PDB residue number (excluding insert character)
//		const char          sec_struc_a =                                  line_string.at(      5    )  ; // Column 2: Protein 1 Secondary structure character
		const char             insert_a =                                  line_string.at(      7    )  ; // Column 3: Protein 1 PDB residue insert character
		const char         amino_acid_a =                                  line_string.at(      9    )  ; // Column 4: Protein 1 Residue Code (One letter code)
		const size_t              score = lexical_cast<size_t>( trim_copy( line_string.substr( 12, 3 ))); // Column 5: SSAP residue score (0-100)
		const char         amino_acid_b =                                  line_string.at(     17    )  ; // Column 6: Protein 2 Residue Code (One letter code)
		const char             insert_b =                                  line_string.at(     19    )  ; // Column 7: Protein 2 PDB residue insert character
//		const char          sec_struc_b =                                  line_string.at(     21    )  ; // Column 8: Protein 2 Secondary structure character
		const int             res_num_b = lexical_cast<int>(    trim_copy( line_string.substr( 23, 4 ))); // Column 9: Protein 2 PDB residue number (excluding insert character)

		// For each side, move the PDB position forward if necessary
		const residue_name res_name_a = make_residue_name_with_non_insert_char( res_num_a, insert_a, '0');
		const residue_name res_name_b = make_residue_name_with_non_insert_char( res_num_b, insert_b, '0' );
		const opt_size find_a_result = search_for_residue_in_residue_names( pos_a, arg_res_names_a, amino_acid_a, res_name_a, arg_stderr );
		const opt_size find_b_result = search_for_residue_in_residue_names( pos_b, arg_res_names_b, amino_acid_b, res_name_b, arg_stderr );
		pos_a = find_a_result ? *find_a_result : pos_a;
		pos_b = find_b_result ? *find_b_result : pos_b;

		if ( find_a_result && find_b_result ) {
			append_position_both_offset_1( new_alignment, ( *find_a_result ) + 1, ( *find_b_result ) + 1 );
			scores.push_back( numeric_cast<double>( score ) );
		}
		else {
			scores.push_back( none );
			if ( find_a_result ) {
				append_position_a_offset_1( new_alignment, ( *find_a_result ) + 1 );
			}
			else if ( find_b_result ) {
				append_position_b_offset_1( new_alignment, ( *find_b_result ) + 1 );
			}
			else {
				BOOST_THROW_EXCEPTION(runtime_error_exception("Alignment file contains entry with neither residue populated"));
			}
		}
	}

	set_pair_alignment_duplicate_scores( new_alignment, scores );
	return new_alignment;
}
示例#2
0
/// \brief Parse a FASTA format input into a vector of pairs of strings (one for id, one for sequence)
str_str_pair_vec cath::align::read_ids_and_sequences_from_fasta(istream &arg_istream ///< The istream from which to read the FASTA input for parsing
                                                                ) {
	arg_istream.exceptions( ios::badbit );
	str_str_pair_vec sequence_of_id;

	// Loop over the lines in the input
	string line_string;
	while ( getline( arg_istream, line_string ) ) {
		// If there are any non-printing characters, throw an exception
		if ( ! all( line_string, is_print() ) ) {
			BOOST_THROW_EXCEPTION(runtime_error_exception("Line in FASTA input contains non-printing characters"));
		}
		// If this line doesn't start with a '>' and it's the first line (nothing yet in sequence_of_id) then throw an exception
		if ( sequence_of_id.empty() && ! starts_with( line_string, ">" ) ) {
			BOOST_THROW_EXCEPTION(runtime_error_exception("Line in FASTA input expected to be header doesn't begin with '>'"));
		}

		// If this line starts with a >, then treat as a header
		if ( starts_with( line_string, ">" ) ) {
			// Remove the first character
			line_string = line_string.substr( 1 );

			// If the rest of the line is empty then throw an exception
			if ( line_string.empty() ) {
				BOOST_THROW_EXCEPTION(runtime_error_exception("Header line in FASTA doesn't have any characters after initial '>'"));
			}

			// Add a new entry at the back of sequence_of_id with this id and an empty string
			sequence_of_id.push_back( make_pair( line_string, string("") ) );
		}
		// Otherwise this is a line of sequence data
		else {
			// Remove all spaces from the string
			find_format_all( line_string, token_finder( is_space() ), empty_formatter( line_string ) );

			// If any of the (remaining) characters aren't alpha characters or '-'s then throw an exception
			if ( ! all( line_string, is_alpha() || is_any_of( "-" ) ) ) {
				BOOST_THROW_EXCEPTION(runtime_error_exception("Sequence line in FASTA input contains non-space characters that are neither letters nor '-'"));
			}

			// Convert the string to upper case and append it to the back of the sequence string for the most recent entry
			to_upper( line_string );
			assert( ! sequence_of_id.empty() );
			sequence_of_id.back().second += line_string;
		}
	}

	// Return the results of this parsing
	return sequence_of_id;
}
示例#3
0
/// \brief Parse a FASTA format input into an alignment
///
/// At present, each sequences must contain all of the residues of the corresponding PDB
/// (because the index in the PDB is required in the alignment).
///
/// !!Case insensitive!!
///
/// \todo !URGENT! Test what this does when given structures with incomplete residues near the start.
///       It looks like it gives indices in the PDB, rather than in the protein (which only
///       contains backbone-complete residues). This is a serious issue!
///
/// \todo Generalise this so that it's possible to read alignments against a pdb_list or a protein_list
///
/// The code will attempt to handle missing residues with a warning if there are a small number.
/// It will fail if the percentage is too low.
///
/// \relates alignment
alignment cath::align::read_alignment_from_fasta(istream                  &arg_istream,          ///< The istream from which to read the FASTA input for parsing
                                                 const amino_acid_vec_vec &arg_amino_acid_lists, ///< TODOCUMENT
                                                 const str_vec            &arg_names,            ///< A vector of names, each of which should be found within the corresponding sequence's ID
                                                 ostream                  &arg_stderr            ///< An ostream to which any warnings should be output (currently unused)
                                                 ) {
		if ( arg_amino_acid_lists.empty() ) {
			BOOST_THROW_EXCEPTION(invalid_argument_exception("Cannot load a FASTA alignment with 0 PDB entries"));
		}
		const size_t num_entries = arg_amino_acid_lists.size();
		if ( arg_names.size() != num_entries ) {
			BOOST_THROW_EXCEPTION(invalid_argument_exception("Cannot load a FASTA alignment with a different number of names and PDB entries"));
		}

		arg_istream.exceptions( ios::badbit );

		try {
			const str_str_pair_vec sequence_of_id = read_ids_and_sequences_from_fasta( arg_istream );
			const size_t num_sequences = sequence_of_id.size();
			if ( num_entries != num_sequences ) {
				BOOST_THROW_EXCEPTION(runtime_error_exception(
					"Number of sequences parsed from FASTA ("
					+ lexical_cast<string>( num_sequences )
					+ ") doesn't match the number of PDBs/names ("
					+ lexical_cast<string>( num_entries   )
					+ ")"
				));
			}

			const size_t sequence_length = sequence_of_id.front().second.length();

			opt_aln_posn_vec_vec positions;
			positions.reserve( num_entries );
			for (size_t entry_ctr = 0; entry_ctr < num_entries; ++entry_ctr) {
				const amino_acid_vec &amino_acids     = arg_amino_acid_lists      [ entry_ctr ];
				const string         &name            = arg_names     [ entry_ctr ];
				const str_str_pair   &id_and_sequence = sequence_of_id[ entry_ctr ];
				const string         &id              = id_and_sequence.first;
				const string         &sequence        = id_and_sequence.second;

				if ( sequence.length() != sequence_length ) {
					BOOST_THROW_EXCEPTION(runtime_error_exception(
						"When attempting to parse entry number "
						+ lexical_cast<string>( entry_ctr + 1     )
						+ " of FASTA alignment, the length of the sequence ("
						+ lexical_cast<string>( sequence.length() )
						+ ") does not match the length of the first sequence ("
						+ lexical_cast<string>( sequence_length   )
						+ ")"
					));
				}

				if ( ! icontains( id, name ) ) {
					BOOST_THROW_EXCEPTION(runtime_error_exception(
						"When attempting to parse entry number "
						+ lexical_cast<string>( entry_ctr + 1 )
						+ " of FASTA alignment, name \""
						+ name
						+ "\" could not be found in a case-insensitive search within FASTA header ID \""
						+ id
						+ "\""
					));
				}

				positions.push_back( align_sequence_to_amino_acids( sequence, amino_acids, name, arg_stderr ) );
			}

			return alignment( positions );
		}
		// Catch any I/O exceptions
		catch (const std::exception &ex) {
			const string error_message(string("Cannot read FASTA legacy alignment file [") + ex.what() + "] ");
			perror(error_message.c_str());
			BOOST_THROW_EXCEPTION(runtime_error_exception(error_message));
		};
	}
示例#4
0
/// \brief TODOCUMENT
///
/// \relates alignment
///
/// CORA file format
///
///  The header consists of the following
///  - One format line '#FM CORA_FORMAT 1.1'
///  - Any number of comment lines '#CC'
///  - Total number of proteins in the alignment
///  - All CATH domain names in the alignment
///  - Total number of alignment positions
///
/// For example:
///
///     #FM CORA_FORMAT 1.1
///     #CC
///     #CC Any number of comment lines (200 characters max per line)
///     #CC
///     #CC
///     3
///     6insE0 1igl00 1bqt00
///     73
///
/// The body consists of the following:
///
///          START       PROT 1     PROT 2     PROT N         END
///     <------------><---------><---------><---------><---------------->
///     ddddxddddxddddxddddcxcxxcxddddcxcxxcxddddcxcxxcxxxcxddddxddddxxdd
///
///        1    0    1    0  0  0    1  A  0    0  0  0   0    0    0   0
///        2    0    1    0  0  0    2  Y  0    0  0  0   0    0    0   0
///        3    0    2    1B F  0    3  R  0    0  0  0   0    0    0   0
///        4    0    3    2B V  H    4  P  0    1  G  0   0    1    0   2
///        5    1    3    3B N  H    5  S  0    2  P  0   0    1    0   6
///        6    0    3    4B Q  H    6  E  0    3  E  0   0    1    0   2
///
/// START (14 characters) :
///   - Column 1: Alignment Position (dddd)
///   - Column 2: No. of position selected for structural template (dddd)
///   - Column 3: No. of proteins aligned at this position (dddd)
///
/// PROT 1,2,3... (11 characters per protein)
///   - Column 4 (7,10 etc): Residue number in PDB file (ddddc) 4 digit number
///   -    + 1 character insert code
///   -    Importantly the insert code is always in the same position not within
///   -    the 4 characters reserved for the pdb number (see below)
///   - Column 5 (8,11 etc): Amino Acid Code (c)
///   - Column 6 (9,12 etc): Secondary Structure Assignment (c)
///
/// END (18 characters)
///   - Last Column-3: Consensus Secondary Structure Assignment (c)
///   - Last Column-2: No. of alpha residues at this position (dddd)
///   - Last Column-1: No. of beta  residues at this position (dddd)
///   - Last Column: Structural Conservation Score (dd)
alignment cath::align::read_alignment_from_cath_cora_legacy_format(istream        &arg_istream, ///< TODOCUMENT
                                                                   const pdb_list &arg_pdbs,    ///< TODOCUMENT
                                                                   ostream        &arg_stderr   ///< TODOCUMENT
                                                                   ) {
	const size_t CHARS_IN_MAIN_DATA_LINE_START = 14;
	const size_t CHARS_IN_MAIN_DATA_LINE_PROT  = 11;
	const size_t CHARS_IN_MAIN_DATA_LINE_END   = 18;

	if (arg_pdbs.empty()) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("Cannot load a CORA legacy alignment with 0 PDB entries"));
	}

	arg_istream.exceptions(ios::badbit);
	try {
		residue_name_vec_vec residue_names_of_first_chains;
		for (const pdb &arg_pdb : arg_pdbs) {
			residue_names_of_first_chains.push_back( arg_pdb.get_residue_names_of_first_chain__backbone_unchecked() );
		}

		// Check the first line is the file format line
		string line_string;
		getline(arg_istream, line_string);
		if (!starts_with(line_string, "#FM CORA_FORMAT ")) {
			BOOST_THROW_EXCEPTION(runtime_error_exception("No CORA header file format line"));
		}

		// Skip any comment lines
		while (getline(arg_istream, line_string) && starts_with(line_string, "#CC")) {
		}

		// Grab the number of proteins and ensure the alignment matches
		const size_t num_proteins = lexical_cast<size_t>(line_string);
		if (num_proteins != arg_pdbs.size()) {
			BOOST_THROW_EXCEPTION(invalid_argument_exception("Number of PDBs in CORA file is " + lexical_cast<string>(num_proteins) + ", which does not match " + lexical_cast<string>(arg_pdbs.size())));
		}
		const size_t num_chars_in_main_data_line = CHARS_IN_MAIN_DATA_LINE_START + (num_proteins * CHARS_IN_MAIN_DATA_LINE_PROT) + CHARS_IN_MAIN_DATA_LINE_END;

		// Grab the protein names
		getline( arg_istream, line_string );
		trim( line_string );
		const str_vec names = split_build<str_vec>( line_string, is_space() );
		if ( names.size() != num_proteins ) {
			BOOST_THROW_EXCEPTION(runtime_error_exception("Splitting on space does not give " + lexical_cast<string>(num_proteins) + " entries in CORA alignment names line: \"" + line_string + "\""));
		}

		// Grab the total number of alignment positions
		getline(arg_istream, line_string);
		const size_t num_positions = lexical_cast<size_t>(line_string);

		// Prepare the data structures to populate
		aln_posn_vec posns( num_proteins, 0 );
		opt_score_vec scores;
		scores.reserve( num_positions );
		opt_aln_posn_vec_vec data( num_proteins );
		for (opt_aln_posn_vec &data_col : data) {
			data_col.reserve( num_positions );
		}
		// Loop over the main data section
		while (getline(arg_istream, line_string)) {
			// Check the line is of the correct length
			if (line_string.length() != num_chars_in_main_data_line) {
				BOOST_THROW_EXCEPTION(runtime_error_exception("Number of characters in main data line does not equal " + lexical_cast<string>(num_chars_in_main_data_line)));
			}

			// Grab the global details from start of this line
			const size_t      alignment_posn = lexical_cast<size_t>( trim_copy( line_string.substr(  0, 4 ))); // Column 1: Alignment Position (dddd)
//			const size_t num_entries_in_temp = lexical_cast<size_t>( trim_copy( line_string.substr(  5, 4 ))); // Column 2: No. of position selected for structural template (dddd)
			const size_t num_entries_in_posn = lexical_cast<size_t>( trim_copy( line_string.substr( 10, 4 ))); // Column 2: No. of position selected for structural template (dddd)

			if (alignment_posn != data.front().size() + 1) {
				BOOST_THROW_EXCEPTION(runtime_error_exception("Alignment position counter " + lexical_cast<string>(alignment_posn) + " does not match " + lexical_cast<string>(data.front().size() + 1)));
			}

			// Loop over the indices of the proteins
			size_t num_present_posns(0);
			for (size_t prot_ctr = 0; prot_ctr < num_proteins; ++prot_ctr) {
				// Prepare string and other data for this protein
				const size_t        prot_string_offset = CHARS_IN_MAIN_DATA_LINE_START + prot_ctr * CHARS_IN_MAIN_DATA_LINE_PROT;
				const string        prot_string        = line_string.substr( prot_string_offset, CHARS_IN_MAIN_DATA_LINE_PROT );
				opt_aln_posn_vec   &data_col           = data [ prot_ctr ];
				aln_posn_type       &posn              = posns[ prot_ctr ];

				// Grab the the details for this protein
				const int              residue_num = lexical_cast<int>(          trim_copy( prot_string.substr(  1, 4 ))); // Column 4 (7,10 etc): Residue number in PDB file (ddddc) 4 digit number
				const char             insert_code =                                        prot_string.at(      5    )  ; //    + 1 character insert code
				const char              amino_acid =                                        prot_string.at(      7    )  ; // Column 5 (8,11 etc): Amino Acid Code (c)
//				const char               sec_struc =                                        prot_string.at(     10    )  ; // Column 6 (9,12 etc): Secondary Structure Assignment (c)

				// Find the residue in the list of this PDB's residue names
				const residue_name_vec &residues_names = residue_names_of_first_chains[ prot_ctr ];
				const residue_name      res_name       = make_residue_name_with_non_insert_char( residue_num, insert_code, ' ' );
				const opt_aln_posn      find_result    = search_for_residue_in_residue_names(
					posn,
					residues_names,
					amino_acid,
					res_name,
					arg_stderr
				);
				data_col.push_back( find_result ? opt_aln_posn( ( *find_result ) + 1 ) : opt_aln_posn( none ) );
				if ( find_result ) {
					posn = *find_result;
					++num_present_posns;
				}
			}
			if (num_present_posns != num_entries_in_posn) {
				BOOST_THROW_EXCEPTION(runtime_error_exception(
						"Number of positions for alignment_posn " + lexical_cast<string>(alignment_posn)
						+ " was " + lexical_cast<string>(num_present_posns)
						+ " not " + lexical_cast<string>(num_entries_in_posn)
				));
			}

			// Prepare the string for the global details at the end of this line
			const size_t end_string_offset = CHARS_IN_MAIN_DATA_LINE_START + num_proteins * CHARS_IN_MAIN_DATA_LINE_PROT;
			const string end_string = line_string.substr( end_string_offset, CHARS_IN_MAIN_DATA_LINE_END );

			// Grab the global details from start of this line
//			const size_t      cons_sec_struc =                                         end_string.at(      3    )  ; // Last Column-3: Consensus Secondary Structure Assignment (c)
//			const size_t       num_alpha_res = lexical_cast<size_t>( trim_copy(  end_string.substr(  5, 4 ))); // Last Column-2: No. of alpha residues at this position (dddd)
//			const size_t        num_beta_res = lexical_cast<size_t>( trim_copy(  end_string.substr( 10, 4 ))); // Last Column-1: No. of beta residues at this position (dddd)
			const size_t          cons_score = lexical_cast<size_t>( trim_copy(  end_string.substr( 16, 2 ))); // Last Column: Structural Conservation Score (dd)

			scores.push_back( numeric_cast<double>( cons_score ) );
//			// If there are multiple entries in this position then store the score
//			if (num_entries_in_posn > 1) {
////				cerr << "Adding score for " << alignment_posn-1 << endl;
//				scores.push_back(cons_score);
//			}
		}

		if ( num_positions != data.front().size() ) {
			BOOST_THROW_EXCEPTION(runtime_error_exception(
				"CORA legacy alignment number of positions was "
				+ lexical_cast<string>( data.front().size() )
				+ " not "
				+ lexical_cast<string>( num_positions )
			) );
		}

		alignment new_alignment = alignment_offset_1_factory( data );

		// Create a scores matrix and then empty any cells that are absent from the alignment
		opt_score_vec_vec all_scores( new_alignment.num_entries(), scores );
		for (size_t entry = 0; entry < new_alignment.num_entries(); ++entry) {
			for (size_t index = 0; index < new_alignment.length(); ++index) {
				if ( ! has_position_of_entry_of_index( new_alignment, entry, index ) ) {
					all_scores[ entry ][ index ] = none;
				}
			}
		}
		set_scores( new_alignment, all_scores);
		return new_alignment;
	}
	// Catch any I/O exceptions
	catch (const std::exception &ex) {
		const string error_message(string("Cannot read CORA legacy alignment file [") + ex.what() + "] ");
		perror(error_message.c_str());
		BOOST_THROW_EXCEPTION(runtime_error_exception(error_message));
	};
}