/// \brief TODOCUMENT /// /// \relates alignment /// /// The lists of residues are used for finding the indices of residues. /// /// Should this parse based on: /// - exact column positions (hence breaking if an extra space is added) or /// - whitespace splitting (hence breaking if a column is missing, eg with an insert as a space rather than a 0) alignment cath::align::read_alignment_from_cath_ssap_legacy_format(istream &arg_istream, ///< TODOCUMENT const residue_name_vec &arg_res_names_a, ///< TODOCUMENT const residue_name_vec &arg_res_names_b, ///< TODOCUMENT ostream &arg_stderr ///< TODOCUMENT ) { arg_istream.exceptions( ios::badbit ); alignment new_alignment( alignment::NUM_ENTRIES_IN_PAIR_ALIGNMENT ); string line_string; size_t pos_a(0); size_t pos_b(0); opt_score_vec scores; while ( getline(arg_istream, line_string ) ) { const int res_num_a = lexical_cast<int>( trim_copy( line_string.substr( 0, 4 ))); // Column 1: Protein 1 PDB residue number (excluding insert character) // const char sec_struc_a = line_string.at( 5 ) ; // Column 2: Protein 1 Secondary structure character const char insert_a = line_string.at( 7 ) ; // Column 3: Protein 1 PDB residue insert character const char amino_acid_a = line_string.at( 9 ) ; // Column 4: Protein 1 Residue Code (One letter code) const size_t score = lexical_cast<size_t>( trim_copy( line_string.substr( 12, 3 ))); // Column 5: SSAP residue score (0-100) const char amino_acid_b = line_string.at( 17 ) ; // Column 6: Protein 2 Residue Code (One letter code) const char insert_b = line_string.at( 19 ) ; // Column 7: Protein 2 PDB residue insert character // const char sec_struc_b = line_string.at( 21 ) ; // Column 8: Protein 2 Secondary structure character const int res_num_b = lexical_cast<int>( trim_copy( line_string.substr( 23, 4 ))); // Column 9: Protein 2 PDB residue number (excluding insert character) // For each side, move the PDB position forward if necessary const residue_name res_name_a = make_residue_name_with_non_insert_char( res_num_a, insert_a, '0'); const residue_name res_name_b = make_residue_name_with_non_insert_char( res_num_b, insert_b, '0' ); const opt_size find_a_result = search_for_residue_in_residue_names( pos_a, arg_res_names_a, amino_acid_a, res_name_a, arg_stderr ); const opt_size find_b_result = search_for_residue_in_residue_names( pos_b, arg_res_names_b, amino_acid_b, res_name_b, arg_stderr ); pos_a = find_a_result ? *find_a_result : pos_a; pos_b = find_b_result ? *find_b_result : pos_b; if ( find_a_result && find_b_result ) { append_position_both_offset_1( new_alignment, ( *find_a_result ) + 1, ( *find_b_result ) + 1 ); scores.push_back( numeric_cast<double>( score ) ); } else { scores.push_back( none ); if ( find_a_result ) { append_position_a_offset_1( new_alignment, ( *find_a_result ) + 1 ); } else if ( find_b_result ) { append_position_b_offset_1( new_alignment, ( *find_b_result ) + 1 ); } else { BOOST_THROW_EXCEPTION(runtime_error_exception("Alignment file contains entry with neither residue populated")); } } } set_pair_alignment_duplicate_scores( new_alignment, scores ); return new_alignment; }
/// \brief Parse a FASTA format input into a vector of pairs of strings (one for id, one for sequence) str_str_pair_vec cath::align::read_ids_and_sequences_from_fasta(istream &arg_istream ///< The istream from which to read the FASTA input for parsing ) { arg_istream.exceptions( ios::badbit ); str_str_pair_vec sequence_of_id; // Loop over the lines in the input string line_string; while ( getline( arg_istream, line_string ) ) { // If there are any non-printing characters, throw an exception if ( ! all( line_string, is_print() ) ) { BOOST_THROW_EXCEPTION(runtime_error_exception("Line in FASTA input contains non-printing characters")); } // If this line doesn't start with a '>' and it's the first line (nothing yet in sequence_of_id) then throw an exception if ( sequence_of_id.empty() && ! starts_with( line_string, ">" ) ) { BOOST_THROW_EXCEPTION(runtime_error_exception("Line in FASTA input expected to be header doesn't begin with '>'")); } // If this line starts with a >, then treat as a header if ( starts_with( line_string, ">" ) ) { // Remove the first character line_string = line_string.substr( 1 ); // If the rest of the line is empty then throw an exception if ( line_string.empty() ) { BOOST_THROW_EXCEPTION(runtime_error_exception("Header line in FASTA doesn't have any characters after initial '>'")); } // Add a new entry at the back of sequence_of_id with this id and an empty string sequence_of_id.push_back( make_pair( line_string, string("") ) ); } // Otherwise this is a line of sequence data else { // Remove all spaces from the string find_format_all( line_string, token_finder( is_space() ), empty_formatter( line_string ) ); // If any of the (remaining) characters aren't alpha characters or '-'s then throw an exception if ( ! all( line_string, is_alpha() || is_any_of( "-" ) ) ) { BOOST_THROW_EXCEPTION(runtime_error_exception("Sequence line in FASTA input contains non-space characters that are neither letters nor '-'")); } // Convert the string to upper case and append it to the back of the sequence string for the most recent entry to_upper( line_string ); assert( ! sequence_of_id.empty() ); sequence_of_id.back().second += line_string; } } // Return the results of this parsing return sequence_of_id; }
/// \brief Parse a FASTA format input into an alignment /// /// At present, each sequences must contain all of the residues of the corresponding PDB /// (because the index in the PDB is required in the alignment). /// /// !!Case insensitive!! /// /// \todo !URGENT! Test what this does when given structures with incomplete residues near the start. /// It looks like it gives indices in the PDB, rather than in the protein (which only /// contains backbone-complete residues). This is a serious issue! /// /// \todo Generalise this so that it's possible to read alignments against a pdb_list or a protein_list /// /// The code will attempt to handle missing residues with a warning if there are a small number. /// It will fail if the percentage is too low. /// /// \relates alignment alignment cath::align::read_alignment_from_fasta(istream &arg_istream, ///< The istream from which to read the FASTA input for parsing const amino_acid_vec_vec &arg_amino_acid_lists, ///< TODOCUMENT const str_vec &arg_names, ///< A vector of names, each of which should be found within the corresponding sequence's ID ostream &arg_stderr ///< An ostream to which any warnings should be output (currently unused) ) { if ( arg_amino_acid_lists.empty() ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Cannot load a FASTA alignment with 0 PDB entries")); } const size_t num_entries = arg_amino_acid_lists.size(); if ( arg_names.size() != num_entries ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Cannot load a FASTA alignment with a different number of names and PDB entries")); } arg_istream.exceptions( ios::badbit ); try { const str_str_pair_vec sequence_of_id = read_ids_and_sequences_from_fasta( arg_istream ); const size_t num_sequences = sequence_of_id.size(); if ( num_entries != num_sequences ) { BOOST_THROW_EXCEPTION(runtime_error_exception( "Number of sequences parsed from FASTA (" + lexical_cast<string>( num_sequences ) + ") doesn't match the number of PDBs/names (" + lexical_cast<string>( num_entries ) + ")" )); } const size_t sequence_length = sequence_of_id.front().second.length(); opt_aln_posn_vec_vec positions; positions.reserve( num_entries ); for (size_t entry_ctr = 0; entry_ctr < num_entries; ++entry_ctr) { const amino_acid_vec &amino_acids = arg_amino_acid_lists [ entry_ctr ]; const string &name = arg_names [ entry_ctr ]; const str_str_pair &id_and_sequence = sequence_of_id[ entry_ctr ]; const string &id = id_and_sequence.first; const string &sequence = id_and_sequence.second; if ( sequence.length() != sequence_length ) { BOOST_THROW_EXCEPTION(runtime_error_exception( "When attempting to parse entry number " + lexical_cast<string>( entry_ctr + 1 ) + " of FASTA alignment, the length of the sequence (" + lexical_cast<string>( sequence.length() ) + ") does not match the length of the first sequence (" + lexical_cast<string>( sequence_length ) + ")" )); } if ( ! icontains( id, name ) ) { BOOST_THROW_EXCEPTION(runtime_error_exception( "When attempting to parse entry number " + lexical_cast<string>( entry_ctr + 1 ) + " of FASTA alignment, name \"" + name + "\" could not be found in a case-insensitive search within FASTA header ID \"" + id + "\"" )); } positions.push_back( align_sequence_to_amino_acids( sequence, amino_acids, name, arg_stderr ) ); } return alignment( positions ); } // Catch any I/O exceptions catch (const std::exception &ex) { const string error_message(string("Cannot read FASTA legacy alignment file [") + ex.what() + "] "); perror(error_message.c_str()); BOOST_THROW_EXCEPTION(runtime_error_exception(error_message)); }; }
/// \brief TODOCUMENT /// /// \relates alignment /// /// CORA file format /// /// The header consists of the following /// - One format line '#FM CORA_FORMAT 1.1' /// - Any number of comment lines '#CC' /// - Total number of proteins in the alignment /// - All CATH domain names in the alignment /// - Total number of alignment positions /// /// For example: /// /// #FM CORA_FORMAT 1.1 /// #CC /// #CC Any number of comment lines (200 characters max per line) /// #CC /// #CC /// 3 /// 6insE0 1igl00 1bqt00 /// 73 /// /// The body consists of the following: /// /// START PROT 1 PROT 2 PROT N END /// <------------><---------><---------><---------><----------------> /// ddddxddddxddddxddddcxcxxcxddddcxcxxcxddddcxcxxcxxxcxddddxddddxxdd /// /// 1 0 1 0 0 0 1 A 0 0 0 0 0 0 0 0 /// 2 0 1 0 0 0 2 Y 0 0 0 0 0 0 0 0 /// 3 0 2 1B F 0 3 R 0 0 0 0 0 0 0 0 /// 4 0 3 2B V H 4 P 0 1 G 0 0 1 0 2 /// 5 1 3 3B N H 5 S 0 2 P 0 0 1 0 6 /// 6 0 3 4B Q H 6 E 0 3 E 0 0 1 0 2 /// /// START (14 characters) : /// - Column 1: Alignment Position (dddd) /// - Column 2: No. of position selected for structural template (dddd) /// - Column 3: No. of proteins aligned at this position (dddd) /// /// PROT 1,2,3... (11 characters per protein) /// - Column 4 (7,10 etc): Residue number in PDB file (ddddc) 4 digit number /// - + 1 character insert code /// - Importantly the insert code is always in the same position not within /// - the 4 characters reserved for the pdb number (see below) /// - Column 5 (8,11 etc): Amino Acid Code (c) /// - Column 6 (9,12 etc): Secondary Structure Assignment (c) /// /// END (18 characters) /// - Last Column-3: Consensus Secondary Structure Assignment (c) /// - Last Column-2: No. of alpha residues at this position (dddd) /// - Last Column-1: No. of beta residues at this position (dddd) /// - Last Column: Structural Conservation Score (dd) alignment cath::align::read_alignment_from_cath_cora_legacy_format(istream &arg_istream, ///< TODOCUMENT const pdb_list &arg_pdbs, ///< TODOCUMENT ostream &arg_stderr ///< TODOCUMENT ) { const size_t CHARS_IN_MAIN_DATA_LINE_START = 14; const size_t CHARS_IN_MAIN_DATA_LINE_PROT = 11; const size_t CHARS_IN_MAIN_DATA_LINE_END = 18; if (arg_pdbs.empty()) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Cannot load a CORA legacy alignment with 0 PDB entries")); } arg_istream.exceptions(ios::badbit); try { residue_name_vec_vec residue_names_of_first_chains; for (const pdb &arg_pdb : arg_pdbs) { residue_names_of_first_chains.push_back( arg_pdb.get_residue_names_of_first_chain__backbone_unchecked() ); } // Check the first line is the file format line string line_string; getline(arg_istream, line_string); if (!starts_with(line_string, "#FM CORA_FORMAT ")) { BOOST_THROW_EXCEPTION(runtime_error_exception("No CORA header file format line")); } // Skip any comment lines while (getline(arg_istream, line_string) && starts_with(line_string, "#CC")) { } // Grab the number of proteins and ensure the alignment matches const size_t num_proteins = lexical_cast<size_t>(line_string); if (num_proteins != arg_pdbs.size()) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Number of PDBs in CORA file is " + lexical_cast<string>(num_proteins) + ", which does not match " + lexical_cast<string>(arg_pdbs.size()))); } const size_t num_chars_in_main_data_line = CHARS_IN_MAIN_DATA_LINE_START + (num_proteins * CHARS_IN_MAIN_DATA_LINE_PROT) + CHARS_IN_MAIN_DATA_LINE_END; // Grab the protein names getline( arg_istream, line_string ); trim( line_string ); const str_vec names = split_build<str_vec>( line_string, is_space() ); if ( names.size() != num_proteins ) { BOOST_THROW_EXCEPTION(runtime_error_exception("Splitting on space does not give " + lexical_cast<string>(num_proteins) + " entries in CORA alignment names line: \"" + line_string + "\"")); } // Grab the total number of alignment positions getline(arg_istream, line_string); const size_t num_positions = lexical_cast<size_t>(line_string); // Prepare the data structures to populate aln_posn_vec posns( num_proteins, 0 ); opt_score_vec scores; scores.reserve( num_positions ); opt_aln_posn_vec_vec data( num_proteins ); for (opt_aln_posn_vec &data_col : data) { data_col.reserve( num_positions ); } // Loop over the main data section while (getline(arg_istream, line_string)) { // Check the line is of the correct length if (line_string.length() != num_chars_in_main_data_line) { BOOST_THROW_EXCEPTION(runtime_error_exception("Number of characters in main data line does not equal " + lexical_cast<string>(num_chars_in_main_data_line))); } // Grab the global details from start of this line const size_t alignment_posn = lexical_cast<size_t>( trim_copy( line_string.substr( 0, 4 ))); // Column 1: Alignment Position (dddd) // const size_t num_entries_in_temp = lexical_cast<size_t>( trim_copy( line_string.substr( 5, 4 ))); // Column 2: No. of position selected for structural template (dddd) const size_t num_entries_in_posn = lexical_cast<size_t>( trim_copy( line_string.substr( 10, 4 ))); // Column 2: No. of position selected for structural template (dddd) if (alignment_posn != data.front().size() + 1) { BOOST_THROW_EXCEPTION(runtime_error_exception("Alignment position counter " + lexical_cast<string>(alignment_posn) + " does not match " + lexical_cast<string>(data.front().size() + 1))); } // Loop over the indices of the proteins size_t num_present_posns(0); for (size_t prot_ctr = 0; prot_ctr < num_proteins; ++prot_ctr) { // Prepare string and other data for this protein const size_t prot_string_offset = CHARS_IN_MAIN_DATA_LINE_START + prot_ctr * CHARS_IN_MAIN_DATA_LINE_PROT; const string prot_string = line_string.substr( prot_string_offset, CHARS_IN_MAIN_DATA_LINE_PROT ); opt_aln_posn_vec &data_col = data [ prot_ctr ]; aln_posn_type &posn = posns[ prot_ctr ]; // Grab the the details for this protein const int residue_num = lexical_cast<int>( trim_copy( prot_string.substr( 1, 4 ))); // Column 4 (7,10 etc): Residue number in PDB file (ddddc) 4 digit number const char insert_code = prot_string.at( 5 ) ; // + 1 character insert code const char amino_acid = prot_string.at( 7 ) ; // Column 5 (8,11 etc): Amino Acid Code (c) // const char sec_struc = prot_string.at( 10 ) ; // Column 6 (9,12 etc): Secondary Structure Assignment (c) // Find the residue in the list of this PDB's residue names const residue_name_vec &residues_names = residue_names_of_first_chains[ prot_ctr ]; const residue_name res_name = make_residue_name_with_non_insert_char( residue_num, insert_code, ' ' ); const opt_aln_posn find_result = search_for_residue_in_residue_names( posn, residues_names, amino_acid, res_name, arg_stderr ); data_col.push_back( find_result ? opt_aln_posn( ( *find_result ) + 1 ) : opt_aln_posn( none ) ); if ( find_result ) { posn = *find_result; ++num_present_posns; } } if (num_present_posns != num_entries_in_posn) { BOOST_THROW_EXCEPTION(runtime_error_exception( "Number of positions for alignment_posn " + lexical_cast<string>(alignment_posn) + " was " + lexical_cast<string>(num_present_posns) + " not " + lexical_cast<string>(num_entries_in_posn) )); } // Prepare the string for the global details at the end of this line const size_t end_string_offset = CHARS_IN_MAIN_DATA_LINE_START + num_proteins * CHARS_IN_MAIN_DATA_LINE_PROT; const string end_string = line_string.substr( end_string_offset, CHARS_IN_MAIN_DATA_LINE_END ); // Grab the global details from start of this line // const size_t cons_sec_struc = end_string.at( 3 ) ; // Last Column-3: Consensus Secondary Structure Assignment (c) // const size_t num_alpha_res = lexical_cast<size_t>( trim_copy( end_string.substr( 5, 4 ))); // Last Column-2: No. of alpha residues at this position (dddd) // const size_t num_beta_res = lexical_cast<size_t>( trim_copy( end_string.substr( 10, 4 ))); // Last Column-1: No. of beta residues at this position (dddd) const size_t cons_score = lexical_cast<size_t>( trim_copy( end_string.substr( 16, 2 ))); // Last Column: Structural Conservation Score (dd) scores.push_back( numeric_cast<double>( cons_score ) ); // // If there are multiple entries in this position then store the score // if (num_entries_in_posn > 1) { //// cerr << "Adding score for " << alignment_posn-1 << endl; // scores.push_back(cons_score); // } } if ( num_positions != data.front().size() ) { BOOST_THROW_EXCEPTION(runtime_error_exception( "CORA legacy alignment number of positions was " + lexical_cast<string>( data.front().size() ) + " not " + lexical_cast<string>( num_positions ) ) ); } alignment new_alignment = alignment_offset_1_factory( data ); // Create a scores matrix and then empty any cells that are absent from the alignment opt_score_vec_vec all_scores( new_alignment.num_entries(), scores ); for (size_t entry = 0; entry < new_alignment.num_entries(); ++entry) { for (size_t index = 0; index < new_alignment.length(); ++index) { if ( ! has_position_of_entry_of_index( new_alignment, entry, index ) ) { all_scores[ entry ][ index ] = none; } } } set_scores( new_alignment, all_scores); return new_alignment; } // Catch any I/O exceptions catch (const std::exception &ex) { const string error_message(string("Cannot read CORA legacy alignment file [") + ex.what() + "] "); perror(error_message.c_str()); BOOST_THROW_EXCEPTION(runtime_error_exception(error_message)); }; }