/// \brief TODOCUMENT pair<alignment, superpose_orderer> ssap_scores_file_alignment_acquirer::do_get_alignment_and_orderer(const pdb_list &arg_pdbs ///< TODOCUMENT ) const { // Parse the SSAP scores file const path ssap_scores_file = get_ssap_scores_file(); const pair<str_vec, size_size_pair_doub_map> ssap_scores_data = ssap_scores_file::parse_ssap_scores_file( ssap_scores_file ); const str_vec &names = ssap_scores_data.first; const size_size_pair_doub_map &scores = ssap_scores_data.second; if ( names.size() != arg_pdbs.size() ) { BOOST_THROW_EXCEPTION(runtime_error_exception( "The number of PDBs is " + ::std::to_string( arg_pdbs.size() ) + ", which doesn't match the " + ::std::to_string( names.size() ) + " structures required for combining with the SSAP scores file \"" + ssap_scores_file.string() + "\"" )); } // Make a superpose_orderer from the scores const superpose_orderer my_orderer = make_superpose_orderer( scores ); // Use the superpose_orderer code to get a spanning tree, ordered in descending order of score const size_size_pair_vec spanning_tree = get_spanning_tree_ordered_by_desc_score( scores ); // // TODOCUMENT ostringstream stderr; const size_size_alignment_tuple_vec spanning_alignments = get_spanning_alignments( ssap_scores_file.parent_path(), names, arg_pdbs, spanning_tree, stderr ); // TODOCUMENT const bool single_pdb = ( arg_pdbs.size() == 1 ); const alignment new_alignment = single_pdb ? make_single_alignment( front( arg_pdbs ).get_num_residues() ) : build_alignment_from_parts( spanning_alignments, build_protein_list_of_pdb_list( arg_pdbs ) ); if ( names.size() == 0 ) { // Return the results return make_pair( new_alignment, my_orderer ); } // BOOST_LOG_TRIVIAL( warning )<< "About to attempt to build protein list using data that's been read from ssap_scores_file (with " << arg_pdbs.size() << " pdbs and " << names.size() << " names)"; const protein_list proteins_of_pdbs = build_protein_list_of_pdb_list_and_names( arg_pdbs, names ); const alignment scored_new_alignment = score_alignment_copy( residue_scorer(), new_alignment, proteins_of_pdbs ); // cerr << "Did generate alignment : \n"; // cerr << horiz_align_outputter( scored_new_alignment ) << endl; // write_alignment_as_fasta_alignment( cerr, scored_new_alignment, build_protein_list_of_pdb_list( arg_pdbs ) ); // cerr << endl; // Return the results return make_pair( scored_new_alignment, my_orderer ); }
/// \brief TODOCUMENT /// /// \relates display_colourer display_colour_spec cath::get_colour_spec(const display_colourer &arg_colourer, ///< TODOCUMENT const pdb_list &arg_pdbs, ///< TODOCUMENT const str_vec &arg_names, ///< TODOCUMENT const alignment &arg_alignment ///< TODOCUMENT ) { const alignment::size_type num_entries = arg_alignment.num_entries(); const alignment::size_type aln_length = arg_alignment.length(); if ( aln_length <= 0 || num_entries <= 0 ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the alignment is empty")); } if ( num_entries != arg_pdbs.size() ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the number of entries doesn't match the number of PDBs")); } if ( num_entries != arg_names.size() ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the number of entries doesn't match the number of names")); } auto &&result_spec = arg_colourer.get_colour_spec( alignment_context( arg_pdbs, arg_names, arg_alignment ) ); return has_score_colour_handler( arg_colourer ) ? adjust_display_colour_spec_copy( std::forward< decltype( result_spec ) >( result_spec ), get_score_colour_handler( arg_colourer ), arg_alignment ) : result_spec; }
/// \brief TODOCUMENT pair<alignment, size_size_pair_vec> alignment_acquirer::get_alignment_and_spanning_tree(const pdb_list &arg_pdbs ) const { // Call the concrete class's implementation of do_get_alignment_and_orderer() and grab the resulting alignment and superpose_orderer const pair<alignment, superpose_orderer> alignment_and_orderer = do_get_alignment_and_orderer(arg_pdbs); const alignment &new_alignment = alignment_and_orderer.first; const superpose_orderer &my_orderer = alignment_and_orderer.second; // Check that both are of the correct size const size_t num_pdbs = arg_pdbs.size(); if ( new_alignment.num_entries() != num_pdbs ) { BOOST_THROW_EXCEPTION(runtime_error_exception( "Number of entries in alignment (" + lexical_cast<string>( new_alignment.num_entries() ) + ") does not match expected number (" + lexical_cast<string>( num_pdbs ) + ")" )); } if ( my_orderer.get_num_items() != num_pdbs ) { BOOST_THROW_EXCEPTION(runtime_error_exception( "Number of entries in superpose_orderer (" + lexical_cast<string>( my_orderer.get_num_items() ) + ") does not match expected number (" + lexical_cast<string>( num_pdbs ) + ")" )); } // Try to construct a spanning tree // Catch any failures and report them sensibly size_size_pair_vec spanning_tree; try { // spanning_tree = get_spanning_tree( my_orderer ); spanning_tree = get_spanning_tree_ordered_by_desc_score( my_orderer ); } /// \todo Make the condition of this error message come from the method by which the orderer was populated catch (const invalid_argument_exception &err) { logger::log_and_exit( logger::return_code::INSUFFICIENT_RESIDUE_NAME_OVERLAPS, "Cannot construct a tree connecting all PDBs with pairs overlapping by at least " + lexical_cast<string>(alignment_acquirer::MIN_NUM_COMMON_RESIDUES_TO_SUPERPOSE_PAIR) + " residues" ); } // If the size of the spanning tree isn't one less than the number of PDBs then throw a wobbly // /// \todo Also check the size of the alignment matches? if ( spanning_tree.size() + 1 != num_pdbs ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("The spanning tree does not correctly cover the number of PDBs")); } // Return the resulting alignment and spanning tree return make_pair(new_alignment, spanning_tree); }
/// \brief TODOCUMENT pair<alignment, superpose_orderer> fasta_aln_file_alignment_acquirer::do_get_alignment_and_orderer(const pdb_list &arg_pdbs ///< TODOCUMENT ) const { // Construct an alignment from the FASTA alignment file const protein_list proteins_of_pdbs = build_protein_list_of_pdb_list( arg_pdbs ); const alignment new_alignment = read_alignment_from_fasta_file( get_fasta_alignment_file(), proteins_of_pdbs, cerr ); // Construct a superpose_orderer and set arbitrary scores to ensure that the spanning tree will connect the entries const size_t num_pdbs = arg_pdbs.size(); superpose_orderer my_orderer( num_pdbs ); for (size_t link_ctr = 1; link_ctr < num_pdbs; ++link_ctr) { my_orderer.set_score(link_ctr, 0, 0.0); } const alignment scored_new_alignment = score_alignment_copy( residue_scorer(), new_alignment, proteins_of_pdbs ); // Return the results return make_pair( scored_new_alignment, my_orderer ); }
/// \brief TODOCUMENT /// /// \relates display_colourer display_colour_spec cath::get_colour_spec(const display_colourer &arg_colourer, ///< TODOCUMENT const pdb_list &arg_pdbs, ///< TODOCUMENT const str_vec &arg_names, ///< TODOCUMENT const alignment &arg_alignment ///< TODOCUMENT ) { const alignment::size_type num_entries = arg_alignment.num_entries(); const alignment::size_type aln_length = arg_alignment.length(); if ( aln_length <= 0 || num_entries <= 0 ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the alignment is empty")); } if ( num_entries != arg_pdbs.size() ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the number of entries doesn't match the number of PDBs")); } if ( num_entries != arg_names.size() ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the number of entries doesn't match the number of names")); } return arg_colourer.get_colour_spec( alignment_context( arg_pdbs, arg_names, arg_alignment ) ); }
/// \brief Parse a FASTA format input into an alignment /// /// This version of read_alignment_from_fasta() doesn't take names to find within the parsed IDS /// so it is less safe than the other version. /// /// \relates alignment alignment cath::align::read_alignment_from_fasta(istream &arg_istream, ///< The istream from which to read the FASTA input for parsing const pdb_list &arg_pdbs, ///< The PDBs that TODOCUMENT ostream &arg_stderr ///< An ostream to which any warnings should be output (currently unused) ) { return read_alignment_from_fasta( arg_istream, get_amino_acid_lists( arg_pdbs ), str_vec( arg_pdbs.size() ), arg_stderr ); }
/// \brief Parse a FASTA format input into an alignment /// /// This version of read_alignment_from_fasta() doesn't take names to find within the parsed IDS /// so it is less safe than the other version. /// /// \relates alignment alignment cath::align::read_alignment_from_fasta_file(const path &arg_fasta_file, ///< The file from which to read the FASTA input for parsing const pdb_list &arg_pdbs, ///< The PDBs that TODOCUMENT ostream &arg_stderr ///< An ostream to which any warnings should be output (currently unused) ) { return read_alignment_from_fasta_file( arg_fasta_file, arg_pdbs, str_vec( arg_pdbs.size() ), arg_stderr ); }
/// \brief TODOCUMENT /// /// \relates alignment /// /// CORA file format /// /// The header consists of the following /// - One format line '#FM CORA_FORMAT 1.1' /// - Any number of comment lines '#CC' /// - Total number of proteins in the alignment /// - All CATH domain names in the alignment /// - Total number of alignment positions /// /// For example: /// /// #FM CORA_FORMAT 1.1 /// #CC /// #CC Any number of comment lines (200 characters max per line) /// #CC /// #CC /// 3 /// 6insE0 1igl00 1bqt00 /// 73 /// /// The body consists of the following: /// /// START PROT 1 PROT 2 PROT N END /// <------------><---------><---------><---------><----------------> /// ddddxddddxddddxddddcxcxxcxddddcxcxxcxddddcxcxxcxxxcxddddxddddxxdd /// /// 1 0 1 0 0 0 1 A 0 0 0 0 0 0 0 0 /// 2 0 1 0 0 0 2 Y 0 0 0 0 0 0 0 0 /// 3 0 2 1B F 0 3 R 0 0 0 0 0 0 0 0 /// 4 0 3 2B V H 4 P 0 1 G 0 0 1 0 2 /// 5 1 3 3B N H 5 S 0 2 P 0 0 1 0 6 /// 6 0 3 4B Q H 6 E 0 3 E 0 0 1 0 2 /// /// START (14 characters) : /// - Column 1: Alignment Position (dddd) /// - Column 2: No. of position selected for structural template (dddd) /// - Column 3: No. of proteins aligned at this position (dddd) /// /// PROT 1,2,3... (11 characters per protein) /// - Column 4 (7,10 etc): Residue number in PDB file (ddddc) 4 digit number /// - + 1 character insert code /// - Importantly the insert code is always in the same position not within /// - the 4 characters reserved for the pdb number (see below) /// - Column 5 (8,11 etc): Amino Acid Code (c) /// - Column 6 (9,12 etc): Secondary Structure Assignment (c) /// /// END (18 characters) /// - Last Column-3: Consensus Secondary Structure Assignment (c) /// - Last Column-2: No. of alpha residues at this position (dddd) /// - Last Column-1: No. of beta residues at this position (dddd) /// - Last Column: Structural Conservation Score (dd) alignment cath::align::read_alignment_from_cath_cora_legacy_format(istream &arg_istream, ///< TODOCUMENT const pdb_list &arg_pdbs, ///< TODOCUMENT ostream &arg_stderr ///< TODOCUMENT ) { const size_t CHARS_IN_MAIN_DATA_LINE_START = 14; const size_t CHARS_IN_MAIN_DATA_LINE_PROT = 11; const size_t CHARS_IN_MAIN_DATA_LINE_END = 18; if (arg_pdbs.empty()) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Cannot load a CORA legacy alignment with 0 PDB entries")); } arg_istream.exceptions(ios::badbit); try { residue_name_vec_vec residue_names_of_first_chains; for (const pdb &arg_pdb : arg_pdbs) { residue_names_of_first_chains.push_back( arg_pdb.get_residue_names_of_first_chain__backbone_unchecked() ); } // Check the first line is the file format line string line_string; getline(arg_istream, line_string); if (!starts_with(line_string, "#FM CORA_FORMAT ")) { BOOST_THROW_EXCEPTION(runtime_error_exception("No CORA header file format line")); } // Skip any comment lines while (getline(arg_istream, line_string) && starts_with(line_string, "#CC")) { } // Grab the number of proteins and ensure the alignment matches const size_t num_proteins = lexical_cast<size_t>(line_string); if (num_proteins != arg_pdbs.size()) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Number of PDBs in CORA file is " + lexical_cast<string>(num_proteins) + ", which does not match " + lexical_cast<string>(arg_pdbs.size()))); } const size_t num_chars_in_main_data_line = CHARS_IN_MAIN_DATA_LINE_START + (num_proteins * CHARS_IN_MAIN_DATA_LINE_PROT) + CHARS_IN_MAIN_DATA_LINE_END; // Grab the protein names getline( arg_istream, line_string ); trim( line_string ); const str_vec names = split_build<str_vec>( line_string, is_space() ); if ( names.size() != num_proteins ) { BOOST_THROW_EXCEPTION(runtime_error_exception("Splitting on space does not give " + lexical_cast<string>(num_proteins) + " entries in CORA alignment names line: \"" + line_string + "\"")); } // Grab the total number of alignment positions getline(arg_istream, line_string); const size_t num_positions = lexical_cast<size_t>(line_string); // Prepare the data structures to populate aln_posn_vec posns( num_proteins, 0 ); opt_score_vec scores; scores.reserve( num_positions ); opt_aln_posn_vec_vec data( num_proteins ); for (opt_aln_posn_vec &data_col : data) { data_col.reserve( num_positions ); } // Loop over the main data section while (getline(arg_istream, line_string)) { // Check the line is of the correct length if (line_string.length() != num_chars_in_main_data_line) { BOOST_THROW_EXCEPTION(runtime_error_exception("Number of characters in main data line does not equal " + lexical_cast<string>(num_chars_in_main_data_line))); } // Grab the global details from start of this line const size_t alignment_posn = lexical_cast<size_t>( trim_copy( line_string.substr( 0, 4 ))); // Column 1: Alignment Position (dddd) // const size_t num_entries_in_temp = lexical_cast<size_t>( trim_copy( line_string.substr( 5, 4 ))); // Column 2: No. of position selected for structural template (dddd) const size_t num_entries_in_posn = lexical_cast<size_t>( trim_copy( line_string.substr( 10, 4 ))); // Column 2: No. of position selected for structural template (dddd) if (alignment_posn != data.front().size() + 1) { BOOST_THROW_EXCEPTION(runtime_error_exception("Alignment position counter " + lexical_cast<string>(alignment_posn) + " does not match " + lexical_cast<string>(data.front().size() + 1))); } // Loop over the indices of the proteins size_t num_present_posns(0); for (size_t prot_ctr = 0; prot_ctr < num_proteins; ++prot_ctr) { // Prepare string and other data for this protein const size_t prot_string_offset = CHARS_IN_MAIN_DATA_LINE_START + prot_ctr * CHARS_IN_MAIN_DATA_LINE_PROT; const string prot_string = line_string.substr( prot_string_offset, CHARS_IN_MAIN_DATA_LINE_PROT ); opt_aln_posn_vec &data_col = data [ prot_ctr ]; aln_posn_type &posn = posns[ prot_ctr ]; // Grab the the details for this protein const int residue_num = lexical_cast<int>( trim_copy( prot_string.substr( 1, 4 ))); // Column 4 (7,10 etc): Residue number in PDB file (ddddc) 4 digit number const char insert_code = prot_string.at( 5 ) ; // + 1 character insert code const char amino_acid = prot_string.at( 7 ) ; // Column 5 (8,11 etc): Amino Acid Code (c) // const char sec_struc = prot_string.at( 10 ) ; // Column 6 (9,12 etc): Secondary Structure Assignment (c) // Find the residue in the list of this PDB's residue names const residue_name_vec &residues_names = residue_names_of_first_chains[ prot_ctr ]; const residue_name res_name = make_residue_name_with_non_insert_char( residue_num, insert_code, ' ' ); const opt_aln_posn find_result = search_for_residue_in_residue_names( posn, residues_names, amino_acid, res_name, arg_stderr ); data_col.push_back( find_result ? opt_aln_posn( ( *find_result ) + 1 ) : opt_aln_posn( none ) ); if ( find_result ) { posn = *find_result; ++num_present_posns; } } if (num_present_posns != num_entries_in_posn) { BOOST_THROW_EXCEPTION(runtime_error_exception( "Number of positions for alignment_posn " + lexical_cast<string>(alignment_posn) + " was " + lexical_cast<string>(num_present_posns) + " not " + lexical_cast<string>(num_entries_in_posn) )); } // Prepare the string for the global details at the end of this line const size_t end_string_offset = CHARS_IN_MAIN_DATA_LINE_START + num_proteins * CHARS_IN_MAIN_DATA_LINE_PROT; const string end_string = line_string.substr( end_string_offset, CHARS_IN_MAIN_DATA_LINE_END ); // Grab the global details from start of this line // const size_t cons_sec_struc = end_string.at( 3 ) ; // Last Column-3: Consensus Secondary Structure Assignment (c) // const size_t num_alpha_res = lexical_cast<size_t>( trim_copy( end_string.substr( 5, 4 ))); // Last Column-2: No. of alpha residues at this position (dddd) // const size_t num_beta_res = lexical_cast<size_t>( trim_copy( end_string.substr( 10, 4 ))); // Last Column-1: No. of beta residues at this position (dddd) const size_t cons_score = lexical_cast<size_t>( trim_copy( end_string.substr( 16, 2 ))); // Last Column: Structural Conservation Score (dd) scores.push_back( numeric_cast<double>( cons_score ) ); // // If there are multiple entries in this position then store the score // if (num_entries_in_posn > 1) { //// cerr << "Adding score for " << alignment_posn-1 << endl; // scores.push_back(cons_score); // } } if ( num_positions != data.front().size() ) { BOOST_THROW_EXCEPTION(runtime_error_exception( "CORA legacy alignment number of positions was " + lexical_cast<string>( data.front().size() ) + " not " + lexical_cast<string>( num_positions ) ) ); } alignment new_alignment = alignment_offset_1_factory( data ); // Create a scores matrix and then empty any cells that are absent from the alignment opt_score_vec_vec all_scores( new_alignment.num_entries(), scores ); for (size_t entry = 0; entry < new_alignment.num_entries(); ++entry) { for (size_t index = 0; index < new_alignment.length(); ++index) { if ( ! has_position_of_entry_of_index( new_alignment, entry, index ) ) { all_scores[ entry ][ index ] = none; } } } set_scores( new_alignment, all_scores); return new_alignment; } // Catch any I/O exceptions catch (const std::exception &ex) { const string error_message(string("Cannot read CORA legacy alignment file [") + ex.what() + "] "); perror(error_message.c_str()); BOOST_THROW_EXCEPTION(runtime_error_exception(error_message)); }; }