/// \brief TODOCUMENT
pair<alignment, superpose_orderer> ssap_scores_file_alignment_acquirer::do_get_alignment_and_orderer(const pdb_list &arg_pdbs ///< TODOCUMENT
                                                                                                     ) const {
	// Parse the SSAP scores file
	const path                     ssap_scores_file = get_ssap_scores_file();
	const pair<str_vec, size_size_pair_doub_map> ssap_scores_data = ssap_scores_file::parse_ssap_scores_file( ssap_scores_file );
	const str_vec                 &names            = ssap_scores_data.first;
	const size_size_pair_doub_map &scores           = ssap_scores_data.second;

	if ( names.size() != arg_pdbs.size() ) {
		BOOST_THROW_EXCEPTION(runtime_error_exception(
			"The number of PDBs is "
			+ ::std::to_string( arg_pdbs.size()         )
			+ ", which doesn't match the "
			+ ::std::to_string( names.size() )
			+ " structures required for combining with the SSAP scores file \""
			+ ssap_scores_file.string()
			+ "\""
		));
	}

	// Make a superpose_orderer from the scores
	const superpose_orderer my_orderer = make_superpose_orderer( scores );

	// Use the superpose_orderer code to get a spanning tree, ordered in descending order of score
	const size_size_pair_vec spanning_tree = get_spanning_tree_ordered_by_desc_score( scores );

//	// TODOCUMENT
	ostringstream stderr;
	const size_size_alignment_tuple_vec spanning_alignments = get_spanning_alignments(
		ssap_scores_file.parent_path(),
		names,
		arg_pdbs,
		spanning_tree,
		stderr
	);

	// TODOCUMENT
	const bool      single_pdb    = ( arg_pdbs.size() == 1 );
	const alignment new_alignment = single_pdb ? make_single_alignment( front( arg_pdbs ).get_num_residues() )
	                                           : build_alignment_from_parts( spanning_alignments, build_protein_list_of_pdb_list( arg_pdbs ) );

	if ( names.size() == 0 ) {
		// Return the results
		return make_pair( new_alignment, my_orderer );
	}

//	BOOST_LOG_TRIVIAL( warning )<< "About to attempt to build protein list using data that's been read from ssap_scores_file (with " << arg_pdbs.size() << " pdbs and " << names.size() << " names)";

	const protein_list proteins_of_pdbs     = build_protein_list_of_pdb_list_and_names( arg_pdbs, names );
	const alignment    scored_new_alignment = score_alignment_copy( residue_scorer(), new_alignment, proteins_of_pdbs );


//	cerr << "Did generate alignment : \n";
//	cerr << horiz_align_outputter( scored_new_alignment ) << endl;
//	write_alignment_as_fasta_alignment( cerr, scored_new_alignment, build_protein_list_of_pdb_list( arg_pdbs ) );
//	cerr << endl;

	// Return the results
	return make_pair( scored_new_alignment, my_orderer );
}
/// \brief TODOCUMENT
///
/// \relates display_colourer
display_colour_spec cath::get_colour_spec(const display_colourer &arg_colourer, ///< TODOCUMENT
                                          const pdb_list         &arg_pdbs,     ///< TODOCUMENT
                                          const str_vec          &arg_names,    ///< TODOCUMENT
                                          const alignment        &arg_alignment ///< TODOCUMENT
                                          ) {
	const alignment::size_type num_entries   = arg_alignment.num_entries();
	const alignment::size_type aln_length    = arg_alignment.length();

	if ( aln_length <= 0 || num_entries <= 0 ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the alignment is empty"));
	}
	if ( num_entries != arg_pdbs.size()  ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the number of entries doesn't match the number of PDBs"));
	}
	if ( num_entries != arg_names.size() ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the number of entries doesn't match the number of names"));
	}
	auto &&result_spec = arg_colourer.get_colour_spec( alignment_context(
		arg_pdbs,
		arg_names,
		arg_alignment
	) );

	return has_score_colour_handler( arg_colourer )
		? adjust_display_colour_spec_copy(
			std::forward< decltype( result_spec ) >( result_spec ),
			get_score_colour_handler( arg_colourer ),
			arg_alignment
		)
		: result_spec;
}
/// \brief TODOCUMENT
pair<alignment, size_size_pair_vec> alignment_acquirer::get_alignment_and_spanning_tree(const pdb_list &arg_pdbs
                                                                                        ) const {
	// Call the concrete class's implementation of do_get_alignment_and_orderer() and grab the resulting alignment and superpose_orderer
	const pair<alignment, superpose_orderer> alignment_and_orderer = do_get_alignment_and_orderer(arg_pdbs);
	const alignment         &new_alignment = alignment_and_orderer.first;
	const superpose_orderer &my_orderer    = alignment_and_orderer.second;

	// Check that both are of the correct size
	const size_t num_pdbs = arg_pdbs.size();
	if ( new_alignment.num_entries() != num_pdbs ) {
		BOOST_THROW_EXCEPTION(runtime_error_exception(
			"Number of entries in alignment ("
			+ lexical_cast<string>( new_alignment.num_entries() )
			+ ") does not match expected number ("
			+ lexical_cast<string>( num_pdbs                    )
			+ ")"
		));
	}
	if ( my_orderer.get_num_items()  != num_pdbs  ) {
		BOOST_THROW_EXCEPTION(runtime_error_exception(
			"Number of entries in superpose_orderer ("
			+ lexical_cast<string>( my_orderer.get_num_items() )
			+ ") does not match expected number ("
			+ lexical_cast<string>( num_pdbs                    )
			+ ")"
		));
	}

	// Try to construct a spanning tree
	// Catch any failures and report them sensibly
	size_size_pair_vec spanning_tree;
	try {
//		spanning_tree = get_spanning_tree( my_orderer );
		spanning_tree = get_spanning_tree_ordered_by_desc_score( my_orderer );
	}
	/// \todo Make the condition of this error message come from the method by which the orderer was populated
	catch (const invalid_argument_exception &err) {
		logger::log_and_exit(
			logger::return_code::INSUFFICIENT_RESIDUE_NAME_OVERLAPS,
			"Cannot construct a tree connecting all PDBs with pairs overlapping by at least " + lexical_cast<string>(alignment_acquirer::MIN_NUM_COMMON_RESIDUES_TO_SUPERPOSE_PAIR) + " residues"
		);
	}

	// If the size of the spanning tree isn't one less than the number of PDBs then throw a wobbly
	//
	/// \todo Also check the size of the alignment matches?
	if ( spanning_tree.size() + 1 != num_pdbs ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("The spanning tree does not correctly cover the number of PDBs"));
	}

	// Return the resulting alignment and spanning tree
	return make_pair(new_alignment, spanning_tree);
}
/// \brief TODOCUMENT
pair<alignment, superpose_orderer> fasta_aln_file_alignment_acquirer::do_get_alignment_and_orderer(const pdb_list &arg_pdbs ///< TODOCUMENT
                                                                                                   ) const {
	// Construct an alignment from the FASTA alignment file
	const protein_list proteins_of_pdbs = build_protein_list_of_pdb_list( arg_pdbs );
	const alignment new_alignment = read_alignment_from_fasta_file( get_fasta_alignment_file(), proteins_of_pdbs, cerr );

	// Construct a superpose_orderer and set arbitrary scores to ensure that the spanning tree will connect the entries
	const size_t num_pdbs      = arg_pdbs.size();
	superpose_orderer my_orderer( num_pdbs );
	for (size_t link_ctr = 1; link_ctr < num_pdbs; ++link_ctr) {
		my_orderer.set_score(link_ctr, 0, 0.0);
	}

	const alignment scored_new_alignment = score_alignment_copy( residue_scorer(), new_alignment, proteins_of_pdbs );

	// Return the results
	return make_pair( scored_new_alignment, my_orderer );
}
Example #5
0
/// \brief TODOCUMENT
///
/// \relates display_colourer
display_colour_spec cath::get_colour_spec(const display_colourer &arg_colourer, ///< TODOCUMENT
                                          const pdb_list         &arg_pdbs,     ///< TODOCUMENT
                                          const str_vec          &arg_names,    ///< TODOCUMENT
                                          const alignment        &arg_alignment ///< TODOCUMENT
                                          ) {
	const alignment::size_type num_entries   = arg_alignment.num_entries();
	const alignment::size_type aln_length    = arg_alignment.length();

	if ( aln_length <= 0 || num_entries <= 0 ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the alignment is empty"));
	}
	if ( num_entries != arg_pdbs.size()  ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the number of entries doesn't match the number of PDBs"));
	}
	if ( num_entries != arg_names.size() ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the number of entries doesn't match the number of names"));
	}
	return arg_colourer.get_colour_spec( alignment_context(
		arg_pdbs,
		arg_names,
		arg_alignment
	) );
}
Example #6
0
/// \brief Parse a FASTA format input into an alignment
///
/// This version of read_alignment_from_fasta() doesn't take names to find within the parsed IDS
/// so it is less safe than the other version.
///
/// \relates alignment
alignment cath::align::read_alignment_from_fasta(istream        &arg_istream, ///< The istream from which to read the FASTA input for parsing
                                                 const pdb_list &arg_pdbs,    ///< The PDBs that TODOCUMENT
                                                 ostream        &arg_stderr   ///< An ostream to which any warnings should be output (currently unused)
                                                 ) {
	return read_alignment_from_fasta( arg_istream, get_amino_acid_lists( arg_pdbs ), str_vec( arg_pdbs.size() ), arg_stderr );
}
Example #7
0
/// \brief Parse a FASTA format input into an alignment
///
/// This version of read_alignment_from_fasta() doesn't take names to find within the parsed IDS
/// so it is less safe than the other version.
///
/// \relates alignment
alignment cath::align::read_alignment_from_fasta_file(const path     &arg_fasta_file, ///< The file from which to read the FASTA input for parsing
                                                      const pdb_list &arg_pdbs,       ///< The PDBs that TODOCUMENT
                                                      ostream        &arg_stderr      ///< An ostream to which any warnings should be output (currently unused)
                                                      ) {
	return read_alignment_from_fasta_file( arg_fasta_file, arg_pdbs, str_vec( arg_pdbs.size() ), arg_stderr );
}
Example #8
0
/// \brief TODOCUMENT
///
/// \relates alignment
///
/// CORA file format
///
///  The header consists of the following
///  - One format line '#FM CORA_FORMAT 1.1'
///  - Any number of comment lines '#CC'
///  - Total number of proteins in the alignment
///  - All CATH domain names in the alignment
///  - Total number of alignment positions
///
/// For example:
///
///     #FM CORA_FORMAT 1.1
///     #CC
///     #CC Any number of comment lines (200 characters max per line)
///     #CC
///     #CC
///     3
///     6insE0 1igl00 1bqt00
///     73
///
/// The body consists of the following:
///
///          START       PROT 1     PROT 2     PROT N         END
///     <------------><---------><---------><---------><---------------->
///     ddddxddddxddddxddddcxcxxcxddddcxcxxcxddddcxcxxcxxxcxddddxddddxxdd
///
///        1    0    1    0  0  0    1  A  0    0  0  0   0    0    0   0
///        2    0    1    0  0  0    2  Y  0    0  0  0   0    0    0   0
///        3    0    2    1B F  0    3  R  0    0  0  0   0    0    0   0
///        4    0    3    2B V  H    4  P  0    1  G  0   0    1    0   2
///        5    1    3    3B N  H    5  S  0    2  P  0   0    1    0   6
///        6    0    3    4B Q  H    6  E  0    3  E  0   0    1    0   2
///
/// START (14 characters) :
///   - Column 1: Alignment Position (dddd)
///   - Column 2: No. of position selected for structural template (dddd)
///   - Column 3: No. of proteins aligned at this position (dddd)
///
/// PROT 1,2,3... (11 characters per protein)
///   - Column 4 (7,10 etc): Residue number in PDB file (ddddc) 4 digit number
///   -    + 1 character insert code
///   -    Importantly the insert code is always in the same position not within
///   -    the 4 characters reserved for the pdb number (see below)
///   - Column 5 (8,11 etc): Amino Acid Code (c)
///   - Column 6 (9,12 etc): Secondary Structure Assignment (c)
///
/// END (18 characters)
///   - Last Column-3: Consensus Secondary Structure Assignment (c)
///   - Last Column-2: No. of alpha residues at this position (dddd)
///   - Last Column-1: No. of beta  residues at this position (dddd)
///   - Last Column: Structural Conservation Score (dd)
alignment cath::align::read_alignment_from_cath_cora_legacy_format(istream        &arg_istream, ///< TODOCUMENT
                                                                   const pdb_list &arg_pdbs,    ///< TODOCUMENT
                                                                   ostream        &arg_stderr   ///< TODOCUMENT
                                                                   ) {
	const size_t CHARS_IN_MAIN_DATA_LINE_START = 14;
	const size_t CHARS_IN_MAIN_DATA_LINE_PROT  = 11;
	const size_t CHARS_IN_MAIN_DATA_LINE_END   = 18;

	if (arg_pdbs.empty()) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("Cannot load a CORA legacy alignment with 0 PDB entries"));
	}

	arg_istream.exceptions(ios::badbit);
	try {
		residue_name_vec_vec residue_names_of_first_chains;
		for (const pdb &arg_pdb : arg_pdbs) {
			residue_names_of_first_chains.push_back( arg_pdb.get_residue_names_of_first_chain__backbone_unchecked() );
		}

		// Check the first line is the file format line
		string line_string;
		getline(arg_istream, line_string);
		if (!starts_with(line_string, "#FM CORA_FORMAT ")) {
			BOOST_THROW_EXCEPTION(runtime_error_exception("No CORA header file format line"));
		}

		// Skip any comment lines
		while (getline(arg_istream, line_string) && starts_with(line_string, "#CC")) {
		}

		// Grab the number of proteins and ensure the alignment matches
		const size_t num_proteins = lexical_cast<size_t>(line_string);
		if (num_proteins != arg_pdbs.size()) {
			BOOST_THROW_EXCEPTION(invalid_argument_exception("Number of PDBs in CORA file is " + lexical_cast<string>(num_proteins) + ", which does not match " + lexical_cast<string>(arg_pdbs.size())));
		}
		const size_t num_chars_in_main_data_line = CHARS_IN_MAIN_DATA_LINE_START + (num_proteins * CHARS_IN_MAIN_DATA_LINE_PROT) + CHARS_IN_MAIN_DATA_LINE_END;

		// Grab the protein names
		getline( arg_istream, line_string );
		trim( line_string );
		const str_vec names = split_build<str_vec>( line_string, is_space() );
		if ( names.size() != num_proteins ) {
			BOOST_THROW_EXCEPTION(runtime_error_exception("Splitting on space does not give " + lexical_cast<string>(num_proteins) + " entries in CORA alignment names line: \"" + line_string + "\""));
		}

		// Grab the total number of alignment positions
		getline(arg_istream, line_string);
		const size_t num_positions = lexical_cast<size_t>(line_string);

		// Prepare the data structures to populate
		aln_posn_vec posns( num_proteins, 0 );
		opt_score_vec scores;
		scores.reserve( num_positions );
		opt_aln_posn_vec_vec data( num_proteins );
		for (opt_aln_posn_vec &data_col : data) {
			data_col.reserve( num_positions );
		}
		// Loop over the main data section
		while (getline(arg_istream, line_string)) {
			// Check the line is of the correct length
			if (line_string.length() != num_chars_in_main_data_line) {
				BOOST_THROW_EXCEPTION(runtime_error_exception("Number of characters in main data line does not equal " + lexical_cast<string>(num_chars_in_main_data_line)));
			}

			// Grab the global details from start of this line
			const size_t      alignment_posn = lexical_cast<size_t>( trim_copy( line_string.substr(  0, 4 ))); // Column 1: Alignment Position (dddd)
//			const size_t num_entries_in_temp = lexical_cast<size_t>( trim_copy( line_string.substr(  5, 4 ))); // Column 2: No. of position selected for structural template (dddd)
			const size_t num_entries_in_posn = lexical_cast<size_t>( trim_copy( line_string.substr( 10, 4 ))); // Column 2: No. of position selected for structural template (dddd)

			if (alignment_posn != data.front().size() + 1) {
				BOOST_THROW_EXCEPTION(runtime_error_exception("Alignment position counter " + lexical_cast<string>(alignment_posn) + " does not match " + lexical_cast<string>(data.front().size() + 1)));
			}

			// Loop over the indices of the proteins
			size_t num_present_posns(0);
			for (size_t prot_ctr = 0; prot_ctr < num_proteins; ++prot_ctr) {
				// Prepare string and other data for this protein
				const size_t        prot_string_offset = CHARS_IN_MAIN_DATA_LINE_START + prot_ctr * CHARS_IN_MAIN_DATA_LINE_PROT;
				const string        prot_string        = line_string.substr( prot_string_offset, CHARS_IN_MAIN_DATA_LINE_PROT );
				opt_aln_posn_vec   &data_col           = data [ prot_ctr ];
				aln_posn_type       &posn              = posns[ prot_ctr ];

				// Grab the the details for this protein
				const int              residue_num = lexical_cast<int>(          trim_copy( prot_string.substr(  1, 4 ))); // Column 4 (7,10 etc): Residue number in PDB file (ddddc) 4 digit number
				const char             insert_code =                                        prot_string.at(      5    )  ; //    + 1 character insert code
				const char              amino_acid =                                        prot_string.at(      7    )  ; // Column 5 (8,11 etc): Amino Acid Code (c)
//				const char               sec_struc =                                        prot_string.at(     10    )  ; // Column 6 (9,12 etc): Secondary Structure Assignment (c)

				// Find the residue in the list of this PDB's residue names
				const residue_name_vec &residues_names = residue_names_of_first_chains[ prot_ctr ];
				const residue_name      res_name       = make_residue_name_with_non_insert_char( residue_num, insert_code, ' ' );
				const opt_aln_posn      find_result    = search_for_residue_in_residue_names(
					posn,
					residues_names,
					amino_acid,
					res_name,
					arg_stderr
				);
				data_col.push_back( find_result ? opt_aln_posn( ( *find_result ) + 1 ) : opt_aln_posn( none ) );
				if ( find_result ) {
					posn = *find_result;
					++num_present_posns;
				}
			}
			if (num_present_posns != num_entries_in_posn) {
				BOOST_THROW_EXCEPTION(runtime_error_exception(
						"Number of positions for alignment_posn " + lexical_cast<string>(alignment_posn)
						+ " was " + lexical_cast<string>(num_present_posns)
						+ " not " + lexical_cast<string>(num_entries_in_posn)
				));
			}

			// Prepare the string for the global details at the end of this line
			const size_t end_string_offset = CHARS_IN_MAIN_DATA_LINE_START + num_proteins * CHARS_IN_MAIN_DATA_LINE_PROT;
			const string end_string = line_string.substr( end_string_offset, CHARS_IN_MAIN_DATA_LINE_END );

			// Grab the global details from start of this line
//			const size_t      cons_sec_struc =                                         end_string.at(      3    )  ; // Last Column-3: Consensus Secondary Structure Assignment (c)
//			const size_t       num_alpha_res = lexical_cast<size_t>( trim_copy(  end_string.substr(  5, 4 ))); // Last Column-2: No. of alpha residues at this position (dddd)
//			const size_t        num_beta_res = lexical_cast<size_t>( trim_copy(  end_string.substr( 10, 4 ))); // Last Column-1: No. of beta residues at this position (dddd)
			const size_t          cons_score = lexical_cast<size_t>( trim_copy(  end_string.substr( 16, 2 ))); // Last Column: Structural Conservation Score (dd)

			scores.push_back( numeric_cast<double>( cons_score ) );
//			// If there are multiple entries in this position then store the score
//			if (num_entries_in_posn > 1) {
////				cerr << "Adding score for " << alignment_posn-1 << endl;
//				scores.push_back(cons_score);
//			}
		}

		if ( num_positions != data.front().size() ) {
			BOOST_THROW_EXCEPTION(runtime_error_exception(
				"CORA legacy alignment number of positions was "
				+ lexical_cast<string>( data.front().size() )
				+ " not "
				+ lexical_cast<string>( num_positions )
			) );
		}

		alignment new_alignment = alignment_offset_1_factory( data );

		// Create a scores matrix and then empty any cells that are absent from the alignment
		opt_score_vec_vec all_scores( new_alignment.num_entries(), scores );
		for (size_t entry = 0; entry < new_alignment.num_entries(); ++entry) {
			for (size_t index = 0; index < new_alignment.length(); ++index) {
				if ( ! has_position_of_entry_of_index( new_alignment, entry, index ) ) {
					all_scores[ entry ][ index ] = none;
				}
			}
		}
		set_scores( new_alignment, all_scores);
		return new_alignment;
	}
	// Catch any I/O exceptions
	catch (const std::exception &ex) {
		const string error_message(string("Cannot read CORA legacy alignment file [") + ex.what() + "] ");
		perror(error_message.c_str());
		BOOST_THROW_EXCEPTION(runtime_error_exception(error_message));
	};
}