void break_text_lines(font_ptr font, xstring text, int total_width, str_vec& lines) { lines.clear(); int_vec breaks = calculate_text_breaks(text); unsigned break_pos = 0; xstring cand = ""; while (text.length() > 0 && break_pos<breaks.size()) { xstring sub = text.substr(0, breaks[break_pos]); Rect bounds = font->get_bounds(sub); if (bounds.get_width() > total_width) { if (cand.empty()) return; // Not enough space for single word lines.push_back(cand); text = text.substr(breaks[break_pos - 1]); text.trim(); breaks = calculate_text_breaks(text); break_pos = 0; } else { cand = text.substr(0, breaks[break_pos]); break_pos++; } } if (text.length() > 0) lines.push_back(text.trim()); }
static void joinStr(std::string &str, char c, str_vec &split) { str = ""; if (split.size() == 0) return; unsigned i; for (i = 0; i < split.size() - 1; ++i) { str += split[i]; str += c; } str += split[i]; }
static void splitStr(std::string &str, char c, str_vec &split) { split.clear(); size_t p = 0, q = 0; while ( ( p = str.find(c, q) ) != std::string::npos ) { split.push_back( str.substr(q, p - q) ); q = p + 1; } if ( q < str.length() ) { split.push_back( str.substr(q, str.length() - q) ); } }
/// \brief TODOCUMENT string score_name_helper::build_short_name(const string &arg_id_name, ///< TODOCUMENT const str_vec &arg_suffixes ///< TODOCUMENT ) { // Sanity check that the id_name isn't empty if ( arg_id_name.empty() ) { BOOST_THROW_EXCEPTION(out_of_range_exception("A score's id_name must not be empty")); } // Sanity check that the id_name contains no space characters if ( ! all( arg_id_name, ! is_space() ) ) { BOOST_THROW_EXCEPTION(out_of_range_exception( "A score's id_name mustn't contain any space characters (name was \"" + arg_id_name + "\"" )); } // Sanity check that the suffixes contain no space characters for (const string &suffix : arg_suffixes) { if ( ! all( suffix, ! is_space() ) ) { BOOST_THROW_EXCEPTION(out_of_range_exception( "A score's short name suffix mustn't contain any space characters (name was \"" + suffix + "\"" )); } } // Return the id_name plus any suffixes separated by full stops const auto suffix_string = arg_suffixes.empty() ? string() : ( "." + join( arg_suffixes, "." ) ); return arg_id_name + suffix_string; }
/// \brief Build an alignment from strings of 'x's for positions and '-'s for gaps (and ignored spaces for formatting) alignment cath::test::alignment_gap_test_suite_fixture::make_gap_alignment_of_strings(const str_vec &arg_strings ///< The strings from which to make the alignment entries ) { aln_posn_opt_vec_vec entries; entries.reserve( arg_strings.size() ); for (const string &the_string : arg_strings) { const string stripped_string = erase_all_copy( the_string, " " ); aln_posn_opt_vec positions; positions.reserve( stripped_string.length() ); size_t counter = 0; for (const char &character : stripped_string) { if ( character == 'x' ) { positions.push_back( counter ); ++counter; } else if ( character == '-' ) { positions.push_back( none ); } else { BOOST_THROW_EXCEPTION(invalid_argument_exception( "Unable to recognise character " + string{ character } + " in gap alignment string" )); } } entries.push_back( positions ); } return alignment( entries ); }
/// \brief TODOCUMENT /// /// \relates display_colourer display_colour_spec cath::get_colour_spec(const display_colourer &arg_colourer, ///< TODOCUMENT const pdb_list &arg_pdbs, ///< TODOCUMENT const str_vec &arg_names, ///< TODOCUMENT const alignment &arg_alignment ///< TODOCUMENT ) { const alignment::size_type num_entries = arg_alignment.num_entries(); const alignment::size_type aln_length = arg_alignment.length(); if ( aln_length <= 0 || num_entries <= 0 ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the alignment is empty")); } if ( num_entries != arg_pdbs.size() ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the number of entries doesn't match the number of PDBs")); } if ( num_entries != arg_names.size() ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the number of entries doesn't match the number of names")); } auto &&result_spec = arg_colourer.get_colour_spec( alignment_context( arg_pdbs, arg_names, arg_alignment ) ); return has_score_colour_handler( arg_colourer ) ? adjust_display_colour_spec_copy( std::forward< decltype( result_spec ) >( result_spec ), get_score_colour_handler( arg_colourer ), arg_alignment ) : result_spec; }
/// \brief TODOCUMENT /// /// \relates viewer str_vec cath::clean_names_for_viewer(const str_vec &arg_names ) { str_vec new_names; new_names.reserve(arg_names.size()); for (const string &name : arg_names) { new_names.push_back(clean_name_for_viewer(name)); } return new_names; }
/// \brief Parse an hbond from the specified string from within a DSSP line dsspfile_hbond_opt dssp_dupl_fixture::parse_dsspfile_bond(const string &arg_hbond_string ///< The string containing the DSSP h-bond data (eg "-2,-2.6") ) { const str_vec parts = split_build<str_vec>( arg_hbond_string, is_any_of( "," ), token_compress_on ); if ( parts.size() != 2 ) { BOOST_THROW_EXCEPTION(runtime_error_exception("Did not find two parts in DSSP file h-bond")); } const int offset = stoi( trim_copy( parts.front() ) ); const double energy = stod( trim_copy( parts.back () ) ); if ( offset == 0 ) { if ( energy != 0.0 ) { BOOST_THROW_EXCEPTION(runtime_error_exception("Whilst try to parse H-bond from DSSP file data, non-zero energy for zero offset")); } return none; } else { return dsspfile_hbond{ offset, energy }; } }
str wild_replace(const str wild, const str_vec& replacements) { if(replacements.empty()) return wild; str fixed; bool esc = false; for(const char c: wild) { if(esc && !(esc = false)) fixed += c; else if(c == '\\') esc = true; else if(c == '*') fixed += replacements[rand_int(0, replacements.size() - 1)]; else fixed += c; } return fixed; }
/// \brief Write instructions for the specified viewer to the specified ostream /// to represent the specified alignment_free_display_colourer in the context of /// the specified cleaned structure names /// /// \relates display_colourer void cath::colour_viewer(const alignment_free_display_colourer &arg_colourer, ///< The alignment_free_display_colourer to write to the ostream ostream &arg_os, ///< The ostream to which the instructions should be written const viewer &arg_viewer, ///< The viewer defining the instructions to be written const str_vec &arg_cleaned_names_for_viewer ///< The names of the structures (cleaned for the viewer) ) { colour_viewer_with_spec( arg_colourer.get_colour_spec_from_num_entries( arg_cleaned_names_for_viewer.size() ), arg_viewer, arg_cleaned_names_for_viewer, arg_os ); }
/// \brief TODOCUMENT /// /// \relates display_colourer display_colour_spec cath::get_colour_spec(const display_colourer &arg_colourer, ///< TODOCUMENT const pdb_list &arg_pdbs, ///< TODOCUMENT const str_vec &arg_names, ///< TODOCUMENT const alignment &arg_alignment ///< TODOCUMENT ) { const alignment::size_type num_entries = arg_alignment.num_entries(); const alignment::size_type aln_length = arg_alignment.length(); if ( aln_length <= 0 || num_entries <= 0 ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the alignment is empty")); } if ( num_entries != arg_pdbs.size() ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the number of entries doesn't match the number of PDBs")); } if ( num_entries != arg_names.size() ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Unable to colour the alignment_context because the number of entries doesn't match the number of names")); } return arg_colourer.get_colour_spec( alignment_context( arg_pdbs, arg_names, arg_alignment ) ); }
/// \brief Parse a FASTA format input into an alignment /// /// At present, each sequences must contain all of the residues of the corresponding PDB /// (because the index in the PDB is required in the alignment). /// /// !!Case insensitive!! /// /// \todo !URGENT! Test what this does when given structures with incomplete residues near the start. /// It looks like it gives indices in the PDB, rather than in the protein (which only /// contains backbone-complete residues). This is a serious issue! /// /// \todo Generalise this so that it's possible to read alignments against a pdb_list or a protein_list /// /// The code will attempt to handle missing residues with a warning if there are a small number. /// It will fail if the percentage is too low. /// /// \relates alignment alignment cath::align::read_alignment_from_fasta(istream &arg_istream, ///< The istream from which to read the FASTA input for parsing const amino_acid_vec_vec &arg_amino_acid_lists, ///< TODOCUMENT const str_vec &arg_names, ///< A vector of names, each of which should be found within the corresponding sequence's ID ostream &arg_stderr ///< An ostream to which any warnings should be output (currently unused) ) { if ( arg_amino_acid_lists.empty() ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Cannot load a FASTA alignment with 0 PDB entries")); } const size_t num_entries = arg_amino_acid_lists.size(); if ( arg_names.size() != num_entries ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Cannot load a FASTA alignment with a different number of names and PDB entries")); } arg_istream.exceptions( ios::badbit ); try { const str_str_pair_vec sequence_of_id = read_ids_and_sequences_from_fasta( arg_istream ); const size_t num_sequences = sequence_of_id.size(); if ( num_entries != num_sequences ) { BOOST_THROW_EXCEPTION(runtime_error_exception( "Number of sequences parsed from FASTA (" + lexical_cast<string>( num_sequences ) + ") doesn't match the number of PDBs/names (" + lexical_cast<string>( num_entries ) + ")" )); } const size_t sequence_length = sequence_of_id.front().second.length(); opt_aln_posn_vec_vec positions; positions.reserve( num_entries ); for (size_t entry_ctr = 0; entry_ctr < num_entries; ++entry_ctr) { const amino_acid_vec &amino_acids = arg_amino_acid_lists [ entry_ctr ]; const string &name = arg_names [ entry_ctr ]; const str_str_pair &id_and_sequence = sequence_of_id[ entry_ctr ]; const string &id = id_and_sequence.first; const string &sequence = id_and_sequence.second; if ( sequence.length() != sequence_length ) { BOOST_THROW_EXCEPTION(runtime_error_exception( "When attempting to parse entry number " + lexical_cast<string>( entry_ctr + 1 ) + " of FASTA alignment, the length of the sequence (" + lexical_cast<string>( sequence.length() ) + ") does not match the length of the first sequence (" + lexical_cast<string>( sequence_length ) + ")" )); } if ( ! icontains( id, name ) ) { BOOST_THROW_EXCEPTION(runtime_error_exception( "When attempting to parse entry number " + lexical_cast<string>( entry_ctr + 1 ) + " of FASTA alignment, name \"" + name + "\" could not be found in a case-insensitive search within FASTA header ID \"" + id + "\"" )); } positions.push_back( align_sequence_to_amino_acids( sequence, amino_acids, name, arg_stderr ) ); } return alignment( positions ); } // Catch any I/O exceptions catch (const std::exception &ex) { const string error_message(string("Cannot read FASTA legacy alignment file [") + ex.what() + "] "); perror(error_message.c_str()); BOOST_THROW_EXCEPTION(runtime_error_exception(error_message)); }; }
/// \brief TODOCUMENT /// /// \relates alignment /// /// CORA file format /// /// The header consists of the following /// - One format line '#FM CORA_FORMAT 1.1' /// - Any number of comment lines '#CC' /// - Total number of proteins in the alignment /// - All CATH domain names in the alignment /// - Total number of alignment positions /// /// For example: /// /// #FM CORA_FORMAT 1.1 /// #CC /// #CC Any number of comment lines (200 characters max per line) /// #CC /// #CC /// 3 /// 6insE0 1igl00 1bqt00 /// 73 /// /// The body consists of the following: /// /// START PROT 1 PROT 2 PROT N END /// <------------><---------><---------><---------><----------------> /// ddddxddddxddddxddddcxcxxcxddddcxcxxcxddddcxcxxcxxxcxddddxddddxxdd /// /// 1 0 1 0 0 0 1 A 0 0 0 0 0 0 0 0 /// 2 0 1 0 0 0 2 Y 0 0 0 0 0 0 0 0 /// 3 0 2 1B F 0 3 R 0 0 0 0 0 0 0 0 /// 4 0 3 2B V H 4 P 0 1 G 0 0 1 0 2 /// 5 1 3 3B N H 5 S 0 2 P 0 0 1 0 6 /// 6 0 3 4B Q H 6 E 0 3 E 0 0 1 0 2 /// /// START (14 characters) : /// - Column 1: Alignment Position (dddd) /// - Column 2: No. of position selected for structural template (dddd) /// - Column 3: No. of proteins aligned at this position (dddd) /// /// PROT 1,2,3... (11 characters per protein) /// - Column 4 (7,10 etc): Residue number in PDB file (ddddc) 4 digit number /// - + 1 character insert code /// - Importantly the insert code is always in the same position not within /// - the 4 characters reserved for the pdb number (see below) /// - Column 5 (8,11 etc): Amino Acid Code (c) /// - Column 6 (9,12 etc): Secondary Structure Assignment (c) /// /// END (18 characters) /// - Last Column-3: Consensus Secondary Structure Assignment (c) /// - Last Column-2: No. of alpha residues at this position (dddd) /// - Last Column-1: No. of beta residues at this position (dddd) /// - Last Column: Structural Conservation Score (dd) alignment cath::align::read_alignment_from_cath_cora_legacy_format(istream &arg_istream, ///< TODOCUMENT const pdb_list &arg_pdbs, ///< TODOCUMENT ostream &arg_stderr ///< TODOCUMENT ) { const size_t CHARS_IN_MAIN_DATA_LINE_START = 14; const size_t CHARS_IN_MAIN_DATA_LINE_PROT = 11; const size_t CHARS_IN_MAIN_DATA_LINE_END = 18; if (arg_pdbs.empty()) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Cannot load a CORA legacy alignment with 0 PDB entries")); } arg_istream.exceptions(ios::badbit); try { residue_name_vec_vec residue_names_of_first_chains; for (const pdb &arg_pdb : arg_pdbs) { residue_names_of_first_chains.push_back( arg_pdb.get_residue_names_of_first_chain__backbone_unchecked() ); } // Check the first line is the file format line string line_string; getline(arg_istream, line_string); if (!starts_with(line_string, "#FM CORA_FORMAT ")) { BOOST_THROW_EXCEPTION(runtime_error_exception("No CORA header file format line")); } // Skip any comment lines while (getline(arg_istream, line_string) && starts_with(line_string, "#CC")) { } // Grab the number of proteins and ensure the alignment matches const size_t num_proteins = lexical_cast<size_t>(line_string); if (num_proteins != arg_pdbs.size()) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Number of PDBs in CORA file is " + lexical_cast<string>(num_proteins) + ", which does not match " + lexical_cast<string>(arg_pdbs.size()))); } const size_t num_chars_in_main_data_line = CHARS_IN_MAIN_DATA_LINE_START + (num_proteins * CHARS_IN_MAIN_DATA_LINE_PROT) + CHARS_IN_MAIN_DATA_LINE_END; // Grab the protein names getline( arg_istream, line_string ); trim( line_string ); const str_vec names = split_build<str_vec>( line_string, is_space() ); if ( names.size() != num_proteins ) { BOOST_THROW_EXCEPTION(runtime_error_exception("Splitting on space does not give " + lexical_cast<string>(num_proteins) + " entries in CORA alignment names line: \"" + line_string + "\"")); } // Grab the total number of alignment positions getline(arg_istream, line_string); const size_t num_positions = lexical_cast<size_t>(line_string); // Prepare the data structures to populate aln_posn_vec posns( num_proteins, 0 ); opt_score_vec scores; scores.reserve( num_positions ); opt_aln_posn_vec_vec data( num_proteins ); for (opt_aln_posn_vec &data_col : data) { data_col.reserve( num_positions ); } // Loop over the main data section while (getline(arg_istream, line_string)) { // Check the line is of the correct length if (line_string.length() != num_chars_in_main_data_line) { BOOST_THROW_EXCEPTION(runtime_error_exception("Number of characters in main data line does not equal " + lexical_cast<string>(num_chars_in_main_data_line))); } // Grab the global details from start of this line const size_t alignment_posn = lexical_cast<size_t>( trim_copy( line_string.substr( 0, 4 ))); // Column 1: Alignment Position (dddd) // const size_t num_entries_in_temp = lexical_cast<size_t>( trim_copy( line_string.substr( 5, 4 ))); // Column 2: No. of position selected for structural template (dddd) const size_t num_entries_in_posn = lexical_cast<size_t>( trim_copy( line_string.substr( 10, 4 ))); // Column 2: No. of position selected for structural template (dddd) if (alignment_posn != data.front().size() + 1) { BOOST_THROW_EXCEPTION(runtime_error_exception("Alignment position counter " + lexical_cast<string>(alignment_posn) + " does not match " + lexical_cast<string>(data.front().size() + 1))); } // Loop over the indices of the proteins size_t num_present_posns(0); for (size_t prot_ctr = 0; prot_ctr < num_proteins; ++prot_ctr) { // Prepare string and other data for this protein const size_t prot_string_offset = CHARS_IN_MAIN_DATA_LINE_START + prot_ctr * CHARS_IN_MAIN_DATA_LINE_PROT; const string prot_string = line_string.substr( prot_string_offset, CHARS_IN_MAIN_DATA_LINE_PROT ); opt_aln_posn_vec &data_col = data [ prot_ctr ]; aln_posn_type &posn = posns[ prot_ctr ]; // Grab the the details for this protein const int residue_num = lexical_cast<int>( trim_copy( prot_string.substr( 1, 4 ))); // Column 4 (7,10 etc): Residue number in PDB file (ddddc) 4 digit number const char insert_code = prot_string.at( 5 ) ; // + 1 character insert code const char amino_acid = prot_string.at( 7 ) ; // Column 5 (8,11 etc): Amino Acid Code (c) // const char sec_struc = prot_string.at( 10 ) ; // Column 6 (9,12 etc): Secondary Structure Assignment (c) // Find the residue in the list of this PDB's residue names const residue_name_vec &residues_names = residue_names_of_first_chains[ prot_ctr ]; const residue_name res_name = make_residue_name_with_non_insert_char( residue_num, insert_code, ' ' ); const opt_aln_posn find_result = search_for_residue_in_residue_names( posn, residues_names, amino_acid, res_name, arg_stderr ); data_col.push_back( find_result ? opt_aln_posn( ( *find_result ) + 1 ) : opt_aln_posn( none ) ); if ( find_result ) { posn = *find_result; ++num_present_posns; } } if (num_present_posns != num_entries_in_posn) { BOOST_THROW_EXCEPTION(runtime_error_exception( "Number of positions for alignment_posn " + lexical_cast<string>(alignment_posn) + " was " + lexical_cast<string>(num_present_posns) + " not " + lexical_cast<string>(num_entries_in_posn) )); } // Prepare the string for the global details at the end of this line const size_t end_string_offset = CHARS_IN_MAIN_DATA_LINE_START + num_proteins * CHARS_IN_MAIN_DATA_LINE_PROT; const string end_string = line_string.substr( end_string_offset, CHARS_IN_MAIN_DATA_LINE_END ); // Grab the global details from start of this line // const size_t cons_sec_struc = end_string.at( 3 ) ; // Last Column-3: Consensus Secondary Structure Assignment (c) // const size_t num_alpha_res = lexical_cast<size_t>( trim_copy( end_string.substr( 5, 4 ))); // Last Column-2: No. of alpha residues at this position (dddd) // const size_t num_beta_res = lexical_cast<size_t>( trim_copy( end_string.substr( 10, 4 ))); // Last Column-1: No. of beta residues at this position (dddd) const size_t cons_score = lexical_cast<size_t>( trim_copy( end_string.substr( 16, 2 ))); // Last Column: Structural Conservation Score (dd) scores.push_back( numeric_cast<double>( cons_score ) ); // // If there are multiple entries in this position then store the score // if (num_entries_in_posn > 1) { //// cerr << "Adding score for " << alignment_posn-1 << endl; // scores.push_back(cons_score); // } } if ( num_positions != data.front().size() ) { BOOST_THROW_EXCEPTION(runtime_error_exception( "CORA legacy alignment number of positions was " + lexical_cast<string>( data.front().size() ) + " not " + lexical_cast<string>( num_positions ) ) ); } alignment new_alignment = alignment_offset_1_factory( data ); // Create a scores matrix and then empty any cells that are absent from the alignment opt_score_vec_vec all_scores( new_alignment.num_entries(), scores ); for (size_t entry = 0; entry < new_alignment.num_entries(); ++entry) { for (size_t index = 0; index < new_alignment.length(); ++index) { if ( ! has_position_of_entry_of_index( new_alignment, entry, index ) ) { all_scores[ entry ][ index ] = none; } } } set_scores( new_alignment, all_scores); return new_alignment; } // Catch any I/O exceptions catch (const std::exception &ex) { const string error_message(string("Cannot read CORA legacy alignment file [") + ex.what() + "] "); perror(error_message.c_str()); BOOST_THROW_EXCEPTION(runtime_error_exception(error_message)); }; }
/** * IPv4 IPv6 agnostic OOB (out Of Band) comms * @param cmd * @param packets Returned packets * @param host Host to connect to * @param port Port to connect on * @param wait Timeout duration in milliseconds * @return false if failed to connect/send or receive else true */ bool aocom(const str& cmd, str_vec& packets, const str& host, int port , siz wait = TIMEOUT) { addrinfo hints; memset(&hints, 0, sizeof hints); hints.ai_family = AF_UNSPEC; // AF_INET or AF_INET6 hints.ai_socktype = SOCK_DGRAM; addrinfo* res; if(int status = getaddrinfo(host.c_str(), std::to_string(port).c_str(), &hints, &res) != 0) { log(gai_strerror(status)); return false; } st_time_point timeout = st_clk::now() + std::chrono::milliseconds(wait); // try to connect to each int cs; addrinfo* p; for(p = res; p; p = p->ai_next) { if((cs = socket(p->ai_family, p->ai_socktype, p->ai_protocol)) == -1) continue; if(!connect(cs, p->ai_addr, p->ai_addrlen)) break; ::close(cs); } freeaddrinfo(res); if(!p) { log("aocom: failed to connect: " << host << ":" << port); return false; } // cs good const str msg = "\xFF\xFF\xFF\xFF" + cmd; int n = 0; if((n = send(cs, msg.c_str(), msg.size(), 0)) < 0 || n < (int)msg.size()) { log("cs send: " << strerror(errno)); return false; } packets.clear(); char buf[2048]; n = sizeof(buf); while(n == sizeof(buf)) { while((n = recv(cs, buf, sizeof(buf), MSG_DONTWAIT)) == -1 && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)) { if(st_clk::now() > timeout) { log("socket timed out connecting to: " << host << ":" << port); return false; } // std::this_thread::yield(); std::this_thread::sleep_for(std::chrono::milliseconds(10)); } if(n < 0) log("cs recv: " << strerror(errno)); if(n > 0) packets.push_back(str(buf, n)); } close(cs); return true; }