Example #1
0
    void fill_padding_repeat()
    {
        using boost::irange;

        const auto offset = ValueLayer::padding_offset;
        const auto last_row = this->num_rows() + offset - 1;
        const auto last_col = this->num_cols() + offset - 1;

        auto& layer_array = this->layer_array;

        for (auto row : irange(0lu, util::rows(layer_array))) {
            auto const& x = layer_array[row][last_col];
            auto const& y = layer_array[row][offset];
            for (auto i : irange(1, offset + 1)) {
                layer_array[row][last_col + i] = x;
                layer_array[row][offset - i] = y;
            }
        }
        for (auto col : irange(0lu, util::cols(layer_array))) {
            auto const& x = layer_array[last_row][col];
            auto const& y = layer_array[offset][col];
            for (auto i : irange(1, offset + 1)) {
                layer_array[last_row + i][col] = x;
                layer_array[offset - i][col] = y;
            }
        }
    }
/// \brief TODOCUMENT
///
/// \todo Could this be generalised (perhaps with a non-member function
///       to make it convenient for the particulars of aligned_pair_score_value_list) ?
void score_classn_value_results_set::add_aligned_pair_score_value_list(const aligned_pair_score_value_list &arg_aligned_pair_score_value_list, ///< TODOCUMENT
                                                                       const bool                          &arg_condition_is_positive,         ///< TODOCUMENT
                                                                       const string                        &arg_instance_label                 ///< TODOCUMENT
                                                                       ) {
	const size_t num_score_values = arg_aligned_pair_score_value_list.size();
	if ( empty() ) {
		for (const size_t &score_value_ctr : irange( 0_z, num_score_values ) ) {
			const auto   &score            = arg_aligned_pair_score_value_list.get_aligned_pair_score_of_index( score_value_ctr );
			const string &name             = score.human_friendly_short_name();
			const bool   &higher_is_better = score.higher_is_better();
			score_classn_value_lists.push_back( make_score_classn_value_list( {}, higher_is_better, name ) );
		}
		sort_score_classn_value_lists();
	}
	else {
		const auto    new_names      = get_all_names( arg_aligned_pair_score_value_list );
		const str_set new_names_set( common::cbegin( new_names ), common::cend( new_names ) );
		const size_t  num_series     = size();
		const auto    existing_names = transform_build<str_vec>(
			irange( 0_z, num_series ),
			[&] (const size_t &x) { return get_name_of_index( *this, x ); }
		);

		if ( ! equal( existing_names, new_names_set ) ) {
//			cerr << "existing_names : " << join( existing_names, ", " ) << endl;
//			cerr << "new_names_set  : " << join( new_names_set,  ", " ) << endl;
			BOOST_THROW_EXCEPTION(invalid_argument_exception("Cannot add aligned_pair_score_value_list because the series names don't match the existing ones"));
		}
	}

	for (const size_t &score_value_ctr : irange( 0_z, num_score_values ) ) {
		const auto   &value            = arg_aligned_pair_score_value_list.get_value_of_index             ( score_value_ctr );
		const auto   &score            = arg_aligned_pair_score_value_list.get_aligned_pair_score_of_index( score_value_ctr );
		const string &name             = score.human_friendly_short_name();
		const bool   &higher_is_better = score.higher_is_better();

		score_classn_value_list &the_list = get_score_classn_value_list_of_name( name );
		if ( get_higher_is_better( the_list ) != higher_is_better ) {
			BOOST_THROW_EXCEPTION(invalid_argument_exception( "Cannot add aligned_pair_score_value_list to score_classn_value_results_set because they conflict regarding the higher_is_better value for score " + name ));
		}

		the_list.add_score_classn_value(
			score_classn_value(
				value,
				arg_condition_is_positive,
				arg_instance_label
			)
		);
	}
}
/// \brief TODOCUMENT
size_vec common_residue_select_best_score_percent_policy::do_select_common_residues_with_scores(const doub_doub_pair_vec &arg_scores ///< TODOCUMENT
                                                                                                ) const {
	// Grab the a list of the smaller value from each pair of scores
	const auto min_scores = transform_build<doub_vec>(
		arg_scores,
		[] (const doub_doub_pair &x) { return min( x.first, x.second ); }
	);

	// Build a stable_sorted list of indices in descending order of the score to which each corresponds
	const auto score_sorted_indices = stable_sort_build<size_vec>(
		irange( 0_z, min_scores.size() ),
		[&] (const size_t &x, const size_t &y) {
			return ( min_scores.at( x ) > min_scores.at( y ) );
		}
	);

	// Calculate the total score of all values and the cutoff for best_score_percentage % of it
	const double total_score            = accumulate( min_scores, 0.0 );
	const double percentile_total_score = total_score * best_score_percentage / 100.0;

	// Step through score_sorted_indices and grab them until the sum of their scores exceeds
	// percentile_total_score, then return a sorted copy of all those indices
	double temp_sum( 0.0 );
	return sort_build<size_vec>(
		common::cbegin( score_sorted_indices ),
		find_if(
			score_sorted_indices,
			[&] (const size_t &x) {
				temp_sum += min_scores[ x ];
				return ( temp_sum > percentile_total_score );
			}
		)
	);
}
Example #4
0
decltype(auto) convert_helper(const std::vector<cv::Mat>& input,
                              CONVERT_FUN convert_element)
{
    using boost::irange;

    const auto ROWS = input[0].rows;
    const auto COLS = input[0].cols;

    auto result = util::make_vector_2d<MATRIX_TYPE>(ROWS, COLS);
    for (auto row : irange(0, ROWS)) {
        for (auto col : irange(0, COLS)) {
            result[row][col] = convert_element(input, row, col);
        }
    }
    return result;
}
Example #5
0
/// \brief Generate names to use in the viewer for the specified number of colours
str_vec cath::generate_colour_names(const size_t &arg_num_colours ///< The total number of colours
                                    ) {
	return transform_build<str_vec>(
		irange( 0_z, arg_num_colours ),
		[&] (const size_t &x) { return generate_colour_name( x, arg_num_colours ); }
	);
}
/// \brief TODOCUMENT
///
/// \relates score_classn_value_results_set
void cath::score::write_to_svm_light_data_files(const score_classn_value_results_set &arg_results_set,      ///< TODOCUMENT
                                                const path                           &arg_output_file_stem, ///< TODOCUMENT
                                                const size_t                         &arg_num_repeats,      ///< TODOCUMENT
                                                mt19937                              &arg_rng,              ///< TODOCUMENT
                                                const double                         &arg_fraction_train    ///< TODOCUMENT
                                                ) {
	const auto num_instances   = get_num_instances      ( arg_results_set );
	const auto names           = get_names              ( arg_results_set );
	const auto instance_labels = get_instance_labels    ( arg_results_set );
	const auto scalings        = get_value_list_scalings( arg_results_set );

	// Get a data structure in which all lists are sorted in the same way
	const auto results = transform_build<score_classn_value_vec_vec>(
		names,
		[&] (const string &x) {
			const auto &results_list = arg_results_set.get_score_classn_value_list_of_name( x );
			return get_score_classn_values_of_instance_labels( results_list, instance_labels );
		}
	);

	for (const size_t &repeat_ctr : irange( 1_z, arg_num_repeats + 1 ) ) {
		const auto repeat_ctr_str = lexical_cast<string>( repeat_ctr );

		const path train_file  = replace_extension_copy( arg_output_file_stem, "." + repeat_ctr_str + ".train" );
		const path test_file   = replace_extension_copy( arg_output_file_stem, "." + repeat_ctr_str + ".test"  );

		const size_vec_size_vec_pair split_indices = random_split( arg_rng, num_instances, arg_fraction_train );

		write_to_svm_light_data_files_impl( results, scalings, train_file, split_indices.first  );
		write_to_svm_light_data_files_impl( results, scalings, test_file,  split_indices.second );
	}
}
/// \brief TODOCUMENT
///
/// Things for investigation:
///  * raw/final score from alignment
///  * expected raw/final score
///  * top categorised reasons by raw score
///  * repeat for varying scan_stride (can just be called by nmnf fn)
alignment_scan_comparison check_scan_on_final_alignment::do_check(const alignment     &arg_alignment,  ///< TODOCUMENT
                                                                  const protein       &arg_protein_a,  ///< TODOCUMENT
                                                                  const protein       &arg_protein_b,  ///< TODOCUMENT
                                                                  const quad_criteria &arg_criteria,   ///< TODOCUMENT
                                                                  const scan_stride   &arg_scan_stride ///< TODOCUMENT
                                                                  ) const {
	const auto aln_range = irange( 0_z, arg_alignment.length() );
	cerr << "SHOULD THE RANGE BE 7.0 RATHER THAN SQRT(40.0)????\n";
	return accumulate(
		cross( aln_range, aln_range ),
		alignment_scan_comparison{},
		[&] (alignment_scan_comparison x, const size_size_tpl &y) {
			const size_t aln_from_ctr   = get<0>( y );
			const size_t aln_to_ctr     = get<1>( y );
			const bool   from_alns_both = has_both_positions_of_index( arg_alignment, aln_from_ctr );
			const bool   to_alns_both   = has_both_positions_of_index( arg_alignment, aln_to_ctr   );

			if ( from_alns_both && to_alns_both ) {

				const auto a_from     = get_a_position_of_index( arg_alignment, aln_from_ctr );
				const auto b_from     = get_b_position_of_index( arg_alignment, aln_from_ctr );
				const auto a_to       = get_a_position_of_index( arg_alignment, aln_to_ctr   );
				const auto b_to       = get_b_position_of_index( arg_alignment, aln_to_ctr   );
				const bool a_included = difference( a_from, a_to ) > NUM_EXCLUDED_ON_SIDES;
				const bool b_included = difference( b_from, b_to ) > NUM_EXCLUDED_ON_SIDES;

				if ( a_included && b_included ) {
					const auto the_distance = distance_between_points(
						view_vector_of_residue_pair(
							arg_protein_a.get_residue_ref_of_index( a_from ),
							arg_protein_a.get_residue_ref_of_index( a_to   )
						),
						view_vector_of_residue_pair(
							arg_protein_b.get_residue_ref_of_index( b_from ),
							arg_protein_b.get_residue_ref_of_index( b_to   )
						)
					);
//					const auto score       = ( the_distance >= 7.0 ) ? 0.0 : ( 1.0 - the_distance / 7.0 );
					const auto score       = ( the_distance >= sqrt( 40.0 ) ) ? 0.0 : ( 1.0 - the_distance / 7.0 );
					if ( score > 0.0 ) {
						const auto scan_result = quad_and_rep_criteria_result_of(
							arg_protein_a,
							arg_protein_b,
							arg_criteria,
							arg_scan_stride,
							numeric_cast<index_type>( a_from ),
							numeric_cast<index_type>( a_to   ),
							numeric_cast<index_type>( b_from ),
							numeric_cast<index_type>( b_to   )
						);
						x += make_pair( scan_result, score );
					}
				}
			}

			return x;
		}
	);
}
/// \brief Generate a string describing the difference between the specified dssp_dupl_ress parsed from a DSSP file
///        and the specified bifur_hbond_list or return none if there isn't any difference
///
/// \relates dssp_dupl_res
str_opt cath::sec::difference_string(const dssp_dupl_res_vec &arg_dssp_dupl_res_vec, ///< A vector of dssp_dupl_res parsed from a DSSP file
                                     const bifur_hbond_list  &arg_bifur_hbond_list   ///< The dssp_dupl_res_list to be compared to the dssp_dupl_res vector
                                     ) {
	const auto num_dssp_dupl_res = arg_dssp_dupl_res_vec.size();
	const auto arg_bifur_hbonds  = arg_bifur_hbond_list.size();

	const auto dssp_non_null_indices = copy_build<size_vec>(
		irange( 0_z, num_dssp_dupl_res )
			| filtered(
				[&] (const size_t &x) {
					return ! arg_dssp_dupl_res_vec[ x ].pdb_residue_name.is_null();
				}
			)
	);

	const size_t num_non_null_residues = dssp_non_null_indices.size();

	const str_opt num_prob = make_optional(
		( num_non_null_residues != arg_bifur_hbonds ),
		"Number of (non-null) DSSP hbond residues ("
			+ ::std::to_string( num_non_null_residues )
			+ "), doesn't match the number of calculated hbond residues ("
			+ ::std::to_string( arg_bifur_hbonds )
			+ "). "
	);

	const auto normal_index_of_dssp_index = [&] () {
		size_vec result( num_dssp_dupl_res, 0 );
		for (const size_t &x : irange( 0_z, dssp_non_null_indices.size() ) ) {
			result[ dssp_non_null_indices[ x ] ] = x;
		}
		return result;
	}();

	for (const size_t &index : irange( 0_z, min( num_dssp_dupl_res, arg_bifur_hbonds ) ) ) {
		const auto diff_str = difference_string( arg_dssp_dupl_res_vec[ dssp_non_null_indices[ index ] ], arg_bifur_hbond_list[ index ], normal_index_of_dssp_index );
		if ( diff_str ) {
			return ( num_prob ? *num_prob : string{} ) + *diff_str;
		}
	}

	return num_prob
		? make_optional( *num_prob + " No differences spotted until the end of the shortest" )
		: none;
}
/// \brief TODOCUMENT
///
/// \relates score_classn_value_results_set
str_vec cath::score::get_names(const score_classn_value_results_set &arg_results_set ///< TODOCUMENT
                               ) {
	return transform_build<str_vec>(
		irange( 0_z, arg_results_set.size() ),
		[&] (const size_t &x) {
			return get_name_of_index( arg_results_set, x );
		}
	);
}
Example #10
0
std::vector<std::string> generateStitchSet(const std::string& directory, boost::format pattern, bool reverse, int start, int rowSize) {
	std::cout << format("Generating stitch set for range %1% to %2%\n") % start % (start + rowSize);
	std::cout << format("Reversing? %1%\n") % reverse;
	std::vector<std::string> files;
	for (auto i : irange(start, rowSize + start + 1)) {
		std::string filename = boost::str(pattern % directory % i);
		files.push_back(filename);
	}
	if (reverse) std::reverse(files.begin(), files.end());
	return files;
}
Example #11
0
/// \brief Private static method that implements the process of building the views from proteins
coord_vec_vec view_cache::build_views(const protein &arg_protein ///< The protein which the view_cache should be built to represent
                                      ) {
	// Grab the number of residues and prepare the views accordingly
	const size_t num_residues = arg_protein.get_length();
	coord_vec_vec new_views( num_residues );
	for (coord_vec &view_of : new_views) {
		view_of.reserve( num_residues );
	}

	// Loop over the all from-versus-to residue pairs and add the resulting views
	for (const size_t &from_res_ctr : irange( 0_z, num_residues ) ) {
		for (const size_t &to_res_ctr : irange( 0_z, num_residues ) ) {
			coord_vec &view_of_from = new_views[ from_res_ctr ];
			view_of_from.push_back( view_vector_of_residue_pair(
				arg_protein.get_residue_ref_of_index( from_res_ctr ),
				arg_protein.get_residue_ref_of_index( to_res_ctr   )
			) );
		}
	}
	return new_views;
}
/// \brief TODOCUMENT
broad_display_colour_spec display_colourer_consecutive::do_get_colour_spec_from_num_entries(const size_t &arg_num_entries ///< The number of structures to be coloured
                                                                                            ) const {
	// Create a new display_colour_spec and populate it for the entries with colours
	broad_display_colour_spec new_spec;
	for (const size_t entry_ctr : irange( 0_z, arg_num_entries ) ) {
		new_spec.colour_pdb(
			entry_ctr,
			colour_of_mod_index( colours, entry_ctr )
		);
	}

	// Return the generated display_colour_spec
	return new_spec;
}
/// \brief TODOCUMENT
hmmer_scores_entry_vec hmmer_scores_file::remove_duplicates(const hmmer_scores_entry_vec &arg_hmmer_scores_entries ///< TODOCUMENT
                                                            ) {
	hmmer_scores_entry_vec  results;

	str_str_pair_size_map index_of_previously_seen;

	return transform_build<hmmer_scores_entry_vec>(
		irange( 0_z, arg_hmmer_scores_entries.size() )
			| filtered(
				[&] (const size_t &x) {
					const auto &entry     = arg_hmmer_scores_entries[ x ];
					const auto &id1       = entry.get_name_1();
					const auto &id2       = entry.get_name_2();
					const auto &evalue    = entry.get_full_sequence_evalue();
					const auto  name_pair = make_pair( id1, id2 );

					if ( ! contains( index_of_previously_seen, name_pair ) ) {
						index_of_previously_seen.emplace( name_pair, x );
						return true;
					}

					cerr << "Ooo - crazy things happening with HMMER parsing" << "\n";

					const auto prev_entry = arg_hmmer_scores_entries[ index_of_previously_seen.at( name_pair ) ];
//					if ( entry.get_hit_num() <= prev_entry.get_hit_num() ) {
//						BOOST_THROW_EXCEPTION(invalid_argument_exception(
//							"When parsing PRC results, found hit between " + id1 + " and " + id2 + " with hit number that isn't higher than for previous result"
//						));
//						BOOST_LOG_TRIVIAL( warning ) << "When parsing PRC results, found hit between " << id1 << " and " << id2 << " with hit number that isn't higher than for previous result";
//					}
					if ( evalue          <  prev_entry.get_full_sequence_evalue()  ) {
						BOOST_THROW_EXCEPTION(invalid_argument_exception(
							"When parsing HMMER results, found hit between " + id1 + " and " + id2 + " with evalue better than for previous result"
						));
					}
					if ( entry.get_full_sequence_score()  >  prev_entry.get_full_sequence_score()  ) {
						BOOST_THROW_EXCEPTION(invalid_argument_exception(
							"When parsing HMMER results, found hit between " + id1 + " and " + id2 + " with simple score better than for previous result "
							+ " " + std::to_string( entry.get_full_sequence_score() )
							+ " " + std::to_string( prev_entry.get_full_sequence_score() )
						));
					}
					return false;
				}
			),
		[&] (const size_t &x) { return arg_hmmer_scores_entries[ x ]; }
	);
	return results;
}
/// \brief TODOCUMENT
size_size_pair_vec cath::align::get_alignment_break_pairs(const alignment &arg_alignment ///< TODOCUMENT
                                                          ) {
	const auto alignment_breaks     = get_alignment_breaks( arg_alignment );
	const auto num_alignment_breaks = alignment_breaks.size();

	size_size_pair_vec break_pairs;
	break_pairs.reserve( num_alignment_breaks );

	for (const size_t &idx_one : irange( 0_z, num_alignment_breaks ) ) {
		for (const size_t &idx_two : irange( idx_one, num_alignment_breaks ) ) {
			const size_t &break_one = alignment_breaks[ idx_one ];
			const size_t &break_two = alignment_breaks[ idx_two ];

			const auto pair_result = check_pair( arg_alignment, break_one, break_two );
			if ( pair_result.first  == break_pair_validity::GOOD ) {
				break_pairs.emplace_back( break_one, break_two );
			}
			if ( pair_result.second == break_pair_future::NEVER_AGAIN ) {
				break;
			}
		}
	}
	return break_pairs;
}
Example #15
0
void add_entries_from_permutation( binary_truth_table& spec, const std::vector<unsigned>& permutation )
{
  using boost::combine;
  using boost::irange;
  using boost::get;

  unsigned n = (unsigned)ceil( log( permutation.size() ) / log( 2 ) );

  for ( const auto& i : combine( irange( 0u, (unsigned)permutation.size() ),
                                 permutation ) )
  {
    spec.add_entry( number_to_truth_table_cube( get<0>( i ), n ),
                    number_to_truth_table_cube( get<1>( i ), n ) );
  }
}
Example #16
0
void Pyramid::iterate()
{
    using std::cout;
    using std::endl;
    using std::flush;
    using boost::irange;

    using accumulators::sum;
    using accumulators::weight;

    auto iteration_cnt = 0;

    auto levels = irange(1lu, height);

    for (const auto lv : levels) {
        cout << endl
             << "=== Level    : " << lv << " === " << shape(lv)
             << " ===" << flush;

        assert(lv > 0);

        cout << endl << "Erasing bad links ..." << flush;
        erase_bad_links(lv);
        while (true) {
            cout << endl << "=== Iteration: " << iteration_cnt << " ===";

            auto spec_dist_fun = [this](auto node, auto desc) {
                return wishart_distance(node, desc);
            };

            iteration_step(lv, spec_dist_fun);

            if (link.activity(lv) < conf.activity_eps ||
                conf.max_iterations == iteration_cnt) {
                iteration_cnt++;
                break;
            }
            iteration_cnt++;
        }
        if (conf.max_iterations == iteration_cnt) {
            cout << endl << "Maximum Iterations reached." << endl;
        }
        else {
            cout << endl << "Converged." << endl;
        }
    }
}
Example #17
0
int main(int argc, char** argv) {
	bool useGpu = true;
	float matchConfidence = 0.3f;

	std::string directory = "/Users/billylee/source/gladstone/data/cultured-neuron/ch1";
	boost::format filePattern("%1%/PID20151028_JL_T0_0_D1_%2%_488_642_Empty_684_1_0.0_AndorZyla110XELWD.tif");

	// read all images into memory, then do an estimateRigidTransform on all pairs
	// of images, and then do a cv::gpu::warpAffine

/*	
	std::vector<std::vector<cv::Mat>> imageGrid;
	for (auto i : irange(1, 6)) {
		for (auto j : irange(1, 6)) {
			std::string file = filePattern % directory % (i * j);
			imageGrid[i].push_back(cv::imread(file));
		}
	}

	std::vector<cv::MatchesInfo> pairwise_matches;
	cv::BestOf2NearestMatcher matcher(useGpu, matchConfidence);

	cv::Mat matchMask(features.size(), features.size(), cv::CV_8U, )

	cv::Mat stitched;
	cv::Stitcher stitcher = cv::Stitcher::createDefault(true);
	cv::Stitcher::Status status = stitcher.stitch(images, stitched);

	if (status != cv::Stitcher::OK) {
		std::cout << "Can't stitch!"
	}
*/
	for (auto i : irange (0, 5)) {
		bool reverse = (i + 1) % 2 != 0;
		auto files = generateStitchSet(directory, filePattern, reverse, 5*i + 1, 4);

		std::array<cv::Mat, 5> images;
		// load set into memory
		int c = 0;
		for (auto f : files) {
			images[c] = cv::imread(f);
			c += 1;
		}
		stitchSet(images);
	}
}
/// \brief TODOCUMENT
///
/// \relates score_classn_value_results_set
named_true_false_pos_neg_list_list cath::score::make_named_true_false_pos_neg_list_list(const score_classn_value_results_set &arg_score_classn_value_results_set ///< TODOCUMENT
                                                                                        ) {
	return {
		transform_build<named_true_false_pos_neg_list_vec>(
			irange( 0_z, arg_score_classn_value_results_set.size() ),
			[&] (const size_t &x) {
				const auto &name   = get_name_of_index( arg_score_classn_value_results_set, x );
				const auto &scores = arg_score_classn_value_results_set.get_score_classn_value_list_of_name( name );

				// \todo Put this check in get_score_classn_value_list_of_name()
				if ( name != scores.get_name() ) {
					BOOST_THROW_EXCEPTION(invalid_argument_exception("Mismatching names in score_classn_value_results_set"));
				}
				return make_named_true_false_pos_neg_list( scores );
			}
		)
	};
}
Example #19
0
Mat4f Pyramid::make_spatial_distance_matrix() const
{
    using std::sqrt;
    using boost::irange;

    auto result = Mat4f(4, 4);

    /* clang-format off */

    result << 9, 5, 5, 9,
              5, 1, 1, 5,
              5, 1, 1, 5,
              9, 5, 5, 9;

    /* clang-format on */

    for (const auto i : irange(0l, result.size())) {
        result(i) = sqrt(result(i));
    }
    return result;
}
Example #20
0
void smooth_helper(Pyramid& p, smooth_level_fun smooth, const size_t top_level)
{
    using boost::irange;
    using boost::adaptors::reversed;
    using namespace std;

    auto cnt = 0;
    auto v_copy = ImagePyramid{p.get_value2()};
    auto w_copy = LinkPyramid{p.get_links()};

    auto levels = irange(0ul, top_level + 1);

    for (const auto lv : levels | reversed) {
        cout << "Smoothing Level " << lv << " ... ";
        cnt += smooth(v_copy, w_copy, lv);
        cout << endl;
    }
    p.set_result(v_copy.bottom());
    cout << "Done." << endl;
    cout << "Number of Segments: " << cnt << endl;
}
/// \brief TODOCUMENT
size_vec cath::align::get_alignment_breaks(const alignment &arg_alignment ///< TODOCUMENT
                                           ) {
	// For each alignment index *after* zero, check whether there's any overlap
	// between the entries that are present at the previous index and the entries that are
	// present at this index. If not, add this index to the list.
	//
	// Note: Would prefer to use Boost Range's adjacent_filtered adaptor here but
	// it doesn't do what's required here: it always lets the first pair through,
	// even if they fail the predicate.
	return copy_build<size_vec>(
		irange( 1_z, arg_alignment.length() )
			| filtered (
				[&] (const size_t &curr_idx) {
					// Return whether the set of present entries for the previous index
					// is disjoint with the set of present entries for this index
					return sets_are_disjoint(
						entries_present_at_index( arg_alignment, curr_idx - 1 ),
						entries_present_at_index( arg_alignment, curr_idx     )
					);
				}
			)
	);
}
/// \brief Build a ssaps_and_prcs_of_query from the specified ssap_scores_entries and prc_scores_entries
///
/// \relates ssaps_and_prcs_of_query
ssaps_and_prcs_of_query cath::homcheck::make_ssaps_and_prcs_of_query(const ssap_scores_entry_vec &arg_ssaps, ///< The SSAPs from which the ssaps_and_prcs_of_query should be built
                                                                     const prc_scores_entry_vec  &arg_prcs   ///< The PRCs from which the ssaps_and_prcs_of_query should be built
                                                                     ) {
	// Sanity check the inputs - step 1: check SSAP results have identical name_1s
	const bool ssap_query_ids_identical = all_of(
		arg_ssaps | adjacented,
		[] (const pair<const ssap_scores_entry &, const ssap_scores_entry &> &x) { return x.first.get_name_1() == x.second.get_name_1(); }
	);
	if ( ! ssap_query_ids_identical ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception(
			"Cannot construct ssaps_and_prcs_of_query from data because the SSAP results have inconsistent query IDs (first is \""
			+ arg_ssaps.front().get_name_1()
			+ "\")"
		));
	}

	// Sanity check the inputs - step 2: check PRC results have identical name_1s
	const bool prc_query_ids_identical = all_of(
		arg_prcs | adjacented,
		[] (const pair<const prc_scores_entry &, const prc_scores_entry &> &x) { return x.first.get_name_1() == x.second.get_name_1(); }
	);
	if ( ! prc_query_ids_identical ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("Cannot construct ssaps_and_prcs_of_query from data because the PRC results have inconsistent query IDs (first is \""
			+ arg_prcs.front().get_name_1()
			+ "\")"
		));
	}

	// Sanity check the inputs - step 3: check the SSAP name_1 matches the PRC name_1
	if ( ! arg_ssaps.empty() && ! arg_prcs.empty() ) {
		const string &ssaps_name_1 = arg_ssaps.front().get_name_1();
		const string &prcs_name_1  =  arg_prcs.front().get_name_1();
		if ( ssaps_name_1 != prcs_name_1 ) {
			BOOST_THROW_EXCEPTION(invalid_argument_exception(
				"Cannot construct ssaps_and_prcs_of_query from data because the SSAP results query ID \""
				+ ssaps_name_1
				+ "\" does not match the PRC results query ID \""
				+ prcs_name_1
				+ "\""
			));
		}
	}

	const auto prc_index_of_ids = transform_build<unordered_map<str_str_pair, size_t, pair_hash> >(
		irange( 0_z, arg_prcs.size() ),
		[&] (const size_t &x) {
			const auto &the_prc = arg_prcs[ x ];
			return make_pair( make_pair( the_prc.get_name_1(), the_prc.get_name_2() ), x );
		}
	);

	vector<ssap_and_prc> ssap_and_prc_entries;
	for (const ssap_scores_entry &the_ssap : arg_ssaps) {
		const string &query_id = the_ssap.get_name_1();
		const string &match_id = the_ssap.get_name_2();

		const auto prc_index_itr = prc_index_of_ids.find( make_pair( query_id, match_id ) );
		if ( prc_index_itr != common::cend( prc_index_of_ids ) ) {
			ssap_and_prc_entries.emplace_back(
				the_ssap,
				arg_prcs[ prc_index_itr->second ]
			);
		}
	}

	const auto num_ssaps = arg_ssaps.size();
	const auto num_prcs  = arg_prcs.size();
	const auto num_comb  = ssap_and_prc_entries.size();
	const string query_id_str        = ( ! ssap_and_prc_entries.empty() ) ? ( " (query: " + get_query_id( ssap_and_prc_entries ) + ")" ) : "";
	const string unmatched_ssaps_str = ( num_ssaps > num_comb           ) ? ( std::to_string( num_ssaps - num_comb ) + " unmatched SSAP results from " + std::to_string( num_ssaps ) ) : "";
	const string unmatched_prcs_str  = ( num_prcs  > num_comb           ) ? ( std::to_string( num_prcs  - num_comb ) + " unmatched PRC results from "  + std::to_string( num_prcs  ) ) : "";
	const string conjuction_str      = ( unmatched_ssaps_str.empty() || unmatched_prcs_str.empty() ) ? "" : " and ";
	if ( ! unmatched_ssaps_str.empty() || ! unmatched_prcs_str.empty() ) {
		BOOST_LOG_TRIVIAL( warning ) << "After parsing " << num_comb << " ssaps_and_prcs_of_query" << query_id_str << ", was left with " << unmatched_ssaps_str << conjuction_str << unmatched_prcs_str;
	}
	return { ssap_and_prc_entries };
}
/// \brief TODOCUMENT
void cath::score::detail::write_to_svm_light_data_files_impl(const score_classn_value_vec_vec &arg_results,             ///< TODOCUMENT
                                                             const value_list_scaling_vec     &arg_value_list_scalings, ///< TODOCUMENT
                                                             const path                       &arg_output_file,         ///< TODOCUMENT
                                                             const size_vec                   &arg_indices              ///< TODOCUMENT
                                                             ) {
	// Sanity check the inputs
	if ( ! is_sorted( arg_indices ) ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("Cannot write SVM data for indices that aren't sorted"));
	}
	if ( contains_adjacent_match( arg_results, [] (const score_classn_value_vec &x, const score_classn_value_vec &y) { return x.size() != y.size(); } ) ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("Not all results to be written to an SVM data file are of equal size"));
	}
	if ( contains_if( arg_results, [&] (const score_classn_value_vec &x) { return x.size() <= arg_indices.back(); } ) ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("The results to be written to an SVM data file are not big enough for the specified indices"));
	}
	if ( arg_value_list_scalings.size() != arg_results.size() ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("The number of scalings doesn't match the number of score_classn_value_vecs when attempting to write SVM data "));
	}
//	if ( ! results.empty() ) {
//		const auto first_results_size = results.front().size();
//		if ( ! all_of( results, [] (const score_classn_value_vec &x) { return x.size() == first_results_size; } ) ) {
//			BOOST_THROW_EXCEPTION(invalid_argument_exception("Not all results to be written to an SVM data file are of equal size"));
//		}
//	}
//	if ( ! arg_indices.empty() && ! results.empty() ) {
//		if ( arg_indices.back() >= arg_results.front().size() ) {
//			BOOST_THROW_EXCEPTION(invalid_argument_exception("The results to be written to an SVM data file are not big enough for the specified indices"));
//		}
//	}

	// Open an ostream for the file
	ofstream out_stream;
	open_ofstream( out_stream, arg_output_file );

	// Loop over the instances of the requested indices
	for (const size_t &index : arg_indices) {

		// Check for any mismatching entries wrt their instance label/is_positive value
		const bool mismatching_instances = contains_adjacent_match(
			arg_results,
			[&] (const score_classn_value_vec &x, const score_classn_value_vec &y) {
				return (    ( x[ index ].get_instance_is_positive() != y[ index ].get_instance_is_positive() )
						 || ( x[ index ].get_instance_label()       != y[ index ].get_instance_label()       ) );
			}
		);
		if ( mismatching_instances ) {
			BOOST_THROW_EXCEPTION(invalid_argument_exception("When attempting to write SVM data, detected mismatching entries wrt their instance label/is_positive value"));
		}

		// If there are arg_results, then output a line of data
		if ( ! arg_results.empty() ) {
			const auto &first_value = arg_results.front()[ index ];

			// First, output '+1 ' if this instance is positive and '-1 ' otherwise
			out_stream << ( first_value.get_instance_is_positive() ? string( "+1 " ) : string( "-1 " ) );

			// Next output the data in the format : '1:value_1 2:value_2 [...] n:value_n'
			out_stream << join(
				transform_build<str_vec>(
					irange( 0_z, arg_results.size() ),
					[&] (const size_t &x) {
						const auto &scaling      = arg_value_list_scalings[ x ];
						const auto  scaled_score = scale_value_copy( scaling, arg_results[ x ][ index ].get_score_value() );
						return lexical_cast<string>( x + 1 ) + ":" + lexical_cast<string>( scaled_score );
					}
				),
				" "
			);

			// Finally, append a comment containing the label of the instance
			out_stream << " # " << first_value.get_instance_label() << "\n";
		}
	}

	// Flush and close the output stream
	out_stream << flush;
	out_stream.close();
}
/// \brief Tally up the residue records parsed from a PDB file with those parsed from a corresponding DSSP/WOLF file
///
/// \pre Every valid DSSP/WOLF residue should have an equivalent PDB residue and matching residues should be in the same order in both.
///
/// The DSSP file may have a NULL residue to indicate a break in the chain or a residue that cannot be properly represented.
/// The WOLF file may just skip some residues.
///
/// Since this is attempting to find the PDB residue that matches each DSSP/WOLF residue, it is implemented
/// with a simple loop through each of the DSSP/WOLF residues whilst maintaining a counter to point to the
/// current PDB residue name.
///
/// \returns A list of pairs of equivalent indices (offset 0) between residues in the PDB and DSSP/WOLF
size_size_pair_vec cath::file::tally_residue_ids(const residue_id_vec &arg_pdb_residue_ids,                             ///< A list of residue_ids parsed from the PDB file
                                                 const residue_id_vec &arg_dssp_or_wolf_residue_ids,                    ///< A list of residue_ids parsed from the DSSP/WOLF file (with a null residue represented with an empty string)
                                                 const bool           &arg_permit_breaks_without_null_residues,         ///< (true for WOLF files and false for DSSP files (at least >= v2.0)
                                                 const bool           &arg_permit_head_tail_break_without_null_residue, ///< (true even for DSSP v2.0.4: file for chain A of 1bvs stops with neither residue 203 or null residue (verbose message: "ignoring incomplete residue ARG  (203)")
                                                 const size_set       &arg_skippable_pdb_indices                        ///< A list of the indices of PDB residue names that should always be considered for being skipped over to find a match to the next DSSP/WOLF residue
                                                 ) {
	BOOST_LOG_TRIVIAL( trace ) << "Tallying PDB residue names: "
	                           << join( arg_pdb_residue_ids          | lexical_casted<string>(), "," )
	                           << " with DSSP/WOLF residue names: "
	                           << join( arg_dssp_or_wolf_residue_ids | lexical_casted<string>(), "," );

	// Sanity check the inputs
	//
	/// \todo Add check that arg_dssp_or_wolf_residue_ids has no duplicates other than empty strings and

	// Check that arg_pdb_residue_ids contains no empty strings
	if ( contains( arg_pdb_residue_ids, residue_id{} ) ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("PDB residues should not contain empty residues names"));
	}
	// Check that arg_pdb_residue_ids contains no duplicates
	if ( ! is_uniq_for_unordered( arg_pdb_residue_ids ) ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("PDB residues should not contain duplicate entries " + join( arg_pdb_residue_ids | lexical_casted<string>(), "," ) ));
	}

	// Check that arg_dssp_or_wolf_residue_ids contains no consecutive duplicates (not even duplicate empty strings)
	if ( contains_adjacent_match( arg_dssp_or_wolf_residue_ids ) ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("DSSP residues should not contain duplicate consecutive entries (not even null residues)"));
	}

	size_size_pair_vec alignment;
	const auto num_pdb_residues          = arg_pdb_residue_ids.size();
	const auto num_dssp_or_wolf_residues = arg_dssp_or_wolf_residue_ids.size();
	const auto min_num_residues          = min( num_pdb_residues, num_dssp_or_wolf_residues );
	alignment.reserve( min_num_residues );

	// Loop through the DSSP/WOLF residues, whilst also indexing through the PDB residues
	size_t pdb_residue_ctr = 0;
	for (const size_t &dssp_residue_ctr : irange( 0_z, num_dssp_or_wolf_residues ) ) {

		// Grab the DSSP/WOLF residue name
		const residue_id &dssp_or_wolf_res_id      = arg_dssp_or_wolf_residue_ids[ dssp_residue_ctr ];
		const bool       &dssp_or_wolf_res_is_null = is_null( dssp_or_wolf_res_id );

		// If the PDB index has stepped past the end of the PDB residues
		if (pdb_residue_ctr >= num_pdb_residues) {

			// Then if this DSSP/WOLF residue is a NULL entry then just skip it
			if ( dssp_or_wolf_res_is_null ) {
				continue;
			}
			// Otherwise, this is a valid DSSP/WOLF residue with no match in the PDB so throw a wobbly
			else {
				BOOST_THROW_EXCEPTION(invalid_argument_exception("DSSP/WOLF residue " + to_string( dssp_or_wolf_res_id ) + " overshoots the end of the PDB residues"));
			}
		}

		// Record whether this is a permitted head break region
		const bool permitted_head_break = ( arg_permit_head_tail_break_without_null_residue && pdb_residue_ctr == 0 );

		// Create a lambda function to calculate whether the specified PDB residue counter should/can be advanced
		// to find a match with the specified DSSP/WOLF residue name target
		const auto should_advance_pdb_res_ctr_for_target_fn = [&] (const size_t     &arg_pdb_res_ctr,
		                                                           const residue_id &arg_target
		                                                           ) {
			const bool mismatches          = ( arg_pdb_residue_ids[ arg_pdb_res_ctr ] != arg_target );
			const bool reason_for_mismatch = (
				dssp_or_wolf_res_is_null
				||
				arg_permit_breaks_without_null_residues
				||
				permitted_head_break
				||
				contains( arg_skippable_pdb_indices, arg_pdb_res_ctr )
			);
			return ( mismatches && reason_for_mismatch );
		};

		// If should advance...
		if ( should_advance_pdb_res_ctr_for_target_fn( pdb_residue_ctr, dssp_or_wolf_res_id ) ) {

			// If is a null residue at the end of the DSSP/WOLF, then set pdb_residue_ctr to the end of the PDB
			if ( dssp_or_wolf_res_is_null && dssp_residue_ctr + 1 >= num_dssp_or_wolf_residues) {
				pdb_residue_ctr = num_pdb_residues;
			}

			// Otherwise, it's necessary to search for the PDB residue that matches:
			//  * the next DSSP/WOLF residue if this one is empty or
			//  * this mismatching DSSP/WOLF residue otherwise
			else {
				// Grab the string to search for
				const residue_id &dssp_or_wolf_res_id_to_find = dssp_or_wolf_res_is_null ? arg_dssp_or_wolf_residue_ids[ dssp_residue_ctr + 1 ]
				                                                                         : arg_dssp_or_wolf_residue_ids[ dssp_residue_ctr     ];

				// Scan through the PDB residues to find a match
				while ( pdb_residue_ctr < num_pdb_residues && should_advance_pdb_res_ctr_for_target_fn( pdb_residue_ctr, dssp_or_wolf_res_id_to_find ) ) {
					++pdb_residue_ctr;
				}

				// If no matching residue was found in the PDB then throw a wobbly
				if ( pdb_residue_ctr >= num_pdb_residues ) {
					BOOST_THROW_EXCEPTION(invalid_argument_exception("Cannot find a match for DSSP/WOLF residue " + to_string( dssp_or_wolf_res_id_to_find ) ));
				}
			}

			// If this DSSP/WOLF residue is an empty then there is nothing more to do so move to the next loop
			if ( dssp_or_wolf_res_is_null ) {
				continue;
			}
		}

		// If these two residue names don't match then throw a wobbly
		const residue_id &pdb_res_id = arg_pdb_residue_ids[ pdb_residue_ctr ];
		if ( pdb_res_id != dssp_or_wolf_res_id ) {
			BOOST_THROW_EXCEPTION(invalid_argument_exception(
				"Cannot match PDB residue "
				+ to_string( pdb_res_id )
				+ " with DSSP/WOLF residue "
				+ to_string( dssp_or_wolf_res_id )
				+ " - it may be worth double-checking the DSSP/WOLF file is generated from"
				  " this version of the PDB file and, if you're using DSSP files, it may be"
				  " worth ensuring you're using an up-to-date DSSP binary"
			));
		}

		// Add this pair of residues to the alignment
		alignment.push_back(make_pair(pdb_residue_ctr, dssp_residue_ctr));

		// Increment the pdb_residue_ctr
		++pdb_residue_ctr;
	}

	// If there are further residues remaining in the PDB and permit_tail_break_without_null_residue is off then throw a wobbly
	if ( ! arg_permit_head_tail_break_without_null_residue && pdb_residue_ctr < num_pdb_residues ) {
		BOOST_THROW_EXCEPTION(invalid_argument_exception("PDB contains residues at the end that are not present at the end of the DSSP/WOLF"));
	}

	// No problem has been spotted so return the constructed alignment
	return alignment;
}
Example #25
0
/// \brief Combine a dssp_file and pdb representing the same structure in a sensible protein object
///
/// \relates dssp_file
///
/// \TODO Consider taking an ostream_ref_opt argument rather than assuming cerr
///       (fix all errors, *then* provide default of boost::none)
protein cath::file::protein_from_dssp_and_pdb(const dssp_file        &arg_dssp_file,        ///< The dssp_file object for a given structure
                                              const pdb              &arg_pdb_file,         ///< The dssp_file object for a given structure
                                              const dssp_skip_policy &arg_dssp_skip_policy, ///< Whether to exclude residues that are in the PDB but not the DSSP
                                              const string           &arg_name,             ///< The name to set as the title of the protein
                                              const ostream_ref_opt  &arg_ostream           ///< An optional reference to an ostream to which any logging should be sent
                                              ) {
	// Build a rough protein object from the pdb object
	const auto pdb_protein       = build_protein_of_pdb(
		arg_pdb_file,
		arg_ostream,
		( arg_dssp_skip_policy == dssp_skip_policy::SKIP__BREAK_ANGLES )
			? dssp_skip_policy::DONT_SKIP__BREAK_ANGLES
			: arg_dssp_skip_policy
	);
	const auto pdb_skip_indices  = get_protein_res_indices_that_dssp_might_skip( arg_pdb_file, arg_ostream );

	// Grab the number of residues in the protein and dssp_file objects
	const auto num_dssp_residues = arg_dssp_file.get_num_residues();
	const auto num_pdb_residues  = pdb_protein.get_length();

	// Grab the residues names from the DSSP and PDB and then tally them up
	const auto pdb_res_names     = get_residue_ids  ( pdb_protein );
	const auto dssp_res_names    = get_residue_ids  ( arg_dssp_file, false );
	const auto alignment         = tally_residue_ids(
		pdb_res_names,
		dssp_res_names,
		false,
		true,
		pdb_skip_indices
	);

	// Prepare a list of new residue to populate
	residue_vec new_residues;
	new_residues.reserve( ( arg_dssp_skip_policy == dssp_skip_policy::SKIP__BREAK_ANGLES ) ? num_dssp_residues : num_pdb_residues );

	// Loop over the residues
	size_t alignment_ctr = 0;
	for (const size_t &pdb_residue_ctr : irange( 0_z, num_pdb_residues ) ) {
		const residue &the_pdb_residue = pdb_protein.get_residue_ref_of_index( pdb_residue_ctr );

		// If this PDB residue is in the alignment then it can be combined with the equivalent DSSP residue
		const bool is_in_alignment     = ( (alignment_ctr < alignment.size() ) && ( alignment[alignment_ctr].first == pdb_residue_ctr ) );
		if ( is_in_alignment ) {
			// Combine the two residues and add them to the back
			const residue &the_dssp_residue = arg_dssp_file.get_residue_of_index( alignment[alignment_ctr].second );
			new_residues.push_back(
				combine_residues_from_dssp_and_pdb(
					the_dssp_residue,
					the_pdb_residue,
					angle_skipping_of_dssp_skip_policy( arg_dssp_skip_policy )
				)
			);

			// Increment the alignment counter
			++alignment_ctr;
		}
		else if ( res_skipping_of_dssp_skip_policy( arg_dssp_skip_policy ) == dssp_skip_res_skipping::DONT_SKIP ) {
			new_residues.push_back( the_pdb_residue );
		}
	}

	// Construct a new protein from the new list of residues
	return { arg_name, new_residues };
}