void MapAlignmentAlgorithmIdentification::computeTransformations_( vector<SeqToList>& rt_data, vector<TransformationDescription>& transforms, bool sorted) { Int size = rt_data.size(); // not Size because we compare to Ints later transforms.clear(); // filter RT data (remove peptides that elute in several fractions): // TODO // compute RT medians: LOG_DEBUG << "Computing RT medians..." << endl; vector<SeqToValue> medians_per_run(size); for (Int i = 0; i < size; ++i) { computeMedians_(rt_data[i], medians_per_run[i], sorted); } SeqToList medians_per_seq; for (vector<SeqToValue>::iterator run_it = medians_per_run.begin(); run_it != medians_per_run.end(); ++run_it) { for (SeqToValue::iterator med_it = run_it->begin(); med_it != run_it->end(); ++med_it) { medians_per_seq[med_it->first].push_back(med_it->second); } } // get reference retention time scale: either directly from reference file, // or compute consensus time scale bool reference_given = !reference_.empty(); // reference file given if (reference_given) { // remove peptides that don't occur in enough runs: LOG_DEBUG << "Removing peptides that occur in too few runs..." << endl; SeqToValue temp; for (SeqToValue::iterator ref_it = reference_.begin(); ref_it != reference_.end(); ++ref_it) { SeqToList::iterator med_it = medians_per_seq.find(ref_it->first); if ((med_it != medians_per_seq.end()) && (med_it->second.size() + 1 >= min_run_occur_)) { temp.insert(temp.end(), *ref_it); // new items should go at the end } } LOG_DEBUG << "Removed " << reference_.size() - temp.size() << " of " << reference_.size() << " peptides." << endl; temp.swap(reference_); } else // compute overall RT median per sequence (median of medians per run) { LOG_DEBUG << "Computing overall RT medians per sequence..." << endl; // remove peptides that don't occur in enough runs (at least two): LOG_DEBUG << "Removing peptides that occur in too few runs..." << endl; SeqToList temp; for (SeqToList::iterator med_it = medians_per_seq.begin(); med_it != medians_per_seq.end(); ++med_it) { if (med_it->second.size() >= min_run_occur_) { temp.insert(temp.end(), *med_it); } } LOG_DEBUG << "Removed " << medians_per_seq.size() - temp.size() << " of " << medians_per_seq.size() << " peptides." << endl; temp.swap(medians_per_seq); computeMedians_(medians_per_seq, reference_); } if (reference_.empty()) { throw Exception::MissingInformation(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "No reference RT information left after filtering"); } double max_rt_shift = param_.getValue("max_rt_shift"); if (max_rt_shift <= 1) { // compute max. allowed shift from overall retention time range: double rt_min = numeric_limits<double>::infinity(), rt_max = -rt_min; for (SeqToValue::iterator it = reference_.begin(); it != reference_.end(); ++it) { rt_min = min(rt_min, it->second); rt_max = max(rt_max, it->second); } double rt_range = rt_max - rt_min; max_rt_shift *= rt_range; // in the degenerate case of only one reference point, "max_rt_shift" // should be zero (because "rt_range" is zero) - this is covered below } if (max_rt_shift == 0) { max_rt_shift = numeric_limits<double>::max(); } LOG_DEBUG << "Max. allowed RT shift (in seconds): " << max_rt_shift << endl; // generate RT transformations: LOG_DEBUG << "Generating RT transformations..." << endl; LOG_INFO << "\nAlignment based on:" << endl; // diagnostic output Size offset = 0; // offset in case of internal reference for (Int i = 0; i < size + 1; ++i) { if (i == reference_index_) { // if one of the input maps was used as reference, it has been skipped // so far - now we have to consider it again: TransformationDescription trafo; trafo.fitModel("identity"); transforms.push_back(trafo); LOG_INFO << "- " << reference_.size() << " data points for sample " << i + 1 << " (reference)\n"; offset = 1; } if (i >= size) break; // to be useful for the alignment, a peptide sequence has to occur in the // current run ("medians_per_run[i]"), but also in at least one other run // ("medians_overall"): TransformationDescription::DataPoints data; Size n_outliers = 0; for (SeqToValue::iterator med_it = medians_per_run[i].begin(); med_it != medians_per_run[i].end(); ++med_it) { SeqToValue::const_iterator pos = reference_.find(med_it->first); if (pos != reference_.end()) { if (abs(med_it->second - pos->second) <= max_rt_shift) { // found, and satisfies "max_rt_shift" condition! TransformationDescription::DataPoint point(med_it->second, pos->second, pos->first); data.push_back(point); } else { n_outliers++; } } } transforms.push_back(TransformationDescription(data)); LOG_INFO << "- " << data.size() << " data points for sample " << i + offset + 1; if (n_outliers) LOG_INFO << " (" << n_outliers << " outliers removed)"; LOG_INFO << "\n"; } LOG_INFO << endl; // delete temporary reference if (!reference_given) reference_.clear(); }
void MapAlignmentAlgorithmIdentification::computeTransformations_( vector<SeqToList> & rt_data, vector<TransformationDescription> & transforms, bool sorted) { Size size = rt_data.size(); transforms.clear(); // filter RT data (remove peptides that elute in several fractions): // TODO // compute RT medians: LOG_DEBUG << "Computing RT medians..." << endl; vector<SeqToValue> medians_per_run(size); for (Size i = 0; i < size; ++i) { computeMedians_(rt_data[i], medians_per_run[i], sorted); } SeqToList medians_per_seq; for (vector<SeqToValue>::iterator run_it = medians_per_run.begin(); run_it != medians_per_run.end(); ++run_it) { for (SeqToValue::iterator med_it = run_it->begin(); med_it != run_it->end(); ++med_it) { medians_per_seq[med_it->first] << med_it->second; } } // get reference retention time scale: either directly from reference file, // or compute consensus time scale bool reference_given = !reference_.empty(); // reference file given if (reference_given) { // remove peptides that don't occur in enough runs: LOG_DEBUG << "Removing peptides that occur in too few runs..." << endl; SeqToValue temp; SeqToValue::iterator pos = temp.begin(); // to prevent segfault below for (SeqToValue::iterator ref_it = reference_.begin(); ref_it != reference_.end(); ++ref_it) { SeqToList::iterator med_it = medians_per_seq.find(ref_it->first); if ((med_it != medians_per_seq.end()) && (med_it->second.size() + 1 >= min_run_occur_)) { temp.insert(pos, *ref_it); pos = --temp.end(); // would cause segfault if "temp" was empty } } temp.swap(reference_); } else // compute overall RT median per sequence (median of medians per run) { LOG_DEBUG << "Computing overall RT medians per sequence..." << endl; // remove peptides that don't occur in enough runs (at least two): LOG_DEBUG << "Removing peptides that occur in too few runs..." << endl; SeqToList temp; SeqToList::iterator pos = temp.begin(); // to prevent segfault below for (SeqToList::iterator med_it = medians_per_seq.begin(); med_it != medians_per_seq.end(); ++med_it) { if (med_it->second.size() >= min_run_occur_) { temp.insert(pos, *med_it); pos = --temp.end(); // would cause segfault if "temp" was empty } } temp.swap(medians_per_seq); computeMedians_(medians_per_seq, reference_); } DoubleReal max_rt_shift = param_.getValue("max_rt_shift"); if (max_rt_shift == 0) { max_rt_shift = numeric_limits<DoubleReal>::max(); } else if (max_rt_shift <= 1) // compute max. allowed shift from overall retention time range: { DoubleReal rt_range, rt_min = reference_.begin()->second, rt_max = rt_min; for (SeqToValue::iterator it = ++reference_.begin(); it != reference_.end(); ++it) { rt_min = min(rt_min, it->second); rt_max = max(rt_max, it->second); } rt_range = rt_max - rt_min; max_rt_shift *= rt_range; } LOG_DEBUG << "Max. allowed RT shift (in seconds): " << max_rt_shift << endl; // generate RT transformations: LOG_DEBUG << "Generating RT transformations..." << endl; LOG_INFO << "\nAlignment based on:" << endl; // diagnostic output for (Size i = 0, offset = 0; i < size + 1; ++i) { if (i == reference_index_ - 1) { // if one of the input maps was used as reference, it has been skipped // so far - now we have to consider it again: TransformationDescription trafo; trafo.fitModel("identity"); transforms.push_back(trafo); LOG_INFO << "- 0 data points for sample " << i + 1 << " (reference)\n"; offset = 1; } if (i >= size) break; // to be useful for the alignment, a peptide sequence has to occur in the // current run ("medians_per_run[i]"), but also in at least one other run // ("medians_overall"): TransformationDescription::DataPoints data; for (SeqToValue::iterator med_it = medians_per_run[i].begin(); med_it != medians_per_run[i].end(); ++med_it) { SeqToValue::const_iterator pos = reference_.find(med_it->first); if ((pos != reference_.end()) && (fabs(med_it->second - pos->second) <= max_rt_shift)) { // found, and satisfies "max_rt_shift" condition! data.push_back(make_pair(med_it->second, pos->second)); } } transforms.push_back(TransformationDescription(data)); LOG_INFO << "- " << data.size() << " data points for sample " << i + offset + 1 << "\n"; } LOG_INFO << endl; // delete temporary reference if (!reference_given) reference_.clear(); }