void adjustRetentionTimes_(MapType& map, const String& trafo_out, bool first_file) { map.updateRanges(); TransformationDescription trafo; if (first_file) // no transformation necessary { rt_offset_ = map.getMax()[0] + rt_gap_; trafo.fitModel("identity"); } else // subsequent file -> apply transformation { TransformationDescription::DataPoints points(2); double rt_min = map.getMin()[0], rt_max = map.getMax()[0]; points[0] = make_pair(rt_min, rt_offset_); rt_offset_ += rt_max - rt_min; points[1] = make_pair(rt_max, rt_offset_); trafo.setDataPoints(points); trafo.fitModel("linear"); MapAlignmentTransformer::transformRetentionTimes(map, trafo, true); rt_offset_ += rt_gap_; } if (!trafo_out.empty()) { TransformationXMLFile().store(trafo_out, trafo); } }
void MapAlignmentAlgorithmPoseClustering::align(const ConsensusMap & map, TransformationDescription & trafo) { // TODO: move this to updateMembers_? (if consensusMap prevails) // TODO: why does superimposer work on consensus map??? const ConsensusMap & map_model = reference_; ConsensusMap map_scene = map; // run superimposer to find the global transformation TransformationDescription si_trafo; superimposer_.run(map_model, map_scene, si_trafo); // apply transformation to consensus features and contained feature // handles for (Size j = 0; j < map_scene.size(); ++j) { //Calculate new RT double rt = map_scene[j].getRT(); rt = si_trafo.apply(rt); //Set RT of consensus feature centroid map_scene[j].setRT(rt); //Set RT of consensus feature handles map_scene[j].begin()->asMutable().setRT(rt); } //run pairfinder to find pairs ConsensusMap result; //TODO: add another 2map interface to pairfinder? std::vector<ConsensusMap> input(2); input[0] = map_model; input[1] = map_scene; pairfinder_.run(input, result); // calculate the local transformation si_trafo.invert(); // to undo the transformation applied above TransformationDescription::DataPoints data; for (ConsensusMap::Iterator it = result.begin(); it != result.end(); ++it) { if (it->size() == 2) // two matching features { ConsensusFeature::iterator feat_it = it->begin(); double y = feat_it->getRT(); double x = si_trafo.apply((++feat_it)->getRT()); // one feature should be from the reference map: if (feat_it->getMapIndex() != 0) { data.push_back(make_pair(x, y)); } else { data.push_back(make_pair(y, x)); } } } trafo = TransformationDescription(data); trafo.fitModel("linear"); }
void MapAlignmentAlgorithmIdentification::computeTransformations_( vector<SeqToList>& rt_data, vector<TransformationDescription>& transforms, bool sorted) { Int size = rt_data.size(); // not Size because we compare to Ints later transforms.clear(); // filter RT data (remove peptides that elute in several fractions): // TODO // compute RT medians: LOG_DEBUG << "Computing RT medians..." << endl; vector<SeqToValue> medians_per_run(size); for (Int i = 0; i < size; ++i) { computeMedians_(rt_data[i], medians_per_run[i], sorted); } SeqToList medians_per_seq; for (vector<SeqToValue>::iterator run_it = medians_per_run.begin(); run_it != medians_per_run.end(); ++run_it) { for (SeqToValue::iterator med_it = run_it->begin(); med_it != run_it->end(); ++med_it) { medians_per_seq[med_it->first].push_back(med_it->second); } } // get reference retention time scale: either directly from reference file, // or compute consensus time scale bool reference_given = !reference_.empty(); // reference file given if (reference_given) { // remove peptides that don't occur in enough runs: LOG_DEBUG << "Removing peptides that occur in too few runs..." << endl; SeqToValue temp; for (SeqToValue::iterator ref_it = reference_.begin(); ref_it != reference_.end(); ++ref_it) { SeqToList::iterator med_it = medians_per_seq.find(ref_it->first); if ((med_it != medians_per_seq.end()) && (med_it->second.size() + 1 >= min_run_occur_)) { temp.insert(temp.end(), *ref_it); // new items should go at the end } } LOG_DEBUG << "Removed " << reference_.size() - temp.size() << " of " << reference_.size() << " peptides." << endl; temp.swap(reference_); } else // compute overall RT median per sequence (median of medians per run) { LOG_DEBUG << "Computing overall RT medians per sequence..." << endl; // remove peptides that don't occur in enough runs (at least two): LOG_DEBUG << "Removing peptides that occur in too few runs..." << endl; SeqToList temp; for (SeqToList::iterator med_it = medians_per_seq.begin(); med_it != medians_per_seq.end(); ++med_it) { if (med_it->second.size() >= min_run_occur_) { temp.insert(temp.end(), *med_it); } } LOG_DEBUG << "Removed " << medians_per_seq.size() - temp.size() << " of " << medians_per_seq.size() << " peptides." << endl; temp.swap(medians_per_seq); computeMedians_(medians_per_seq, reference_); } if (reference_.empty()) { throw Exception::MissingInformation(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "No reference RT information left after filtering"); } double max_rt_shift = param_.getValue("max_rt_shift"); if (max_rt_shift <= 1) { // compute max. allowed shift from overall retention time range: double rt_min = numeric_limits<double>::infinity(), rt_max = -rt_min; for (SeqToValue::iterator it = reference_.begin(); it != reference_.end(); ++it) { rt_min = min(rt_min, it->second); rt_max = max(rt_max, it->second); } double rt_range = rt_max - rt_min; max_rt_shift *= rt_range; // in the degenerate case of only one reference point, "max_rt_shift" // should be zero (because "rt_range" is zero) - this is covered below } if (max_rt_shift == 0) { max_rt_shift = numeric_limits<double>::max(); } LOG_DEBUG << "Max. allowed RT shift (in seconds): " << max_rt_shift << endl; // generate RT transformations: LOG_DEBUG << "Generating RT transformations..." << endl; LOG_INFO << "\nAlignment based on:" << endl; // diagnostic output Size offset = 0; // offset in case of internal reference for (Int i = 0; i < size + 1; ++i) { if (i == reference_index_) { // if one of the input maps was used as reference, it has been skipped // so far - now we have to consider it again: TransformationDescription trafo; trafo.fitModel("identity"); transforms.push_back(trafo); LOG_INFO << "- " << reference_.size() << " data points for sample " << i + 1 << " (reference)\n"; offset = 1; } if (i >= size) break; // to be useful for the alignment, a peptide sequence has to occur in the // current run ("medians_per_run[i]"), but also in at least one other run // ("medians_overall"): TransformationDescription::DataPoints data; Size n_outliers = 0; for (SeqToValue::iterator med_it = medians_per_run[i].begin(); med_it != medians_per_run[i].end(); ++med_it) { SeqToValue::const_iterator pos = reference_.find(med_it->first); if (pos != reference_.end()) { if (abs(med_it->second - pos->second) <= max_rt_shift) { // found, and satisfies "max_rt_shift" condition! TransformationDescription::DataPoint point(med_it->second, pos->second, pos->first); data.push_back(point); } else { n_outliers++; } } } transforms.push_back(TransformationDescription(data)); LOG_INFO << "- " << data.size() << " data points for sample " << i + offset + 1; if (n_outliers) LOG_INFO << " (" << n_outliers << " outliers removed)"; LOG_INFO << "\n"; } LOG_INFO << endl; // delete temporary reference if (!reference_given) reference_.clear(); }
ExitCodes main_(int, const char**) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- String in = getStringOption_("in"); String out = getStringOption_("out"); String trafo_in = getStringOption_("trafo_in"); String trafo_out = getStringOption_("trafo_out"); Param model_params = getParam_().copy("model:", true); String model_type = model_params.getValue("type"); model_params = model_params.copy(model_type + ":", true); ProgressLogger progresslogger; progresslogger.setLogType(log_type_); //------------------------------------------------------------- // check for valid input //------------------------------------------------------------- if (out.empty() && trafo_out.empty()) { writeLog_("Error: Either a data or a transformation output file has to be provided (parameters 'out'/'trafo_out')"); return ILLEGAL_PARAMETERS; } if (in.empty() != out.empty()) { writeLog_("Error: Data input and output parameters ('in'/'out') must be used together"); return ILLEGAL_PARAMETERS; } //------------------------------------------------------------- // apply transformation //------------------------------------------------------------- TransformationXMLFile trafoxml; TransformationDescription trafo; trafoxml.load(trafo_in, trafo); if (model_type != "none") { trafo.fitModel(model_type, model_params); } if (getFlag_("invert")) { trafo.invert(); } if (!trafo_out.empty()) { trafoxml.store(trafo_out, trafo); } if (!in.empty()) // load input { FileTypes::Type in_type = FileHandler::getType(in); if (in_type == FileTypes::MZML) { MzMLFile file; MSExperiment<> map; applyTransformation_(in, out, trafo, file, map); } else if (in_type == FileTypes::FEATUREXML) { FeatureXMLFile file; FeatureMap map; applyTransformation_(in, out, trafo, file, map); } else if (in_type == FileTypes::CONSENSUSXML) { ConsensusXMLFile file; ConsensusMap map; applyTransformation_(in, out, trafo, file, map); } else if (in_type == FileTypes::IDXML) { IdXMLFile file; vector<ProteinIdentification> proteins; vector<PeptideIdentification> peptides; file.load(in, proteins, peptides); bool store_original_rt = getFlag_("store_original_rt"); MapAlignmentTransformer::transformRetentionTimes(peptides, trafo, store_original_rt); // no "data processing" section in idXML file.store(out, proteins, peptides); } } return EXECUTION_OK; }
ExitCodes main_(int, const char**) override { ExitCodes ret = TOPPMapAlignerBase::checkParameters_(); if (ret != EXECUTION_OK) return ret; MapAlignmentAlgorithmPoseClustering algorithm; Param algo_params = getParam_().copy("algorithm:", true); algorithm.setParameters(algo_params); algorithm.setLogType(log_type_); StringList in_files = getStringList_("in"); StringList out_files = getStringList_("out"); StringList out_trafos = getStringList_("trafo_out"); Size reference_index = getIntOption_("reference:index"); String reference_file = getStringOption_("reference:file"); FileTypes::Type in_type = FileHandler::getType(in_files[0]); String file; if (!reference_file.empty()) { file = reference_file; reference_index = in_files.size(); // points to invalid index } else if (reference_index > 0) // normal reference (index was checked before) { file = in_files[--reference_index]; // ref. index is 1-based in parameters, but should be 0-based here } else if (reference_index == 0) // no reference given { LOG_INFO << "Picking a reference (by size) ..." << std::flush; // use map with highest number of features as reference: Size max_count(0); FeatureXMLFile f; for (Size i = 0; i < in_files.size(); ++i) { Size s = 0; if (in_type == FileTypes::FEATUREXML) { s = f.loadSize(in_files[i]); } else if (in_type == FileTypes::MZML) // this is expensive! { PeakMap exp; MzMLFile().load(in_files[i], exp); exp.updateRanges(1); s = exp.getSize(); } if (s > max_count) { max_count = s; reference_index = i; } } LOG_INFO << " done" << std::endl; file = in_files[reference_index]; } FeatureXMLFile f_fxml; if (out_files.empty()) // no need to store featureXML, thus we can load only minimum required information { f_fxml.getOptions().setLoadConvexHull(false); f_fxml.getOptions().setLoadSubordinates(false); } if (in_type == FileTypes::FEATUREXML) { FeatureMap map_ref; FeatureXMLFile f_fxml_tmp; // for the reference, we never need CH or subordinates f_fxml_tmp.getOptions().setLoadConvexHull(false); f_fxml_tmp.getOptions().setLoadSubordinates(false); f_fxml_tmp.load(file, map_ref); algorithm.setReference(map_ref); } else if (in_type == FileTypes::MZML) { PeakMap map_ref; MzMLFile().load(file, map_ref); algorithm.setReference(map_ref); } ProgressLogger plog; plog.setLogType(log_type_); plog.startProgress(0, in_files.size(), "Aligning input maps"); Size progress(0); // thread-safe progress // TODO: it should all work on featureXML files, since we might need them for output anyway. Converting to consensusXML is just wasting memory! #ifdef _OPENMP #pragma omp parallel for schedule(dynamic, 1) #endif for (int i = 0; i < static_cast<int>(in_files.size()); ++i) { TransformationDescription trafo; if (in_type == FileTypes::FEATUREXML) { FeatureMap map; // workaround for loading: use temporary FeatureXMLFile since it is not thread-safe FeatureXMLFile f_fxml_tmp; // do not use OMP-firstprivate, since FeatureXMLFile has no copy c'tor f_fxml_tmp.getOptions() = f_fxml.getOptions(); f_fxml_tmp.load(in_files[i], map); if (i == static_cast<int>(reference_index)) trafo.fitModel("identity"); else algorithm.align(map, trafo); if (out_files.size()) { MapAlignmentTransformer::transformRetentionTimes(map, trafo); // annotate output with data processing info addDataProcessing_(map, getProcessingInfo_(DataProcessing::ALIGNMENT)); f_fxml_tmp.store(out_files[i], map); } } else if (in_type == FileTypes::MZML) { PeakMap map; MzMLFile().load(in_files[i], map); if (i == static_cast<int>(reference_index)) trafo.fitModel("identity"); else algorithm.align(map, trafo); if (out_files.size()) { MapAlignmentTransformer::transformRetentionTimes(map, trafo); // annotate output with data processing info addDataProcessing_(map, getProcessingInfo_(DataProcessing::ALIGNMENT)); MzMLFile().store(out_files[i], map); } } if (!out_trafos.empty()) { TransformationXMLFile().store(out_trafos[i], trafo); } #ifdef _OPENMP #pragma omp critical (MAPose_Progress) #endif { plog.setProgress(++progress); // thread safe progress counter } } plog.endProgress(); return EXECUTION_OK; }
ExitCodes main_(int, const char **) { StringList file_list = getStringList_("in"); String tr_file_str = getStringOption_("tr"); String out = getStringOption_("out"); bool is_swath = getFlag_("is_swath"); bool ppm = getFlag_("ppm"); bool extract_MS1 = getFlag_("extract_MS1"); double min_upper_edge_dist = getDoubleOption_("min_upper_edge_dist"); double mz_extraction_window = getDoubleOption_("mz_window"); double rt_extraction_window = getDoubleOption_("rt_window"); String extraction_function = getStringOption_("extraction_function"); // If we have a transformation file, trafo will transform the RT in the // scoring according to the model. If we dont have one, it will apply the // null transformation. String trafo_in = getStringOption_("rt_norm"); TransformationDescription trafo; if (trafo_in.size() > 0) { TransformationXMLFile trafoxml; String model_type = getStringOption_("model:type"); Param model_params = getParam_().copy("model:", true); trafoxml.load(trafo_in, trafo); trafo.fitModel(model_type, model_params); } TransformationDescription trafo_inverse = trafo; trafo_inverse.invert(); const char * tr_file = tr_file_str.c_str(); MapType out_exp; std::vector< OpenMS::MSChromatogram > chromatograms; TraMLFile traml; OpenMS::TargetedExperiment targeted_exp; std::cout << "Loading TraML file" << std::endl; traml.load(tr_file, targeted_exp); std::cout << "Loaded TraML file" << std::endl; // Do parallelization over the different input files // Only in OpenMP 3.0 are unsigned loop variables allowed #ifdef _OPENMP #pragma omp parallel for #endif for (SignedSize i = 0; i < boost::numeric_cast<SignedSize>(file_list.size()); ++i) { boost::shared_ptr<PeakMap > exp(new PeakMap); MzMLFile f; // Logging and output to the console // IF_MASTERTHREAD f.setLogType(log_type_); // Find the transitions to extract and extract them MapType tmp_out; OpenMS::TargetedExperiment transition_exp_used; f.load(file_list[i], *exp); if (exp->empty() ) { continue; } // if empty, go on OpenSwath::SpectrumAccessPtr expptr = SimpleOpenMSSpectraFactory::getSpectrumAccessOpenMSPtr(exp); bool do_continue = true; if (is_swath) { do_continue = OpenSwathHelper::checkSwathMapAndSelectTransitions(*exp, targeted_exp, transition_exp_used, min_upper_edge_dist); } else { transition_exp_used = targeted_exp; } #ifdef _OPENMP #pragma omp critical (OpenSwathChromatogramExtractor_metadata) #endif // after loading the first file, copy the meta data from that experiment // this may happen *after* chromatograms were already added to the // output, thus we do NOT fill the experiment here but rather store all // the chromatograms in the "chromatograms" array and store them in // out_exp afterwards. if (i == 0) { out_exp = *exp; out_exp.clear(false); } std::cout << "Extracting " << transition_exp_used.getTransitions().size() << " transitions" << std::endl; std::vector< OpenSwath::ChromatogramPtr > chromatogram_ptrs; std::vector< ChromatogramExtractor::ExtractionCoordinates > coordinates; // continue if the map is not empty if (do_continue) { // Prepare the coordinates (with or without rt extraction) and then extract the chromatograms ChromatogramExtractor extractor; if (rt_extraction_window < 0) { extractor.prepare_coordinates(chromatogram_ptrs, coordinates, transition_exp_used, rt_extraction_window, extract_MS1); } else { // Use an rt extraction window of 0.0 which will just write the retention time in start / end positions extractor.prepare_coordinates(chromatogram_ptrs, coordinates, transition_exp_used, 0.0, extract_MS1); for (std::vector< ChromatogramExtractor::ExtractionCoordinates >::iterator it = coordinates.begin(); it != coordinates.end(); ++it) { it->rt_start = trafo_inverse.apply(it->rt_start) - rt_extraction_window / 2.0; it->rt_end = trafo_inverse.apply(it->rt_end) + rt_extraction_window / 2.0; } } extractor.extractChromatograms(expptr, chromatogram_ptrs, coordinates, mz_extraction_window, ppm, extraction_function); #ifdef _OPENMP #pragma omp critical (OpenSwathChromatogramExtractor_insertMS1) #endif { // Remove potential meta value indicating cached data SpectrumSettings exp_settings = (*exp)[0]; for (Size j = 0; j < exp_settings.getDataProcessing().size(); j++) { if (exp_settings.getDataProcessing()[j]->metaValueExists("cached_data")) { exp_settings.getDataProcessing()[j]->removeMetaValue("cached_data"); } } extractor.return_chromatogram(chromatogram_ptrs, coordinates, transition_exp_used, exp_settings, chromatograms, extract_MS1); } } // end of do_continue } // end of loop over all files / end of OpenMP // TODO check that no chromatogram IDs occur multiple times ! // store the output out_exp.setChromatograms(chromatograms); MzMLFile mzf; mzf.setLogType(log_type_); addDataProcessing_(out_exp, getProcessingInfo_(DataProcessing::SMOOTHING)); mzf.store(out, out_exp); return EXECUTION_OK; }
void MapAlignmentAlgorithmIdentification::computeTransformations_( vector<SeqToList> & rt_data, vector<TransformationDescription> & transforms, bool sorted) { Size size = rt_data.size(); transforms.clear(); // filter RT data (remove peptides that elute in several fractions): // TODO // compute RT medians: LOG_DEBUG << "Computing RT medians..." << endl; vector<SeqToValue> medians_per_run(size); for (Size i = 0; i < size; ++i) { computeMedians_(rt_data[i], medians_per_run[i], sorted); } SeqToList medians_per_seq; for (vector<SeqToValue>::iterator run_it = medians_per_run.begin(); run_it != medians_per_run.end(); ++run_it) { for (SeqToValue::iterator med_it = run_it->begin(); med_it != run_it->end(); ++med_it) { medians_per_seq[med_it->first] << med_it->second; } } // get reference retention time scale: either directly from reference file, // or compute consensus time scale bool reference_given = !reference_.empty(); // reference file given if (reference_given) { // remove peptides that don't occur in enough runs: LOG_DEBUG << "Removing peptides that occur in too few runs..." << endl; SeqToValue temp; SeqToValue::iterator pos = temp.begin(); // to prevent segfault below for (SeqToValue::iterator ref_it = reference_.begin(); ref_it != reference_.end(); ++ref_it) { SeqToList::iterator med_it = medians_per_seq.find(ref_it->first); if ((med_it != medians_per_seq.end()) && (med_it->second.size() + 1 >= min_run_occur_)) { temp.insert(pos, *ref_it); pos = --temp.end(); // would cause segfault if "temp" was empty } } temp.swap(reference_); } else // compute overall RT median per sequence (median of medians per run) { LOG_DEBUG << "Computing overall RT medians per sequence..." << endl; // remove peptides that don't occur in enough runs (at least two): LOG_DEBUG << "Removing peptides that occur in too few runs..." << endl; SeqToList temp; SeqToList::iterator pos = temp.begin(); // to prevent segfault below for (SeqToList::iterator med_it = medians_per_seq.begin(); med_it != medians_per_seq.end(); ++med_it) { if (med_it->second.size() >= min_run_occur_) { temp.insert(pos, *med_it); pos = --temp.end(); // would cause segfault if "temp" was empty } } temp.swap(medians_per_seq); computeMedians_(medians_per_seq, reference_); } DoubleReal max_rt_shift = param_.getValue("max_rt_shift"); if (max_rt_shift == 0) { max_rt_shift = numeric_limits<DoubleReal>::max(); } else if (max_rt_shift <= 1) // compute max. allowed shift from overall retention time range: { DoubleReal rt_range, rt_min = reference_.begin()->second, rt_max = rt_min; for (SeqToValue::iterator it = ++reference_.begin(); it != reference_.end(); ++it) { rt_min = min(rt_min, it->second); rt_max = max(rt_max, it->second); } rt_range = rt_max - rt_min; max_rt_shift *= rt_range; } LOG_DEBUG << "Max. allowed RT shift (in seconds): " << max_rt_shift << endl; // generate RT transformations: LOG_DEBUG << "Generating RT transformations..." << endl; LOG_INFO << "\nAlignment based on:" << endl; // diagnostic output for (Size i = 0, offset = 0; i < size + 1; ++i) { if (i == reference_index_ - 1) { // if one of the input maps was used as reference, it has been skipped // so far - now we have to consider it again: TransformationDescription trafo; trafo.fitModel("identity"); transforms.push_back(trafo); LOG_INFO << "- 0 data points for sample " << i + 1 << " (reference)\n"; offset = 1; } if (i >= size) break; // to be useful for the alignment, a peptide sequence has to occur in the // current run ("medians_per_run[i]"), but also in at least one other run // ("medians_overall"): TransformationDescription::DataPoints data; for (SeqToValue::iterator med_it = medians_per_run[i].begin(); med_it != medians_per_run[i].end(); ++med_it) { SeqToValue::const_iterator pos = reference_.find(med_it->first); if ((pos != reference_.end()) && (fabs(med_it->second - pos->second) <= max_rt_shift)) { // found, and satisfies "max_rt_shift" condition! data.push_back(make_pair(med_it->second, pos->second)); } } transforms.push_back(TransformationDescription(data)); LOG_INFO << "- " << data.size() << " data points for sample " << i + offset + 1 << "\n"; } LOG_INFO << endl; // delete temporary reference if (!reference_given) reference_.clear(); }
void PoseClusteringShiftSuperimposer::run(const ConsensusMap & map_model, const ConsensusMap & map_scene, TransformationDescription & transformation) { typedef ConstRefVector<ConsensusMap> PeakPointerArray_; typedef Math::LinearInterpolation<double, double> LinearInterpolationType_; LinearInterpolationType_ shift_hash_; // OLD STUFF // LinearInterpolationType_ scaling_hash_1; // LinearInterpolationType_ scaling_hash_2; // LinearInterpolationType_ shift_hash_; // LinearInterpolationType_ rt_high_hash_; /// Maximum deviation in mz of two partner points const double mz_pair_max_distance = param_.getValue("mz_pair_max_distance"); /// Size of each shift bucket const double shift_bucket_size = param_.getValue("shift_bucket_size"); const UInt struc_elem_length_datapoints = 21; // MAGIC ALERT: number of data points in structuring element for tophat filter, which removes baseline from histogram const double scaling_histogram_crossing_slope = 3.0; // MAGIC ALERT: used when distinguishing noise level and enriched histogram bins const double scaling_cutoff_stdev_multiplier = 1.5; // MAGIC ALERT: multiplier for stdev in cutoff for outliers const UInt loops_mean_stdev_cutoff = 3; // MAGIC ALERT: number of loops in stdev cutoff for outliers startProgress(0, 100, "shift pose clustering"); UInt actual_progress = 0; setProgress(++actual_progress); // Optionally, we will write dumps of the hash table buckets. bool do_dump_buckets = false; String dump_buckets_basename; if (param_.getValue("dump_buckets") != "") { do_dump_buckets = true; dump_buckets_basename = param_.getValue("dump_buckets"); } setProgress(++actual_progress); // Even more optionally, we will write dumps of the hashed pairs. bool do_dump_pairs = false; String dump_pairs_basename; if (param_.getValue("dump_pairs") != "") { do_dump_pairs = true; dump_pairs_basename = param_.getValue("dump_pairs"); } setProgress(++actual_progress); //************************************************************************** // Select the most abundant data points only. After that, disallow modifications // (we tend to have annoying issues with const_iterator versus iterator). PeakPointerArray_ model_map_ini(map_model.begin(), map_model.end()); const PeakPointerArray_ & model_map(model_map_ini); PeakPointerArray_ scene_map_ini(map_scene.begin(), map_scene.end()); const PeakPointerArray_ & scene_map(scene_map_ini); { // truncate the data as necessary // casting to SignedSize is done on PURPOSE here! (num_used_points will be maximal if -1 is used) const Size num_used_points = (SignedSize) param_.getValue("num_used_points"); if (model_map_ini.size() > num_used_points) { model_map_ini.sortByIntensity(true); model_map_ini.resize(num_used_points); } model_map_ini.sortByComparator(Peak2D::MZLess()); setProgress(++actual_progress); if (scene_map_ini.size() > num_used_points) { scene_map_ini.sortByIntensity(true); scene_map_ini.resize(num_used_points); } scene_map_ini.sortByComparator(Peak2D::MZLess()); setProgress(++actual_progress); // Note: model_map_ini and scene_map_ini will not be used further below } setProgress((actual_progress = 10)); //************************************************************************** // Preprocessing // get RT ranges (NOTE: we trust that min and max have been updated in the // ConsensusMap::convert() method !) const double model_low = map_model.getMin()[ConsensusFeature::RT]; const double scene_low = map_scene.getMin()[ConsensusFeature::RT]; const double model_high = map_model.getMax()[ConsensusFeature::RT]; const double scene_high = map_scene.getMax()[ConsensusFeature::RT]; // OLD STUFF // const double rt_low = (maps[0].getMin()[ConsensusFeature::RT] + maps[1].getMin()[ConsensusFeature::RT]) / 2.; // const double rt_high = (maps[0].getMax()[ConsensusFeature::RT] + maps[1].getMax()[ConsensusFeature::RT]) / 2.; // Initialize the hash tables: shift_hash_ // OLD STUFF: was: rt_scaling_hash_, rt_low_hash_, and rt_high_hash_ { // (over)estimate the required number of buckets for shifting double max_shift = param_.getValue("max_shift"); // actually the largest possible shift can be much smaller, depending on the data do { if (max_shift < 0) max_shift = -max_shift; // ...ml@@@mh........ , ........ml@@@mh... // ........sl@@@sh... , ...sl@@@sh........ double diff; diff = model_high - scene_low; if (diff < 0) diff = -diff; if (max_shift > diff) max_shift = diff; diff = model_low - scene_high; if (diff < 0) diff = -diff; if (max_shift > diff) max_shift = diff; } while (0); const Int shift_buckets_num_half = 4 + (Int) ceil((max_shift) / shift_bucket_size); const Int shift_buckets_num = 1 + 2 * shift_buckets_num_half; shift_hash_.getData().clear(); shift_hash_.getData().resize(shift_buckets_num); shift_hash_.setMapping(shift_bucket_size, shift_buckets_num_half, 0); } setProgress(++actual_progress); //************************************************************************** // compute the ratio of the total intensities of both maps, for normalization double total_intensity_ratio; do { double total_int_model_map = 0; for (Size i = 0; i < model_map.size(); ++i) { total_int_model_map += model_map[i].getIntensity(); } setProgress(++actual_progress); double total_int_scene_map = 0; for (Size i = 0; i < scene_map.size(); ++i) { total_int_scene_map += scene_map[i].getIntensity(); } setProgress(++actual_progress); // ... and finally ... total_intensity_ratio = total_int_model_map / total_int_scene_map; } while (0); // (the extra syntax helps with code folding in eclipse!) setProgress((actual_progress = 20)); /// The serial number is incremented for each invocation of this, to avoid overwriting of hash table dumps. static Int dump_buckets_serial = 0; ++dump_buckets_serial; //************************************************************************** // Hashing // Compute the transformations between each point pair in the model map // and each point pair in the scene map and hash the shift // transformation. // To speed up the calculation of the final transformation, we confine the number of // considered point pairs. We match a point p in the model map only onto those points p' // in the scene map that lie in a certain mz interval. Size const model_map_size = model_map.size(); // i /* OLD STUFF: also: j */ Size const scene_map_size = scene_map.size(); // k /* OLD STUFF: also: l */ const double winlength_factor_baseline = 0.1; // MAGIC ALERT: Each window is given unit weight. If there are too many pairs for a window, the individual contributions will be very small, but running time will be high, so we provide a cutoff for this. Typically this will exclude compounds which elute over the whole retention time range from consideration. /////////////////////////////////////////////////////////////////// // Hashing: Estimate the shift do // begin of hashing (the extra syntax helps with code folding in eclipse!) { String dump_pairs_filename; std::ofstream dump_pairs_file; if (do_dump_pairs) { dump_pairs_filename = dump_pairs_basename + String(dump_buckets_serial); dump_pairs_file.open(dump_pairs_filename.c_str()); dump_pairs_file << "#" << ' ' << "i" << ' ' << "k" << std::endl; } setProgress(++actual_progress); // first point in model map for (Size i = 0, i_low = 0, i_high = 0, k_low = 0, k_high = 0; i < model_map_size - 1; ++i) { setProgress(actual_progress + float(i) / model_map_size * 10.f); // Adjust window around i in model map while (i_low < model_map_size && model_map[i_low].getMZ() < model_map[i].getMZ() - mz_pair_max_distance) ++i_low; while (i_high < model_map_size && model_map[i_high].getMZ() <= model_map[i].getMZ() + mz_pair_max_distance) ++i_high; double i_winlength_factor = 1. / (i_high - i_low); i_winlength_factor -= winlength_factor_baseline; if (i_winlength_factor <= 0) continue; // Adjust window around k in scene map while (k_low < scene_map_size && scene_map[k_low].getMZ() < model_map[i].getMZ() - mz_pair_max_distance) ++k_low; while (k_high < scene_map_size && scene_map[k_high].getMZ() <= model_map[i].getMZ() + mz_pair_max_distance) ++k_high; // first point in scene map for (Size k = k_low; k < k_high; ++k) { double k_winlength_factor = 1. / (k_high - k_low); k_winlength_factor -= winlength_factor_baseline; if (k_winlength_factor <= 0) continue; // compute similarity of intensities i k double similarity_ik; { const double int_i = model_map[i].getIntensity(); const double int_k = scene_map[k].getIntensity() * total_intensity_ratio; similarity_ik = (int_i < int_k) ? int_i / int_k : int_k / int_i; // weight is inverse proportional to number of elements with similar mz similarity_ik *= i_winlength_factor; similarity_ik *= k_winlength_factor; // VV_(int_i<<' '<<int_k<<' '<<int_similarity_ik); } // compute the transformation (i) -> (k) double shift = model_map[i].getRT() - scene_map[k].getRT(); // hash the images of scaling, rt_low and rt_high into their respective hash tables shift_hash_.addValue(shift, similarity_ik); if (do_dump_pairs) { dump_pairs_file << i << ' ' << model_map[i].getRT() << ' ' << model_map[i].getMZ() << ' ' << k << ' ' << scene_map[k].getRT() << ' ' << scene_map[k].getMZ() << ' ' << similarity_ik << ' ' << std::endl; } } // k } // i } while (0); // end of hashing (the extra syntax helps with code folding in eclipse!) setProgress((actual_progress = 30)); /////////////////////////////////////////////////////////////////// // work on shift_hash_ // double shift_low; // double shift_centroid; // double shift_high; // OLD STUFF // double shift_low; double shift_centroid; // double shift_high; do { UInt filtering_stage = 0; // optionally, dump before filtering String dump_buckets_filename; std::ofstream dump_buckets_file; if (do_dump_buckets) { dump_buckets_filename = dump_buckets_basename + "_" + String(dump_buckets_serial); dump_buckets_file.open(dump_buckets_filename.c_str()); VV_(dump_buckets_filename); dump_buckets_file << "# shift hash table buckets dump ( scale, height ) : " << dump_buckets_filename << std::endl; dump_buckets_file << "# unfiltered hash data\n"; for (Size index = 0; index < shift_hash_.getData().size(); ++index) { const double image = shift_hash_.index2key(index); const double height = shift_hash_.getData()[index]; dump_buckets_file << filtering_stage << '\t' << index << '\t' << image << '\t' << height << '\n'; } dump_buckets_file << '\n'; } ++filtering_stage; setProgress(++actual_progress); // apply tophat filter to histogram MorphologicalFilter morph_filter; Param morph_filter_param; morph_filter_param.setValue("struc_elem_unit", "DataPoints"); morph_filter_param.setValue("struc_elem_length", double(struc_elem_length_datapoints)); morph_filter_param.setValue("method", "tophat"); morph_filter.setParameters(morph_filter_param); LinearInterpolationType_::container_type buffer(shift_hash_.getData().size()); morph_filter.filterRange(shift_hash_.getData().begin(), shift_hash_.getData().end(), buffer.begin()); shift_hash_.getData().swap(buffer); // optionally, dump after filtering if (do_dump_buckets) { dump_buckets_file << "# tophat filtered hash data\n"; for (Size index = 0; index < shift_hash_.getData().size(); ++index) { const double image = shift_hash_.index2key(index); const double height = shift_hash_.getData()[index]; dump_buckets_file << filtering_stage << '\t' << index << '\t' << image << '\t' << height << '\n'; } dump_buckets_file << '\n'; } setProgress(++actual_progress); ++filtering_stage; // compute freq_cutoff using a fancy criterion to distinguish between the noise level of the histogram and enriched histogram bins double freq_cutoff_low; do { { std::copy(shift_hash_.getData().begin(), shift_hash_.getData().end(), buffer.begin()); std::sort(buffer.begin(), buffer.end(), std::greater<double>()); double freq_intercept = shift_hash_.getData().front(); double freq_slope = (shift_hash_.getData().back() - shift_hash_.getData().front()) / double(buffer.size()) / scaling_histogram_crossing_slope; if (!freq_slope || !buffer.size()) { // in fact these conditions are actually impossible, but let's be really sure ;-) freq_cutoff_low = 0; } else { Size index = 1; // not 0 (!) while (buffer[index] >= freq_intercept + freq_slope * double(index)) { ++index; } freq_cutoff_low = buffer[--index]; // note that we have index >= 1 } } } while (0); setProgress(++actual_progress); // apply freq_cutoff, setting smaller values to zero for (Size index = 0; index < shift_hash_.getData().size(); ++index) { if (shift_hash_.getData()[index] < freq_cutoff_low) { shift_hash_.getData()[index] = 0; } } setProgress(++actual_progress); // optionally, dump after noise filtering using freq_cutoff if (do_dump_buckets) { dump_buckets_file << "# after freq_cutoff, which is: " << freq_cutoff_low << '\n'; for (Size index = 0; index < shift_hash_.getData().size(); ++index) { const double image = shift_hash_.index2key(index); const double height = shift_hash_.getData()[index]; dump_buckets_file << filtering_stage << '\t' << index << '\t' << image << '\t' << height << '\n'; } dump_buckets_file << '\n'; } setProgress(++actual_progress); // iterative cut-off based on mean and stdev - relies upon scaling_cutoff_stdev_multiplier which is a bit hard to set right. { Math::BasicStatistics<double> statistics; std::vector<double>::const_iterator data_begin = shift_hash_.getData().begin(); const Size data_size = shift_hash_.getData().size(); Size data_range_begin = 0; Size data_range_end = data_size; for (UInt loop = 0; loop < loops_mean_stdev_cutoff; ++loop) // MAGIC ALERT: number of loops { statistics.update(data_begin + data_range_begin, data_begin + data_range_end); double mean = statistics.mean() + data_range_begin; double stdev = sqrt(statistics.variance()); data_range_begin = floor(std::max<double>(mean - scaling_cutoff_stdev_multiplier * stdev, 0)); data_range_end = ceil(std::min<double>(mean + scaling_cutoff_stdev_multiplier * stdev + 1, data_size)); const double outside_mean = shift_hash_.index2key(mean); const double outside_stdev = stdev * shift_hash_.getScale(); // shift_low = (outside_mean - outside_stdev); shift_centroid = (outside_mean); // shift_high = (outside_mean + outside_stdev); if (do_dump_buckets) { dump_buckets_file << "# loop: " << loop << " mean: " << outside_mean << " stdev: " << outside_stdev << " (mean-stdev): " << outside_mean - outside_stdev << " (mean+stdev): " << outside_mean + outside_stdev << " data_range_begin: " << data_range_begin << " data_range_end: " << data_range_end << std::endl; } } setProgress(++actual_progress); } if (do_dump_buckets) { dump_buckets_file << "# EOF" << std::endl; dump_buckets_file.close(); } setProgress(80); } while (0); //************************************************************************************ // Estimate transform // Compute the shifts at the low and high ends by looking at (around) the fullest bins. double intercept; #if 1 // yes of course, use centroids for images of rt_low and rt_high intercept = shift_centroid; #else // ooh, use maximum bins instead (Note: this is a fossil which would disregard most of the above computations! The code is left here for developers/debugging only.) const Size rt_low_max_index = std::distance(shift_hash_.getData().begin(), std::max_element(shift_hash_.getData().begin(), shift_hash_.getData().end())); intercept = shift_hash_.index2key(rt_low_max_index); #endif VV_(intercept); setProgress(++actual_progress); // set trafo { Param params; params.setValue("slope", 1.0); params.setValue("intercept", intercept); TransformationDescription trafo; trafo.fitModel("linear", params); transformation = trafo; } setProgress(++actual_progress); endProgress(); return; } // run()