void SeedListGenerator::generateSeedLists(const ConsensusMap& consensus, Map<UInt64, SeedList>& seed_lists) { seed_lists.clear(); // iterate over all consensus features... for (ConsensusMap::ConstIterator cons_it = consensus.begin(); cons_it != consensus.end(); ++cons_it) { DPosition<2> point(cons_it->getRT(), cons_it->getMZ()); // for each sub-map in the consensus map, add a seed at the position of // this consensus feature: for (ConsensusMap::FileDescriptions::const_iterator file_it = consensus.getFileDescriptions().begin(); file_it != consensus.getFileDescriptions().end(); ++file_it) seed_lists[file_it->first].push_back(point); // for each feature contained in the consensus feature, remove the seed of // the corresponding map: for (ConsensusFeature::HandleSetType::const_iterator feat_it = cons_it->getFeatures().begin(); feat_it != cons_it->getFeatures().end(); ++feat_it) { seed_lists[feat_it->getMapIndex()].pop_back(); } // this leaves seeds for maps where no feature was found near the // consensus position } }
ExitCodes main_(int, const char **) { String in = getStringOption_("in"), out = getStringOption_("out"); FileTypes::Type in_type = FileHandler::getType(in); if (in_type == FileTypes::FEATUREXML) { FeatureMap<> features; FeatureXMLFile().load(in, features); for (FeatureMap<>::Iterator feat_it = features.begin(); feat_it != features.end(); ++feat_it) { resolveConflict_(feat_it->getPeptideIdentifications()); } addDataProcessing_(features, getProcessingInfo_(DataProcessing::FILTERING)); FeatureXMLFile().store(out, features); } else // consensusXML { ConsensusMap consensus; ConsensusXMLFile().load(in, consensus); for (ConsensusMap::Iterator cons_it = consensus.begin(); cons_it != consensus.end(); ++cons_it) { resolveConflict_(cons_it->getPeptideIdentifications()); } addDataProcessing_(consensus, getProcessingInfo_(DataProcessing::FILTERING)); ConsensusXMLFile().store(out, consensus); } return EXECUTION_OK; }
void ConsensusMapNormalizerAlgorithmThreshold::normalizeMaps(ConsensusMap& map, const vector<double>& ratios) { ConsensusMap::Iterator cf_it; ProgressLogger progresslogger; progresslogger.setLogType(ProgressLogger::CMD); progresslogger.startProgress(0, map.size(), "normalizing maps"); for (cf_it = map.begin(); cf_it != map.end(); ++cf_it) { progresslogger.setProgress(cf_it - map.begin()); ConsensusFeature::HandleSetType::const_iterator f_it; for (f_it = cf_it->getFeatures().begin(); f_it != cf_it->getFeatures().end(); ++f_it) { f_it->asMutable().setIntensity(f_it->getIntensity() * ratios[f_it->getMapIndex()]); } } progresslogger.endProgress(); }
void MapAlignmentAlgorithmPoseClustering::align(const ConsensusMap & map, TransformationDescription & trafo) { // TODO: move this to updateMembers_? (if consensusMap prevails) // TODO: why does superimposer work on consensus map??? const ConsensusMap & map_model = reference_; ConsensusMap map_scene = map; // run superimposer to find the global transformation TransformationDescription si_trafo; superimposer_.run(map_model, map_scene, si_trafo); // apply transformation to consensus features and contained feature // handles for (Size j = 0; j < map_scene.size(); ++j) { //Calculate new RT double rt = map_scene[j].getRT(); rt = si_trafo.apply(rt); //Set RT of consensus feature centroid map_scene[j].setRT(rt); //Set RT of consensus feature handles map_scene[j].begin()->asMutable().setRT(rt); } //run pairfinder to find pairs ConsensusMap result; //TODO: add another 2map interface to pairfinder? std::vector<ConsensusMap> input(2); input[0] = map_model; input[1] = map_scene; pairfinder_.run(input, result); // calculate the local transformation si_trafo.invert(); // to undo the transformation applied above TransformationDescription::DataPoints data; for (ConsensusMap::Iterator it = result.begin(); it != result.end(); ++it) { if (it->size() == 2) // two matching features { ConsensusFeature::iterator feat_it = it->begin(); double y = feat_it->getRT(); double x = si_trafo.apply((++feat_it)->getRT()); // one feature should be from the reference map: if (feat_it->getMapIndex() != 0) { data.push_back(make_pair(x, y)); } else { data.push_back(make_pair(y, x)); } } } trafo = TransformationDescription(data); trafo.fitModel("linear"); }
void FeatureGroupingAlgorithm::transferSubelements(const vector<ConsensusMap>& maps, ConsensusMap& out) const { // accumulate file descriptions from the input maps: // cout << "Updating file descriptions..." << endl; out.getFileDescriptions().clear(); // mapping: (map index, original id) -> new id map<pair<Size, UInt64>, Size> mapid_table; for (Size i = 0; i < maps.size(); ++i) { const ConsensusMap& consensus = maps[i]; for (ConsensusMap::FileDescriptions::const_iterator desc_it = consensus.getFileDescriptions().begin(); desc_it != consensus.getFileDescriptions().end(); ++desc_it) { Size counter = mapid_table.size(); mapid_table[make_pair(i, desc_it->first)] = counter; out.getFileDescriptions()[counter] = desc_it->second; } } // look-up table: input map -> unique ID -> consensus feature // cout << "Creating look-up table..." << endl; vector<map<UInt64, ConsensusMap::ConstIterator> > feat_lookup(maps.size()); for (Size i = 0; i < maps.size(); ++i) { const ConsensusMap& consensus = maps[i]; for (ConsensusMap::ConstIterator feat_it = consensus.begin(); feat_it != consensus.end(); ++feat_it) { // do NOT use "id_lookup[i][feat_it->getUniqueId()] = feat_it;" here as // you will get "attempt to copy-construct an iterator from a singular // iterator" in STL debug mode: feat_lookup[i].insert(make_pair(feat_it->getUniqueId(), feat_it)); } } // adjust the consensus features: // cout << "Adjusting consensus features..." << endl; for (ConsensusMap::iterator cons_it = out.begin(); cons_it != out.end(); ++cons_it) { ConsensusFeature adjusted = ConsensusFeature( static_cast<BaseFeature>(*cons_it)); // remove sub-features for (ConsensusFeature::HandleSetType::const_iterator sub_it = cons_it->getFeatures().begin(); sub_it != cons_it->getFeatures().end(); ++sub_it) { UInt64 id = sub_it->getUniqueId(); Size map_index = sub_it->getMapIndex(); ConsensusMap::ConstIterator origin = feat_lookup[map_index][id]; for (ConsensusFeature::HandleSetType::const_iterator handle_it = origin->getFeatures().begin(); handle_it != origin->getFeatures().end(); ++handle_it) { FeatureHandle handle = *handle_it; Size new_id = mapid_table[make_pair(map_index, handle.getMapIndex())]; handle.setMapIndex(new_id); adjusted.insert(handle); } } *cons_it = adjusted; } }
void MapAlignmentTransformer::transformSingleConsensusMap(ConsensusMap & cmap, const TransformationDescription & trafo) { for (ConsensusMap::Iterator cmit = cmap.begin(); cmit != cmap.end(); ++cmit) { applyToConsensusFeature_(*cmit, trafo); } // adapt RT values of unassigned peptides: if (!cmap.getUnassignedPeptideIdentifications().empty()) { transformSinglePeptideIdentification( cmap.getUnassignedPeptideIdentifications(), trafo); } }
void ConsensusMapNormalizerAlgorithmQuantile::setNormalizedIntensityValues(const vector<vector<double> >& feature_ints, ConsensusMap& map) { //assumes the input map and feature_ints are in the same order as in the beginning, //although feature_ints has normalized values now (but the same ranks as before) Size number_of_maps = map.getColumnHeaders().size(); ConsensusMap::ConstIterator cf_it; vector<Size> progress_indices(number_of_maps); for (cf_it = map.begin(); cf_it != map.end(); ++cf_it) { ConsensusFeature::HandleSetType::const_iterator f_it; for (f_it = cf_it->getFeatures().begin(); f_it != cf_it->getFeatures().end(); ++f_it) { Size map_idx = f_it->getMapIndex(); double intensity = feature_ints[map_idx][progress_indices[map_idx]++]; f_it->asMutable().setIntensity(intensity); } } }
void QuantitativeExperimentalDesign::mergeConsensusMaps_(ConsensusMap & out, const String & experiment, StringList & file_paths) { ConsensusMap map; LOG_INFO << "Merge consensus maps: " << endl; UInt counter = 1; for (StringList::Iterator file_it = file_paths.begin(); file_it != file_paths.end(); ++file_it, ++counter) { //load should clear the map ConsensusXMLFile().load(*file_it, map); for (ConsensusMap::iterator it = map.begin(); it != map.end(); ++it) { it->setMetaValue("experiment", DataValue(experiment)); } out += map; } LOG_INFO << endl; }
void ConsensusMapNormalizerAlgorithmQuantile::extractIntensityVectors(const ConsensusMap& map, vector<vector<double> >& out_intensities) { //reserve space for out_intensities (unequal vector lengths, 0-features omitted) Size number_of_maps = map.getColumnHeaders().size(); out_intensities.clear(); out_intensities.resize(number_of_maps); for (UInt i = 0; i < number_of_maps; i++) { ConsensusMap::ColumnHeaders::const_iterator it = map.getColumnHeaders().find(i); if (it == map.getColumnHeaders().end()) throw Exception::ElementNotFound(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, String(i)); out_intensities[i].reserve(it->second.size); } //fill out_intensities ConsensusMap::ConstIterator cf_it; for (cf_it = map.begin(); cf_it != map.end(); ++cf_it) { ConsensusFeature::HandleSetType::const_iterator f_it; for (f_it = cf_it->getFeatures().begin(); f_it != cf_it->getFeatures().end(); ++f_it) { out_intensities[f_it->getMapIndex()].push_back(f_it->getIntensity()); } } }
vector<double> ConsensusMapNormalizerAlgorithmThreshold::computeCorrelation(const ConsensusMap& map, const double& ratio_threshold, const String& acc_filter, const String& desc_filter) { Size number_of_features = map.size(); Size number_of_maps = map.getFileDescriptions().size(); vector<vector<double> > feature_int(number_of_maps); //get map with most features, resize feature_int UInt map_with_most_features_idx = 0; ConsensusMap::FileDescriptions::const_iterator map_with_most_features = map.getFileDescriptions().find(0); for (UInt i = 0; i < number_of_maps; i++) { feature_int[i].resize(number_of_features); ConsensusMap::FileDescriptions::const_iterator it = map.getFileDescriptions().find(i); if (it->second.size > map_with_most_features->second.size) { map_with_most_features = it; map_with_most_features_idx = i; } } //fill feature_int with intensities Size pass_counter = 0; ConsensusMap::ConstIterator cf_it; UInt idx = 0; for (cf_it = map.begin(); cf_it != map.end(); ++cf_it, ++idx) { if (!ConsensusMapNormalizerAlgorithmMedian::passesFilters_(cf_it, map, acc_filter, desc_filter)) { continue; } ++pass_counter; ConsensusFeature::HandleSetType::const_iterator f_it; for (f_it = cf_it->getFeatures().begin(); f_it != cf_it->getFeatures().end(); ++f_it) { feature_int[f_it->getMapIndex()][idx] = f_it->getIntensity(); } } LOG_INFO << endl << "Using " << pass_counter << "/" << map.size() << " consensus features for computing normalization coefficients" << endl << endl; //determine ratio vector<double> ratio_vector(number_of_maps); for (UInt j = 0; j < number_of_maps; j++) { vector<double> ratios; for (UInt k = 0; k < number_of_features; ++k) { if (feature_int[map_with_most_features_idx][k] != 0.0 && feature_int[j][k] != 0.0) { double ratio = feature_int[map_with_most_features_idx][k] / feature_int[j][k]; if (ratio > ratio_threshold && ratio < 1 / ratio_threshold) { ratios.push_back(ratio); } } } if (ratios.empty()) { LOG_WARN << endl << "Not enough features passing filters. Cannot compute normalization coefficients for all maps. Result will be unnormalized." << endl << endl; return vector<double>(number_of_maps, 1.0); } ratio_vector[j] = Math::mean(ratios.begin(), ratios.end()); } return ratio_vector; }
IsobaricChannelExtractor ice(q_method); // disable activation filtering Param p = ice.getParameters(); p.setValue("select_activation", ""); p.setValue("min_precursor_intensity", 5300000.0); ice.setParameters(p); // extract channels ConsensusMap cm_out; ice.extractChannels(exp, cm_out); // compare results TEST_EQUAL(cm_out.size(), 4) ABORT_IF(cm_out.size() != 4) for(ConsensusMap::Iterator cf = cm_out.begin(); cf != cm_out.end(); ++cf) { DoubleReal prec_intensity = cf->getMetaValue("precursor_intensity"); TEST_EQUAL(prec_intensity > 5300000.0, true) } } { // load test data MSExperiment<Peak1D> exp; MzMLFile mzmlfile; mzmlfile.load(OPENMS_GET_TEST_DATA_PATH("IsobaricChannelExtractor_6.mzML"), exp); // add some more information to the quant method Param pItraq = q_method->getParameters(); pItraq.setValue("channel_114_description", "ref"); pItraq.setValue("channel_115_description", "something");
ExitCodes outputTo(ostream& os) { //------------------------------------------------------------- // Parameter handling //------------------------------------------------------------- // File names String in = getStringOption_("in"); // File type FileHandler fh; FileTypes::Type in_type = FileTypes::nameToType(getStringOption_("in_type")); if (in_type == FileTypes::UNKNOWN) { in_type = fh.getType(in); writeDebug_(String("Input file type: ") + FileTypes::typeToName(in_type), 2); } if (in_type == FileTypes::UNKNOWN) { writeLog_("Error: Could not determine input file type!"); return PARSE_ERROR; } MSExperiment<Peak1D> exp; FeatureMap feat; ConsensusMap cons; if (in_type == FileTypes::FEATUREXML) //features { FeatureXMLFile().load(in, feat); feat.updateRanges(); } else if (in_type == FileTypes::CONSENSUSXML) //consensus features { ConsensusXMLFile().load(in, cons); cons.updateRanges(); } //------------------------------------------------------------- // meta information //------------------------------------------------------------- if (getFlag_("m")) { os << endl << "-- General information --" << endl << endl << "file name: " << in << endl << "file type: " << FileTypes::typeToName(in_type) << endl; //basic info os << endl << "-- Meta information --" << endl << endl; if (in_type == FileTypes::FEATUREXML) //features { os << "Document id : " << feat.getIdentifier() << endl << endl; } else if (in_type == FileTypes::CONSENSUSXML) //consensus features { os << "Document id : " << cons.getIdentifier() << endl << endl; } } //------------------------------------------------------------- // data processing //------------------------------------------------------------- if (getFlag_("p")) { //basic info os << endl << "-- Data processing information --" << endl << endl; //get data processing info vector<DataProcessing> dp; if (in_type == FileTypes::FEATUREXML) //features { dp = feat.getDataProcessing(); } else if (in_type == FileTypes::CONSENSUSXML) //consensus features { dp = cons.getDataProcessing(); } int i = 0; for (vector<DataProcessing>::iterator it = dp.begin(); it != dp.end(); ++it) { os << "Data processing " << i << endl; os << "\tcompletion_time: " << (*it).getCompletionTime().getDate() << 'T' << (*it).getCompletionTime().getTime() << endl; os << "\tsoftware name: " << (*it).getSoftware().getName() << " version " << (*it).getSoftware().getVersion() << endl; for (set<DataProcessing::ProcessingAction>::const_iterator paIt = (*it).getProcessingActions().begin(); paIt != (*it).getProcessingActions().end(); ++paIt) { os << "\t\tprocessing action: " << DataProcessing::NamesOfProcessingAction[*paIt] << endl; } } ++i; } //------------------------------------------------------------- // statistics //------------------------------------------------------------- if (getFlag_("s")) { //------------------------------------------------------------- // Content statistics //------------------------------------------------------------- Map<String, int> meta_names; if (in_type == FileTypes::FEATUREXML) //features { os << "Number of features: " << feat.size() << endl << endl << "Ranges:" << endl << " retention time: " << String::number(feat.getMin()[Peak2D::RT], 2) << " : " << String::number(feat.getMax()[Peak2D::RT], 2) << endl << " mass-to-charge: " << String::number(feat.getMin()[Peak2D::MZ], 2) << " : " << String::number(feat.getMax()[Peak2D::MZ], 2) << endl << " intensity: " << String::number(feat.getMinInt(), 2) << " : " << String::number(feat.getMaxInt(), 2) << endl << endl; // Charge distribution Map<UInt, UInt> charges; for (Size i = 0; i < feat.size(); ++i) { charges[feat[i].getCharge()]++; } os << "Charge distribution" << endl; for (Map<UInt, UInt>::const_iterator it = charges.begin(); it != charges.end(); ++it) { os << "charge " << it->first << ": " << it->second << endl; } } else if (in_type == FileTypes::CONSENSUSXML) //consensus features { map<Size, UInt> num_consfeat_of_size; for (ConsensusMap::const_iterator cmit = cons.begin(); cmit != cons.end(); ++cmit) { ++num_consfeat_of_size[cmit->size()]; } os << endl << "Number of consensus features:" << endl; for (map<Size, UInt>::reverse_iterator i = num_consfeat_of_size.rbegin(); i != num_consfeat_of_size.rend(); ++i) { os << " of size " << setw(2) << i->first << ": " << setw(6) << i->second << endl; } os << " total: " << setw(6) << cons.size() << endl << endl; os << "Ranges:" << endl << " retention time: " << String::number(cons.getMin()[Peak2D::RT], 2) << " : " << String::number(cons.getMax()[Peak2D::RT], 2) << endl << " mass-to-charge: " << String::number(cons.getMin()[Peak2D::MZ], 2) << " : " << String::number(cons.getMax()[Peak2D::MZ], 2) << endl << " intensity: " << String::number(cons.getMinInt(), 2) << " : " << String::number(cons.getMaxInt(), 2) << endl; // file descriptions const ConsensusMap::FileDescriptions& descs = cons.getFileDescriptions(); if (!descs.empty()) { os << endl << "File descriptions:" << endl; for (ConsensusMap::FileDescriptions::const_iterator it = descs.begin(); it != descs.end(); ++it) { os << " - " << it->second.filename << endl << " identifier: " << it->first << endl << " label : " << it->second.label << endl << " size : " << it->second.size << endl; } } } os << endl << "-- Summary Statistics --" << endl << endl; } if (in_type == FileTypes::FEATUREXML) //features { feat.sortByRT(); vector<double> slice_stats; Size n = getIntOption_("n"); Size begin = 0; Size end = 0; os << "#slice\tRT_begin\tRT_end\tnumber_of_features\ttic\t" << "int_mean\tint_stddev\tint_min\tint_max\tint_median\tint_lowerq\tint_upperq\t" << "mz_mean\tmz_stddev\tmz_min\tmz_max\tmz_median\tmz_lowerq\tmz_upperq\t" << "width_mean\twidth_stddev\twidth_min\twidth_max\twidth_median\twidth_lowerq\twidth_upperq\t" << "qual_mean\tqual_stddev\tqual_min\tqual_max\tqual_median\tqual_lowerq\tqual_upperq\t" << "rt_qual_mean\trt_qual_stddev\trt_qual_min\trt_qual_max\trt_qual_median\trt_qual_lowerq\trt_qual_upperq\t" << "mz_qual_mean\tmz_qual_stddev\tmz_qual_min\tmz_qual_max\tmz_qual_median\tmz_qual_lowerq\tmz_qual_upperq" << endl; double rt_begin = 0.0; for (Size slice = 0; slice < n; ++slice) { // Determine slice boundaries. double rt_end = feat.back().getRT() / (double)n * (slice + 1); for (end = begin; end < feat.size() && feat[end].getRT() < rt_end; ++end) {} // Compute statistics on all features in this slice. slice_stats = sliceStatistics(feat, begin, end); // Write the beginning and end of the slices to the output as well as the slice index. os << slice << "\t" << rt_begin << "\t" << rt_end << "\t" << end - begin << "\t"; // Write the statistics as a line of an csv file copy(slice_stats.begin(), slice_stats.end(), ostream_iterator<double>(os, "\t")); os << endl; begin = end; rt_begin = rt_end; } } else if (in_type == FileTypes::CONSENSUSXML) //consensus features { Size size = cons.size(); vector<double> intensities; intensities.reserve(size); vector<double> qualities(size); qualities.reserve(size); vector<double> widths(size); widths.reserve(size); vector<double> rt_delta_by_elems; vector<double> rt_aad_by_elems; vector<double> rt_aad_by_cfs; rt_aad_by_cfs.reserve(size); vector<double> mz_delta_by_elems; vector<double> mz_aad_by_elems; vector<double> mz_aad_by_cfs; mz_aad_by_cfs.reserve(size); vector<double> it_delta_by_elems; vector<double> it_aad_by_elems; vector<double> it_aad_by_cfs; it_aad_by_cfs.reserve(size); for (ConsensusMap::const_iterator cm_iter = cons.begin(); cm_iter != cons.end(); ++cm_iter) { double rt_aad = 0; double mz_aad = 0; double it_aad = 0; intensities.push_back(cm_iter->getIntensity()); qualities.push_back(cm_iter->getQuality()); widths.push_back(cm_iter->getWidth()); for (ConsensusFeature::HandleSetType::const_iterator hs_iter = cm_iter->begin(); hs_iter != cm_iter->end(); ++hs_iter) { double rt_diff = hs_iter->getRT() - cm_iter->getRT(); rt_delta_by_elems.push_back(rt_diff); if (rt_diff < 0) { rt_diff = -rt_diff; } rt_aad_by_elems.push_back(rt_diff); rt_aad += rt_diff; double mz_diff = hs_iter->getMZ() - cm_iter->getMZ(); mz_delta_by_elems.push_back(mz_diff); if (mz_diff < 0) { mz_diff = -mz_diff; } mz_aad_by_elems.push_back(mz_diff); mz_aad += mz_diff; double it_ratio = hs_iter->getIntensity() / (cm_iter->getIntensity() ? cm_iter->getIntensity() : 1.); it_delta_by_elems.push_back(it_ratio); if (it_ratio < 1.) { it_ratio = 1. / it_ratio; } it_aad_by_elems.push_back(it_ratio); it_aad += it_ratio; } if (!cm_iter->empty()) { rt_aad /= cm_iter->size(); mz_aad /= cm_iter->size(); it_aad /= cm_iter->size(); } // otherwise rt_aad etc. are 0 anyway rt_aad_by_cfs.push_back(rt_aad); mz_aad_by_cfs.push_back(mz_aad); it_aad_by_cfs.push_back(it_aad); } OpenMS::SomeStatistics some_statistics; os.precision(writtenDigits(ConsensusFeature::IntensityType())); os << "Intensities of consensus features:" << endl << some_statistics(intensities) << endl; os.precision(writtenDigits(ConsensusFeature::QualityType())); os << "Qualities of consensus features:" << endl << some_statistics(qualities) << endl; os.precision(writtenDigits(ConsensusFeature::CoordinateType())); os << "Retention time differences ( element-center, weight 1 per element):" << endl << some_statistics(rt_delta_by_elems) << endl; os << "Absolute retention time differences ( |element-center|, weight 1 per element):" << endl << some_statistics(rt_aad_by_elems) << endl; os << "Average absolute differences of retention time within consensus features ( |element-center|, weight 1 per consensus features):" << endl << some_statistics(rt_aad_by_cfs) << endl; os.precision(writtenDigits(ConsensusFeature::CoordinateType())); os << "Mass-to-charge differences ( element-center, weight 1 per element):" << endl << some_statistics(mz_delta_by_elems) << endl; os << "Absolute differences of mass-to-charge ( |element-center|, weight 1 per element):" << endl << some_statistics(mz_aad_by_elems) << endl; os << "Average absolute differences of mass-to-charge within consensus features ( |element-center|, weight 1 per consensus features):" << endl << some_statistics(mz_aad_by_cfs) << endl; os.precision(writtenDigits(ConsensusFeature::IntensityType())); os << "Intensity ratios ( element/center, weight 1 per element):" << endl << some_statistics(it_delta_by_elems) << endl; os << "Relative intensity error ( max{(element/center),(center/element)}, weight 1 per element):" << endl << some_statistics(it_aad_by_elems) << endl; os << "Average relative intensity error within consensus features ( max{(element/center),(center/element)}, weight 1 per consensus features):" << endl << some_statistics(it_aad_by_cfs) << endl; } return EXECUTION_OK; }
ExitCodes main_(int, const char**) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- // file list StringList file_list = getStringList_("in"); // file type FileHandler file_handler; FileTypes::Type force_type; if (getStringOption_("in_type").size() > 0) { force_type = FileTypes::nameToType(getStringOption_("in_type")); } else { force_type = file_handler.getType(file_list[0]); } // output file names and types String out_file = getStringOption_("out"); bool annotate_file_origin = getFlag_("annotate_file_origin"); rt_gap_ = getDoubleOption_("rt_concat:gap"); vector<String> trafo_out = getStringList_("rt_concat:trafo_out"); if (trafo_out.empty()) { // resize now so we don't have to worry about indexing out of bounds: trafo_out.resize(file_list.size()); } else if (trafo_out.size() != file_list.size()) { writeLog_("Error: Number of transformation output files must equal the number of input files (parameters 'rt_concat:trafo_out'/'in')!"); return ILLEGAL_PARAMETERS; } //------------------------------------------------------------- // calculations //------------------------------------------------------------- if (force_type == FileTypes::FEATUREXML) { FeatureMap out; FeatureXMLFile fh; for (Size i = 0; i < file_list.size(); ++i) { FeatureMap map; fh.load(file_list[i], map); if (annotate_file_origin) { for (FeatureMap::iterator it = map.begin(); it != map.end(); ++it) { it->setMetaValue("file_origin", DataValue(file_list[i])); } } if (rt_gap_ > 0.0) // concatenate in RT { adjustRetentionTimes_(map, trafo_out[i], i == 0); } out += map; } //------------------------------------------------------------- // writing output //------------------------------------------------------------- // annotate output with data processing info addDataProcessing_(out, getProcessingInfo_(DataProcessing::FORMAT_CONVERSION)); fh.store(out_file, out); } else if (force_type == FileTypes::CONSENSUSXML) { ConsensusMap out; ConsensusXMLFile fh; fh.load(file_list[0], out); // skip first file for (Size i = 1; i < file_list.size(); ++i) { ConsensusMap map; fh.load(file_list[i], map); if (annotate_file_origin) { for (ConsensusMap::iterator it = map.begin(); it != map.end(); ++it) { it->setMetaValue("file_origin", DataValue(file_list[i])); } } if (rt_gap_ > 0.0) // concatenate in RT { adjustRetentionTimes_(map, trafo_out[i], i == 0); } out += map; } //------------------------------------------------------------- // writing output //------------------------------------------------------------- // annotate output with data processing info addDataProcessing_(out, getProcessingInfo_(DataProcessing::FORMAT_CONVERSION)); fh.store(out_file, out); } else if (force_type == FileTypes::TRAML) { TargetedExperiment out; TraMLFile fh; for (Size i = 0; i < file_list.size(); ++i) { TargetedExperiment map; fh.load(file_list[i], map); out += map; } //------------------------------------------------------------- // writing output //------------------------------------------------------------- // annotate output with data processing info Software software; software.setName("FileMerger"); software.setVersion(VersionInfo::getVersion()); out.addSoftware(software); fh.store(out_file, out); } else // raw data input (e.g. mzML) { // RT bool rt_auto_number = getFlag_("raw:rt_auto"); bool rt_filename = getFlag_("raw:rt_filename"); bool rt_custom = false; DoubleList custom_rts = getDoubleList_("raw:rt_custom"); if (!custom_rts.empty()) { rt_custom = true; if (custom_rts.size() != file_list.size()) { writeLog_("Custom retention time list (parameter 'raw:rt_custom') must have as many elements as there are input files (parameter 'in')!"); return ILLEGAL_PARAMETERS; } } // MS level Int ms_level = getIntOption_("raw:ms_level"); MSExperiment<> out; UInt rt_auto = 0; UInt native_id = 0; for (Size i = 0; i < file_list.size(); ++i) { String filename = file_list[i]; // load file force_type = file_handler.getType(file_list[i]); MSExperiment<> in; file_handler.loadExperiment(filename, in, force_type, log_type_); if (in.empty() && in.getChromatograms().empty()) { writeLog_(String("Warning: Empty file '") + filename + "'!"); continue; } out.reserve(out.size() + in.size()); // warn if custom RT and more than one scan in input file if (rt_custom && in.size() > 1) { writeLog_(String("Warning: More than one scan in file '") + filename + "'! All scans will have the same retention time!"); } // handle special raw data options: for (MSExperiment<>::iterator spec_it = in.begin(); spec_it != in.end(); ++spec_it) { float rt_final = spec_it->getRT(); if (rt_auto_number) { rt_final = ++rt_auto; } else if (rt_custom) { rt_final = custom_rts[i]; } else if (rt_filename) { static const boost::regex re("rt(\\d+(\\.\\d+)?)"); boost::smatch match; bool found = boost::regex_search(filename, match, re); if (found) { rt_final = String(match[1]).toFloat(); } else { writeLog_("Warning: could not extract retention time from filename '" + filename + "'"); } } // none of the rt methods were successful if (rt_final < 0) { writeLog_(String("Warning: No valid retention time for output scan '") + rt_auto + "' from file '" + filename + "'"); } spec_it->setRT(rt_final); spec_it->setNativeID("spectrum=" + String(native_id)); if (ms_level > 0) { spec_it->setMSLevel(ms_level); } ++native_id; } // if we have only one spectrum, we can annotate it directly, for more spectra, we just name the source file leaving the spectra unannotated (to avoid a long and redundant list of sourceFiles) if (in.size() == 1) { in[0].setSourceFile(in.getSourceFiles()[0]); in.getSourceFiles().clear(); // delete source file annotated from source file (it's in the spectrum anyways) } if (rt_gap_ > 0.0) // concatenate in RT { adjustRetentionTimes_(in, trafo_out[i], i == 0); } // add spectra to output for (MSExperiment<>::const_iterator spec_it = in.begin(); spec_it != in.end(); ++spec_it) { out.addSpectrum(*spec_it); } // also add the chromatograms for (vector<MSChromatogram<ChromatogramPeak> >::const_iterator chrom_it = in.getChromatograms().begin(); chrom_it != in.getChromatograms().end(); ++chrom_it) { out.addChromatogram(*chrom_it); } // copy experimental settings from first file if (i == 0) { out.ExperimentalSettings::operator=(in); } else // otherwise append { out.getSourceFiles().insert(out.getSourceFiles().end(), in.getSourceFiles().begin(), in.getSourceFiles().end()); // could be emtpty if spectrum was annotated above, but that's ok then } } //------------------------------------------------------------- // writing output //------------------------------------------------------------- // annotate output with data processing info addDataProcessing_(out, getProcessingInfo_(DataProcessing::FORMAT_CONVERSION)); MzMLFile f; f.setLogType(log_type_); f.store(out_file, out); } return EXECUTION_OK; }
void IBSpectraFile::store(const String& filename, const ConsensusMap& cm) { // typdefs for shorter code typedef std::vector<ProteinHit>::iterator ProtHitIt; // general settings .. do we need to expose these? // ---------------------------------------------------------------------- /// Allow also non-unique peptides to be exported bool allow_non_unique = true; /// Intensities below this value will be set to 0.0 to avoid numerical problems when quantifying double intensity_threshold = 0.00001; // ---------------------------------------------------------------------- // guess experiment type boost::shared_ptr<IsobaricQuantitationMethod> quantMethod = guessExperimentType_(cm); // we need the protein identifications to reference the protein names ProteinIdentification protIdent; bool has_proteinIdentifications = false; if (cm.getProteinIdentifications().size() > 0) { protIdent = cm.getProteinIdentifications()[0]; has_proteinIdentifications = true; } // start the file by adding the tsv header TextFile textFile; textFile.addLine(ListUtils::concatenate(constructHeader_(*quantMethod), "\t")); for (ConsensusMap::ConstIterator cm_iter = cm.begin(); cm_iter != cm.end(); ++cm_iter) { const ConsensusFeature& cFeature = *cm_iter; std::vector<IdCSV> entries; /// 1st we extract the identification information from the consensus feature if (cFeature.getPeptideIdentifications().size() == 0 || !has_proteinIdentifications) { // we store unidentified hits anyway, because the iTRAQ quant is still helpful for normalization entries.push_back(IdCSV()); } else { // protein name: const PeptideHit& peptide_hit = cFeature.getPeptideIdentifications()[0].getHits()[0]; std::set<String> protein_accessions = peptide_hit.extractProteinAccessions(); if (protein_accessions.size() != 1) { if (!allow_non_unique) continue; // we only want unique peptides } for (std::set<String>::const_iterator prot_ac = protein_accessions.begin(); prot_ac != protein_accessions.end(); ++prot_ac) { IdCSV entry; entry.charge = cFeature.getPeptideIdentifications()[0].getHits()[0].getCharge(); entry.peptide = cFeature.getPeptideIdentifications()[0].getHits()[0].getSequence().toUnmodifiedString(); entry.theo_mass = cFeature.getPeptideIdentifications()[0].getHits()[0].getSequence().getMonoWeight(Residue::Full, cFeature.getPeptideIdentifications()[0].getHits()[0].getCharge()); // write modif entry.modif = getModifString_(cFeature.getPeptideIdentifications()[0].getHits()[0].getSequence()); ProtHitIt proteinHit = protIdent.findHit(*prot_ac); if (proteinHit == protIdent.getHits().end()) { std::cerr << "Protein referenced in peptide not found...\n"; continue; // protein not found } entry.accession = proteinHit->getAccession(); entries.push_back(entry); } } // 2nd we add the quantitative information of the channels // .. skip features with 0 intensity if (cFeature.getIntensity() == 0) { continue; } for (std::vector<IdCSV>::iterator entry = entries.begin(); entry != entries.end(); ++entry) { // set parent intensity entry->parent_intens = cFeature.getIntensity(); entry->retention_time = cFeature.getRT(); entry->spectrum = cFeature.getUniqueId(); entry->exp_mass = cFeature.getMZ(); // create output line StringList currentLine; // add entry to currentLine entry->toStringList(currentLine); // extract channel intensities and positions std::map<Int, double> intensityMap; ConsensusFeature::HandleSetType features = cFeature.getFeatures(); for (ConsensusFeature::HandleSetType::const_iterator fIt = features.begin(); fIt != features.end(); ++fIt) { intensityMap[Int(fIt->getMZ())] = (fIt->getIntensity() > intensity_threshold ? fIt->getIntensity() : 0.0); } for (IsobaricQuantitationMethod::IsobaricChannelList::const_iterator it = quantMethod->getChannelInformation().begin(); it != quantMethod->getChannelInformation().end(); ++it) { currentLine.push_back(String(it->center)); } for (IsobaricQuantitationMethod::IsobaricChannelList::const_iterator it = quantMethod->getChannelInformation().begin(); it != quantMethod->getChannelInformation().end(); ++it) { currentLine.push_back(String(intensityMap[int(it->center)])); } textFile.addLine(ListUtils::concatenate(currentLine, "\t")); } } // write to file textFile.store(filename); }
ExitCodes main_(int, const char **) { String in = getStringOption_("in"), out = getStringOption_("out"), id_out = getStringOption_("id_out"); if (out.empty() && id_out.empty()) { throw Exception::RequiredParameterNotGiven(__FILE__, __LINE__, __PRETTY_FUNCTION__, "out/id_out"); } vector<ProteinIdentification> proteins; vector<PeptideIdentification> peptides; FileTypes::Type in_type = FileHandler::getType(in); if (in_type == FileTypes::MZML) { MSExperiment<> experiment; MzMLFile().load(in, experiment); // what about unassigned peptide IDs? for (MSExperiment<>::Iterator exp_it = experiment.begin(); exp_it != experiment.end(); ++exp_it) { peptides.insert(peptides.end(), exp_it->getPeptideIdentifications().begin(), exp_it->getPeptideIdentifications().end()); exp_it->getPeptideIdentifications().clear(); } experiment.getProteinIdentifications().swap(proteins); if (!out.empty()) { addDataProcessing_(experiment, getProcessingInfo_(DataProcessing::FILTERING)); MzMLFile().store(out, experiment); } } else if (in_type == FileTypes::FEATUREXML) { FeatureMap features; FeatureXMLFile().load(in, features); features.getUnassignedPeptideIdentifications().swap(peptides); for (FeatureMap::Iterator feat_it = features.begin(); feat_it != features.end(); ++feat_it) { peptides.insert(peptides.end(), feat_it->getPeptideIdentifications().begin(), feat_it->getPeptideIdentifications().end()); feat_it->getPeptideIdentifications().clear(); } features.getProteinIdentifications().swap(proteins); if (!out.empty()) { addDataProcessing_(features, getProcessingInfo_(DataProcessing::FILTERING)); FeatureXMLFile().store(out, features); } } else // consensusXML { ConsensusMap consensus; ConsensusXMLFile().load(in, consensus); consensus.getUnassignedPeptideIdentifications().swap(peptides); for (ConsensusMap::Iterator cons_it = consensus.begin(); cons_it != consensus.end(); ++cons_it) { peptides.insert(peptides.end(), cons_it->getPeptideIdentifications().begin(), cons_it->getPeptideIdentifications().end()); cons_it->getPeptideIdentifications().clear(); } consensus.getProteinIdentifications().swap(proteins); if (!out.empty()) { addDataProcessing_(consensus, getProcessingInfo_(DataProcessing::FILTERING)); ConsensusXMLFile().store(out, consensus); } } if (!id_out.empty()) { // IDMapper can match a peptide ID to several overlapping features, // resulting in duplicates; this shouldn't be the case for peak data if (in_type != FileTypes::MZML) removeDuplicates_(peptides); IdXMLFile().store(id_out, proteins, peptides); } return EXECUTION_OK; }
ExitCodes main_(int, const char**) { vector<ProteinIdentification> prot_ids; vector<PeptideIdentification> pep_ids; ProteinHit temp_protein_hit; //------------------------------------------------------------- // parsing parameters //------------------------------------------------------------- String inputfile_id = getStringOption_("id"); String inputfile_feature = getStringOption_("feature"); String inputfile_consensus = getStringOption_("consensus"); String inputfile_raw = getStringOption_("in"); String outputfile_name = getStringOption_("out"); //~ bool Ms1(getFlag_("MS1")); //~ bool Ms2(getFlag_("MS2")); bool remove_duplicate_features(getFlag_("remove_duplicate_features")); //------------------------------------------------------------- // fetch vocabularies //------------------------------------------------------------ ControlledVocabulary cv; cv.loadFromOBO("PSI-MS", File::find("/CV/psi-ms.obo")); cv.loadFromOBO("QC", File::find("/CV/qc-cv.obo")); QcMLFile qcmlfile; //------------------------------------------------------------- // MS aqiusition //------------------------------------------------------------ String base_name = QFileInfo(QString::fromStdString(inputfile_raw)).baseName(); cout << "Reading mzML file..." << endl; MzMLFile mz_data_file; MSExperiment<Peak1D> exp; MzMLFile().load(inputfile_raw, exp); //---prep input exp.sortSpectra(); UInt min_mz = std::numeric_limits<UInt>::max(); UInt max_mz = 0; std::map<Size, UInt> mslevelcounts; qcmlfile.registerRun(base_name,base_name); //TODO use UIDs //---base MS aquisition qp String msaq_ref = base_name + "_msaq"; QcMLFile::QualityParameter qp; qp.id = msaq_ref; ///< Identifier qp.cvRef = "QC"; ///< cv reference qp.cvAcc = "QC:0000004"; try { //~ const ControlledVocabulary::CVTerm& test = cv.getTermByName("MS aquisition result details"); //~ cout << test.name << test.id << endl; const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc); //~ const ControlledVocabulary::CVTerm& term = cv.getTerm("0000004"); qp.name = term.name; ///< Name } catch (...) { qp.name = "mzML file"; ///< Name } qcmlfile.addRunQualityParameter(base_name, qp); //---file origin qp qp = QcMLFile::QualityParameter(); qp.name = "mzML file"; ///< Name qp.id = base_name + "_run_name"; ///< Identifier qp.cvRef = "MS"; ///< cv reference qp.cvAcc = "MS:1000577"; qp.value = base_name; qcmlfile.addRunQualityParameter(base_name, qp); qp = QcMLFile::QualityParameter(); qp.name = "instrument model"; ///< Name qp.id = base_name + "_instrument_name"; ///< Identifier qp.cvRef = "MS"; ///< cv reference qp.cvAcc = "MS:1000031"; qp.value = exp.getInstrument().getName(); qcmlfile.addRunQualityParameter(base_name, qp); qp = QcMLFile::QualityParameter(); qp.name = "completion time"; ///< Name qp.id = base_name + "_date"; ///< Identifier qp.cvRef = "MS"; ///< cv reference qp.cvAcc = "MS:1000747"; qp.value = exp.getDateTime().getDate(); qcmlfile.addRunQualityParameter(base_name, qp); //---precursors at QcMLFile::Attachment at; at.cvRef = "QC"; ///< cv reference at.cvAcc = "QC:0000044"; at.qualityRef = msaq_ref; at.id = base_name + "_precursors"; ///< Identifier try { const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc); at.name = term.name; ///< Name } catch (...) { at.name = "precursors"; ///< Name } at.colTypes.push_back("MS:1000894_[sec]"); //RT at.colTypes.push_back("MS:1000040"); //MZ for (Size i = 0; i < exp.size(); ++i) { mslevelcounts[exp[i].getMSLevel()]++; if (exp[i].getMSLevel() == 2) { if (exp[i].getPrecursors().front().getMZ() < min_mz) { min_mz = exp[i].getPrecursors().front().getMZ(); } if (exp[i].getPrecursors().front().getMZ() > max_mz) { max_mz = exp[i].getPrecursors().front().getMZ(); } std::vector<String> row; row.push_back(exp[i].getRT()); row.push_back(exp[i].getPrecursors().front().getMZ()); at.tableRows.push_back(row); } } qcmlfile.addRunAttachment(base_name, at); //---aquisition results qp qp = QcMLFile::QualityParameter(); qp.cvRef = "QC"; ///< cv reference qp.cvAcc = "QC:0000006"; ///< cv accession for "aquisition results" qp.id = base_name + "_ms1aquisition"; ///< Identifier qp.value = String(mslevelcounts[1]); try { const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc); qp.name = term.name; ///< Name } catch (...) { qp.name = "number of ms1 spectra"; ///< Name } qcmlfile.addRunQualityParameter(base_name, qp); qp = QcMLFile::QualityParameter(); qp.cvRef = "QC"; ///< cv reference qp.cvAcc = "QC:0000007"; ///< cv accession for "aquisition results" qp.id = base_name + "_ms2aquisition"; ///< Identifier qp.value = String(mslevelcounts[2]); try { const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc); qp.name = term.name; ///< Name } catch (...) { qp.name = "number of ms2 spectra"; ///< Name } qcmlfile.addRunQualityParameter(base_name, qp); qp = QcMLFile::QualityParameter(); qp.cvRef = "QC"; ///< cv reference qp.cvAcc = "QC:0000008"; ///< cv accession for "aquisition results" qp.id = base_name + "_Chromaquisition"; ///< Identifier qp.value = String(exp.getChromatograms().size()); try { const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc); qp.name = term.name; ///< Name } catch (...) { qp.name = "number of chromatograms"; ///< Name } qcmlfile.addRunQualityParameter(base_name, qp); at = QcMLFile::Attachment(); at.cvRef = "QC"; ///< cv reference at.cvAcc = "QC:0000009"; at.qualityRef = msaq_ref; at.id = base_name + "_mzrange"; ///< Identifier try { const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc); at.name = term.name; ///< Name } catch (...) { at.name = "MS MZ aquisition ranges"; ///< Name } at.colTypes.push_back("QC:0000010"); //MZ at.colTypes.push_back("QC:0000011"); //MZ std::vector<String> rowmz; rowmz.push_back(String(min_mz)); rowmz.push_back(String(max_mz)); at.tableRows.push_back(rowmz); qcmlfile.addRunAttachment(base_name, at); at = QcMLFile::Attachment(); at.cvRef = "QC"; ///< cv reference at.cvAcc = "QC:0000012"; at.qualityRef = msaq_ref; at.id = base_name + "_rtrange"; ///< Identifier try { const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc); at.name = term.name; ///< Name } catch (...) { at.name = "MS RT aquisition ranges"; ///< Name } at.colTypes.push_back("QC:0000013"); //MZ at.colTypes.push_back("QC:0000014"); //MZ std::vector<String> rowrt; rowrt.push_back(String(exp.begin()->getRT())); rowrt.push_back(String(exp.getSpectra().back().getRT())); at.tableRows.push_back(rowrt); qcmlfile.addRunAttachment(base_name, at); //---ion current stability ( & tic ) qp at = QcMLFile::Attachment(); at.cvRef = "QC"; ///< cv reference at.cvAcc = "QC:0000022"; at.qualityRef = msaq_ref; at.id = base_name + "_tics"; ///< Identifier try { const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc); at.name = term.name; ///< Name } catch (...) { at.name = "MS TICs"; ///< Name } at.colTypes.push_back("MS:1000894_[sec]"); at.colTypes.push_back("MS:1000285"); UInt max = 0; Size below_10k = 0; for (Size i = 0; i < exp.size(); ++i) { if (exp[i].getMSLevel() == 1) { UInt sum = 0; for (Size j = 0; j < exp[i].size(); ++j) { sum += exp[i][j].getIntensity(); } if (sum > max) { max = sum; } if (sum < 10000) { ++below_10k; } std::vector<String> row; row.push_back(exp[i].getRT()); row.push_back(sum); at.tableRows.push_back(row); } } qcmlfile.addRunAttachment(base_name, at); qp = QcMLFile::QualityParameter(); qp.id = base_name + "_ticslump"; ///< Identifier qp.cvRef = "QC"; ///< cv reference qp.cvAcc = "QC:0000023"; qp.value = String((100 / exp.size()) * below_10k); try { const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc); qp.name = term.name; ///< Name } catch (...) { qp.name = "percentage of tic slumps"; ///< Name } qcmlfile.addRunQualityParameter(base_name, qp); //------------------------------------------------------------- // MS id //------------------------------------------------------------ if (inputfile_id != "") { IdXMLFile().load(inputfile_id, prot_ids, pep_ids); cerr << "idXML read ended. Found " << pep_ids.size() << " peptide identifications." << endl; ProteinIdentification::SearchParameters params = prot_ids[0].getSearchParameters(); vector<String> var_mods = params.variable_modifications; //~ boost::regex re("(?<=[KR])(?=[^P])"); String msid_ref = base_name + "_msid"; QcMLFile::QualityParameter qp; qp.id = msid_ref; ///< Identifier qp.cvRef = "QC"; ///< cv reference qp.cvAcc = "QC:0000025"; try { const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc); qp.name = term.name; ///< Name } catch (...) { qp.name = "MS identification result details"; ///< Name } qcmlfile.addRunQualityParameter(base_name, qp); at = QcMLFile::Attachment(); at.cvRef = "QC"; ///< cv reference at.cvAcc = "QC:0000026"; at.qualityRef = msid_ref; at.id = base_name + "_idsetting"; ///< Identifier try { const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc); at.name = term.name; ///< Name } catch (...) { at.name = "MS id settings"; ///< Name } at.colTypes.push_back("MS:1001013"); //MS:1001013 db name MS:1001016 version MS:1001020 taxonomy at.colTypes.push_back("MS:1001016"); at.colTypes.push_back("MS:1001020"); std::vector<String> row; row.push_back(String(prot_ids.front().getSearchParameters().db)); row.push_back(String(prot_ids.front().getSearchParameters().db_version)); row.push_back(String(prot_ids.front().getSearchParameters().taxonomy)); at.tableRows.push_back(row); qcmlfile.addRunAttachment(base_name, at); UInt spectrum_count = 0; Size peptide_hit_count = 0; UInt runs_count = 0; Size protein_hit_count = 0; set<String> peptides; set<String> proteins; Size missedcleavages = 0; for (Size i = 0; i < pep_ids.size(); ++i) { if (!pep_ids[i].empty()) { ++spectrum_count; peptide_hit_count += pep_ids[i].getHits().size(); const vector<PeptideHit>& temp_hits = pep_ids[i].getHits(); for (Size j = 0; j < temp_hits.size(); ++j) { peptides.insert(temp_hits[j].getSequence().toString()); } } } for (set<String>::iterator it = peptides.begin(); it != peptides.end(); ++it) { for (String::const_iterator st = it->begin(); st != it->end() - 1; ++st) { if (*st == 'K' || *st == 'R') { ++missedcleavages; } } } for (Size i = 0; i < prot_ids.size(); ++i) { ++runs_count; protein_hit_count += prot_ids[i].getHits().size(); const vector<ProteinHit>& temp_hits = prot_ids[i].getHits(); for (Size j = 0; j < temp_hits.size(); ++j) { proteins.insert(temp_hits[j].getAccession()); } } qp = QcMLFile::QualityParameter(); qp.cvRef = "QC"; ///< cv reference qp.cvAcc = "QC:0000037"; ///< cv accession qp.id = base_name + "_misscleave"; ///< Identifier qp.value = missedcleavages; try { const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc); qp.name = term.name; ///< Name } catch (...) { qp.name = "total number of missed cleavages"; ///< Name } qcmlfile.addRunQualityParameter(base_name, qp); qp = QcMLFile::QualityParameter(); qp.cvRef = "QC"; ///< cv reference qp.cvAcc = "QC:0000032"; ///< cv accession qp.id = base_name + "_totprot"; ///< Identifier qp.value = protein_hit_count; try { const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc); qp.name = term.name; ///< Name } catch (...) { qp.name = "total number of identified proteins"; ///< Name } qcmlfile.addRunQualityParameter(base_name, qp); qp = QcMLFile::QualityParameter(); qp.cvRef = "QC"; ///< cv reference qp.cvAcc = "QC:0000033"; ///< cv accession qp.id = base_name + "_totuniqprot"; ///< Identifier qp.value = String(proteins.size()); try { const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc); qp.name = term.name; ///< Name } catch (...) { qp.name = "total number of uniquely identified proteins"; ///< Name } qcmlfile.addRunQualityParameter(base_name, qp); qp = QcMLFile::QualityParameter(); qp.cvRef = "QC"; ///< cv reference qp.cvAcc = "QC:0000029"; ///< cv accession qp.id = base_name + "_psms"; ///< Identifier qp.value = String(spectrum_count); try { const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc); qp.name = term.name; ///< Name } catch (...) { qp.name = "total number of PSM"; ///< Name } qcmlfile.addRunQualityParameter(base_name, qp); qp = QcMLFile::QualityParameter(); qp.cvRef = "QC"; ///< cv reference qp.cvAcc = "QC:0000030"; ///< cv accession qp.id = base_name + "_totpeps"; ///< Identifier qp.value = String(peptide_hit_count); try { const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc); qp.name = term.name; ///< Name } catch (...) { qp.name = "total number of identified peptides"; ///< Name } qcmlfile.addRunQualityParameter(base_name, qp); qp = QcMLFile::QualityParameter(); qp.cvRef = "QC"; ///< cv reference qp.cvAcc = "QC:0000031"; ///< cv accession qp.id = base_name + "_totuniqpeps"; ///< Identifier qp.value = String(peptides.size()); try { const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc); qp.name = term.name; ///< Name } catch (...) { qp.name = "total number of uniquely identified peptides"; ///< Name } qcmlfile.addRunQualityParameter(base_name, qp); at = QcMLFile::Attachment(); at.cvRef = "QC"; ///< cv reference at.cvAcc = "QC:0000038"; at.qualityRef = msid_ref; at.id = base_name + "_massacc"; ///< Identifier try { const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc); at.name = term.name; ///< Name } catch (...) { at.name = "delta ppm tables"; } //~ delta ppm QC:0000039 RT MZ uniqueness ProteinID MS:1000885 target/decoy Score PeptideSequence MS:1000889 Annots string Similarity Charge UO:0000219 TheoreticalWeight UO:0000221 Oxidation_(M) at.colTypes.push_back("RT"); at.colTypes.push_back("MZ"); at.colTypes.push_back("Score"); at.colTypes.push_back("PeptideSequence"); at.colTypes.push_back("Charge"); at.colTypes.push_back("TheoreticalWeight"); at.colTypes.push_back("delta_ppm"); for (UInt w = 0; w < var_mods.size(); ++w) { at.colTypes.push_back(String(var_mods[w]).substitute(' ', '_')); } std::vector<double> deltas; //~ prot_ids[0].getSearchParameters(); for (vector<PeptideIdentification>::iterator it = pep_ids.begin(); it != pep_ids.end(); ++it) { if (it->getHits().size() > 0) { std::vector<String> row; row.push_back(it->getRT()); row.push_back(it->getMZ()); PeptideHit tmp = it->getHits().front(); //TODO depends on score & sort vector<UInt> pep_mods; for (UInt w = 0; w < var_mods.size(); ++w) { pep_mods.push_back(0); } for (AASequence::ConstIterator z = tmp.getSequence().begin(); z != tmp.getSequence().end(); ++z) { Residue res = *z; String temp; if (res.getModification().size() > 0 && res.getModification() != "Carbamidomethyl") { temp = res.getModification() + " (" + res.getOneLetterCode() + ")"; //cout<<res.getModification()<<endl; for (UInt w = 0; w < var_mods.size(); ++w) { if (temp == var_mods[w]) { //cout<<temp; pep_mods[w] += 1; } } } } row.push_back(tmp.getScore()); row.push_back(tmp.getSequence().toString().removeWhitespaces()); row.push_back(tmp.getCharge()); row.push_back(String((tmp.getSequence().getMonoWeight() + tmp.getCharge() * Constants::PROTON_MASS_U) / tmp.getCharge())); double dppm = /* std::abs */ (getMassDifference(((tmp.getSequence().getMonoWeight() + tmp.getCharge() * Constants::PROTON_MASS_U) / tmp.getCharge()), it->getMZ(), true)); row.push_back(String(dppm)); deltas.push_back(dppm); for (UInt w = 0; w < var_mods.size(); ++w) { row.push_back(pep_mods[w]); } at.tableRows.push_back(row); } } qcmlfile.addRunAttachment(base_name, at); qp = QcMLFile::QualityParameter(); qp.cvRef = "QC"; ///< cv reference qp.cvAcc = "QC:0000040"; ///< cv accession qp.id = base_name + "_mean_delta"; ///< Identifier qp.value = String(OpenMS::Math::mean(deltas.begin(), deltas.end())); try { const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc); qp.name = term.name; ///< Name } catch (...) { qp.name = "mean delta ppm"; ///< Name } qcmlfile.addRunQualityParameter(base_name, qp); qp = QcMLFile::QualityParameter(); qp.cvRef = "QC"; ///< cv reference qp.cvAcc = "QC:0000041"; ///< cv accession qp.id = base_name + "_median_delta"; ///< Identifier qp.value = String(OpenMS::Math::median(deltas.begin(), deltas.end(), false)); try { const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc); qp.name = term.name; ///< Name } catch (...) { qp.name = "median delta ppm"; ///< Name } qcmlfile.addRunQualityParameter(base_name, qp); qp = QcMLFile::QualityParameter(); qp.cvRef = "QC"; ///< cv reference qp.cvAcc = "QC:0000035"; ///< cv accession qp.id = base_name + "_ratio_id"; ///< Identifier qp.value = String(double(pep_ids.size()) / double(mslevelcounts[2])); try { const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc); qp.name = term.name; ///< Name } catch (...) { qp.name = "id ratio"; ///< Name } qcmlfile.addRunQualityParameter(base_name, qp); } //------------------------------------------------------------- // MS quantitation //------------------------------------------------------------ FeatureMap map; String msqu_ref = base_name + "_msqu"; if (inputfile_feature != "") { FeatureXMLFile f; f.load(inputfile_feature, map); cout << "Read featureXML file..." << endl; //~ UInt fiter = 0; map.sortByRT(); map.updateRanges(); qp = QcMLFile::QualityParameter(); qp.cvRef = "QC"; ///< cv reference qp.cvAcc = "QC:0000045"; ///< cv accession qp.id = msqu_ref; ///< Identifier try { const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc); qp.name = term.name; ///< Name } catch (...) { qp.name = "MS quantification result details"; ///< Name } qcmlfile.addRunQualityParameter(base_name, qp); qp = QcMLFile::QualityParameter(); qp.cvRef = "QC"; ///< cv reference qp.cvAcc = "QC:0000046"; ///< cv accession qp.id = base_name + "_feature_count"; ///< Identifier qp.value = String(map.size()); try { const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc); qp.name = term.name; ///< Name } catch (...) { qp.name = "number of features"; ///< Name } qcmlfile.addRunQualityParameter(base_name, qp); } if (inputfile_feature != "" && !remove_duplicate_features) { QcMLFile::Attachment at; at = QcMLFile::Attachment(); at.cvRef = "QC"; ///< cv reference at.cvAcc = "QC:0000047"; at.qualityRef = msqu_ref; at.id = base_name + "_features"; ///< Identifier try { const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc); at.name = term.name; ///< Name } catch (...) { at.name = "features"; ///< Name } at.colTypes.push_back("MZ"); at.colTypes.push_back("RT"); at.colTypes.push_back("Intensity"); at.colTypes.push_back("Charge"); at.colTypes.push_back("Quality"); at.colTypes.push_back("FWHM"); at.colTypes.push_back("IDs"); UInt fiter = 0; map.sortByRT(); //ofstream out(outputfile_name.c_str()); while (fiter < map.size()) { std::vector<String> row; row.push_back(map[fiter].getMZ()); row.push_back(map[fiter].getRT()); row.push_back(map[fiter].getIntensity()); row.push_back(map[fiter].getCharge()); row.push_back(map[fiter].getOverallQuality()); row.push_back(map[fiter].getWidth()); row.push_back(map[fiter].getPeptideIdentifications().size()); fiter++; at.tableRows.push_back(row); } qcmlfile.addRunAttachment(base_name, at); } else if (inputfile_feature != "" && remove_duplicate_features) { QcMLFile::Attachment at; at = QcMLFile::Attachment(); at.cvRef = "QC"; ///< cv reference at.cvAcc = "QC:0000047"; at.qualityRef = msqu_ref; at.id = base_name + "_features"; ///< Identifier try { const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc); at.name = term.name; ///< Name } catch (...) { at.name = "features"; ///< Name } at.colTypes.push_back("MZ"); at.colTypes.push_back("RT"); at.colTypes.push_back("Intensity"); at.colTypes.push_back("Charge"); FeatureMap map, map_out; FeatureXMLFile f; f.load(inputfile_feature, map); UInt fiter = 0; map.sortByRT(); while (fiter < map.size()) { FeatureMap map_tmp; for (UInt k = fiter; k <= map.size(); ++k) { if (abs(map[fiter].getRT() - map[k].getRT()) < 0.1) { //~ cout << fiter << endl; map_tmp.push_back(map[k]); } else { fiter = k; break; } } map_tmp.sortByMZ(); UInt retif = 1; map_out.push_back(map_tmp[0]); while (retif < map_tmp.size()) { if (abs(map_tmp[retif].getMZ() - map_tmp[retif - 1].getMZ()) > 0.01) { cout << "equal RT, but mass different" << endl; map_out.push_back(map_tmp[retif]); } retif++; } } qcmlfile.addRunAttachment(base_name, at); } if (inputfile_consensus != "") { cout << "Reading consensusXML file..." << endl; ConsensusXMLFile f; ConsensusMap map; f.load(inputfile_consensus, map); //~ String CONSENSUS_NAME = "_consensus.tsv"; //~ String combined_out = outputfile_name + CONSENSUS_NAME; //~ ofstream out(combined_out.c_str()); at = QcMLFile::Attachment(); qp.name = "consensuspoints"; ///< Name //~ qp.id = base_name + "_consensuses"; ///< Identifier qp.cvRef = "QC"; ///< cv reference qp.cvAcc = "QC:xxxxxxxx"; ///< cv accession "featuremapper results" at.colTypes.push_back("Native_spectrum_ID"); at.colTypes.push_back("DECON_RT_(sec)"); at.colTypes.push_back("DECON_MZ_(Th)"); at.colTypes.push_back("DECON_Intensity"); at.colTypes.push_back("Feature_RT_(sec)"); at.colTypes.push_back("Feature_MZ_(Th)"); at.colTypes.push_back("Feature_Intensity"); at.colTypes.push_back("Feature_Charge"); for (ConsensusMap::const_iterator cmit = map.begin(); cmit != map.end(); ++cmit) { const ConsensusFeature& CF = *cmit; for (ConsensusFeature::const_iterator cfit = CF.begin(); cfit != CF.end(); ++cfit) { std::vector<String> row; FeatureHandle FH = *cfit; row.push_back(CF.getMetaValue("spectrum_native_id")); row.push_back(CF.getRT()); row.push_back(CF.getMZ()); row.push_back(CF.getIntensity()); row.push_back(FH.getRT()); row.push_back(FH.getMZ()); row.push_back(FH.getCharge()); at.tableRows.push_back(row); } } qcmlfile.addRunAttachment(base_name, at); } //------------------------------------------------------------- // finalize //------------------------------------------------------------ qcmlfile.store(outputfile_name); return EXECUTION_OK; }
void LabeledPairFinder::run(const vector<ConsensusMap>& input_maps, ConsensusMap& result_map) { if (input_maps.size() != 1) throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "exactly one input map required"); if (result_map.getFileDescriptions().size() != 2) throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "two file descriptions required"); if (result_map.getFileDescriptions().begin()->second.filename != result_map.getFileDescriptions().rbegin()->second.filename) throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the two file descriptions have to contain the same file name"); checkIds_(input_maps); //look up the light and heavy index Size light_index = numeric_limits<Size>::max(); Size heavy_index = numeric_limits<Size>::max(); for (ConsensusMap::FileDescriptions::const_iterator it = result_map.getFileDescriptions().begin(); it != result_map.getFileDescriptions().end(); ++it) { if (it->second.label == "heavy") { heavy_index = it->first; } else if (it->second.label == "light") { light_index = it->first; } } if (light_index == numeric_limits<Size>::max() || heavy_index == numeric_limits<Size>::max()) { throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the input maps have to be labeled 'light' and 'heavy'"); } result_map.clear(false); // sort consensus features by RT (and MZ) to speed up searching afterwards typedef ConstRefVector<ConsensusMap> RefMap; RefMap model_ref(input_maps[0].begin(), input_maps[0].end()); model_ref.sortByPosition(); //calculate matches ConsensusMap matches; //settings double rt_pair_dist = param_.getValue("rt_pair_dist"); double rt_dev_low = param_.getValue("rt_dev_low"); double rt_dev_high = param_.getValue("rt_dev_high"); double mz_dev = param_.getValue("mz_dev"); DoubleList mz_pair_dists = param_.getValue("mz_pair_dists"); bool mrm = param_.getValue("mrm").toBool(); //estimate RT parameters if (param_.getValue("rt_estimate") == "true") { //find all possible RT distances of features with the same charge and a good m/z distance vector<double> dists; dists.reserve(model_ref.size()); for (RefMap::const_iterator it = model_ref.begin(); it != model_ref.end(); ++it) { for (RefMap::const_iterator it2 = model_ref.begin(); it2 != model_ref.end(); ++it2) { for (DoubleList::const_iterator dist_it = mz_pair_dists.begin(); dist_it != mz_pair_dists.end(); ++dist_it) { double mz_pair_dist = *dist_it; if (it2->getCharge() == it->getCharge() && it2->getMZ() >= it->getMZ() + mz_pair_dist / it->getCharge() - mz_dev && it2->getMZ() <= it->getMZ() + mz_pair_dist / it->getCharge() + mz_dev) { dists.push_back(it2->getRT() - it->getRT()); } } } } if (dists.empty()) { cout << "Warning: Could not find pairs for RT distance estimation. The manual settings are used!" << endl; } else { if (dists.size() < 50) { cout << "Warning: Found only " << dists.size() << " pairs. The estimated shift and std deviation are probably not reliable!" << endl; } //--------------------------- estimate initial parameters of fit --------------------------- GaussFitter::GaussFitResult result(-1, -1, -1); //first estimate of the optimal shift: median of the distances sort(dists.begin(), dists.end()); Size median_index = dists.size() / 2; result.x0 = dists[median_index]; //create histogram of distances //consider only the maximum of pairs, centered around the optimal shift Size max_pairs = model_ref.size() / 2; Size start_index = (Size) max((SignedSize)0, (SignedSize)(median_index - max_pairs / 2)); Size end_index = (Size) min((SignedSize)(dists.size() - 1), (SignedSize)(median_index + max_pairs / 2)); double start_value = dists[start_index]; double end_value = dists[end_index]; double bin_step = fabs(end_value - start_value) / 99.999; //ensure that we have 100 bins Math::Histogram<> hist(start_value, end_value, bin_step); //std::cout << "HIST from " << start_value << " to " << end_value << " (bin size " << bin_step << ")" << endl; for (Size i = start_index; i <= end_index; ++i) { hist.inc(dists[i]); } //cout << hist << endl; dists.clear(); //determine median of bins (uniform background distribution) vector<Size> bins(hist.begin(), hist.end()); sort(bins.begin(), bins.end()); Size bin_median = bins[bins.size() / 2]; bins.clear(); //estimate scale A: maximum of the histogram Size max_value = hist.maxValue(); result.A = max_value - bin_median; //overwrite estimate of x0 with the position of the highest bin for (Size i = 0; i < hist.size(); ++i) { if (hist[i] == max_value) { result.x0 = hist.centerOfBin(i); break; } } //estimate sigma: first time the count is less or equal the median count in the histogram double pos = result.x0; while (pos > start_value && hist.binValue(pos) > bin_median) { pos -= bin_step; } double sigma_low = result.x0 - pos; pos = result.x0; while (pos<end_value&& hist.binValue(pos)> bin_median) { pos += bin_step; } double sigma_high = pos - result.x0; result.sigma = (sigma_high + sigma_low) / 6.0; //cout << "estimated optimal RT distance (before fit): " << result.x0 << endl; //cout << "estimated allowed deviation (before fit): " << result.sigma*3.0 << endl; //--------------------------- do gauss fit --------------------------- vector<DPosition<2> > points(hist.size()); for (Size i = 0; i < hist.size(); ++i) { points[i][0] = hist.centerOfBin(i); points[i][1] = max(0u, hist[i]); } GaussFitter fitter; fitter.setInitialParameters(result); result = fitter.fit(points); cout << "estimated optimal RT distance: " << result.x0 << endl; cout << "estimated allowed deviation: " << fabs(result.sigma) * 3.0 << endl; rt_pair_dist = result.x0; rt_dev_low = fabs(result.sigma) * 3.0; rt_dev_high = fabs(result.sigma) * 3.0; } } // check each feature for (RefMap::const_iterator it = model_ref.begin(); it != model_ref.end(); ++it) { for (DoubleList::const_iterator dist_it = mz_pair_dists.begin(); dist_it != mz_pair_dists.end(); ++dist_it) { double mz_pair_dist = *dist_it; RefMap::const_iterator it2 = lower_bound(model_ref.begin(), model_ref.end(), it->getRT() + rt_pair_dist - rt_dev_low, ConsensusFeature::RTLess()); while (it2 != model_ref.end() && it2->getRT() <= it->getRT() + rt_pair_dist + rt_dev_high) { // if in mrm mode, we need to compare precursor mass difference and fragment mass difference, charge remains the same double prec_mz_diff(0); if (mrm) { prec_mz_diff = fabs((double)it2->getMetaValue("MZ") - (double)it->getMetaValue("MZ")); if (it->getCharge() != 0) { prec_mz_diff = fabs(prec_mz_diff - mz_pair_dist / it->getCharge()); } else { prec_mz_diff = fabs(prec_mz_diff - mz_pair_dist); } } bool mrm_correct_dist(false); double frag_mz_diff = fabs(it->getMZ() - it2->getMZ()); //cerr << it->getRT() << " charge1=" << it->getCharge() << ", charge2=" << it2->getCharge() << ", prec_diff=" << prec_mz_diff << ", frag_diff=" << frag_mz_diff << endl; if (mrm && it2->getCharge() == it->getCharge() && prec_mz_diff < mz_dev && (frag_mz_diff < mz_dev || fabs(frag_mz_diff - mz_pair_dist) < mz_dev)) { mrm_correct_dist = true; //cerr << "mrm_correct_dist" << endl; } if ((mrm && mrm_correct_dist) || (!mrm && it2->getCharge() == it->getCharge() && it2->getMZ() >= it->getMZ() + mz_pair_dist / it->getCharge() - mz_dev && it2->getMZ() <= it->getMZ() + mz_pair_dist / it->getCharge() + mz_dev )) { //cerr << "dist correct" << endl; double score = sqrt( PValue_(it2->getMZ() - it->getMZ(), mz_pair_dist / it->getCharge(), mz_dev, mz_dev) * PValue_(it2->getRT() - it->getRT(), rt_pair_dist, rt_dev_low, rt_dev_high) ); // Note: we used to copy the id from the light feature here, but that strategy does not generalize to more than two labels. // We might want to report consensus features where the light one is missing but more than one heavier variant was found. // Also, the old strategy is inconsistent with what was done in the unlabeled case. Thus now we assign a new unique id here. matches.push_back(ConsensusFeature()); matches.back().setUniqueId(); matches.back().insert(light_index, *it); matches.back().clearMetaInfo(); matches.back().insert(heavy_index, *it2); matches.back().setQuality(score); matches.back().setCharge(it->getCharge()); matches.back().computeMonoisotopicConsensus(); } ++it2; } } } //compute best pairs // - sort matches by quality // - take highest-quality matches first (greedy) and mark them as used set<Size> used_features; matches.sortByQuality(true); for (ConsensusMap::const_iterator match = matches.begin(); match != matches.end(); ++match) { //check if features are not used yet if (used_features.find(match->begin()->getUniqueId()) == used_features.end() && used_features.find(match->rbegin()->getUniqueId()) == used_features.end() ) { //if unused, add it to the final set of elements result_map.push_back(*match); used_features.insert(match->begin()->getUniqueId()); used_features.insert(match->rbegin()->getUniqueId()); } } //Add protein identifications to result map for (Size i = 0; i < input_maps.size(); ++i) { result_map.getProteinIdentifications().insert(result_map.getProteinIdentifications().end(), input_maps[i].getProteinIdentifications().begin(), input_maps[i].getProteinIdentifications().end()); } //Add unassigned peptide identifications to result map for (Size i = 0; i < input_maps.size(); ++i) { result_map.getUnassignedPeptideIdentifications().insert(result_map.getUnassignedPeptideIdentifications().end(), input_maps[i].getUnassignedPeptideIdentifications().begin(), input_maps[i].getUnassignedPeptideIdentifications().end()); } // Very useful for checking the results, and the ids have no real meaning anyway result_map.sortByMZ(); }
ExitCodes main_(int, const char**) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- //file list StringList file_list = getStringList_("in"); //file type FileHandler fh; FileTypes::Type force_type; if (getStringOption_("in_type").size() > 0) { force_type = FileTypes::nameToType(getStringOption_("in_type")); } else { force_type = fh.getType(file_list[0]); } //output file names and types String out_file = getStringOption_("out"); //------------------------------------------------------------- // calculations //------------------------------------------------------------- bool annotate_file_origin = getFlag_("annotate_file_origin"); if (force_type == FileTypes::FEATUREXML) { FeatureMap<> out; for (Size i = 0; i < file_list.size(); ++i) { FeatureMap<> map; FeatureXMLFile fh; fh.load(file_list[i], map); if (annotate_file_origin) { for (FeatureMap<>::iterator it = map.begin(); it != map.end(); ++it) { it->setMetaValue("file_origin", DataValue(file_list[i])); } } out += map; } //------------------------------------------------------------- // writing output //------------------------------------------------------------- //annotate output with data processing info addDataProcessing_(out, getProcessingInfo_(DataProcessing::FORMAT_CONVERSION)); FeatureXMLFile f; f.store(out_file, out); } else if (force_type == FileTypes::CONSENSUSXML) { ConsensusMap out; ConsensusXMLFile fh; fh.load(file_list[0], out); //skip first file for (Size i = 1; i < file_list.size(); ++i) { ConsensusMap map; ConsensusXMLFile fh; fh.load(file_list[i], map); if (annotate_file_origin) { for (ConsensusMap::iterator it = map.begin(); it != map.end(); ++it) { it->setMetaValue("file_origin", DataValue(file_list[i])); } } out += map; } //------------------------------------------------------------- // writing output //------------------------------------------------------------- //annotate output with data processing info addDataProcessing_(out, getProcessingInfo_(DataProcessing::FORMAT_CONVERSION)); ConsensusXMLFile f; f.store(out_file, out); } else if (force_type == FileTypes::TRAML) { TargetedExperiment out; for (Size i = 0; i < file_list.size(); ++i) { TargetedExperiment map; TraMLFile fh; fh.load(file_list[i], map); out += map; } //------------------------------------------------------------- // writing output //------------------------------------------------------------- //annotate output with data processing info Software software; software.setName("FileMerger"); software.setVersion(VersionInfo::getVersion()); out.addSoftware(software); TraMLFile f; f.store(out_file, out); } else { // we might want to combine different types, thus we only // query in_type (which applies to all files) // and not the suffix or content of a single file force_type = FileTypes::nameToType(getStringOption_("in_type")); //rt bool rt_auto_number = getFlag_("raw:rt_auto"); bool rt_filename = getFlag_("raw:rt_filename"); bool rt_custom = false; DoubleList custom_rts = getDoubleList_("raw:rt_custom"); if (custom_rts.size() != 0) { rt_custom = true; if (custom_rts.size() != file_list.size()) { writeLog_("Custom retention time list must have as many elements as there are input files!"); printUsage_(); return ILLEGAL_PARAMETERS; } } //ms level bool user_ms_level = getFlag_("raw:user_ms_level"); MSExperiment<> out; out.reserve(file_list.size()); UInt rt_auto = 0; UInt native_id = 0; std::vector<MSChromatogram<ChromatogramPeak> > all_chromatograms; for (Size i = 0; i < file_list.size(); ++i) { String filename = file_list[i]; //load file MSExperiment<> in; fh.loadExperiment(filename, in, force_type, log_type_); if (in.empty() && in.getChromatograms().empty()) { writeLog_(String("Warning: Empty file '") + filename + "'!"); continue; } out.reserve(out.size() + in.size()); //warn if custom RT and more than one scan in input file if (rt_custom && in.size() > 1) { writeLog_(String("Warning: More than one scan in file '") + filename + "'! All scans will have the same retention time!"); } for (MSExperiment<>::const_iterator it2 = in.begin(); it2 != in.end(); ++it2) { //handle rt Real rt_final = it2->getRT(); if (rt_auto_number) { rt_final = ++rt_auto; } else if (rt_custom) { rt_final = custom_rts[i]; } else if (rt_filename) { if (!filename.hasSubstring("rt")) { writeLog_(String("Warning: cannot guess retention time from filename as it does not contain 'rt'")); } for (Size i = 0; i < filename.size(); ++i) { if (filename[i] == 'r' && ++i != filename.size() && filename[i] == 't' && ++i != filename.size() && isdigit(filename[i])) { String rt; while (i != filename.size() && (filename[i] == '.' || isdigit(filename[i]))) { rt += filename[i++]; } if (rt.size() > 0) { // remove dot from rt3892.98.dta // ^ if (rt[rt.size() - 1] == '.') { // remove last character rt.erase(rt.end() - 1); } } try { float tmp = rt.toFloat(); rt_final = tmp; } catch (Exception::ConversionError) { writeLog_(String("Warning: cannot convert the found retention time in a value '" + rt + "'.")); } } } } // none of the rt methods were successful if (rt_final == -1) { writeLog_(String("Warning: No valid retention time for output scan '") + rt_auto + "' from file '" + filename + "'"); } out.addSpectrum(*it2); out.getSpectra().back().setRT(rt_final); out.getSpectra().back().setNativeID(native_id); if (user_ms_level) { out.getSpectra().back().setMSLevel((int)getIntOption_("raw:ms_level")); } ++native_id; } // if we had only one spectrum, we can annotate it directly, for more spectra, we just name the source file leaving the spectra unannotated (to avoid a long and redundant list of sourceFiles) if (in.size() == 1) { out.getSpectra().back().setSourceFile(in.getSourceFiles()[0]); in.getSourceFiles().clear(); // delete source file annotated from source file (its in the spectrum anyways) } // copy experimental settings from first file if (i == 0) { out.ExperimentalSettings::operator=(in); } else // otherwise append { out.getSourceFiles().insert(out.getSourceFiles().end(), in.getSourceFiles().begin(), in.getSourceFiles().end()); // could be emtpty if spectrum was annotated above, but that's ok then } // also add the chromatograms for (std::vector<MSChromatogram<ChromatogramPeak> >::const_iterator it2 = in.getChromatograms().begin(); it2 != in.getChromatograms().end(); ++it2) { all_chromatograms.push_back(*it2); } } // set the chromatograms out.setChromatograms(all_chromatograms); //------------------------------------------------------------- // writing output //------------------------------------------------------------- //annotate output with data processing info addDataProcessing_(out, getProcessingInfo_(DataProcessing::FORMAT_CONVERSION)); MzMLFile f; f.setLogType(log_type_); f.store(out_file, out); } return EXECUTION_OK; }
void PoseClusteringShiftSuperimposer::run(const ConsensusMap & map_model, const ConsensusMap & map_scene, TransformationDescription & transformation) { typedef ConstRefVector<ConsensusMap> PeakPointerArray_; typedef Math::LinearInterpolation<double, double> LinearInterpolationType_; LinearInterpolationType_ shift_hash_; // OLD STUFF // LinearInterpolationType_ scaling_hash_1; // LinearInterpolationType_ scaling_hash_2; // LinearInterpolationType_ shift_hash_; // LinearInterpolationType_ rt_high_hash_; /// Maximum deviation in mz of two partner points const double mz_pair_max_distance = param_.getValue("mz_pair_max_distance"); /// Size of each shift bucket const double shift_bucket_size = param_.getValue("shift_bucket_size"); const UInt struc_elem_length_datapoints = 21; // MAGIC ALERT: number of data points in structuring element for tophat filter, which removes baseline from histogram const double scaling_histogram_crossing_slope = 3.0; // MAGIC ALERT: used when distinguishing noise level and enriched histogram bins const double scaling_cutoff_stdev_multiplier = 1.5; // MAGIC ALERT: multiplier for stdev in cutoff for outliers const UInt loops_mean_stdev_cutoff = 3; // MAGIC ALERT: number of loops in stdev cutoff for outliers startProgress(0, 100, "shift pose clustering"); UInt actual_progress = 0; setProgress(++actual_progress); // Optionally, we will write dumps of the hash table buckets. bool do_dump_buckets = false; String dump_buckets_basename; if (param_.getValue("dump_buckets") != "") { do_dump_buckets = true; dump_buckets_basename = param_.getValue("dump_buckets"); } setProgress(++actual_progress); // Even more optionally, we will write dumps of the hashed pairs. bool do_dump_pairs = false; String dump_pairs_basename; if (param_.getValue("dump_pairs") != "") { do_dump_pairs = true; dump_pairs_basename = param_.getValue("dump_pairs"); } setProgress(++actual_progress); //************************************************************************** // Select the most abundant data points only. After that, disallow modifications // (we tend to have annoying issues with const_iterator versus iterator). PeakPointerArray_ model_map_ini(map_model.begin(), map_model.end()); const PeakPointerArray_ & model_map(model_map_ini); PeakPointerArray_ scene_map_ini(map_scene.begin(), map_scene.end()); const PeakPointerArray_ & scene_map(scene_map_ini); { // truncate the data as necessary // casting to SignedSize is done on PURPOSE here! (num_used_points will be maximal if -1 is used) const Size num_used_points = (SignedSize) param_.getValue("num_used_points"); if (model_map_ini.size() > num_used_points) { model_map_ini.sortByIntensity(true); model_map_ini.resize(num_used_points); } model_map_ini.sortByComparator(Peak2D::MZLess()); setProgress(++actual_progress); if (scene_map_ini.size() > num_used_points) { scene_map_ini.sortByIntensity(true); scene_map_ini.resize(num_used_points); } scene_map_ini.sortByComparator(Peak2D::MZLess()); setProgress(++actual_progress); // Note: model_map_ini and scene_map_ini will not be used further below } setProgress((actual_progress = 10)); //************************************************************************** // Preprocessing // get RT ranges (NOTE: we trust that min and max have been updated in the // ConsensusMap::convert() method !) const double model_low = map_model.getMin()[ConsensusFeature::RT]; const double scene_low = map_scene.getMin()[ConsensusFeature::RT]; const double model_high = map_model.getMax()[ConsensusFeature::RT]; const double scene_high = map_scene.getMax()[ConsensusFeature::RT]; // OLD STUFF // const double rt_low = (maps[0].getMin()[ConsensusFeature::RT] + maps[1].getMin()[ConsensusFeature::RT]) / 2.; // const double rt_high = (maps[0].getMax()[ConsensusFeature::RT] + maps[1].getMax()[ConsensusFeature::RT]) / 2.; // Initialize the hash tables: shift_hash_ // OLD STUFF: was: rt_scaling_hash_, rt_low_hash_, and rt_high_hash_ { // (over)estimate the required number of buckets for shifting double max_shift = param_.getValue("max_shift"); // actually the largest possible shift can be much smaller, depending on the data do { if (max_shift < 0) max_shift = -max_shift; // ...ml@@@mh........ , ........ml@@@mh... // ........sl@@@sh... , ...sl@@@sh........ double diff; diff = model_high - scene_low; if (diff < 0) diff = -diff; if (max_shift > diff) max_shift = diff; diff = model_low - scene_high; if (diff < 0) diff = -diff; if (max_shift > diff) max_shift = diff; } while (0); const Int shift_buckets_num_half = 4 + (Int) ceil((max_shift) / shift_bucket_size); const Int shift_buckets_num = 1 + 2 * shift_buckets_num_half; shift_hash_.getData().clear(); shift_hash_.getData().resize(shift_buckets_num); shift_hash_.setMapping(shift_bucket_size, shift_buckets_num_half, 0); } setProgress(++actual_progress); //************************************************************************** // compute the ratio of the total intensities of both maps, for normalization double total_intensity_ratio; do { double total_int_model_map = 0; for (Size i = 0; i < model_map.size(); ++i) { total_int_model_map += model_map[i].getIntensity(); } setProgress(++actual_progress); double total_int_scene_map = 0; for (Size i = 0; i < scene_map.size(); ++i) { total_int_scene_map += scene_map[i].getIntensity(); } setProgress(++actual_progress); // ... and finally ... total_intensity_ratio = total_int_model_map / total_int_scene_map; } while (0); // (the extra syntax helps with code folding in eclipse!) setProgress((actual_progress = 20)); /// The serial number is incremented for each invocation of this, to avoid overwriting of hash table dumps. static Int dump_buckets_serial = 0; ++dump_buckets_serial; //************************************************************************** // Hashing // Compute the transformations between each point pair in the model map // and each point pair in the scene map and hash the shift // transformation. // To speed up the calculation of the final transformation, we confine the number of // considered point pairs. We match a point p in the model map only onto those points p' // in the scene map that lie in a certain mz interval. Size const model_map_size = model_map.size(); // i /* OLD STUFF: also: j */ Size const scene_map_size = scene_map.size(); // k /* OLD STUFF: also: l */ const double winlength_factor_baseline = 0.1; // MAGIC ALERT: Each window is given unit weight. If there are too many pairs for a window, the individual contributions will be very small, but running time will be high, so we provide a cutoff for this. Typically this will exclude compounds which elute over the whole retention time range from consideration. /////////////////////////////////////////////////////////////////// // Hashing: Estimate the shift do // begin of hashing (the extra syntax helps with code folding in eclipse!) { String dump_pairs_filename; std::ofstream dump_pairs_file; if (do_dump_pairs) { dump_pairs_filename = dump_pairs_basename + String(dump_buckets_serial); dump_pairs_file.open(dump_pairs_filename.c_str()); dump_pairs_file << "#" << ' ' << "i" << ' ' << "k" << std::endl; } setProgress(++actual_progress); // first point in model map for (Size i = 0, i_low = 0, i_high = 0, k_low = 0, k_high = 0; i < model_map_size - 1; ++i) { setProgress(actual_progress + float(i) / model_map_size * 10.f); // Adjust window around i in model map while (i_low < model_map_size && model_map[i_low].getMZ() < model_map[i].getMZ() - mz_pair_max_distance) ++i_low; while (i_high < model_map_size && model_map[i_high].getMZ() <= model_map[i].getMZ() + mz_pair_max_distance) ++i_high; double i_winlength_factor = 1. / (i_high - i_low); i_winlength_factor -= winlength_factor_baseline; if (i_winlength_factor <= 0) continue; // Adjust window around k in scene map while (k_low < scene_map_size && scene_map[k_low].getMZ() < model_map[i].getMZ() - mz_pair_max_distance) ++k_low; while (k_high < scene_map_size && scene_map[k_high].getMZ() <= model_map[i].getMZ() + mz_pair_max_distance) ++k_high; // first point in scene map for (Size k = k_low; k < k_high; ++k) { double k_winlength_factor = 1. / (k_high - k_low); k_winlength_factor -= winlength_factor_baseline; if (k_winlength_factor <= 0) continue; // compute similarity of intensities i k double similarity_ik; { const double int_i = model_map[i].getIntensity(); const double int_k = scene_map[k].getIntensity() * total_intensity_ratio; similarity_ik = (int_i < int_k) ? int_i / int_k : int_k / int_i; // weight is inverse proportional to number of elements with similar mz similarity_ik *= i_winlength_factor; similarity_ik *= k_winlength_factor; // VV_(int_i<<' '<<int_k<<' '<<int_similarity_ik); } // compute the transformation (i) -> (k) double shift = model_map[i].getRT() - scene_map[k].getRT(); // hash the images of scaling, rt_low and rt_high into their respective hash tables shift_hash_.addValue(shift, similarity_ik); if (do_dump_pairs) { dump_pairs_file << i << ' ' << model_map[i].getRT() << ' ' << model_map[i].getMZ() << ' ' << k << ' ' << scene_map[k].getRT() << ' ' << scene_map[k].getMZ() << ' ' << similarity_ik << ' ' << std::endl; } } // k } // i } while (0); // end of hashing (the extra syntax helps with code folding in eclipse!) setProgress((actual_progress = 30)); /////////////////////////////////////////////////////////////////// // work on shift_hash_ // double shift_low; // double shift_centroid; // double shift_high; // OLD STUFF // double shift_low; double shift_centroid; // double shift_high; do { UInt filtering_stage = 0; // optionally, dump before filtering String dump_buckets_filename; std::ofstream dump_buckets_file; if (do_dump_buckets) { dump_buckets_filename = dump_buckets_basename + "_" + String(dump_buckets_serial); dump_buckets_file.open(dump_buckets_filename.c_str()); VV_(dump_buckets_filename); dump_buckets_file << "# shift hash table buckets dump ( scale, height ) : " << dump_buckets_filename << std::endl; dump_buckets_file << "# unfiltered hash data\n"; for (Size index = 0; index < shift_hash_.getData().size(); ++index) { const double image = shift_hash_.index2key(index); const double height = shift_hash_.getData()[index]; dump_buckets_file << filtering_stage << '\t' << index << '\t' << image << '\t' << height << '\n'; } dump_buckets_file << '\n'; } ++filtering_stage; setProgress(++actual_progress); // apply tophat filter to histogram MorphologicalFilter morph_filter; Param morph_filter_param; morph_filter_param.setValue("struc_elem_unit", "DataPoints"); morph_filter_param.setValue("struc_elem_length", double(struc_elem_length_datapoints)); morph_filter_param.setValue("method", "tophat"); morph_filter.setParameters(morph_filter_param); LinearInterpolationType_::container_type buffer(shift_hash_.getData().size()); morph_filter.filterRange(shift_hash_.getData().begin(), shift_hash_.getData().end(), buffer.begin()); shift_hash_.getData().swap(buffer); // optionally, dump after filtering if (do_dump_buckets) { dump_buckets_file << "# tophat filtered hash data\n"; for (Size index = 0; index < shift_hash_.getData().size(); ++index) { const double image = shift_hash_.index2key(index); const double height = shift_hash_.getData()[index]; dump_buckets_file << filtering_stage << '\t' << index << '\t' << image << '\t' << height << '\n'; } dump_buckets_file << '\n'; } setProgress(++actual_progress); ++filtering_stage; // compute freq_cutoff using a fancy criterion to distinguish between the noise level of the histogram and enriched histogram bins double freq_cutoff_low; do { { std::copy(shift_hash_.getData().begin(), shift_hash_.getData().end(), buffer.begin()); std::sort(buffer.begin(), buffer.end(), std::greater<double>()); double freq_intercept = shift_hash_.getData().front(); double freq_slope = (shift_hash_.getData().back() - shift_hash_.getData().front()) / double(buffer.size()) / scaling_histogram_crossing_slope; if (!freq_slope || !buffer.size()) { // in fact these conditions are actually impossible, but let's be really sure ;-) freq_cutoff_low = 0; } else { Size index = 1; // not 0 (!) while (buffer[index] >= freq_intercept + freq_slope * double(index)) { ++index; } freq_cutoff_low = buffer[--index]; // note that we have index >= 1 } } } while (0); setProgress(++actual_progress); // apply freq_cutoff, setting smaller values to zero for (Size index = 0; index < shift_hash_.getData().size(); ++index) { if (shift_hash_.getData()[index] < freq_cutoff_low) { shift_hash_.getData()[index] = 0; } } setProgress(++actual_progress); // optionally, dump after noise filtering using freq_cutoff if (do_dump_buckets) { dump_buckets_file << "# after freq_cutoff, which is: " << freq_cutoff_low << '\n'; for (Size index = 0; index < shift_hash_.getData().size(); ++index) { const double image = shift_hash_.index2key(index); const double height = shift_hash_.getData()[index]; dump_buckets_file << filtering_stage << '\t' << index << '\t' << image << '\t' << height << '\n'; } dump_buckets_file << '\n'; } setProgress(++actual_progress); // iterative cut-off based on mean and stdev - relies upon scaling_cutoff_stdev_multiplier which is a bit hard to set right. { Math::BasicStatistics<double> statistics; std::vector<double>::const_iterator data_begin = shift_hash_.getData().begin(); const Size data_size = shift_hash_.getData().size(); Size data_range_begin = 0; Size data_range_end = data_size; for (UInt loop = 0; loop < loops_mean_stdev_cutoff; ++loop) // MAGIC ALERT: number of loops { statistics.update(data_begin + data_range_begin, data_begin + data_range_end); double mean = statistics.mean() + data_range_begin; double stdev = sqrt(statistics.variance()); data_range_begin = floor(std::max<double>(mean - scaling_cutoff_stdev_multiplier * stdev, 0)); data_range_end = ceil(std::min<double>(mean + scaling_cutoff_stdev_multiplier * stdev + 1, data_size)); const double outside_mean = shift_hash_.index2key(mean); const double outside_stdev = stdev * shift_hash_.getScale(); // shift_low = (outside_mean - outside_stdev); shift_centroid = (outside_mean); // shift_high = (outside_mean + outside_stdev); if (do_dump_buckets) { dump_buckets_file << "# loop: " << loop << " mean: " << outside_mean << " stdev: " << outside_stdev << " (mean-stdev): " << outside_mean - outside_stdev << " (mean+stdev): " << outside_mean + outside_stdev << " data_range_begin: " << data_range_begin << " data_range_end: " << data_range_end << std::endl; } } setProgress(++actual_progress); } if (do_dump_buckets) { dump_buckets_file << "# EOF" << std::endl; dump_buckets_file.close(); } setProgress(80); } while (0); //************************************************************************************ // Estimate transform // Compute the shifts at the low and high ends by looking at (around) the fullest bins. double intercept; #if 1 // yes of course, use centroids for images of rt_low and rt_high intercept = shift_centroid; #else // ooh, use maximum bins instead (Note: this is a fossil which would disregard most of the above computations! The code is left here for developers/debugging only.) const Size rt_low_max_index = std::distance(shift_hash_.getData().begin(), std::max_element(shift_hash_.getData().begin(), shift_hash_.getData().end())); intercept = shift_hash_.index2key(rt_low_max_index); #endif VV_(intercept); setProgress(++actual_progress); // set trafo { Param params; params.setValue("slope", 1.0); params.setValue("intercept", intercept); TransformationDescription trafo; trafo.fitModel("linear", params); transformation = trafo; } setProgress(++actual_progress); endProgress(); return; } // run()