static void checkSequence(string seq) { Normalizer n; bool ok = n.check(seq); if (!ok) { cerr << "Sequence can contain only numbers 0-9 and lowercase a-z" << endl; exit(1); } }
int main() { Model model; model.AddVariable("X0"); model.AddParam("gfi"); Normalizer normalizer; normalizer.PutRmCoor("X0", 0.156, 0.5465, 0.41541, 0.214); return 0; }
std::string sequenceToPattern(std::string seq) { checkSequence(seq); Normalizer n; std::string sub = ""; if (seq.length()>10) { sub = seq.substr(0,seq.length()-10); seq = seq.substr(seq.length()-10,seq.length()); } string pat = n.denorm(submain(n.norm(seq))); pat = sub + pat; return pat; }
void testNormalize() { BOOST_LOG(rdInfoLog) << "-----------------------\n test normalize" << std::endl; Normalizer normalizer; // Test sulfoxide normalization. std::string smi1 = "CS(C)=O"; std::shared_ptr<ROMol> m1(SmilesToMol(smi1)); ROMOL_SPTR normalized(normalizer.normalize(*m1)); TEST_ASSERT(MolToSmiles(*normalized) == "C[S+](C)[O-]"); // normalize sulfone. std::string smi2 = "C[S+2]([O-])([O-])C"; std::shared_ptr<ROMol> m2(SmilesToMol(smi2)); ROMOL_SPTR normalized2(normalizer.normalize(*m2)); TEST_ASSERT(MolToSmiles(*normalized2) == "CS(C)(=O)=O"); BOOST_LOG(rdInfoLog) << "Finished" << std::endl; }
/* operates on normalized and localized sequences and patterns only alphabet can be modified */ list<string> extendSequence(const list<string>& seq, string pat) { list<string> results; std::string result; boost::regex_constants::syntax_option_type flags = boost::regex_constants::perl; boost::regex re; boost::smatch what; pat = "^" + pat; re.assign(pat, flags); for (list<string>::const_iterator it = seq.begin(); it!=seq.end(); it++) { string activeSeq = *it; Normalizer n; n.norm(activeSeq); string input = n.getInput() + "z"; for (int i=0; i<input.length(); i++) { string ref = activeSeq+input[i]+"~~~~~~~~~~~~~~~~~~"; DBG cout << "pattern " << pat << " against " << ref << endl; try { if (regex_search(ref,what,re)) { string m = what[0]; if (m.length()>activeSeq.length()) { string bit; bit += input[i]; string ext = activeSeq + n.norm(bit); DBG cout << " got something " << ext << endl; results.push_back(string(ext)); } } } catch(boost::regex_error& regErr) { cerr << "regular expression failed: " << regErr.what() << endl; } } } return results; }
ExitCodes main_(int, const char **) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- //input/output files String in(getStringOption_("in")); String out(getStringOption_("out")); //------------------------------------------------------------- // loading input //------------------------------------------------------------- MSExperiment<> exp; MzMLFile f; f.setLogType(log_type_); f.load(in, exp); //------------------------------------------------------------- // filter //------------------------------------------------------------- Param filter_param = getParam_().copy("algorithm:", true); writeDebug_("Used filter parameters", filter_param, 3); Normalizer filter; filter.setParameters(filter_param); filter.filterPeakMap(exp); //------------------------------------------------------------- // writing output //------------------------------------------------------------- //annotate output with data processing info addDataProcessing_(exp, getProcessingInfo_(DataProcessing::FILTERING)); f.store(out, exp); return EXECUTION_OK; }
int main() { cout << ">> TEST DATA SET (k=3)" << endl; rowvec inX = randu<rowvec>(4); mat dataSet = randu<mat>(4, 4); ucolvec labels; labels << 1 << endr << 3 << endr << 2 << endr << 3 << endr; inX.print("inX"); dataSet.print("dataSet:"); labels.print("labels:"); cout << "answer: " << knn(inX, dataSet, labels, 3) << endl << endl; cout << ">> RAND DATA SET (k=10)" << endl; srand ( time(NULL) ); inX = randu<rowvec>(100); dataSet = randu<mat>(1000000, 100); labels = arma::conv_to<ucolvec>::from(4 * randu<colvec>(1000000)); cout << "0s: " << count(labels.begin(), labels.end(), 0) << endl; cout << "1s: " << count(labels.begin(), labels.end(), 1) << endl; cout << "2s: " << count(labels.begin(), labels.end(), 2) << endl; cout << "3s: " << count(labels.begin(), labels.end(), 3) << endl; cout << "answer: " << knn(inX, dataSet, labels, 10) << endl << endl; cout << ">> FILE-BASED DATA SET (k=10)" << endl; mat data; data.load("datingTestSet2.txt"); dataSet = data.submat(0, 0, data.n_rows-1, data.n_cols-2); Normalizer <double> normalizer; dataSet = normalizer.normalize(dataSet); labels = arma::conv_to<ucolvec>::from(data.submat(0, data.n_cols-1, data.n_rows-1, data.n_cols-1)); rowvec inX1, inX2, inX3; inX1 << 58732 << 2.454285 << 0.222380 << endr; inX2 << 6121 << 8.339588 << 1.443357 << endr; inX3 << 36800 << 12.45 << 0.64 << endr; cout << "example from the 1st class:" << inX1 << "answer: " << knn(normalizer.normalize(inX1), dataSet, labels, 10) << endl << endl; cout << "example from the 2nd class:" << inX2 << "answer: " << knn(normalizer.normalize(inX2), dataSet, labels, 10) << endl << endl; cout << "example from the 3rd class:" << inX3 << "answer: " << knn(normalizer.normalize(inX3), dataSet, labels, 10) << endl << endl; return 0; }
vector<int> extendSequence(const vector<int>& seq, int ct) { vector<int> result; Normalizer n; int at = 0; map<int,char> lower; map<char,int> revLower; for (vector<int>::const_iterator it = seq.begin(); it!=seq.end(); it++) { int v = *it; if (lower.find(v)==lower.end()) { char ch = n.toChar(at); if (ch==-1) { printf("Too many units in symbolic sequence\n"); } lower[v] = ch; revLower[ch] = v; at++; } } if (at>10) { // alternate mode: just look for best prior int hist = seq.size()-1; for (int i=0; i<ct; i++) { int bestHits = -1; int bestAt = 0; for (int j=0; j<seq.size(); j++) { int hits = 0; for (int k=0; k<=hist; k++) { if (j-k-1<0) { break; } if (seq[j-k-1]!=seq[hist-k]) { break; } hits++; } if (hits>bestHits) { bestAt = j; bestHits = hits; } } hist = bestAt; result.push_back(seq[hist]); } cerr << "Did alternative search" << endl; cerr << "Got: "; for (int i=0; i<result.size(); i++) { cerr << result[i] << " "; } cerr << endl; return result; } string strSeq; for (vector<int>::const_iterator it = seq.begin(); it!=seq.end(); it++) { strSeq += lower[*it]; } string pat = sequenceToPattern(strSeq); string ext = extendSequence(strSeq,pat,ct); map<char,int> novel; int novelAt = 1; for (int i=0; i<ext.length(); i++) { char ch = ext[i]; int v = 0; if (revLower.find(ch)==revLower.end()) { if (novel.find(ch)==novel.end()) { novel[ch] = novelAt; novelAt++; } v = -novel[ch]; } else { v = revLower[ch]; } result.push_back(v); } return result; }
std::string extendSequence(std::string seq, std::string pattern, int len) { srand (std::time(0)); checkSequence(seq); Normalizer n; int pruneLen = 100; seq = n.norm(seq); pattern = n.norm(pattern); string refPattern = pattern; pattern = localizePattern(pattern); DBG cout << "extend " << seq << " with " << pattern << endl; n.add("ABCDEFGHIJKLMNOPQRSTUVWXYZ"); list<string> lst; lst.push_back(seq); for (int k=0; k<len; k++) { lst = extendSequence(lst,pattern); list<string> lst2; for (list<string>::const_iterator it = lst.begin(); it!=lst.end(); it++) { string nextSeq = *it; string nextPat = refPattern; if (nextSeq.length()<=10) { nextPat = sequenceToPattern(nextSeq); } if (nextPat==refPattern) { lst2.push_back(nextSeq); } else { DBG cout << nextSeq << ": " << "mismatch " << nextPat << " versus " << refPattern << endl; } } if (lst2.size()>0) { lst = lst2; } if (lst.size()>pruneLen) { DBG cout << "NEED TO PRUNE" << endl; vector<string> v(lst.begin(),lst.end()); random_shuffle(v.begin(),v.end()); lst.clear(); lst = list<string>(v.begin(),v.begin()+pruneLen); //lst.erase((++(++(lst.begin()))),lst.end()); } DBG { cout << "possibilities: " << endl; for (list<string>::const_iterator it = lst.begin(); it!=lst.end(); it++) { cout << " -- " << n.denorm(*it) << endl; } } } for (list<string>::const_iterator it = lst.begin(); it!=lst.end(); it++) { DBG cout << " final possibility " << n.denorm(*it) << endl; } vector<string> v(lst.begin(),lst.end()); random_shuffle(v.begin(),v.end()); if (v.size()>0) { string result = n.denorm(v[0]); result = result.substr(seq.length(),result.length()); return result; } return ""; }
void CompNovoIdentificationCID::getIdentification(PeptideIdentification & id, const PeakSpectrum & CID_spec) { //if (CID_spec.getPrecursors().begin()->getMZ() > 1000.0) //{ //cerr << "Weight of precursor has been estimated to exceed 2000.0 Da which is the current limit" << endl; //return; //} PeakSpectrum new_CID_spec(CID_spec); windowMower_(new_CID_spec, 0.3, 1); Param zhang_param; zhang_param = zhang_.getParameters(); zhang_param.setValue("tolerance", fragment_mass_tolerance_); zhang_param.setValue("use_gaussian_factor", "true"); zhang_param.setValue("use_linear_factor", "false"); zhang_.setParameters(zhang_param); Normalizer normalizer; Param n_param(normalizer.getParameters()); n_param.setValue("method", "to_one"); normalizer.setParameters(n_param); normalizer.filterSpectrum(new_CID_spec); Size charge(2); double precursor_weight(0); // [M+H]+ if (!CID_spec.getPrecursors().empty()) { // believe charge of spectrum? if (CID_spec.getPrecursors().begin()->getCharge() != 0) { charge = CID_spec.getPrecursors().begin()->getCharge(); } else { // TODO estimate charge state } precursor_weight = CID_spec.getPrecursors().begin()->getMZ() * charge - ((charge - 1) * Constants::PROTON_MASS_U); } //cerr << "charge=" << charge << ", [M+H]=" << precursor_weight << endl; // now delete all peaks that are right of the estimated precursor weight Size peak_counter(0); for (PeakSpectrum::ConstIterator it = new_CID_spec.begin(); it != new_CID_spec.end(); ++it, ++peak_counter) { if (it->getPosition()[0] > precursor_weight) { break; } } if (peak_counter < new_CID_spec.size()) { new_CID_spec.resize(peak_counter); } static double oxonium_mass = EmpiricalFormula("H2O+").getMonoWeight(); Peak1D p; p.setIntensity(1); p.setPosition(oxonium_mass); new_CID_spec.push_back(p); p.setPosition(precursor_weight); new_CID_spec.push_back(p); // add complement to spectrum /* for (PeakSpectrum::ConstIterator it1 = CID_spec.begin(); it1 != CID_spec.end(); ++it1) { // get m/z of complement double mz_comp = precursor_weight - it1->getPosition()[0] + Constants::PROTON_MASS_U; // search if peaks are available that have similar m/z values Size count(0); bool found(false); for (PeakSpectrum::ConstIterator it2 = CID_spec.begin(); it2 != CID_spec.end(); ++it2, ++count) { if (fabs(mz_comp - it2->getPosition()[0]) < fragment_mass_tolerance) { // add peak intensity to corresponding peak in new_CID_spec new_CID_spec[count].setIntensity(new_CID_spec[count].getIntensity()); } } if (!found) { // infer this peak Peak1D p; p.setIntensity(it1->getIntensity()); p.setPosition(mz_comp); new_CID_spec.push_back(p); } }*/ CompNovoIonScoringCID ion_scoring; Param ion_scoring_param(ion_scoring.getParameters()); ion_scoring_param.setValue("fragment_mass_tolerance", fragment_mass_tolerance_); ion_scoring_param.setValue("precursor_mass_tolerance", precursor_mass_tolerance_); ion_scoring_param.setValue("decomp_weights_precision", decomp_weights_precision_); ion_scoring_param.setValue("double_charged_iso_threshold", (double)param_.getValue("double_charged_iso_threshold")); ion_scoring_param.setValue("max_isotope_to_score", param_.getValue("max_isotope_to_score")); ion_scoring_param.setValue("max_isotope", max_isotope_); ion_scoring.setParameters(ion_scoring_param); Map<double, IonScore> ion_scores; ion_scoring.scoreSpectrum(ion_scores, new_CID_spec, precursor_weight, charge); new_CID_spec.sortByPosition(); /* cerr << "Size of ion_scores " << ion_scores.size() << endl; for (Map<double, IonScore>::const_iterator it = ion_scores.begin(); it != ion_scores.end(); ++it) { cerr << it->first << " " << it->second.score << endl; }*/ #ifdef WRITE_SCORED_SPEC PeakSpectrum filtered_spec(new_CID_spec); filtered_spec.clear(); for (Map<double, CompNovoIonScoringCID::IonScore>::const_iterator it = ion_scores.begin(); it != ion_scores.end(); ++it) { Peak1D p; p.setIntensity(it->second.score); p.setPosition(it->first); filtered_spec.push_back(p); } DTAFile().store("spec_scored.dta", filtered_spec); #endif set<String> sequences; getDecompositionsDAC_(sequences, 0, new_CID_spec.size() - 1, precursor_weight, new_CID_spec, ion_scores); #ifdef SPIKE_IN sequences.insert("AFCVDGEGR"); sequences.insert("APEFAAPWPDFVPR"); sequences.insert("AVKQFEESQGR"); sequences.insert("CCTESLVNR"); sequences.insert("DAFLGSFLYEYSR"); sequences.insert("DAIPENLPPLTADFAEDK"); sequences.insert("DDNKVEDIWSFLSK"); sequences.insert("DDPHACYSTVFDK"); sequences.insert("DEYELLCLDGSR"); sequences.insert("DGAESYKELSVLLPNR"); sequences.insert("DGASCWCVDADGR"); sequences.insert("DLFIPTCLETGEFAR"); sequences.insert("DTHKSEIAHR"); sequences.insert("DVCKNYQEAK"); sequences.insert("EACFAVEGPK"); sequences.insert("ECCHGDLLECADDR"); sequences.insert("EFLGDKFYTVISSLK"); sequences.insert("EFTPVLQADFQK"); sequences.insert("ELFLDSGIFQPMLQGR"); sequences.insert("ETYGDMADCCEK"); sequences.insert("EVGCPSSSVQEMVSCLR"); sequences.insert("EYEATLEECCAK"); sequences.insert("FADLIQSGTFQLHLDSK"); sequences.insert("FFSASCVPGATIEQK"); sequences.insert("FLANVSTVLTSK"); sequences.insert("FLSGSDYAIR"); sequences.insert("FTASCPPSIK"); sequences.insert("GAIEWEGIESGSVEQAVAK"); sequences.insert("GDVAFIQHSTVEENTGGK"); sequences.insert("GEPPSCAEDQSCPSER"); sequences.insert("GEYVPTSLTAR"); sequences.insert("GQEFTITGQKR"); sequences.insert("GTFAALSELHCDK"); sequences.insert("HLVDEPQNLIK"); sequences.insert("HQDCLVTTLQTQPGAVR"); sequences.insert("HTTVNENAPDQK"); sequences.insert("ILDCGSPDTEVR"); sequences.insert("KCPSPCQLQAER"); sequences.insert("KGTEFTVNDLQGK"); sequences.insert("KQTALVELLK"); sequences.insert("KVPQVSTPTLVEVSR"); sequences.insert("LALQFTTNAKR"); sequences.insert("LCVLHEKTPVSEK"); sequences.insert("LFTFHADICTLPDTEK"); sequences.insert("LGEYGFQNALIVR"); sequences.insert("LHVDPENFK"); sequences.insert("LKECCDKPLLEK"); sequences.insert("LKHLVDEPQNLIK"); sequences.insert("LKPDPNTLCDEFK"); sequences.insert("LLGNVLVVVLAR"); sequences.insert("LLVVYPWTQR"); sequences.insert("LRVDPVNFK"); sequences.insert("LTDEELAFPPLSPSR"); sequences.insert("LVNELTEFAK"); sequences.insert("MFLSFPTTK"); sequences.insert("MPCTEDYLSLILNR"); sequences.insert("NAPYSGYSGAFHCLK"); sequences.insert("NECFLSHKDDSPDLPK"); sequences.insert("NEPNKVPACPGSCEEVK"); sequences.insert("NLQMDDFELLCTDGR"); sequences.insert("QAGVQAEPSPK"); sequences.insert("RAPEFAAPWPDFVPR"); sequences.insert("RHPEYAVSVLLR"); sequences.insert("RPCFSALTPDETYVPK"); sequences.insert("RSLLLAPEEGPVSQR"); sequences.insert("SAFPPEPLLCSVQR"); sequences.insert("SAGWNIPIGTLLHR"); sequences.insert("SCWCVDEAGQK"); sequences.insert("SGNPNYPHEFSR"); sequences.insert("SHCIAEVEK"); sequences.insert("SISSGFFECER"); sequences.insert("SKYLASASTMDHAR"); sequences.insert("SLHTLFGDELCK"); sequences.insert("SLLLAPEEGPVSQR"); sequences.insert("SPPQCSPDGAFRPVQCK"); sequences.insert("SREGDPLAVYLK"); sequences.insert("SRQIPQCPTSCER"); sequences.insert("TAGTPVSIPVCDDSSVK"); sequences.insert("TCVADESHAGCEK"); sequences.insert("TQFGCLEGFGR"); sequences.insert("TVMENFVAFVDK"); sequences.insert("TYFPHFDLSHGSAQVK"); sequences.insert("TYMLAFDVNDEK"); sequences.insert("VDEVGGEALGR"); sequences.insert("VDLLIGSSQDDGLINR"); sequences.insert("VEDIWSFLSK"); sequences.insert("VGGHAAEYGAEALER"); sequences.insert("VGTRCCTKPESER"); sequences.insert("VKVDEVGGEALGR"); sequences.insert("VKVDLLIGSSQDDGLINR"); sequences.insert("VLDSFSNGMK"); sequences.insert("VLSAADKGNVK"); sequences.insert("VPQVSTPTLVEVSR"); sequences.insert("VTKCCTESLVNR"); sequences.insert("VVAASDASQDALGCVK"); sequences.insert("VVAGVANALAHR"); sequences.insert("YICDNQDTISSK"); sequences.insert("YLASASTMDHAR"); sequences.insert("YNGVFQECCQAEDK"); #endif SpectrumAlignmentScore spectra_zhang; spectra_zhang.setParameters(zhang_param); vector<PeptideHit> hits; Size missed_cleavages = param_.getValue("missed_cleavages"); for (set<String>::const_iterator it = sequences.begin(); it != sequences.end(); ++it) { Size num_missed = countMissedCleavagesTryptic_(*it); if (missed_cleavages < num_missed) { //cerr << "Two many missed cleavages: " << *it << ", found " << num_missed << ", allowed " << missed_cleavages << endl; continue; } PeakSpectrum CID_sim_spec; getCIDSpectrum_(CID_sim_spec, *it, charge); //normalizer.filterSpectrum(CID_sim_spec); double cid_score = zhang_(CID_sim_spec, CID_spec); PeptideHit hit; hit.setScore(cid_score); hit.setSequence(getModifiedAASequence_(*it)); hit.setCharge((Int)charge); //TODO unify charge interface: int or size? hits.push_back(hit); //cerr << getModifiedAASequence_(*it) << " " << cid_score << " " << endl; } // rescore the top hits id.setHits(hits); id.assignRanks(); hits = id.getHits(); SpectrumAlignmentScore alignment_score; Param align_param(alignment_score.getParameters()); align_param.setValue("tolerance", fragment_mass_tolerance_); align_param.setValue("use_linear_factor", "true"); alignment_score.setParameters(align_param); for (vector<PeptideHit>::iterator it = hits.begin(); it != hits.end(); ++it) { //cerr << "Pre: " << it->getRank() << " " << it->getSequence() << " " << it->getScore() << " " << endl; } Size number_of_prescoring_hits = param_.getValue("number_of_prescoring_hits"); if (hits.size() > number_of_prescoring_hits) { hits.resize(number_of_prescoring_hits); } for (vector<PeptideHit>::iterator it = hits.begin(); it != hits.end(); ++it) { PeakSpectrum CID_sim_spec; getCIDSpectrum_(CID_sim_spec, getModifiedStringFromAASequence_(it->getSequence()), charge); normalizer.filterSpectrum(CID_sim_spec); //DTAFile().store("sim_specs/" + it->getSequence().toUnmodifiedString() + "_sim_CID.dta", CID_sim_spec); //double cid_score = spectra_zhang(CID_sim_spec, CID_spec); double cid_score = alignment_score(CID_sim_spec, CID_spec); //cerr << "Final: " << it->getSequence() << " " << cid_score << endl; it->setScore(cid_score); } id.setHits(hits); id.assignRanks(); hits = id.getHits(); for (vector<PeptideHit>::iterator it = hits.begin(); it != hits.end(); ++it) { //cerr << "Fin: " << it->getRank() << " " << it->getSequence() << " " << it->getScore() << " " << endl; } Size number_of_hits = param_.getValue("number_of_hits"); if (id.getHits().size() > number_of_hits) { hits.resize(number_of_hits); } id.setHits(hits); id.assignRanks(); return; }
int main(int /*argc*/, char ** /*argv*/) { Normalizer *normalizer = new Normalizer(); DOMDocument *doc = normalizer->createDocument(); bool *tmpTrue = new bool(true); bool *tmpFalse = new bool(false); DOMElement* docFirstElement = doc->createElementNS(X("http://www.test.com"),X("docEle")); doc->appendChild(docFirstElement); DOMElement* docFirstElementChild = doc->createElementNS(X("http://www.test2.com"),X("docEleChild")); docFirstElement->appendChild(docFirstElementChild); //create default ns doc->normalizeDocument(); normalizer->serializeNode(doc); XERCES_STD_QUALIFIER cout << "\n\n"; //add in binding docFirstElement->setPrefix(X("po")); doc->normalizeDocument(); normalizer->serializeNode(doc); XERCES_STD_QUALIFIER cout << "\n\n"; //use default DOMElement* docFirstElementChildChild = doc->createElementNS(X("http://www.test2.com"),X("docEleChildChild")); docFirstElementChild->appendChild(docFirstElementChildChild); doc->normalizeDocument(); normalizer->serializeNode(doc); XERCES_STD_QUALIFIER cout << "\n\n"; // this block is needed to destroy the XMLBuffer { //use a binding XMLBuffer buf; buf.set(XMLUni::fgXMLNSString); buf.append(chColon); buf.append(X("po2")); docFirstElementChild->removeAttributeNS(XMLUni::fgXMLNSURIName, XMLUni::fgXMLNSString); docFirstElement->removeAttributeNS(XMLUni::fgXMLNSURIName, XMLUni::fgXMLNSString); docFirstElement->setAttributeNS(XMLUni::fgXMLNSURIName, buf.getRawBuffer(), X("http://www.test2.com")); docFirstElementChild->setPrefix(X("po2")); doc->normalizeDocument(); normalizer->serializeNode(doc); XERCES_STD_QUALIFIER cout << "\n\n"; } //some siblngs to ensure the scope stacks are working docFirstElementChildChild = doc->createElementNS(X("http://www.test3.com"),X("docEleChildChild2")); docFirstElementChild->appendChild(docFirstElementChildChild); docFirstElementChildChild = doc->createElementNS(X("http://www.test4.com"),X("po4:docEleChildChild3")); docFirstElementChild->appendChild(docFirstElementChildChild); docFirstElementChildChild = doc->createElementNS(X("http://www.test4.com"),X("po4:docEleChildChild4")); docFirstElementChild->appendChild(docFirstElementChildChild); doc->normalizeDocument(); normalizer->serializeNode(doc); XERCES_STD_QUALIFIER cout << "\n\n"; //conflicting prefix docFirstElementChildChild->setAttributeNS(XMLUni::fgXMLNSURIName, X("po4"), X("conflict")); doc->normalizeDocument(); normalizer->serializeNode(doc); XERCES_STD_QUALIFIER cout << "\n\n"; //conflicting default docFirstElementChildChild = doc->createElementNS(X("http://www.test4.com"),X("docEleChildChild5")); docFirstElementChild->appendChild(docFirstElementChildChild); docFirstElementChildChild->setAttributeNS(XMLUni::fgXMLNSURIName, XMLUni::fgXMLNSString, X("conflict")); doc->normalizeDocument(); normalizer->serializeNode(doc); XERCES_STD_QUALIFIER cout << "\n\n"; //set the xmlns to "" DOMElement *noNamespaceEle = doc->createElementNS(X(""),X("noNamespace")); docFirstElementChildChild->appendChild(noNamespaceEle); doc->normalizeDocument(); normalizer->serializeNode(doc); XERCES_STD_QUALIFIER cout << "\n\n"; //now lets do a bit off attribute testing on the doc ele docFirstElement->setAttributeNS(X("http://testattr.com"), X("attr1"), X("value")); docFirstElement->setAttributeNS(X("http://testattr.com"), X("attr2"), X("value")); docFirstElement->setAttributeNS(X("http://testattr2.com"), X("attr3"), X("value")); docFirstElement->setAttributeNS(X("http://www.test.com"), X("attr4"), X("value")); docFirstElement->setAttributeNS(X("http://testattr2.com"), X("po:attr5"), X("value")); docFirstElement->setAttributeNS(X("http://testattr2.com"), X("poFake:attr6"), X("value")); docFirstElement->setAttributeNS(X("http://testattr3.com"), X("po3:attr7"), X("value")); doc->normalizeDocument(); normalizer->serializeNode(doc); XERCES_STD_QUALIFIER cout << "\n\n"; //and now on one of its children docFirstElementChildChild->setAttributeNS(X("http://testattr.com"), X("attr1"), X("value")); docFirstElementChildChild->setAttributeNS(X("http://testattr.com"), X("attr2"), X("value")); docFirstElementChildChild->setAttributeNS(X("http://testattr2.com"), X("attr3"), X("value")); docFirstElementChildChild->setAttributeNS(X("http://www.test.com"), X("attr4"), X("value")); docFirstElementChildChild->setAttributeNS(X("http://testattr2.com"), X("po:attr5"), X("value")); docFirstElementChildChild->setAttributeNS(X("http://testattr2.com"), X("poFake:attr6"), X("value")); docFirstElementChildChild->setAttributeNS(X("http://testattr3.com"), X("po3:attr7"), X("value")); docFirstElementChildChild->setAttributeNS(X("http://testattr4.com"), X("po4:attr8"), X("value")); //test for a clash with our NSx attrs docFirstElementChildChild->setAttributeNS(X("http://testclash.com"), X("NS1:attr9"), X("value")); docFirstElementChildChild->setAttributeNS(XMLUni::fgXMLNSURIName, X("xmlns:NS1"), X("http://testclash.com")); //clash with standard prefix docFirstElementChildChild->setAttributeNS(X("http://testattr5.com"), X("po:attr10"), X("value")); doc->normalizeDocument(); normalizer->serializeNode(doc); XERCES_STD_QUALIFIER cout << "\n\n"; //2 prefix with the same uri docFirstElementChildChild = doc->createElementNS(X("http://www.uri1.com"),X("docEleChildChild6")); docFirstElementChild->appendChild(docFirstElementChildChild); docFirstElementChildChild->setAttributeNS(XMLUni::fgXMLNSURIName, X("xmlns:uri1"), X("http://www.uri1.com")); docFirstElementChildChild->setAttributeNS(XMLUni::fgXMLNSURIName, X("xmlns:uri1b"), X("http://www.uri1.com")); docFirstElementChildChild->setAttributeNS(X("http://www.uri1.com"), X("uri1:attr1"), X("value")); docFirstElementChildChild->setAttributeNS(X("http://www.uri1.com"), X("uri1b:attr2"), X("value")); doc->normalizeDocument(); normalizer->serializeNode(doc); XERCES_STD_QUALIFIER cout << "\n\n"; //check to see we use the nearest binding and for more inheritence DOMElement *docFirstElementChildChildChild = doc->createElementNS(X("http://www.uri1.com"),X("docEleChildChildChild")); docFirstElementChildChild->appendChild(docFirstElementChildChildChild); docFirstElementChildChild->setAttributeNS(XMLUni::fgXMLNSURIName, X("xmlns:nearerThanPo"), X("http://www.test.com")); docFirstElementChildChildChild->setAttributeNS(X("http://testattr.com"), X("attr2"), X("value")); docFirstElementChildChildChild->setAttributeNS(X("http://www.test.com"), X("attr1"), X("value")); doc->normalizeDocument(); normalizer->serializeNode(doc); XERCES_STD_QUALIFIER cout << "\n\n"; //NS1.1 stuff //test creating default prefix when NS1 has been set to "" noNamespaceEle->setAttributeNS(XMLUni::fgXMLNSURIName, X("xmlns:NS1"), X("")); DOMElement *noNamespaceChild = doc->createElementNS(X("http://testclash.com"),X("testing1.1Stuff")); noNamespaceEle->appendChild(noNamespaceChild); doc->normalizeDocument(); normalizer->serializeNode(doc); noNamespaceChild = doc->createElementNS(X("http://testclash.com"),X("NS1:testing1.1Stuff")); noNamespaceEle->appendChild(noNamespaceChild); noNamespaceChild->setAttributeNS(X("http://www.someRandomUri.com"), X("attr"), X("value")); doc->normalizeDocument(); normalizer->serializeNode(doc); //check error conditions XERCES_STD_QUALIFIER cout << "error conditions" << XERCES_STD_QUALIFIER endl; DOMConfiguration *conf = doc->getDOMConfig(); conf->setParameter(XMLUni::fgDOMErrorHandler, normalizer); conf->setParameter(XMLUni::fgDOMNamespaces, true); DOMElement *level1Node = doc->createElement(X("level1Node")); docFirstElement->appendChild(level1Node); doc->normalizeDocument(); docFirstElement->removeChild(level1Node); docFirstElement->setAttribute(X("level1Attr"), X("level1")); doc->normalizeDocument(); docFirstElement->removeAttribute(X("level1Attr")); //cant check this as Xerces does not let us do it // noNamespaceChild->setAttributeNS(X("http://www.someRandomUri.com"), X("xmlns"), X("value")); // doc->normalizeDocument(); //lets do a sanity test on a comment DOMComment *comment = doc->createComment(X("some comment")); docFirstElement->appendChild(comment); doc->normalizeDocument(); normalizer->serializeNode(doc); conf->setParameter(XMLUni::fgDOMComments, false); docFirstElement->appendChild(comment); doc->normalizeDocument(); normalizer->serializeNode(doc); //and on a CDATA DOMCDATASection *cData = doc->createCDATASection(X("some cdata")); docFirstElement->appendChild(cData); doc->normalizeDocument(); normalizer->serializeNode(doc); conf->setParameter(XMLUni::fgDOMCDATASections, false); docFirstElement->appendChild(cData); doc->normalizeDocument(); normalizer->serializeNode(doc); delete normalizer; delete tmpTrue; delete tmpFalse; return 0; }
ExitCodes main_(int, const char **) { //------------------------------------------------------------- // parsing parameters //------------------------------------------------------------- StringList id_in(getStringList_("id_in")); StringList in_raw(getStringList_("in")); Size number_of_bins((UInt)getIntOption_("number_of_bins")); bool precursor_error_ppm(getFlag_("precursor_error_ppm")); bool fragment_error_ppm(getFlag_("fragment_error_ppm")); bool generate_gnuplot_scripts(DataValue(getStringOption_("generate_gnuplot_scripts")).toBool()); if (in_raw.size() != id_in.size()) { writeLog_("Number of spectrum files and identification files differs..."); return ILLEGAL_PARAMETERS; } //------------------------------------------------------------- // reading input //------------------------------------------------------------- vector<vector<PeptideIdentification> > pep_ids; vector<vector<ProteinIdentification> > prot_ids; pep_ids.resize(id_in.size()); prot_ids.resize(id_in.size()); IdXMLFile idxmlfile; for (Size i = 0; i != id_in.size(); ++i) { String doc_id; idxmlfile.load(id_in[i], prot_ids[i], pep_ids[i], doc_id); } // read mzML files vector<RichPeakMap> maps_raw; maps_raw.resize(in_raw.size()); MzMLFile mzml_file; for (Size i = 0; i != in_raw.size(); ++i) { mzml_file.load(in_raw[i], maps_raw[i]); } //------------------------------------------------------------- // calculations //------------------------------------------------------------- // mapping ids IDMapper mapper; for (Size i = 0; i != maps_raw.size(); ++i) { mapper.annotate(maps_raw[i], pep_ids[i], prot_ids[i]); } // normalize the spectra Normalizer normalizer; for (vector<RichPeakMap>::iterator it1 = maps_raw.begin(); it1 != maps_raw.end(); ++it1) { for (RichPeakMap::Iterator it2 = it1->begin(); it2 != it1->end(); ++it2) { normalizer.filterSpectrum(*it2); } } // generate precursor statistics vector<MassDifference> precursor_diffs; if (getStringOption_("precursor_out") != "") { for (Size i = 0; i != maps_raw.size(); ++i) { for (Size j = 0; j != maps_raw[i].size(); ++j) { if (maps_raw[i][j].getPeptideIdentifications().empty()) { continue; } for (vector<PeptideIdentification>::const_iterator it = maps_raw[i][j].getPeptideIdentifications().begin(); it != maps_raw[i][j].getPeptideIdentifications().end(); ++it) { if (it->getHits().size() > 0) { PeptideHit hit = *it->getHits().begin(); MassDifference md; Int charge = hit.getCharge(); if (charge == 0) { charge = 1; } md.exp_mz = it->getMZ(); md.theo_mz = (hit.getSequence().getMonoWeight() + (double)charge * Constants::PROTON_MASS_U) / (double)charge; md.charge = charge; precursor_diffs.push_back(md); } } } } } // generate fragment ions statistics vector<MassDifference> fragment_diffs; TheoreticalSpectrumGenerator tsg; SpectrumAlignment sa; double fragment_mass_tolerance(getDoubleOption_("fragment_mass_tolerance")); Param sa_param(sa.getParameters()); sa_param.setValue("tolerance", fragment_mass_tolerance); sa.setParameters(sa_param); if (getStringOption_("fragment_out") != "") { for (Size i = 0; i != maps_raw.size(); ++i) { for (Size j = 0; j != maps_raw[i].size(); ++j) { if (maps_raw[i][j].getPeptideIdentifications().empty()) { continue; } for (vector<PeptideIdentification>::const_iterator it = maps_raw[i][j].getPeptideIdentifications().begin(); it != maps_raw[i][j].getPeptideIdentifications().end(); ++it) { if (it->getHits().size() > 0) { PeptideHit hit = *it->getHits().begin(); RichPeakSpectrum theo_spec; tsg.addPeaks(theo_spec, hit.getSequence(), Residue::YIon); tsg.addPeaks(theo_spec, hit.getSequence(), Residue::BIon); vector<pair<Size, Size> > pairs; sa.getSpectrumAlignment(pairs, theo_spec, maps_raw[i][j]); //cerr << hit.getSequence() << " " << hit.getSequence().getSuffix(1).getFormula() << " " << hit.getSequence().getSuffix(1).getFormula().getMonoWeight() << endl; for (vector<pair<Size, Size> >::const_iterator pit = pairs.begin(); pit != pairs.end(); ++pit) { MassDifference md; md.exp_mz = maps_raw[i][j][pit->second].getMZ(); md.theo_mz = theo_spec[pit->first].getMZ(); //cerr.precision(15); //cerr << md.exp_mz << " " << md.theo_mz << " " << md.exp_mz - md.theo_mz << endl; md.intensity = maps_raw[i][j][pit->second].getIntensity(); md.charge = hit.getCharge(); fragment_diffs.push_back(md); } } } } } } //------------------------------------------------------------- // writing output //------------------------------------------------------------- String precursor_out_file(getStringOption_("precursor_out")); if (precursor_out_file != "") { vector<double> errors; ofstream precursor_out(precursor_out_file.c_str()); double min_diff(numeric_limits<double>::max()), max_diff(numeric_limits<double>::min()); for (Size i = 0; i != precursor_diffs.size(); ++i) { double diff = getMassDifference(precursor_diffs[i].theo_mz, precursor_diffs[i].exp_mz, precursor_error_ppm); precursor_out << diff << "\n"; errors.push_back(diff); if (diff > max_diff) { max_diff = diff; } if (diff < min_diff) { min_diff = diff; } } precursor_out.close(); // fill histogram with the collected values double bin_size = (max_diff - min_diff) / (double)number_of_bins; Histogram<double, double> hist(min_diff, max_diff, bin_size); for (Size i = 0; i != errors.size(); ++i) { hist.inc(errors[i], 1.0); } writeDebug_("min_diff=" + String(min_diff) + ", max_diff=" + String(max_diff) + ", number_of_bins=" + String(number_of_bins), 1); // transform the histogram into a vector<DPosition<2> > for the fitting vector<DPosition<2> > values; for (Size i = 0; i != hist.size(); ++i) { DPosition<2> p; p.setX((double)i / (double)number_of_bins * (max_diff - min_diff) + min_diff); p.setY(hist[i]); values.push_back(p); } double mean = Math::mean(errors.begin(), errors.end()); double abs_dev = Math::absdev(errors.begin(), errors.end(), mean); double sdv = Math::sd(errors.begin(), errors.end(), mean); sort(errors.begin(), errors.end()); double median = errors[(Size)(errors.size() / 2.0)]; writeDebug_("Precursor mean error: " + String(mean), 1); writeDebug_("Precursor abs. dev.: " + String(abs_dev), 1); writeDebug_("Precursor std. dev.: " + String(sdv), 1); writeDebug_("Precursor median error: " + String(median), 1); // calculate histogram for gauss fitting GaussFitter gf; GaussFitter::GaussFitResult init_param (hist.maxValue(), median, sdv/500.0); gf.setInitialParameters(init_param); try { gf.fit(values); // write gnuplot scripts if (generate_gnuplot_scripts) { ofstream out(String(precursor_out_file + "_gnuplot.dat").c_str()); for (vector<DPosition<2> >::const_iterator it = values.begin(); it != values.end(); ++it) { out << it->getX() << " " << it->getY() << endl; } out.close(); ofstream gpl_out(String(precursor_out_file + "_gnuplot.gpl").c_str()); gpl_out << "set terminal png" << endl; gpl_out << "set output \"" << precursor_out_file << "_gnuplot.png\"" << endl; if (precursor_error_ppm) { gpl_out << "set xlabel \"error in ppm\"" << endl; } else { gpl_out << "set xlabel \"error in Da\"" << endl; } gpl_out << "set ylabel \"frequency\"" << endl; gpl_out << "plot '" << precursor_out_file << "_gnuplot.dat' title 'Precursor mass error distribution' w boxes, f(x) w lp title 'Gaussian fit of the error distribution'" << endl; gpl_out.close(); } } catch (Exception::UnableToFit) { writeLog_("Unable to fit a Gaussian distribution to the precursor mass errors"); } } String fragment_out_file(getStringOption_("fragment_out")); if (fragment_out_file != "") { vector<double> errors; ofstream fragment_out(fragment_out_file.c_str()); double min_diff(numeric_limits<double>::max()), max_diff(numeric_limits<double>::min()); for (Size i = 0; i != fragment_diffs.size(); ++i) { double diff = getMassDifference(fragment_diffs[i].theo_mz, fragment_diffs[i].exp_mz, fragment_error_ppm); fragment_out << diff << endl; errors.push_back(diff); if (diff > max_diff) { max_diff = diff; } if (diff < min_diff) { min_diff = diff; } } fragment_out.close(); // fill histogram with the collected values // here we use the intensities to scale the error // low intensity peaks are likely to be random matches double bin_size = (max_diff - min_diff) / (double)number_of_bins; Histogram<double, double> hist(min_diff, max_diff, bin_size); for (Size i = 0; i != fragment_diffs.size(); ++i) { double diff = getMassDifference(fragment_diffs[i].theo_mz, fragment_diffs[i].exp_mz, fragment_error_ppm); hist.inc(diff, fragment_diffs[i].intensity); } writeDebug_("min_diff=" + String(min_diff) + ", max_diff=" + String(max_diff) + ", number_of_bins=" + String(number_of_bins), 1); // transform the histogram into a vector<DPosition<2> > for the fitting vector<DPosition<2> > values; for (Size i = 0; i != hist.size(); ++i) { DPosition<2> p; p.setX((double)i / (double)number_of_bins * (max_diff - min_diff) + min_diff); p.setY(hist[i]); values.push_back(p); } double mean = Math::mean(errors.begin(), errors.end()); double abs_dev = Math::absdev(errors.begin(), errors.end(), mean); double sdv = Math::sd(errors.begin(), errors.end(), mean); sort(errors.begin(), errors.end()); double median = errors[(Size)(errors.size() / 2.0)]; writeDebug_("Fragment mean error: " + String(mean), 1); writeDebug_("Fragment abs. dev.: " + String(abs_dev), 1); writeDebug_("Fragment std. dev.: " + String(sdv), 1); writeDebug_("Fragment median error: " + String(median), 1); // calculate histogram for gauss fitting GaussFitter gf; GaussFitter::GaussFitResult init_param (hist.maxValue(), median, sdv / 100.0); gf.setInitialParameters(init_param); try { gf.fit(values); // write gnuplot script if (generate_gnuplot_scripts) { ofstream out(String(fragment_out_file + "_gnuplot.dat").c_str()); for (vector<DPosition<2> >::const_iterator it = values.begin(); it != values.end(); ++it) { out << it->getX() << " " << it->getY() << endl; } out.close(); ofstream gpl_out(String(fragment_out_file + "_gnuplot.gpl").c_str()); gpl_out << "set terminal png" << endl; gpl_out << "set output \"" << fragment_out_file << "_gnuplot.png\"" << endl; if (fragment_error_ppm) { gpl_out << "set xlabel \"error in ppm\"" << endl; } else { gpl_out << "set xlabel \"error in Da\"" << endl; } gpl_out << "set ylabel \"frequency\"" << endl; gpl_out << "plot '" << fragment_out_file << "_gnuplot.dat' title 'Fragment mass error distribution' w boxes, f(x) w lp title 'Gaussian fit of the error distribution'" << endl; gpl_out.close(); } } catch (Exception::UnableToFit) { writeLog_("Unable to fit a Gaussian distribution to the fragment mass errors"); } } return EXECUTION_OK; }
Normalizer* e_ptr = 0; Normalizer* e_nullPointer = 0; START_SECTION((Normalizer())) e_ptr = new Normalizer; TEST_NOT_EQUAL(e_ptr, e_nullPointer) END_SECTION START_SECTION((~Normalizer())) delete e_ptr; END_SECTION e_ptr = new Normalizer(); START_SECTION((Normalizer(const Normalizer& source))) Normalizer copy(*e_ptr); TEST_EQUAL(copy.getParameters(), e_ptr->getParameters()) TEST_EQUAL(copy.getName(), e_ptr->getName()) END_SECTION START_SECTION((Normalizer& operator = (const Normalizer& source))) Normalizer copy; copy = *e_ptr; TEST_EQUAL(copy.getParameters(), e_ptr->getParameters()) TEST_EQUAL(copy.getName(), e_ptr->getName()) END_SECTION START_SECTION((template<typename SpectrumType> void filterSpectrum(SpectrumType& spectrum))) DTAFile dta_file; PeakSpectrum spec; dta_file.load(OPENMS_GET_TEST_DATA_PATH("Transformers_tests.dta"), spec);
TEST_EQUAL(copy.getName(), ptr->getName()); TEST_EQUAL(copy.getParameters(), ptr->getParameters()); END_SECTION START_SECTION(ZhangSimilarityScore& operator = (const ZhangSimilarityScore& source)) ZhangSimilarityScore copy; copy = *ptr; TEST_EQUAL(copy.getName(), ptr->getName()); TEST_EQUAL(copy.getParameters(), ptr->getParameters()); END_SECTION START_SECTION(double operator () (const PeakSpectrum& spec) const) PeakSpectrum s1; DTAFile().load(OPENMS_GET_TEST_DATA_PATH("PILISSequenceDB_DFPIANGER_1.dta"), s1); Normalizer normalizer; Param p(normalizer.getParameters()); p.setValue("method", "to_one"); normalizer.setParameters(p); normalizer.filterSpectrum(s1); double score = (*ptr)(s1); TEST_REAL_SIMILAR(score, 1.82682); END_SECTION START_SECTION(double operator () (const PeakSpectrum& spec1, const PeakSpectrum& spec2) const) PeakSpectrum s1, s2; DTAFile().load(OPENMS_GET_TEST_DATA_PATH("PILISSequenceDB_DFPIANGER_1.dta"), s1); DTAFile().load(OPENMS_GET_TEST_DATA_PATH("PILISSequenceDB_DFPIANGER_1.dta"), s2); Normalizer normalizer;