Esempio n. 1
0
    ExitCodes main_(int, const char **)
    {
        //-------------------------------------------------------------
        // parameter handling
        //-------------------------------------------------------------

        //input/output files
        String in(getStringOption_("in"));
        String out(getStringOption_("out"));

        //-------------------------------------------------------------
        // loading input
        //-------------------------------------------------------------

        RichPeakMap exp;
        MzMLFile f;
        f.setLogType(log_type_);
        f.load(in, exp);

        writeDebug_("Data set contains " + String(exp.size()) + " spectra", 1);

        //-------------------------------------------------------------
        // calculations
        //-------------------------------------------------------------

        writeDebug_("Reading model file", 2);

        // create model an set the given options
        PILISModel * model = new PILISModel();
        model->readFromFile(getStringOption_("model_file"));
        Param model_param(model->getParameters());
        model_param.setValue("upper_mz", getDoubleOption_("model:upper_mz"));
        model_param.setValue("lower_mz", getDoubleOption_("model:lower_mz"));
        model_param.setValue("charge_directed_threshold", getDoubleOption_("model:charge_directed_threshold"));
        model_param.setValue("charge_remote_threshold", getDoubleOption_("model:charge_remote_threshold"));
        //model_param.setValue("min_main_ion_intensity", getDoubleOption_("model:min_main_ion_intensity"));
        //model_param.setValue("min_loss_ion_intensity", getDoubleOption_("model:min_loss_ion_intensity"));
        model_param.setValue("min_y_ion_intensity", getDoubleOption_("model:min_y_ion_intensity"));
        model_param.setValue("min_b_ion_intensity", getDoubleOption_("model:min_b_ion_intensity"));
        model_param.setValue("min_a_ion_intensity", getDoubleOption_("model:min_a_ion_intensity"));
        model_param.setValue("min_y_loss_intensity", getDoubleOption_("model:min_y_loss_intensity"));
        model_param.setValue("min_b_loss_intensity", getDoubleOption_("model:min_b_loss_intensity"));
        model_param.setValue("charge_loss_factor", getDoubleOption_("model:charge_loss_factor"));
        model_param.setValue("visible_model_depth", getIntOption_("model:visible_model_depth"));
        model_param.setValue("model_depth", getIntOption_("model:model_depth"));
        model_param.setValue("fixed_modifications", getStringOption_("fixed_modifications"));
        model->setParameters(model_param);

        writeDebug_("Reading sequence db", 2);

        // create sequence db
        SuffixArrayPeptideFinder * sapf = new SuffixArrayPeptideFinder(getStringOption_("peptide_db_file"), "trypticCompressed");
        sapf->setTolerance(getDoubleOption_("precursor_mass_tolerance"));
        sapf->setNumberOfModifications(0);
        sapf->setUseTags(false);

        //exp.resize(50); // TODO

        UInt max_charge(3), min_charge(1);         // TODO
        vector<double> pre_weights;
        for (RichPeakMap::Iterator it = exp.begin(); it != exp.end(); ++it)
        {
            double pre_weight(it->getPrecursors()[0].getMZ());
            for (Size z = min_charge; z <= max_charge; ++z)
            {
                pre_weights.push_back((pre_weight * (double)z) - (double)z);
            }
        }

        sort(pre_weights.begin(), pre_weights.end());

        cerr << "Getting candidates from SA...";
        vector<vector<pair<pair<String, String>, String> > > candidates;
        sapf->getCandidates(candidates, pre_weights);
        cerr << "done" << endl;

        delete sapf;

        map<double, vector<pair<pair<String, String>, String> > > sorted_candidates;
        UInt count(0);
        for (Size count = 0; count != candidates.size(); ++count)
        {
            sorted_candidates[pre_weights[count]] = candidates[count];
        }
        candidates.clear();

        // create ProteinIdentification and set the options
        PILISIdentification PILIS_id;

        PILIS_id.setModel(model);

        Param id_param(PILIS_id.getParameters());
        id_param.setValue("precursor_mass_tolerance", getDoubleOption_("precursor_mass_tolerance"));
        id_param.setValue("max_candidates", getIntOption_("max_pre_candidates"));
        // disable evalue scoring, this is done separately to allow for a single id per spectrum
        id_param.setValue("use_evalue_scoring", 0);
        id_param.setValue("fixed_modifications", getStringOption_("fixed_modifications"));
        PILIS_id.setParameters(id_param);

        vector<PeptideIdentification> ids;

        // perform the ProteinIdentification of the given spectra
        UInt no(0);
        for (RichPeakMap::Iterator it = exp.begin(); it != exp.end(); ++it, ++no)
        {
            if (it->getMSLevel() == 0)
            {
                writeLog_("Warning: MSLevel is 0, assuming MSLevel 2");
                it->setMSLevel(2);
            }

            if (it->getMSLevel() == 2)
            {
                writeDebug_(String(no) + "/" + String(exp.size()), 1);
                PeptideIdentification id;

                map<String, UInt> cand;

                for (UInt z = min_charge; z <= max_charge; ++z)
                {
                    double pre_weight = (it->getPrecursors()[0].getMZ() * (double)z) - (double)z;
                    for (vector<pair<pair<String, String>, String> >::const_iterator cit = sorted_candidates[pre_weight].begin(); cit != sorted_candidates[pre_weight].end(); ++cit)
                    {
                        String seq = cit->first.second;
                        if (seq.size() > 39)
                        {
                            continue;
                        }
                        UInt num_cleavages_sites(0);
                        for (Size k = 0; k != seq.size(); ++k)
                        {
                            if (k != seq.size() - 1)
                            {
                                if ((seq[k] == 'K' || seq[k] == 'R') && seq[k + 1] != 'P')
                                {
                                    ++num_cleavages_sites;
                                }
                            }
                        }

                        if (num_cleavages_sites > 1)
                        {
                            continue;
                        }

                        cand[seq] = z;
                    }
                }

                cerr << "#cand=" << cand.size() << endl;
                PILIS_id.getIdentification(cand, id, *it);

                id.setMetaValue("RT", it->getRT());
                id.setMetaValue("MZ", it->getPrecursors()[0].getMZ());

                ids.push_back(id);

                if (!id.getHits().empty())
                {
                    cerr << it->getPrecursors()[0].getMZ() << " " << AASequence(id.getHits().begin()->getSequence()).getAverageWeight() << endl;
                    writeDebug_(id.getHits().begin()->getSequence().toString() + " (z=" + id.getHits().begin()->getCharge() + "), score=" + String(id.getHits().begin()->getScore()), 10);
                }
            }
        }

        // perform the PILIS scoring to the spectra
        if (!getFlag_("scoring:do_not_use_evalue_scoring"))
        {
            PILISScoring scoring;
            Param scoring_param(scoring.getParameters());
            scoring_param.setValue("use_local_scoring", (int)getFlag_("scoring:use_local_scoring"));
            scoring_param.setValue("survival_function_bin_size", getIntOption_("scoring:survival_function_bin_size"));
            scoring_param.setValue("global_linear_fitting_threshold", getDoubleOption_("scoring:global_linear_fitting_threshold"));
            scoring_param.setValue("local_linear_fitting_threshold", getDoubleOption_("scoring:local_linear_fitting_threshold"));
            scoring.setParameters(scoring_param);

            scoring.getScores(ids);
        }

        // write the result to the IdentificationData structure for the storing
        UInt max_candidates = getIntOption_("max_candidates");
        for (Size i = 0; i != ids.size(); ++i)
        {
            if (ids[i].getHits().size() > max_candidates)
            {
                vector<PeptideHit> hits = ids[i].getHits();
                hits.resize(max_candidates);
                ids[i].setHits(hits);
            }
        }

        delete model;


        //-------------------------------------------------------------
        // writing output
        //-------------------------------------------------------------

        DateTime now;
        now.now();

        String date_string;
        //now.get(date_string); // @todo Fix it (Andreas)
        String identifier("PILIS_" + date_string);

        //UInt count(0);
        count = 0;
        for (RichPeakMap::ConstIterator it = exp.begin(); it != exp.end(); ++it)
        {
            if (it->getMSLevel() == 2)
            {
                ids[count].setMetaValue("RT", it->getRT());
                ids[count].setMetaValue("MZ", it->getPrecursors()[0].getMZ());

                ids[count].setIdentifier(identifier);
                ids[count++].setHigherScoreBetter(false);
            }
        }

        // search parameters
        ProteinIdentification::SearchParameters search_parameters;
        search_parameters.db = getStringOption_("peptide_db_file");
        search_parameters.db_version = "";
        search_parameters.taxonomy = "";
        //search_parameters.charges = getStringOption_("charges");
        search_parameters.mass_type = ProteinIdentification::MONOISOTOPIC;
        vector<String> fixed_mods;
        getStringOption_("fixed_modifications").split(',', fixed_mods);
        search_parameters.fixed_modifications = fixed_mods;
        search_parameters.enzyme = ProteinIdentification::TRYPSIN;
        search_parameters.missed_cleavages = 1;
        search_parameters.peak_mass_tolerance = getDoubleOption_("peak_mass_tolerance");
        search_parameters.precursor_tolerance = getDoubleOption_("precursor_mass_tolerance");

        ProteinIdentification protein_identification;
        protein_identification.setDateTime(now);
        protein_identification.setSearchEngine("PILIS");
        protein_identification.setSearchEngineVersion("beta");
        protein_identification.setSearchParameters(search_parameters);
        protein_identification.setIdentifier(identifier);

        vector<ProteinIdentification> protein_identifications;
        protein_identifications.push_back(protein_identification);
        IdXMLFile().store(out, protein_identifications, ids);

        return EXECUTION_OK;
    }
Esempio n. 2
0
PILISModel* ptr = 0;
PILISModel* nullPointer = 0;
const AASequence peptide = AASequence::fromString("DFPIANGER");
START_SECTION(PILISModel())
  ptr = new PILISModel();
  TEST_NOT_EQUAL(ptr, nullPointer)
END_SECTION

START_SECTION(~PILISModel())
  delete ptr;
END_SECTION

ptr = new PILISModel();

START_SECTION(PILISModel(const PILISModel& model))
  PILISModel copy(*ptr);
  TEST_EQUAL(copy.getParameters(), ptr->getParameters())
END_SECTION

START_SECTION(PILISModel& operator = (const PILISModel& mode))
  PILISModel copy;
  copy = *ptr;
  TEST_EQUAL(copy.getParameters(), ptr->getParameters())
END_SECTION

START_SECTION(void writeGraphMLFile(const String& filename))
  NOT_TESTABLE // will be tested in the next section, to avoid time consuming instantiation
END_SECTION

START_SECTION(void writeToFile(const String& filename))
  String filename;
  ExitCodes main_(int, const char **)
  {
    //-------------------------------------------------------------
    // parameter handling
    //-------------------------------------------------------------

    //input/output files
    StringList in(getStringList_("in"));
    StringList id_in(getStringList_("id_in"));
    String trained_model_file(getStringOption_("trained_model_file"));
    String model_file(getStringOption_("model_file"));
    bool score_filtering(getFlag_("score_filtering"));
    double score_threshold(getDoubleOption_("score_threshold"));
    Int min_charge(getIntOption_("min_charge"));
    Int max_charge(getIntOption_("max_charge"));

    if (in.empty())
    {
      writeLog_("For 'training' mode spectra and identifications are needed.");
      return INCOMPATIBLE_INPUT_DATA;
    }

    //bool duplicates_by_tic(getFlag_("duplicates_by_tic"));
    //bool base_model_from_file(getFlag_("base_model_from_file"));

    // create model, either read from a model file, or initialize with default parameters
    PILISModel model;
    if (model_file != "")
    {
      writeDebug_("Reading model from file '" + model_file + "'", 1);
      model.readFromFile(model_file);
    }
    else
    {
      writeDebug_("Initializing model", 1);
      model.setParameters(getParam_().copy("PILIS_parameters:", true));
      model.init();
    }

    Param pilis_param(model.getParameters());
    ModificationDefinitionsSet mod_set(pilis_param.getValue("fixed_modifications"), pilis_param.getValue("variable_modifications"));

    // read spectra file (if available)
    vector<RichPeakMap> exp;
    vector<vector<ProteinIdentification> > prot_ids;
    vector<vector<PeptideIdentification> > pep_ids;

    if (!in.empty())
    {
      FileTypes::Type in_file_type = FileHandler().getType(in[0]);
      writeDebug_("File type of parameter 'in' estimated as '" + FileTypes::typeToName(in_file_type) + "'", 1);
      // TODO check all types
      if (in_file_type == FileTypes::MSP)
      {
        writeDebug_("Reading MSP file", 1);
        MSPFile f;
        exp.resize(in.size());
        pep_ids.resize(in.size());
        for (Size i = 0; i != in.size(); ++i)
        {
          f.load(in[i], pep_ids[i], exp[i]);
          for (Size j = 0; j != exp[i].size(); ++j)
          {
            exp[i][j].getPeptideIdentifications().push_back(pep_ids[i][j]);
          }
        }
      }

      if (in_file_type == FileTypes::MZML)
      {
        MzMLFile f;
        f.setLogType(log_type_);

        exp.resize(in.size());
        for (Size i = 0; i != in.size(); ++i)
        {
          f.load(in[i], exp[i]);
        }
      }
    }

    if (!id_in.empty())
    {
      prot_ids.resize(id_in.size());
      pep_ids.resize(id_in.size());
      IdXMLFile f;
      for (Size i = 0; i != id_in.size(); ++i)
      {
        f.load(id_in[i], prot_ids[i], pep_ids[i]);
      }
    }

    if (!id_in.empty() && !in.empty())
    {
      // map the
      if (id_in.size() != in.size())
      {
        writeLog_("If in parameter contains mzML files and id_in contains idXML files, the number should be equal to allow mapping of the identification to the spectra");
        return INCOMPATIBLE_INPUT_DATA;
      }

      // map the ids to the spectra
      IDMapper id_mapper;
      for (Size i = 0; i != exp.size(); ++i)
      {
        id_mapper.annotate(exp[i], pep_ids[i], prot_ids[i]);
      }
    }

    // get the peptides and spectra
    vector<PILISCrossValidation::Peptide> peptides;

    for (vector<RichPeakMap>::const_iterator it1 = exp.begin(); it1 != exp.end(); ++it1)
    {
      for (RichPeakMap::ConstIterator it2 = it1->begin(); it2 != it1->end(); ++it2)
      {
        if (it2->getPeptideIdentifications().empty())
        {
          continue;
        }

        PeptideHit hit;

        if (it2->getPeptideIdentifications().begin()->getHits().size() > 0)
        {
          hit = *it2->getPeptideIdentifications().begin()->getHits().begin();
        }
        else
        {
          continue;
        }

        // check whether the sequence contains a modification not modelled
        if (!mod_set.isCompatible(hit.getSequence()) || hit.getSequence().size() > (UInt)pilis_param.getValue("visible_model_depth"))
        {
          continue;
        }

        if (score_filtering &&
            ((hit.getScore() < score_threshold && it2->getPeptideIdentifications().begin()->isHigherScoreBetter()) ||
             (hit.getScore() > score_threshold && !it2->getPeptideIdentifications().begin()->isHigherScoreBetter())))
        {
          continue;
        }

        PILISCrossValidation::Peptide pep_struct;
        pep_struct.sequence = hit.getSequence();
        pep_struct.charge = hit.getCharge();
        pep_struct.spec = *it2;
        pep_struct.hits = it2->getPeptideIdentifications().begin()->getHits();

        // check charges
        if (pep_struct.charge < min_charge || pep_struct.charge > max_charge)
        {
          continue;
        }

        peptides.push_back(pep_struct);
      }
    }


    getUniquePeptides(peptides);
    writeDebug_("Number of (unique) peptides for training: " + String(peptides.size()), 1);

    //model.writeToFile("pilis_tmp.dat");

    model.setParameters(pilis_param);
    for (vector<PILISCrossValidation::Peptide>::const_iterator it = peptides.begin(); it != peptides.end(); ++it)
    {
      model.train(it->spec, it->sequence, it->charge);
    }
    model.evaluate();

    if (trained_model_file != "")
    {
      model.writeToFile(trained_model_file);
    }


    return EXECUTION_OK;
  }