예제 #1
0
// divide and conquer algorithm of the sequencing
void CompNovoIdentificationCID::getDecompositionsDAC_(set<String> & sequences, Size left, Size right, double peptide_weight, const PeakSpectrum & CID_spec, Map<double, CompNovoIonScoringCID::IonScore> & ion_scores)
{
    static double oxonium_mass = EmpiricalFormula("H2O+").getMonoWeight();
    double offset_suffix(CID_spec[left].getPosition()[0] - oxonium_mass);
    double offset_prefix(peptide_weight - CID_spec[right].getPosition()[0]);

#ifdef DAC_DEBUG
    static Int depth_(0);
    ++depth_;
    String tabs_(depth_, '\t');
    cerr << tabs_ << "void getDecompositionsDAC(sequences[" << sequences.size() << "], " << left << ", " << right << ") ";
    cerr << CID_spec[left].getPosition()[0] << " " << CID_spec[right].getPosition()[0] << " diff=";
#endif

    double diff = CID_spec[right].getPosition()[0] - CID_spec[left].getPosition()[0];

#ifdef DAC_DEBUG
    cerr << diff << endl;
    cerr << "offset_prefix=" << offset_prefix << ", offset_suffix=" << offset_suffix << endl;
#endif

    if (subspec_to_sequences_.has(left) && subspec_to_sequences_[left].has(right))
    {
        sequences = subspec_to_sequences_[left][right];

#ifdef DAC_DEBUG
        depth_--;
        cerr << tabs_ << "from cache DAC: " << CID_spec[left].getPosition()[0] << " " << CID_spec[right].getPosition()[0] << " " << sequences.size() << " " << left << " " << right << endl;
#endif
        return;
    }

    // no further solutions possible?
    if (diff < min_aa_weight_)
    {
#ifdef DAC_DEBUG
        depth_--;
#endif
        return;
    }

    // no further division needed?
    if (diff <= max_decomp_weight_)
    {
        vector<MassDecomposition> decomps;

        // if we are at the C-terminus use precursor_mass_tolerance_
        if (offset_prefix < precursor_mass_tolerance_)
        {
            Param decomp_param(mass_decomp_algorithm_.getParameters());
            decomp_param.setValue("tolerance", precursor_mass_tolerance_);
            mass_decomp_algorithm_.setParameters(decomp_param);
            getDecompositions_(decomps, diff);
            decomp_param.setValue("tolerance", fragment_mass_tolerance_);
            mass_decomp_algorithm_.setParameters(decomp_param);
        }
        else
        {
            getDecompositions_(decomps, diff);
        }
        //filterDecomps_(decomps);

#ifdef DAC_DEBUG
        cerr << tabs_ << "Found " << decomps.size() << " decomps" << endl;
        cerr << tabs_ << "Permuting...";
#endif

        //static Map<String, set<String> > permute_cache;
        for (vector<MassDecomposition>::const_iterator it = decomps.begin(); it != decomps.end(); ++it)
        {
#ifdef DAC_DEBUG
            cerr << it->toString() << endl;
#endif

            String exp_string = it->toExpandedString();
            if (!permute_cache_.has(exp_string))
            {
                permute_("", exp_string, sequences);
                permute_cache_[exp_string] = sequences;
            }
            else
            {
                sequences = permute_cache_[exp_string];
            }
        }

#ifdef DAC_DEBUG
        cerr << tabs_ << CID_spec[left].getPosition()[0] << " " << CID_spec[right].getPosition()[0] << " " << peptide_weight << endl;
        if (sequences.size() > max_subscore_number_)
        {
            cerr << tabs_ << "Reducing #sequences from " << sequences.size() << " to " << max_subscore_number_ << "(prefix=" << offset_prefix  << ", suffix=" << offset_suffix << ")...";
        }
#endif

        // C-terminus
        if (offset_suffix <= precursor_mass_tolerance_)
        {
            filterPermuts_(sequences);
        }

        // reduce the sequences
        reducePermuts_(sequences, CID_spec, offset_prefix, offset_suffix);
#ifdef DAC_DEBUG
        cerr << "Writing to cache " << left << " " << right << endl;
#endif
        subspec_to_sequences_[left][right] = sequences;

#ifdef DAC_DEBUG
        cerr << "ended" << endl;
        cerr << tabs_ << "DAC: " << CID_spec[left].getPosition()[0] << " " << CID_spec[right].getPosition()[0] << " " << sequences.size() << endl;
        depth_--;
#endif

        return;
    }

    // select suitable pivot peaks
    vector<Size> pivots;

    if (offset_suffix < precursor_mass_tolerance_ && offset_prefix < precursor_mass_tolerance_)
    {
        selectPivotIons_(pivots, left, right, ion_scores, CID_spec, peptide_weight, true);
    }
    else
    {
        selectPivotIons_(pivots, left, right, ion_scores, CID_spec, peptide_weight, false);
    }

    // run divide step
#ifdef DAC_DEBUG
    cerr << tabs_ << "Selected " << pivots.size() << " pivot ions: ";
    for (vector<Size>::const_iterator it = pivots.begin(); it != pivots.end(); ++it)
    {
        cerr << *it << "(" << CID_spec[*it].getPosition()[0] << ") ";
    }
    cerr << endl;
#endif

    for (vector<Size>::const_iterator it = pivots.begin(); it != pivots.end(); ++it)
    {
        set<String> seq1, seq2, new_sequences;

        // the smaller the 'gap' the greater the chance of not finding anything
        // so we we compute the smaller gap first
        double diff1(CID_spec[*it].getPosition()[0] - CID_spec[left].getPosition()[0]);
        double diff2(CID_spec[right].getPosition()[0] - CID_spec[*it].getPosition()[0]);

        if (diff1 < diff2)
        {
            getDecompositionsDAC_(seq1, left, *it, peptide_weight, CID_spec, ion_scores);
            if (seq1.empty())
            {
#ifdef DAC_DEBUG
                cerr << tabs_ << "first call produced 0 candidates (" << diff1 << ")" << endl;
#endif
                continue;
            }

            getDecompositionsDAC_(seq2, *it, right, peptide_weight, CID_spec, ion_scores);
        }
        else
        {
            getDecompositionsDAC_(seq2, *it, right, peptide_weight, CID_spec, ion_scores);
            if (seq2.empty())
            {
#ifdef DAC_DEBUG
                cerr << tabs_ << "second call produced 0 candidates (" << diff2 << ")" << endl;
#endif
                continue;
            }

            getDecompositionsDAC_(seq1, left, *it, peptide_weight, CID_spec, ion_scores);
        }

#ifdef DAC_DEBUG
        cerr << tabs_ << "Found " << seq1.size() << " solutions (1) " << diff1 << endl;
        cerr << tabs_ << "Found " << seq2.size() << " solutions (2) " << diff2 << endl;
        cerr << tabs_ << "inserting " << seq1.size() * seq2.size()  << " sequences" << endl;
#endif

        // C-terminus
        if (offset_suffix <= fragment_mass_tolerance_)
        {
            filterPermuts_(seq1);
        }

        // test if we found enough sequence candidates
        if (seq1.empty() || seq2.empty())
        {
            continue;
        }

        for (set<String>::const_iterator it1 = seq1.begin(); it1 != seq1.end(); ++it1)
        {
            for (set<String>::const_iterator it2 = seq2.begin(); it2 != seq2.end(); ++it2)
            {
                new_sequences.insert(*it2 + *it1);
            }
        }

        if (seq1.size() * seq2.size() > max_subscore_number_ /* && (offset_prefix > fragment_mass_tolerance_ || offset_suffix > fragment_mass_tolerance_)*/)
        {
#ifdef DAC_DEBUG
            cerr << tabs_ << CID_spec[left].getPosition()[0] << " " << CID_spec[right].getPosition()[0] << " " << peptide_weight << endl;
            cerr << tabs_ << "Reducing #sequences from " << new_sequences.size() << " to " << max_subscore_number_ << "(prefix=" << offset_prefix  << ", suffix=" << offset_suffix << ")...";
#endif
            if (offset_prefix > precursor_mass_tolerance_ || offset_suffix > precursor_mass_tolerance_)
            {
                reducePermuts_(new_sequences, CID_spec, offset_prefix, offset_suffix);
            }

#ifdef DAC_DEBUG
            for (set<String>::const_iterator it1 = new_sequences.begin(); it1 != new_sequences.end(); ++it1)
            {
                cerr << tabs_ << *it1 << endl;
            }
            cerr << endl;
#endif
        }

        for (set<String>::const_iterator sit = new_sequences.begin(); sit != new_sequences.end(); ++sit)
        {
            sequences.insert(*sit);
        }
    }
#ifdef DAC_DEBUG
    cerr << tabs_ << "Found sequences for " << CID_spec[left].getPosition()[0] << " " << CID_spec[right].getPosition()[0] << endl;
    for (set<String>::const_iterator sit = sequences.begin(); sit != sequences.end(); ++sit)
    {
        cerr << tabs_ << *sit << endl;
    }
#endif

    // reduce the permuts once again to reduce complexity
    if (offset_prefix > precursor_mass_tolerance_ || offset_suffix > precursor_mass_tolerance_)
    {
        reducePermuts_(sequences, CID_spec, offset_prefix, offset_suffix);
    }

#ifdef DAC_DEBUG
    cerr << "Writing to cache " << left << " " << right << endl;
#endif

    subspec_to_sequences_[left][right] = sequences;

#ifdef DAC_DEBUG
    depth_--;
    cerr << tabs_ << "DAC: " << CID_spec[left].getPosition()[0] << " " << CID_spec[right].getPosition()[0] << " " << sequences.size() << endl;
#endif
    return;

}
  void    CompNovoIdentificationBase::updateMembers_()
  {
    // init residue mass table
    String residue_set(param_.getValue("residue_set"));

    set<const Residue *> residues = ResidueDB::getInstance()->getResidues(residue_set);
    for (set<const Residue *>::const_iterator it = residues.begin(); it != residues.end(); ++it)
    {
      aa_to_weight_[(*it)->getOneLetterCode()[0]] = (*it)->getMonoWeight(Residue::Internal);
    }

    max_number_aa_per_decomp_ = param_.getValue("max_number_aa_per_decomp");
    tryptic_only_ = param_.getValue("tryptic_only").toBool();
    fragment_mass_tolerance_ = (DoubleReal)param_.getValue("fragment_mass_tolerance");
    max_number_pivot_ = param_.getValue("max_number_pivot");
    decomp_weights_precision_ = (DoubleReal)param_.getValue("decomp_weights_precision");
    min_mz_ = (DoubleReal)param_.getValue("min_mz");
    max_mz_ = (DoubleReal)param_.getValue("max_mz");
    max_decomp_weight_ = (DoubleReal)param_.getValue("max_decomp_weight");
    max_subscore_number_ = param_.getValue("max_subscore_number");
    max_isotope_ = param_.getValue("max_isotope");

    name_to_residue_.clear();
    residue_to_name_.clear();

    // now handle the modifications
    ModificationDefinitionsSet mod_set((StringList)param_.getValue("fixed_modifications"), (StringList)param_.getValue("variable_modifications"));
    set<ModificationDefinition> fixed_mods = mod_set.getFixedModifications();

    for (set<ModificationDefinition>::const_iterator it = fixed_mods.begin(); it != fixed_mods.end(); ++it)
    {
      ResidueModification mod = ModificationsDB::getInstance()->getModification(it->getModification());
      char aa = ' ';
      if (mod.getOrigin().size() != 1 || mod.getOrigin() == "X")
      {
        cerr << "Warning: cannot handle modification " << it->getModification() << ", because aa is ambiguous (" << mod.getOrigin() << "), ignoring modification!" << endl;
        continue;
      }
      else
      {
        aa = mod.getOrigin()[0];
      }

      if (mod.getMonoMass() != 0)
      {
        aa_to_weight_[aa] = mod.getMonoMass();
      }
      else
      {
        if (mod.getDiffMonoMass() != 0)
        {
          aa_to_weight_[aa] += mod.getDiffMonoMass();
        }
        else
        {
          cerr << "Warning: cannot handle modification " << it->getModification() << ", because no monoisotopic mass value was found! Ignoring modification!" << endl;
          continue;
        }
      }

      //cerr << "Setting fixed modification " << it->getModification() << " of amino acid '" << aa << "'; weight = " << aa_to_weight_[aa] << endl;

      const Residue * res = ResidueDB::getInstance()->getModifiedResidue(it->getModification());
      name_to_residue_[aa] = res;
      residue_to_name_[res] = aa;
    }

    const StringList mod_names(StringList::create("a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z"));
    vector<String>::const_iterator actual_mod_name = mod_names.begin();
    set<ModificationDefinition> var_mods = mod_set.getVariableModifications();
    for (set<ModificationDefinition>::const_iterator it = var_mods.begin(); it != var_mods.end(); ++it)
    {
      ResidueModification mod = ModificationsDB::getInstance()->getModification(it->getModification());
      char aa = (*actual_mod_name)[0];
      char origin_aa = ' ';
      ++actual_mod_name;

      if (mod.getOrigin().size() != 1 || mod.getOrigin() == "X")
      {
        cerr << "CompNovoIdentificationBase: Warning: cannot handle modification " << it->getModification() << ", because aa is ambiguous (" << mod.getOrigin() << "), ignoring modification!" << endl;
        continue;
      }
      else
      {
        origin_aa = mod.getOrigin()[0];
      }

      if (mod.getMonoMass() != 0)
      {
        aa_to_weight_[aa] = mod.getMonoMass();
      }
      else
      {
        if (mod.getDiffMonoMass() != 0)
        {
          aa_to_weight_[aa] = aa_to_weight_[origin_aa] + mod.getDiffMonoMass();
        }
        else
        {
          cerr << "CompNovoIdentificationBase: Warning: cannot handle modification " << it->getModification() << ", because no monoisotopic mass value was found! Ignoring modification!" << endl;
          continue;
        }
      }

      //cerr << "Mapping variable modification " << it->getModification() << " to letter '" << aa << "' (@" << origin_aa << "); weight = " << aa_to_weight_[aa] << endl;
      const Residue * res = ResidueDB::getInstance()->getModifiedResidue(it->getModification());
      name_to_residue_[aa] = res;
      residue_to_name_[res] = aa;
    }

    /*
    cerr << "Following masses are used for identification: " << endl;

    for (Map<char, DoubleReal>::const_iterator it = aa_to_weight_.begin(); it != aa_to_weight_.end(); ++it)
    {
        cerr << it->first << " " << precisionWrapper(it->second) << endl;
    }*/

    initIsotopeDistributions_();

    Param decomp_param(mass_decomp_algorithm_.getParameters());
    decomp_param.setValue("tolerance", fragment_mass_tolerance_);
    decomp_param.setValue("fixed_modifications", (StringList)param_.getValue("fixed_modifications"));
    decomp_param.setValue("variable_modifications", (StringList)param_.getValue("variable_modifications"));
    mass_decomp_algorithm_.setParameters(decomp_param);

    min_aa_weight_ = numeric_limits<DoubleReal>::max();
    for (Map<char, DoubleReal>::const_iterator it = aa_to_weight_.begin(); it != aa_to_weight_.end(); ++it)
    {
      if (min_aa_weight_ > it->second)
      {
        min_aa_weight_ = it->second;
      }
    }
    return;
  }