Пример #1
0
void CompNovoIdentificationCID::getIdentification(PeptideIdentification & id, const PeakSpectrum & CID_spec)
{
    //if (CID_spec.getPrecursors().begin()->getMZ() > 1000.0)
    //{
    //cerr << "Weight of precursor has been estimated to exceed 2000.0 Da which is the current limit" << endl;
    //return;
    //}

    PeakSpectrum new_CID_spec(CID_spec);
    windowMower_(new_CID_spec, 0.3, 1);

    Param zhang_param;
    zhang_param = zhang_.getParameters();
    zhang_param.setValue("tolerance", fragment_mass_tolerance_);
    zhang_param.setValue("use_gaussian_factor", "true");
    zhang_param.setValue("use_linear_factor", "false");
    zhang_.setParameters(zhang_param);


    Normalizer normalizer;
    Param n_param(normalizer.getParameters());
    n_param.setValue("method", "to_one");
    normalizer.setParameters(n_param);
    normalizer.filterSpectrum(new_CID_spec);

    Size charge(2);
    double precursor_weight(0);     // [M+H]+
    if (!CID_spec.getPrecursors().empty())
    {
        // believe charge of spectrum?
        if (CID_spec.getPrecursors().begin()->getCharge() != 0)
        {
            charge = CID_spec.getPrecursors().begin()->getCharge();
        }
        else
        {
            // TODO estimate charge state
        }
        precursor_weight = CID_spec.getPrecursors().begin()->getMZ() * charge - ((charge - 1) * Constants::PROTON_MASS_U);
    }

    //cerr << "charge=" << charge << ", [M+H]=" << precursor_weight << endl;

    // now delete all peaks that are right of the estimated precursor weight
    Size peak_counter(0);
    for (PeakSpectrum::ConstIterator it = new_CID_spec.begin(); it != new_CID_spec.end(); ++it, ++peak_counter)
    {
        if (it->getPosition()[0] > precursor_weight)
        {
            break;
        }
    }
    if (peak_counter < new_CID_spec.size())
    {
        new_CID_spec.resize(peak_counter);
    }


    static double oxonium_mass = EmpiricalFormula("H2O+").getMonoWeight();

    Peak1D p;
    p.setIntensity(1);
    p.setPosition(oxonium_mass);

    new_CID_spec.push_back(p);

    p.setPosition(precursor_weight);
    new_CID_spec.push_back(p);

    // add complement to spectrum
    /*
    for (PeakSpectrum::ConstIterator it1 = CID_spec.begin(); it1 != CID_spec.end(); ++it1)
    {
    // get m/z of complement
    double mz_comp = precursor_weight - it1->getPosition()[0] + Constants::PROTON_MASS_U;

    // search if peaks are available that have similar m/z values
    Size count(0);
    bool found(false);
    for (PeakSpectrum::ConstIterator it2 = CID_spec.begin(); it2 != CID_spec.end(); ++it2, ++count)
    {
    if (fabs(mz_comp - it2->getPosition()[0]) < fragment_mass_tolerance)
    {
      // add peak intensity to corresponding peak in new_CID_spec
      new_CID_spec[count].setIntensity(new_CID_spec[count].getIntensity());
    }
    }
    if (!found)
    {
    // infer this peak
    Peak1D p;
    p.setIntensity(it1->getIntensity());
    p.setPosition(mz_comp);
    new_CID_spec.push_back(p);
    }
    }*/

    CompNovoIonScoringCID ion_scoring;
    Param ion_scoring_param(ion_scoring.getParameters());
    ion_scoring_param.setValue("fragment_mass_tolerance", fragment_mass_tolerance_);
    ion_scoring_param.setValue("precursor_mass_tolerance", precursor_mass_tolerance_);
    ion_scoring_param.setValue("decomp_weights_precision", decomp_weights_precision_);
    ion_scoring_param.setValue("double_charged_iso_threshold", (double)param_.getValue("double_charged_iso_threshold"));
    ion_scoring_param.setValue("max_isotope_to_score", param_.getValue("max_isotope_to_score"));
    ion_scoring_param.setValue("max_isotope", max_isotope_);
    ion_scoring.setParameters(ion_scoring_param);

    Map<double, IonScore> ion_scores;
    ion_scoring.scoreSpectrum(ion_scores, new_CID_spec, precursor_weight, charge);

    new_CID_spec.sortByPosition();

    /*
    cerr << "Size of ion_scores " << ion_scores.size() << endl;
    for (Map<double, IonScore>::const_iterator it = ion_scores.begin(); it != ion_scores.end(); ++it)
    {
        cerr << it->first << " " << it->second.score << endl;
    }*/

#ifdef WRITE_SCORED_SPEC
    PeakSpectrum filtered_spec(new_CID_spec);
    filtered_spec.clear();
    for (Map<double, CompNovoIonScoringCID::IonScore>::const_iterator it = ion_scores.begin(); it != ion_scores.end(); ++it)
    {
        Peak1D p;
        p.setIntensity(it->second.score);
        p.setPosition(it->first);
        filtered_spec.push_back(p);
    }
    DTAFile().store("spec_scored.dta", filtered_spec);
#endif

    set<String> sequences;
    getDecompositionsDAC_(sequences, 0, new_CID_spec.size() - 1, precursor_weight, new_CID_spec, ion_scores);

#ifdef SPIKE_IN
    sequences.insert("AFCVDGEGR");
    sequences.insert("APEFAAPWPDFVPR");
    sequences.insert("AVKQFEESQGR");
    sequences.insert("CCTESLVNR");
    sequences.insert("DAFLGSFLYEYSR");
    sequences.insert("DAIPENLPPLTADFAEDK");
    sequences.insert("DDNKVEDIWSFLSK");
    sequences.insert("DDPHACYSTVFDK");
    sequences.insert("DEYELLCLDGSR");
    sequences.insert("DGAESYKELSVLLPNR");
    sequences.insert("DGASCWCVDADGR");
    sequences.insert("DLFIPTCLETGEFAR");
    sequences.insert("DTHKSEIAHR");
    sequences.insert("DVCKNYQEAK");
    sequences.insert("EACFAVEGPK");
    sequences.insert("ECCHGDLLECADDR");
    sequences.insert("EFLGDKFYTVISSLK");
    sequences.insert("EFTPVLQADFQK");
    sequences.insert("ELFLDSGIFQPMLQGR");
    sequences.insert("ETYGDMADCCEK");
    sequences.insert("EVGCPSSSVQEMVSCLR");
    sequences.insert("EYEATLEECCAK");
    sequences.insert("FADLIQSGTFQLHLDSK");
    sequences.insert("FFSASCVPGATIEQK");
    sequences.insert("FLANVSTVLTSK");
    sequences.insert("FLSGSDYAIR");
    sequences.insert("FTASCPPSIK");
    sequences.insert("GAIEWEGIESGSVEQAVAK");
    sequences.insert("GDVAFIQHSTVEENTGGK");
    sequences.insert("GEPPSCAEDQSCPSER");
    sequences.insert("GEYVPTSLTAR");
    sequences.insert("GQEFTITGQKR");
    sequences.insert("GTFAALSELHCDK");
    sequences.insert("HLVDEPQNLIK");
    sequences.insert("HQDCLVTTLQTQPGAVR");
    sequences.insert("HTTVNENAPDQK");
    sequences.insert("ILDCGSPDTEVR");
    sequences.insert("KCPSPCQLQAER");
    sequences.insert("KGTEFTVNDLQGK");
    sequences.insert("KQTALVELLK");
    sequences.insert("KVPQVSTPTLVEVSR");
    sequences.insert("LALQFTTNAKR");
    sequences.insert("LCVLHEKTPVSEK");
    sequences.insert("LFTFHADICTLPDTEK");
    sequences.insert("LGEYGFQNALIVR");
    sequences.insert("LHVDPENFK");
    sequences.insert("LKECCDKPLLEK");
    sequences.insert("LKHLVDEPQNLIK");
    sequences.insert("LKPDPNTLCDEFK");
    sequences.insert("LLGNVLVVVLAR");
    sequences.insert("LLVVYPWTQR");
    sequences.insert("LRVDPVNFK");
    sequences.insert("LTDEELAFPPLSPSR");
    sequences.insert("LVNELTEFAK");
    sequences.insert("MFLSFPTTK");
    sequences.insert("MPCTEDYLSLILNR");
    sequences.insert("NAPYSGYSGAFHCLK");
    sequences.insert("NECFLSHKDDSPDLPK");
    sequences.insert("NEPNKVPACPGSCEEVK");
    sequences.insert("NLQMDDFELLCTDGR");
    sequences.insert("QAGVQAEPSPK");
    sequences.insert("RAPEFAAPWPDFVPR");
    sequences.insert("RHPEYAVSVLLR");
    sequences.insert("RPCFSALTPDETYVPK");
    sequences.insert("RSLLLAPEEGPVSQR");
    sequences.insert("SAFPPEPLLCSVQR");
    sequences.insert("SAGWNIPIGTLLHR");
    sequences.insert("SCWCVDEAGQK");
    sequences.insert("SGNPNYPHEFSR");
    sequences.insert("SHCIAEVEK");
    sequences.insert("SISSGFFECER");
    sequences.insert("SKYLASASTMDHAR");
    sequences.insert("SLHTLFGDELCK");
    sequences.insert("SLLLAPEEGPVSQR");
    sequences.insert("SPPQCSPDGAFRPVQCK");
    sequences.insert("SREGDPLAVYLK");
    sequences.insert("SRQIPQCPTSCER");
    sequences.insert("TAGTPVSIPVCDDSSVK");
    sequences.insert("TCVADESHAGCEK");
    sequences.insert("TQFGCLEGFGR");
    sequences.insert("TVMENFVAFVDK");
    sequences.insert("TYFPHFDLSHGSAQVK");
    sequences.insert("TYMLAFDVNDEK");
    sequences.insert("VDEVGGEALGR");
    sequences.insert("VDLLIGSSQDDGLINR");
    sequences.insert("VEDIWSFLSK");
    sequences.insert("VGGHAAEYGAEALER");
    sequences.insert("VGTRCCTKPESER");
    sequences.insert("VKVDEVGGEALGR");
    sequences.insert("VKVDLLIGSSQDDGLINR");
    sequences.insert("VLDSFSNGMK");
    sequences.insert("VLSAADKGNVK");
    sequences.insert("VPQVSTPTLVEVSR");
    sequences.insert("VTKCCTESLVNR");
    sequences.insert("VVAASDASQDALGCVK");
    sequences.insert("VVAGVANALAHR");
    sequences.insert("YICDNQDTISSK");
    sequences.insert("YLASASTMDHAR");
    sequences.insert("YNGVFQECCQAEDK");
#endif

    SpectrumAlignmentScore spectra_zhang;
    spectra_zhang.setParameters(zhang_param);

    vector<PeptideHit> hits;
    Size missed_cleavages = param_.getValue("missed_cleavages");
    for (set<String>::const_iterator it = sequences.begin(); it != sequences.end(); ++it)
    {

        Size num_missed = countMissedCleavagesTryptic_(*it);
        if (missed_cleavages < num_missed)
        {
            //cerr << "Two many missed cleavages: " << *it << ", found " << num_missed << ", allowed " << missed_cleavages << endl;
            continue;
        }
        PeakSpectrum CID_sim_spec;
        getCIDSpectrum_(CID_sim_spec, *it, charge);

        //normalizer.filterSpectrum(CID_sim_spec);

        double cid_score = zhang_(CID_sim_spec, CID_spec);

        PeptideHit hit;
        hit.setScore(cid_score);

        hit.setSequence(getModifiedAASequence_(*it));
        hit.setCharge((Int)charge);   //TODO unify charge interface: int or size?
        hits.push_back(hit);
        //cerr << getModifiedAASequence_(*it) << " " << cid_score << " " << endl;
    }

    // rescore the top hits
    id.setHits(hits);
    id.assignRanks();

    hits = id.getHits();

    SpectrumAlignmentScore alignment_score;
    Param align_param(alignment_score.getParameters());
    align_param.setValue("tolerance", fragment_mass_tolerance_);
    align_param.setValue("use_linear_factor", "true");
    alignment_score.setParameters(align_param);

    for (vector<PeptideHit>::iterator it = hits.begin(); it != hits.end(); ++it)
    {
        //cerr << "Pre: " << it->getRank() << " " << it->getSequence() << " " << it->getScore() << " " << endl;
    }

    Size number_of_prescoring_hits = param_.getValue("number_of_prescoring_hits");
    if (hits.size() > number_of_prescoring_hits)
    {
        hits.resize(number_of_prescoring_hits);
    }

    for (vector<PeptideHit>::iterator it = hits.begin(); it != hits.end(); ++it)
    {
        PeakSpectrum CID_sim_spec;
        getCIDSpectrum_(CID_sim_spec, getModifiedStringFromAASequence_(it->getSequence()), charge);

        normalizer.filterSpectrum(CID_sim_spec);

        //DTAFile().store("sim_specs/" + it->getSequence().toUnmodifiedString() + "_sim_CID.dta", CID_sim_spec);

        //double cid_score = spectra_zhang(CID_sim_spec, CID_spec);
        double cid_score = alignment_score(CID_sim_spec, CID_spec);

        //cerr << "Final: " << it->getSequence() << " " << cid_score << endl;

        it->setScore(cid_score);
    }

    id.setHits(hits);
    id.assignRanks();
    hits = id.getHits();

    for (vector<PeptideHit>::iterator it = hits.begin(); it != hits.end(); ++it)
    {
        //cerr << "Fin: " << it->getRank() << " " << it->getSequence() << " " << it->getScore() << " " << endl;
    }

    Size number_of_hits = param_.getValue("number_of_hits");
    if (id.getHits().size() > number_of_hits)
    {
        hits.resize(number_of_hits);
    }

    id.setHits(hits);
    id.assignRanks();

    return;
}
Пример #2
0
int single_map_align_backbone (Descr *descr1, Protein * protein1, Representation *rep1, 
			       Descr *descr2, Protein * protein2, Representation *rep2, 
			       Map * map) {
    
    /* for now,  we will just postprocess the  best map */

    int no_res_1 = protein1->length, no_res_2= protein2->length;
    int resctr1, resctr2;
    int map_size;
    int *residue_map_i2j, *residue_map_j2i;
    
    int *type_1, *type_2;
    int longest_element_length = (protein1->length > protein2->length) ?
	    protein1->length : protein2->length;
   
    double d0 = options.distance_tol_in_bb_almt;
    double aln_score, rmsd;
    double ** similarity;
    double ** sim_in_element;
    double **x, **y;
    double **R, T[3], q[4];
    double total_score = 0;
     /* for the MC: */
    int max_no_steps = 20, no_steps = 0;
    int done = 0, toggle = 0;
    double *current_q,  *old_q, *current_T, *old_T;
    double *best_q, *best_T;
    double old_score = total_score, current_score = 0.0, d_mc = 0.5;
    double max_score;
    double t_mc, d_init;

      
    int  alignment_size  (int * residue_map_i2j, int no_res_1 );
    int  closeness_score_for_sse_almt (Descr *descr1, Representation *rep1, Representation *rep2, Map * map,
			  Protein *protein1, Protein *protein2,
			  double **R, double *T, double d0, double ** similarity, double * score_ptr);
    int following_loop (int *element_begin, int *element_end,
			int no_of_elements, int no_of_res, 
			int element_ctr, int * first_res, int * last_res);  
    int map2rotation (Protein *protein1, Protein *protein2, int *residue_map_i2j,
		       double **x, double **y, double *q, double *T, double *rmsd);
    int out_of_order_alignment (Descr *descr1,  Descr *descr2, Map *map, int *element_1_begin, int *element_1_end,
			     int *element_2_begin, int *element_2_end,
			     int longest_element_length, double ** similarity, double ** sim_in_element,
				 int *residue_map_i2j, int *residue_map_j2i, double * score_ptr);
    int preceding_loop (int *element_begin, int *element_end,
			int element_ctr, int * first_res, int * last_res);
    
    if ( ! (R=dmatrix(3,3) ) ) return 1; /* compiler is bugging me otherwise */
    
    construct_translation_vecs (rep1, rep2, map);
    
    /* make sure that we have all the info we might need */
  
    /* define matrix, the size of nr of residues in set of SSEs 
       x nr of  residues in the other set of SSEs, and fill it with -1 */
    similarity = dmatrix (no_res_1, no_res_2);
    if ( !similarity ) return 1;
    
    sim_in_element = dmatrix (no_res_1, no_res_2);
    if ( !similarity ) return 1;
    
    for (resctr1=0; resctr1<no_res_1; resctr1++) {
	for (resctr2=0; resctr2<no_res_2; resctr2++) {
	    similarity[resctr1][resctr2] = -1;
	}
    }
    
    /* alloc */

    type_1 = protein1->sse_sequence;
    type_2 = protein2->sse_sequence;
    
    
    if ( ! (residue_map_i2j     = emalloc (no_res_1*sizeof(int))) ) return 1;
    if ( ! (residue_map_j2i     = emalloc (no_res_2*sizeof(int))) )  return 2;

    if ( ! (x = dmatrix (3, no_res_1+no_res_2)))  exit(1);
    if ( ! (y = dmatrix (3, no_res_1+no_res_2)))  exit(1);


  
    /*********************************************************************/
    /*********************************************************************/
    /* aliases */
    int *element_1_begin, *element_1_end; /* "element" here means SSE */
    int *element_2_begin, *element_2_end;
 
    element_1_begin = protein1->element_begin;
    element_1_end   = protein1->element_end;
     
    element_2_begin = protein2->element_begin;
    element_2_end   = protein2->element_end;
 


    /************************************************************/
    /************************************************************/
    /* ALIGNMENT, round 1                                       */
    /************************************************************/
    /* for all mapped blocks calculate similarity as exp (-d/d0) */
    total_score = 0.0;
    quat_to_R (map->q, R);

    closeness_score_for_sse_almt (descr1, rep1, rep2, map, protein1, protein2,
		     R, NULL, d0, similarity, &total_score);
    /* run Smith-Waterman and use the mapped CA to find the transformation        */
    /* I have another copy of SW here (the first one is in struct_map.c)          */
    /* so I wouldn't fumble with parameters - the two should be joined eventually */
    if ( options.search_algorithm == SEQUENTIAL ) {

	smith_waterman_2 (no_res_1, no_res_2, similarity,
			  residue_map_i2j, residue_map_j2i, &aln_score);
    } else {
	
	out_of_order_alignment (descr1, descr2, map, element_1_begin, element_1_end,
				element_2_begin, element_2_end, longest_element_length,
				similarity, sim_in_element,
				residue_map_i2j,  residue_map_j2i, &aln_score);
    }
 
    
	
    map2rotation (protein1, protein2, residue_map_i2j, x, y, q, T, &rmsd);
    
    quat_to_R (q, R);
    current_score = alignment_score (protein1, protein2, residue_map_i2j, R, T, d0);

    /*********************************************************/
    /* fiddle iteratively with the transformation            */
    if ( ! (current_q = emalloc (4*sizeof(double)) )) return 1;
    if ( ! (old_q     = emalloc (4*sizeof(double)) )) return 1;
    if ( ! (best_q    = emalloc (4*sizeof(double)) )) return 1;
    if ( ! (current_T = emalloc (3*sizeof(double)) )) return 1;
    if ( ! (old_T     = emalloc (3*sizeof(double)) )) return 1;
    if ( ! (best_T    = emalloc (3*sizeof(double)) )) return 1;
    
    srand48 (time (0));
    
    memcpy (current_q, q, 4*sizeof(double));
    memcpy (    old_q, q, 4*sizeof(double));
    memcpy (   best_q, q, 4*sizeof(double));
    
    memcpy (    old_T, T, 3*sizeof(double));
    memcpy (   best_T, T, 3*sizeof(double));
    memcpy (current_T, T, 3*sizeof(double));

    quat_to_R ( current_q, R);

    d_init = d0;


    /* t_mc = exp ( (1.0- (double)anneal_round)/10.0); */
    d_mc = d_init;
    t_mc = 5;
	
    memcpy (current_q, best_q, 4*sizeof(double));
    memcpy (current_T, best_T, 3*sizeof(double));
    memcpy (old_q, best_q, 4*sizeof(double));
    memcpy (old_T, best_T, 3*sizeof(double));
    old_score = 0;
    max_score = 0;
    no_steps  = 0;
    
    toggle = 1;
    done = 0;
    
    while (no_steps < max_no_steps && !done ) {

	    
	closeness_score_for_sse_almt (descr1, NULL, NULL, map, protein1, protein2,
			 R, current_T, d_mc, similarity, &total_score);
	    
	
	if ( options.search_algorithm == SEQUENTIAL ) {
	    smith_waterman_2 (no_res_1, no_res_2, similarity,
			      residue_map_i2j,  residue_map_j2i, &aln_score);
	} else {
	    out_of_order_alignment (descr1, descr2, map, element_1_begin, element_1_end,
				    element_2_begin,  element_2_end, longest_element_length,
				    similarity, sim_in_element,
				    residue_map_i2j,  residue_map_j2i, &aln_score);
	}
	
 	map2rotation (protein1, protein2, residue_map_i2j, x, y, current_q,  current_T, &rmsd);
	
	quat_to_R ( current_q, R);
	current_score = alignment_score (protein1, protein2, residue_map_i2j, R, current_T, d_mc);

	
	if ( current_score >  max_score )  {
	    max_score = current_score;
	    memcpy (best_q, current_q, 4*sizeof(double));
	    memcpy (best_T, current_T, 3*sizeof(double));
	}

	if (old_score) done = ( fabs(old_score-current_score)/old_score < 0.01);
	old_score = current_score;
	no_steps++;
    
    }

    memcpy (q, best_q, 4*sizeof(double));
    memcpy (T, best_T, 3*sizeof(double));
	 
    free (current_q);
    free (old_q);
    free (best_q);
    free (current_T);
    free (old_T);
    free (best_T);
   


    /************************************************************/
    /************************************************************/
    /* ALIGNMENT, round 2                                       */
    /************************************************************/
    /************************************************************/
    /* find the similarity matrix for this new rotation  -- this*/
    /* time extending to neighboring elements                   */

    closeness_score_for_bb_almt (map, protein1, protein2, R, T, d0,
				 similarity, &total_score);
     /************************************************************/

    
    memset (residue_map_i2j, 0, no_res_1*sizeof(int)); 
    memset (residue_map_j2i, 0, no_res_2*sizeof(int)); 

    if ( options.search_algorithm == SEQUENTIAL ) {
	smith_waterman_2 (no_res_1, no_res_2, similarity,
			  residue_map_i2j,  residue_map_j2i, &aln_score);
    } else {
	out_of_order_alignment (descr1, descr2, map, element_1_begin, element_1_end,
				element_2_begin,  element_2_end, longest_element_length,
				similarity, sim_in_element,
				residue_map_i2j,  residue_map_j2i, &aln_score);
    }

    map2rotation (protein1, protein2, residue_map_i2j, x, y, q, T, &rmsd);
    
    quat_to_R (q, R);
    
    //aln_score = alignment_score (protein1, protein2, residue_map_i2j, R, T, d0);
    map_size  = alignment_size  (residue_map_i2j, protein1->length);		

    memcpy (&(map->q[0]), &q[0], 4*sizeof(double));
    memcpy (&(map->T[0]), &T[0], 3*sizeof(double));
    
    map->x2y_residue_level  = residue_map_i2j;
    map->y2x_residue_level  = residue_map_j2i;
    
    map->x2y_residue_l_size = no_res_1;
    map->y2x_residue_l_size = no_res_2;
    

    /*************************************************************************/
    map->res_almt_length     = map_size;
    map->aln_score           = aln_score;
    map->res_rmsd            = rmsd;

    free_dmatrix(R);
    free_dmatrix (similarity);
    free_dmatrix (sim_in_element);
    free_dmatrix (x);
    free_dmatrix (y);

 
     
    return 0;
}