string posFromBoxes(vector <AlignBox*> boxes) { string poss = ""; string initials = ""; int n = boxes.size(); for (int i=0; i<n; i++) { initials += boxes[i]->getInitial() ; poss += " " + string_of_int(boxes[i]->start + FIRST_POS) ; poss += " " + string_of_int(boxes[i]->end + FIRST_POS) ; } return initials + "\t" + poss; }
string codeFromBoxes(vector <AlignBox*> boxes, string sequence) { string code = ""; int n = boxes.size(); for (int i=0; i<n; i++) { if (i>0) { code += " " + string_of_int(boxes[i-1]->del_right) + "/" // From box_left->end + 1 to box_right->start - 1, both positions included + sequence.substr(boxes[i-1]->end + 1, boxes[i]->start - boxes[i-1]->end - 1) + "/" + string_of_int(boxes[i]->del_left) + " " ; } code += boxes[i]->ref_label ; } return code; }
// Generate 10 sequences, and launch 10 times getRandom(1). // We should not have the same sequence 10 times (p < 10^{-10}) void testRandom() { list<Sequence> seqs; string seg_name = "seq"; char id = '0'; string sequence = "AA"; for (int i = 0; i < 10; i++) { seqs.push_back(create_sequence("seq" + string_of_int(id), "seq" + string_of_int(id), sequence, "")); sequence += "A"; id++; } SequenceSampler sampler(seqs); string first_random = sampler.getRandom(1).front().label; bool all_equal = true; for (int i = 0; i < 9 && all_equal; i++) { if (first_random != sampler.getRandom(1).front().label) all_equal = false; } TAP_TEST(all_equal == false, TEST_SAMPLER_RANDOM, "On the 10 trials, we drawn 10 times the same sequence"); }
string Segmenter::getInfoLine() const { string s = "" ; s += (segmented ? "" : "! ") + info ; s += " " + info_extra ; s += " " + segmented_germline->code ; s += " " + string(segmented_mesg[because]) ; if (evalue > NO_LIMIT_VALUE) s += " " + scientific_string_of_double(evalue); if (evalue_left > NO_LIMIT_VALUE) s += " " + scientific_string_of_double(evalue_left); if (evalue_right > NO_LIMIT_VALUE) s += "/" + scientific_string_of_double(evalue_right); if (CDR3start > 0) s += " {" + string_of_int(JUNCTIONstart) + "(" + string_of_int(JUNCTIONend-JUNCTIONstart+1) + ")" + string_of_int(JUNCTIONend) + " " + "up"[JUNCTIONproductive] + " " + JUNCTIONaa + "}"; return s ; }
string Segmenter::removeChevauchement() { assert(isSegmented()); string chevauchement = "" ; if (box_V->end >= box_J->start) { int middle = (box_V->end + box_J->start) / 2 ; chevauchement = " !ov " + string_of_int (box_V->end - box_J->start + 1); box_V->end = middle ; box_J->start = middle+1 ; } return chevauchement ; }
FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c, double threshold, double multiplier) { box_V = new AlignBox("5"); box_D = new AlignBox("4"); box_J = new AlignBox("3"); segmented = false; dSegmented = false; because = NOT_PROCESSED ; segmented_germline = germline ; info_extra = "" ; label = seq.label ; sequence = seq.sequence ; segment_cost=segment_c; evalue = NO_LIMIT_VALUE; evalue_left = NO_LIMIT_VALUE; evalue_right = NO_LIMIT_VALUE; box_V->marked_pos = 0; box_J->marked_pos = 0; CDR3start = -1; CDR3end = -1; JUNCTIONstart = -1; JUNCTIONend = -1; bool reverse_V = false ; bool reverse_J = false ; if ((germline->seg_method == SEG_METHOD_MAX12) || (germline->seg_method == SEG_METHOD_MAX1U)) { // We check whether this sequence is segmented with MAX12 or MAX1U (with default e-value parameters) KmerSegmenter *kseg = new KmerSegmenter(seq, germline, THRESHOLD_NB_EXPECTED, 1); if (kseg->isSegmented()) { reversed = kseg->isReverse(); KmerAffect left = reversed ? KmerAffect(kseg->after, true) : kseg->before ; KmerAffect right = reversed ? KmerAffect(kseg->before, true) : kseg->after ; delete kseg ; reverse_V = (left.getStrand() == -1); reverse_J = (right.getStrand() == -1); code = "Unexpected "; code += left.toStringSigns() + germline->index->getLabel(left).basename; code += "/"; code += right.toStringSigns() + germline->index->getLabel(right).basename; info_extra += " " + left.toString() + "/" + right.toString() + " (" + code + ")"; if (germline->seg_method == SEG_METHOD_MAX1U) return ; germline->override_rep5_rep3_from_labels(left, right); } else { delete kseg ; return ; } } // Strand determination, with KmerSegmenter (with default e-value parameters) // Note that we use only the 'strand' component // When the KmerSegmenter fails, continue with positive strand // TODO: flag to force a strand / to test both strands ? KmerSegmenter *kseg = new KmerSegmenter(seq, germline, THRESHOLD_NB_EXPECTED, 1); reversed = kseg->isReverse(); delete kseg ; sequence_or_rc = revcomp(sequence, reversed); // sequence, possibly reversed /* Segmentation */ align_against_collection(sequence_or_rc, germline->rep_5, NO_FORBIDDEN_ID, reverse_V, reverse_V, false, box_V, segment_cost); align_against_collection(sequence_or_rc, germline->rep_3, NO_FORBIDDEN_ID, reverse_J, !reverse_J, false, box_J, segment_cost); // J was run with '!reverseJ', we copy the box informations from right to left // Should this directly be handled in align_against_collection() ? box_J->start = box_J->end ; box_J->del_left = box_J->del_right; /* E-values */ evalue_left = multiplier * sequence.size() * germline->rep_5.totalSize() * segment_cost.toPValue(box_V->score[0].first); evalue_right = multiplier * sequence.size() * germline->rep_3.totalSize() * segment_cost.toPValue(box_J->score[0].first); evalue = evalue_left + evalue_right ; /* Unsegmentation causes */ if (box_V->end == (int) string::npos) { evalue_left = BAD_EVALUE ; } if (box_J->start == (int) string::npos) { evalue_right = BAD_EVALUE ; } checkLeftRightEvaluesThreshold(threshold, reversed ? -1 : 1); if (because != NOT_PROCESSED) { segmented = false; info = " @" + string_of_int (box_V->end + FIRST_POS) + " @" + string_of_int(box_J->start + FIRST_POS) ; return ; } /* The sequence is segmented */ segmented = true ; because = reversed ? SEG_MINUS : SEG_PLUS ; //overlap VJ seg_N = check_and_resolve_overlap(sequence_or_rc, 0, sequence_or_rc.length(), box_V, box_J, segment_cost); // Reset extreme positions box_V->start = 0; box_J->end = sequence.length()-1; // Why could this happen ? if (box_J->start>=(int) sequence.length()) box_J->start=sequence.length()-1; // seg_N will be recomputed in finishSegmentation() boxes.clear(); boxes.push_back(box_V); boxes.push_back(box_J); code = codeFromBoxes(boxes, sequence_or_rc); info = posFromBoxes(boxes); finishSegmentation(); }
string format_del(int deletions) { return deletions ? *"(" + string_of_int(deletions) + " del)" : "" ; }
void KmerSegmenter::computeSegmentation(int strand, KmerAffect before, KmerAffect after, double threshold, double multiplier) { // Try to segment, computing 'box_V->end' and 'box_J->start' // If not segmented, put the cause of unsegmentation in 'because' affect_infos max; max = kaa->getMaximum(before, after); // We did not find a good segmentation point if (!max.max_found) { // We labeled it detected if there were both enough affect_5 and enough affect_3 bool detected_before = (max.nb_before_left + max.nb_before_right >= DETECT_THRESHOLD); bool detected_after = (max.nb_after_left + max.nb_after_right >= DETECT_THRESHOLD); if (detected_before && detected_after) because = UNSEG_AMBIGUOUS ; else if ((strand == 1 && detected_before) || (strand == -1 && detected_after)) because = UNSEG_TOO_FEW_J ; else if ((strand == 1 && detected_after) || (strand == -1 && detected_before)) because = UNSEG_TOO_FEW_V ; else because = UNSEG_TOO_FEW_ZERO ; return ; } // E-values pair <double, double> pvalues = kaa->getLeftRightProbabilityAtLeastOrAbove(); evalue_left = pvalues.first * multiplier ; evalue_right = pvalues.second * multiplier ; evalue = evalue_left + evalue_right ; checkLeftRightEvaluesThreshold(threshold, strand); if (because != NOT_PROCESSED) return ; // There was a good segmentation point box_V->end = max.first_pos_max; box_J->start = max.last_pos_max + 1; if (strand == -1) { int tmp = sequence.size() - box_V->end - 1; box_V->end = sequence.size() - box_J->start - 1; box_J->start = tmp; } // Yes, it is segmented segmented = true; because = reversed ? SEG_MINUS : SEG_PLUS ; // TODO: this should also use possFromBoxes()... but 'boxes' is not defined here info = "VJ \t" + string_of_int(FIRST_POS) + " " + string_of_int(box_V->end + FIRST_POS) + " " + string_of_int(box_J->start + FIRST_POS) + " " + string_of_int(sequence.size() - 1 + FIRST_POS) ; // removeChevauchement is called once info was already computed: it is only to output info_extra info_extra += removeChevauchement(); finishSegmentation(); return ; }