FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c, double threshold, double multiplier) { box_V = new AlignBox("5"); box_D = new AlignBox("4"); box_J = new AlignBox("3"); segmented = false; dSegmented = false; because = NOT_PROCESSED ; segmented_germline = germline ; info_extra = "" ; label = seq.label ; sequence = seq.sequence ; segment_cost=segment_c; evalue = NO_LIMIT_VALUE; evalue_left = NO_LIMIT_VALUE; evalue_right = NO_LIMIT_VALUE; box_V->marked_pos = 0; box_J->marked_pos = 0; CDR3start = -1; CDR3end = -1; JUNCTIONstart = -1; JUNCTIONend = -1; bool reverse_V = false ; bool reverse_J = false ; if ((germline->seg_method == SEG_METHOD_MAX12) || (germline->seg_method == SEG_METHOD_MAX1U)) { // We check whether this sequence is segmented with MAX12 or MAX1U (with default e-value parameters) KmerSegmenter *kseg = new KmerSegmenter(seq, germline, THRESHOLD_NB_EXPECTED, 1); if (kseg->isSegmented()) { reversed = kseg->isReverse(); KmerAffect left = reversed ? KmerAffect(kseg->after, true) : kseg->before ; KmerAffect right = reversed ? KmerAffect(kseg->before, true) : kseg->after ; delete kseg ; reverse_V = (left.getStrand() == -1); reverse_J = (right.getStrand() == -1); code = "Unexpected "; code += left.toStringSigns() + germline->index->getLabel(left).basename; code += "/"; code += right.toStringSigns() + germline->index->getLabel(right).basename; info_extra += " " + left.toString() + "/" + right.toString() + " (" + code + ")"; if (germline->seg_method == SEG_METHOD_MAX1U) return ; germline->override_rep5_rep3_from_labels(left, right); } else { delete kseg ; return ; } } // Strand determination, with KmerSegmenter (with default e-value parameters) // Note that we use only the 'strand' component // When the KmerSegmenter fails, continue with positive strand // TODO: flag to force a strand / to test both strands ? KmerSegmenter *kseg = new KmerSegmenter(seq, germline, THRESHOLD_NB_EXPECTED, 1); reversed = kseg->isReverse(); delete kseg ; sequence_or_rc = revcomp(sequence, reversed); // sequence, possibly reversed /* Segmentation */ align_against_collection(sequence_or_rc, germline->rep_5, NO_FORBIDDEN_ID, reverse_V, reverse_V, false, box_V, segment_cost); align_against_collection(sequence_or_rc, germline->rep_3, NO_FORBIDDEN_ID, reverse_J, !reverse_J, false, box_J, segment_cost); // J was run with '!reverseJ', we copy the box informations from right to left // Should this directly be handled in align_against_collection() ? box_J->start = box_J->end ; box_J->del_left = box_J->del_right; /* E-values */ evalue_left = multiplier * sequence.size() * germline->rep_5.totalSize() * segment_cost.toPValue(box_V->score[0].first); evalue_right = multiplier * sequence.size() * germline->rep_3.totalSize() * segment_cost.toPValue(box_J->score[0].first); evalue = evalue_left + evalue_right ; /* Unsegmentation causes */ if (box_V->end == (int) string::npos) { evalue_left = BAD_EVALUE ; } if (box_J->start == (int) string::npos) { evalue_right = BAD_EVALUE ; } checkLeftRightEvaluesThreshold(threshold, reversed ? -1 : 1); if (because != NOT_PROCESSED) { segmented = false; info = " @" + string_of_int (box_V->end + FIRST_POS) + " @" + string_of_int(box_J->start + FIRST_POS) ; return ; } /* The sequence is segmented */ segmented = true ; because = reversed ? SEG_MINUS : SEG_PLUS ; //overlap VJ seg_N = check_and_resolve_overlap(sequence_or_rc, 0, sequence_or_rc.length(), box_V, box_J, segment_cost); // Reset extreme positions box_V->start = 0; box_J->end = sequence.length()-1; // Why could this happen ? if (box_J->start>=(int) sequence.length()) box_J->start=sequence.length()-1; // seg_N will be recomputed in finishSegmentation() boxes.clear(); boxes.push_back(box_V); boxes.push_back(box_J); code = codeFromBoxes(boxes, sequence_or_rc); info = posFromBoxes(boxes); finishSegmentation(); }
/* This test check the integrity of the getMultiResults function in AbstractACAutomaton class and its inherited classes. */ void testGetMultiResults(){ map<KmerAffect,int> results; PointerACAutomaton<KmerAffect> aho(false); const string errorOccurence = "KmerAffect doesn't have the good number of occurences."; const string errorSize = "Map has too many Kmers."; seqtype seq = "TTTTAATTAAGGGGCTACCCCCAATGTCCGTGGAGCTCTGGGGGGTTA"; affect_t affect[10]; seqtype seqs[10]; char c = 'a'; for(int i = 0; i < 10; ++i){ affect[i].c = c; c++; } seqs[0] = "AGCTCT"; seqs[1] = "TTTT"; seqs[2] = "AATT"; seqs[3] = "CGTGG"; seqs[4] = "CAATGTC"; seqs[5] = "AGGG"; seqs[6] = "GGGG"; seqs[7] = "TTAA"; seqs[8] = "GCTAC"; seqs[9] = "CCCC"; for(int i = 0;i < 10; ++i){ aho.insert(seqs[i], KmerAffect(affect[i])); } aho.build_failure_functions(); results = aho.getMultiResults(seq); /* Best situation: every sequences is found at least once in automaton. */ TAP_TEST(results.size() <= 11, TEST_AC_OCCURENCES, errorSize); TAP_TEST_EQUAL(results.at(aho.get(seqs[0])), 1, TEST_AC_OCCURENCES, errorOccurence); TAP_TEST_EQUAL(results.at(aho.get(seqs[1])), 1, TEST_AC_OCCURENCES, errorOccurence); TAP_TEST_EQUAL(results.at(aho.get(seqs[2])), 1, TEST_AC_OCCURENCES, errorOccurence); TAP_TEST_EQUAL(results.at(aho.get(seqs[3])), 1, TEST_AC_OCCURENCES, errorOccurence); TAP_TEST_EQUAL(results.at(aho.get(seqs[4])), 1, TEST_AC_OCCURENCES, errorOccurence); TAP_TEST_EQUAL(results.at(aho.get(seqs[5])), 1, TEST_AC_OCCURENCES, errorOccurence); TAP_TEST_EQUAL(results.at(aho.get(seqs[6])), 4, TEST_AC_OCCURENCES, errorOccurence); TAP_TEST_EQUAL(results.at(aho.get(seqs[7])), 2, TEST_AC_OCCURENCES, errorOccurence); TAP_TEST_EQUAL(results.at(aho.get(seqs[8])), 1, TEST_AC_OCCURENCES, errorOccurence); TAP_TEST_EQUAL(results.at(aho.get(seqs[9])), 2, TEST_AC_OCCURENCES, errorOccurence); /* Situation: Only one K-mer is in the sequence, appearing once. */ seqtype seq2 = "AAAAAAAAAAAAAAAAAATTCAAAAAAAAA"; results = aho.getMultiResults(seq2); TAP_TEST(results.size() <= 2, TEST_AC_OCCURENCES, errorSize); TAP_TEST_EQUAL(results.at(aho.get(seqs[2])), 1, TEST_AC_OCCURENCES, errorOccurence); /* Situation: Only one K-mer is the sequence, appearing many times. */ seqtype seq3 = "GCTACGCTACGCTACGCTACGCTA"; results = aho.getMultiResults(seq3); TAP_TEST(results.size() <= 2, TEST_AC_OCCURENCES, errorSize); TAP_TEST_EQUAL(results.at(aho.get(seqs[8])), 4, TEST_AC_OCCURENCES, errorOccurence); /* Situation: No K-mer appear in the sequence. */ seqtype seq4 = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; results = aho.getMultiResults(seq4); TAP_TEST(results.size() <= 1, TEST_AC_OCCURENCES, errorSize); /* If there is K-mers in automaton doesn't match the sequence, the map must return only unknown K-mers. */ pair<KmerAffect, int> singleResult = *(results.begin()); KmerAffect unknownKmerAffect = singleResult.first; TAP_TEST_EQUAL(unknownKmerAffect, AFFECT_UNKNOWN, TEST_AC_OCCURENCES, "Unknown Kmer not found"); }
KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline, double threshold, double multiplier) { box_V = new AlignBox(); box_D = new AlignBox(); box_J = new AlignBox(); CDR3start = -1; CDR3end = -1; JUNCTIONstart = -1; JUNCTIONend = -1; label = seq.label ; sequence = seq.sequence ; info = "" ; info_extra = "seed"; segmented = false; segmented_germline = germline ; system = germline->code; // useful ? reversed = false; because = NOT_PROCESSED ; // Cause of unsegmentation score = 0 ; evalue = NO_LIMIT_VALUE; evalue_left = NO_LIMIT_VALUE; evalue_right = NO_LIMIT_VALUE; int s = (size_t)germline->index->getS() ; int length = sequence.length() ; if (length < s) { because = UNSEG_TOO_SHORT; kaa = NULL; return ; } kaa = new KmerAffectAnalyser(*(germline->index), sequence); // Check strand consistency among the affectations. int strand; int nb_strand[2] = {0,0}; // In cell 0 we'll put the number of negative // strand, while in cell 1 we'll put the // positives for (int i = 0; i < kaa->count(); i++) { KmerAffect it = kaa->getAffectation(i); if (! it.isAmbiguous() && ! it.isUnknown()) { strand = affect_strand(it.affect); nb_strand[(strand + 1) / 2] ++; // (strand+1) / 2 → 0 if strand == -1; 1 if strand == 1 } } score = nb_strand[0] + nb_strand[1] ; // Used only for non-segmented germlines reversed = (nb_strand[0] > nb_strand[1]) ; if ((germline->seg_method == SEG_METHOD_MAX12) || (germline->seg_method == SEG_METHOD_MAX1U)) { // Pseudo-germline, MAX12 and MAX1U pair <KmerAffect, KmerAffect> max12 ; CountKmerAffectAnalyser ckaa(*(germline->index), sequence); set<KmerAffect> forbidden; forbidden.insert(KmerAffect::getAmbiguous()); forbidden.insert(KmerAffect::getUnknown()); if (germline->seg_method == SEG_METHOD_MAX12) // MAX12: two maximum k-mers (no unknown) { max12 = ckaa.max12(forbidden); if (max12.first.isUnknown() || max12.second.isUnknown()) { because = UNSEG_TOO_FEW_ZERO ; return ; } } else // MAX1U: the maximum k-mers (no unknown) + unknown { CountKmerAffectAnalyser ckaa(*(germline->index), sequence); KmerAffect max = ckaa.max(forbidden); if (max.isUnknown()) { because = UNSEG_TOO_FEW_ZERO ; return ; } max12 = make_pair(max, KmerAffect::getUnknown()); } pair <KmerAffect, KmerAffect> before_after = ckaa.sortLeftRight(max12); before = before_after.first ; after = before_after.second ; // This strand computation is only a heuristic, especially for chimera +/- reads // Anyway, it allows to gather such reads and their reverse complement into a unique window... // ... except when the read is quite different outside the window strand = reversed ? -1 : 1 ; } else { // Regular germline // Test on which strand we are, select the before and after KmerAffects if (nb_strand[0] == 0 && nb_strand[1] == 0) { because = UNSEG_TOO_FEW_ZERO ; return ; } else if (nb_strand[0] > RATIO_STRAND * nb_strand[1]) { strand = -1; before = KmerAffect(germline->affect_3, -1); after = KmerAffect(germline->affect_5, -1); } else if (nb_strand[1] > RATIO_STRAND * nb_strand[0]) { strand = 1; before = KmerAffect(germline->affect_5, 1); after = KmerAffect(germline->affect_3, 1); } else { // Ambiguous information: we have positive and negative strands // and there is not enough difference to put them apart. if (nb_strand[0] + nb_strand[1] >= DETECT_THRESHOLD_STRAND) because = UNSEG_STRAND_NOT_CONSISTENT ; else because = UNSEG_TOO_FEW_ZERO ; return ; } } // endif Pseudo-germline computeSegmentation(strand, before, after, threshold, multiplier); }