コード例 #1
0
FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c,  double threshold, double multiplier)
{
  box_V = new AlignBox("5");
  box_D = new AlignBox("4");
  box_J = new AlignBox("3");

  segmented = false;
  dSegmented = false;
  because = NOT_PROCESSED ;
  segmented_germline = germline ;
  info_extra = "" ;
  label = seq.label ;
  sequence = seq.sequence ;
  segment_cost=segment_c;
  evalue = NO_LIMIT_VALUE;
  evalue_left = NO_LIMIT_VALUE;
  evalue_right = NO_LIMIT_VALUE;
  box_V->marked_pos = 0;
  box_J->marked_pos = 0;

  CDR3start = -1;
  CDR3end = -1;

  JUNCTIONstart = -1;
  JUNCTIONend = -1;

  bool reverse_V = false ;
  bool reverse_J = false ;

  if ((germline->seg_method == SEG_METHOD_MAX12) || (germline->seg_method == SEG_METHOD_MAX1U))
    {
      // We check whether this sequence is segmented with MAX12 or MAX1U (with default e-value parameters)
      KmerSegmenter *kseg = new KmerSegmenter(seq, germline, THRESHOLD_NB_EXPECTED, 1);
      if (kseg->isSegmented())
        {
          reversed = kseg->isReverse();

          KmerAffect left = reversed ? KmerAffect(kseg->after, true) : kseg->before ;
          KmerAffect right = reversed ? KmerAffect(kseg->before, true) : kseg->after ;

          delete kseg ;

          reverse_V = (left.getStrand() == -1);
          reverse_J = (right.getStrand() == -1);

          code = "Unexpected ";

          code += left.toStringSigns() + germline->index->getLabel(left).basename;
          code += "/";
          code += right.toStringSigns() + germline->index->getLabel(right).basename;
          info_extra += " " + left.toString() + "/" + right.toString() + " (" + code + ")";

          if (germline->seg_method == SEG_METHOD_MAX1U)
            return ;

          germline->override_rep5_rep3_from_labels(left, right);
        }
      else
        {
          delete kseg ;
          return ;
        }
    }

  // Strand determination, with KmerSegmenter (with default e-value parameters)
  // Note that we use only the 'strand' component
  // When the KmerSegmenter fails, continue with positive strand
  // TODO: flag to force a strand / to test both strands ?

  KmerSegmenter *kseg = new KmerSegmenter(seq, germline, THRESHOLD_NB_EXPECTED, 1);
  reversed = kseg->isReverse();
  delete kseg ;
  
  sequence_or_rc = revcomp(sequence, reversed); // sequence, possibly reversed


  /* Segmentation */
  align_against_collection(sequence_or_rc, germline->rep_5, NO_FORBIDDEN_ID, reverse_V, reverse_V, false,
                                        box_V, segment_cost);

  align_against_collection(sequence_or_rc, germline->rep_3, NO_FORBIDDEN_ID, reverse_J, !reverse_J, false,
                                          box_J, segment_cost);

  // J was run with '!reverseJ', we copy the box informations from right to left
  // Should this directly be handled in align_against_collection() ?
  box_J->start = box_J->end ;
  box_J->del_left = box_J->del_right;

  /* E-values */
  evalue_left  = multiplier * sequence.size() * germline->rep_5.totalSize() * segment_cost.toPValue(box_V->score[0].first);
  evalue_right = multiplier * sequence.size() * germline->rep_3.totalSize() * segment_cost.toPValue(box_J->score[0].first);
  evalue = evalue_left + evalue_right ;

  /* Unsegmentation causes */
  if (box_V->end == (int) string::npos)
    {
      evalue_left = BAD_EVALUE ;
    }
      
  if (box_J->start == (int) string::npos)
    {
      evalue_right = BAD_EVALUE ;
    }

  checkLeftRightEvaluesThreshold(threshold, reversed ? -1 : 1);

  if (because != NOT_PROCESSED)
    {
      segmented = false;
      info = " @" + string_of_int (box_V->end + FIRST_POS) + "  @" + string_of_int(box_J->start + FIRST_POS) ;
      return ;
    }

  /* The sequence is segmented */
  segmented = true ;
  because = reversed ? SEG_MINUS : SEG_PLUS ;

    //overlap VJ
  seg_N = check_and_resolve_overlap(sequence_or_rc, 0, sequence_or_rc.length(),
                                    box_V, box_J, segment_cost);

  // Reset extreme positions
  box_V->start = 0;
  box_J->end = sequence.length()-1;

  // Why could this happen ?
      if (box_J->start>=(int) sequence.length())
	  box_J->start=sequence.length()-1;

  // seg_N will be recomputed in finishSegmentation()

  boxes.clear();
  boxes.push_back(box_V);
  boxes.push_back(box_J);
  code = codeFromBoxes(boxes, sequence_or_rc);
  info = posFromBoxes(boxes);

  finishSegmentation();
}
コード例 #2
0
ファイル: testAutomaton.cpp プロジェクト: vidjil/vidjil
/* 
  This test check the integrity of the getMultiResults function in
   AbstractACAutomaton class and its inherited classes.
*/
void testGetMultiResults(){
  map<KmerAffect,int> results;
  PointerACAutomaton<KmerAffect> aho(false);
  const string errorOccurence = "KmerAffect doesn't have the good number of occurences.";
  const string errorSize = "Map has too many Kmers.";
  seqtype seq = "TTTTAATTAAGGGGCTACCCCCAATGTCCGTGGAGCTCTGGGGGGTTA";
  affect_t affect[10];
  seqtype seqs[10];
  char c = 'a';
  for(int i = 0; i < 10; ++i){
    affect[i].c = c;
    c++;
  }
  seqs[0] = "AGCTCT";
  seqs[1] = "TTTT";
  seqs[2] = "AATT";
  seqs[3] = "CGTGG";
  seqs[4] = "CAATGTC";
  seqs[5] = "AGGG";
  seqs[6] = "GGGG";
  seqs[7] = "TTAA";
  seqs[8] = "GCTAC";
  seqs[9] = "CCCC";
  
  for(int i = 0;i < 10; ++i){
    aho.insert(seqs[i], KmerAffect(affect[i]));
  }
  aho.build_failure_functions();
  results = aho.getMultiResults(seq);

  /* Best situation: every sequences is found at least once in automaton. */
  TAP_TEST(results.size() <= 11, TEST_AC_OCCURENCES, errorSize);
  TAP_TEST_EQUAL(results.at(aho.get(seqs[0])), 1, TEST_AC_OCCURENCES, errorOccurence);  
  TAP_TEST_EQUAL(results.at(aho.get(seqs[1])), 1, TEST_AC_OCCURENCES, errorOccurence);  
  TAP_TEST_EQUAL(results.at(aho.get(seqs[2])), 1, TEST_AC_OCCURENCES, errorOccurence);  
  TAP_TEST_EQUAL(results.at(aho.get(seqs[3])), 1, TEST_AC_OCCURENCES, errorOccurence);  
  TAP_TEST_EQUAL(results.at(aho.get(seqs[4])), 1, TEST_AC_OCCURENCES, errorOccurence);  
  TAP_TEST_EQUAL(results.at(aho.get(seqs[5])), 1, TEST_AC_OCCURENCES, errorOccurence);  
  TAP_TEST_EQUAL(results.at(aho.get(seqs[6])), 4, TEST_AC_OCCURENCES, errorOccurence);  
  TAP_TEST_EQUAL(results.at(aho.get(seqs[7])), 2, TEST_AC_OCCURENCES, errorOccurence);  
  TAP_TEST_EQUAL(results.at(aho.get(seqs[8])), 1, TEST_AC_OCCURENCES, errorOccurence);  
  TAP_TEST_EQUAL(results.at(aho.get(seqs[9])), 2, TEST_AC_OCCURENCES, errorOccurence);  
  
  /* Situation: Only one K-mer is in the sequence, appearing once. */
  seqtype seq2 = "AAAAAAAAAAAAAAAAAATTCAAAAAAAAA";
  results = aho.getMultiResults(seq2);
  TAP_TEST(results.size() <= 2, TEST_AC_OCCURENCES, errorSize);
  TAP_TEST_EQUAL(results.at(aho.get(seqs[2])), 1, TEST_AC_OCCURENCES, errorOccurence);

  /* Situation: Only one K-mer is the sequence, appearing many times. */
  seqtype seq3 = "GCTACGCTACGCTACGCTACGCTA";
  results = aho.getMultiResults(seq3);
  TAP_TEST(results.size() <= 2, TEST_AC_OCCURENCES, errorSize);
  TAP_TEST_EQUAL(results.at(aho.get(seqs[8])), 4, TEST_AC_OCCURENCES, errorOccurence);
  
  /* Situation: No K-mer appear in the sequence. */
  seqtype seq4 = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
  results = aho.getMultiResults(seq4);
  TAP_TEST(results.size() <= 1, TEST_AC_OCCURENCES, errorSize);
  /*
    If there is K-mers in automaton doesn't match the sequence, the map must
    return only unknown K-mers.
  */
  pair<KmerAffect, int> singleResult = *(results.begin());
  KmerAffect unknownKmerAffect = singleResult.first;
  TAP_TEST_EQUAL(unknownKmerAffect, AFFECT_UNKNOWN, TEST_AC_OCCURENCES, "Unknown Kmer not found");
}
コード例 #3
0
KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline, double threshold, double multiplier)
{
  box_V = new AlignBox();
  box_D = new AlignBox();
  box_J = new AlignBox();

  CDR3start = -1;
  CDR3end = -1;

  JUNCTIONstart = -1;
  JUNCTIONend = -1;

  label = seq.label ;
  sequence = seq.sequence ;
  info = "" ;
  info_extra = "seed";
  segmented = false;
  segmented_germline = germline ;
  system = germline->code; // useful ?
  reversed = false;
  because = NOT_PROCESSED ; // Cause of unsegmentation
  score = 0 ;
  evalue = NO_LIMIT_VALUE;
  evalue_left = NO_LIMIT_VALUE;
  evalue_right = NO_LIMIT_VALUE;

  int s = (size_t)germline->index->getS() ;
  int length = sequence.length() ;

  if (length < s) 
    {
      because = UNSEG_TOO_SHORT;
      kaa = NULL;
      return ;
    }
 
  kaa = new KmerAffectAnalyser(*(germline->index), sequence);
  
  // Check strand consistency among the affectations.
  int strand;
  int nb_strand[2] = {0,0};     // In cell 0 we'll put the number of negative
                                // strand, while in cell 1 we'll put the
                                // positives
  for (int i = 0; i < kaa->count(); i++) { 
    KmerAffect it = kaa->getAffectation(i);
    if (! it.isAmbiguous() && ! it.isUnknown()) {
      strand = affect_strand(it.affect);
      nb_strand[(strand + 1) / 2] ++; // (strand+1) / 2 → 0 if strand == -1; 1 if strand == 1
    }
  }

  score = nb_strand[0] + nb_strand[1] ; // Used only for non-segmented germlines

  reversed = (nb_strand[0] > nb_strand[1]) ;

  if ((germline->seg_method == SEG_METHOD_MAX12)
      || (germline->seg_method == SEG_METHOD_MAX1U))
    { // Pseudo-germline, MAX12 and MAX1U
      pair <KmerAffect, KmerAffect> max12 ;
      CountKmerAffectAnalyser ckaa(*(germline->index), sequence);


      set<KmerAffect> forbidden;
      forbidden.insert(KmerAffect::getAmbiguous());
      forbidden.insert(KmerAffect::getUnknown());

      if (germline->seg_method == SEG_METHOD_MAX12)
        // MAX12: two maximum k-mers (no unknown)
        {
          max12 = ckaa.max12(forbidden);

          if (max12.first.isUnknown() || max12.second.isUnknown())
            {
              because = UNSEG_TOO_FEW_ZERO ;
              return ;
            }
        }

      else
        // MAX1U: the maximum k-mers (no unknown) + unknown
        {
          CountKmerAffectAnalyser ckaa(*(germline->index), sequence);
          KmerAffect max = ckaa.max(forbidden);

          if (max.isUnknown())
            {
              because = UNSEG_TOO_FEW_ZERO ;
              return ;
            }
          max12 = make_pair(max, KmerAffect::getUnknown());
        }

      pair <KmerAffect, KmerAffect> before_after =  ckaa.sortLeftRight(max12);

      before = before_after.first ;
      after = before_after.second ;

      // This strand computation is only a heuristic, especially for chimera +/- reads
      // Anyway, it allows to gather such reads and their reverse complement into a unique window...
      // ... except when the read is quite different outside the window
      strand = reversed ? -1 : 1 ;
    }

  else
    { // Regular germline

  // Test on which strand we are, select the before and after KmerAffects
  if (nb_strand[0] == 0 && nb_strand[1] == 0) {
    because = UNSEG_TOO_FEW_ZERO ;
    return ;
  } else if (nb_strand[0] > RATIO_STRAND * nb_strand[1]) {
    strand = -1;
    before = KmerAffect(germline->affect_3, -1); 
    after = KmerAffect(germline->affect_5, -1);
  } else if (nb_strand[1] > RATIO_STRAND * nb_strand[0]) {
    strand = 1;
    before = KmerAffect(germline->affect_5, 1); 
    after = KmerAffect(germline->affect_3, 1);    
  } else {
    // Ambiguous information: we have positive and negative strands
    // and there is not enough difference to put them apart.
    if (nb_strand[0] + nb_strand[1] >= DETECT_THRESHOLD_STRAND)
      because = UNSEG_STRAND_NOT_CONSISTENT ;
    else
      because = UNSEG_TOO_FEW_ZERO ;
    return ;
  }

    } // endif Pseudo-germline
 
  computeSegmentation(strand, before, after, threshold, multiplier);
}