Пример #1
0
int aln_pair_align(FILE *fp1, FILE *fp2, AlnParam *ap, int type, int misc_flag)
{
	seq_t seq1, seq2;
	int len1, len2, n;
	char name1[MAX_NAME_LEN], name2[MAX_NAME_LEN];
	path_t *pt, *pp;
	AlnAln *aa;

	INIT_SEQ(seq1); INIT_SEQ(seq2);

	for (n = 0; ; ++n) {
		len1 = read_fasta(fp1, &seq1, name1, 0);
		len2 = read_fasta(fp2, &seq2, name2, 0);
		if (len1 < 0 || len2 < 0) break;
		aa = aln_align((char*)seq1.s, (char*)seq2.s, ap, type);
		pp = aa->path; pt = aa->path + aa->path_len - 1;
		printf(">%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\n", name1, len1, pt->i, pp->i,
				name2, len2, pt->j, pp->j, aa->score);
		if (aa->out1) printf("%s\n", aa->out1);
		if (aa->outm) printf("%s\n", aa->outm);
		if (aa->out2) printf("%s\n", aa->out2);
		if (type != ALN_BOUND_ALIGN) printf("//\n");
		fflush(stdout);
		if (misc_flag)
			aln_output_segment((char*)seq1.s, (char*)seq2.s, aa->path, aa->path_len, name1, name2);
		aln_free_AlnAln(aa);
	}
	MYFREE(seq1.s); MYFREE(seq2.s);
	return n;
}
Пример #2
0
PWDB* pw_aln_contigs(CtgDB *db) {
	uint32_t i, j, n;
	int k, mn, mm, off0, off1, aln_len;
	PWDB *pwdb;
	Ctg *c0, *c1;
	pwdb = (PWDB*)malloc(sizeof(PWDB));

	pwdb->pwctgs = init_pwctglist(6);
	pwdb->hp = init_heap(aln_cmp, pwdb);
	pwdb->ctgv = db->ctgs;
	AlnParam ap = {10, 2, 2, aln_sm_nt, 16, 75};

	n = db->ctgnum;
	
	for (i = 0; i < n-1; i++) {
		c0 = ref_ctglist(db->ctgs, i);
		for (j = i+1; j < n; j++) {
			c1 = ref_ctglist(db->ctgs, j);
			AlnAln *aa;
			mn = mm = 0;
			off0 = off1 = -1;
			aa = aln_stdaln(c0->seq, c1->seq, &ap, 0, 1);
			aln_len = strlen(aa->out1);
			for (k = 0; k < aln_len; k++) {
				if (aa->out1[k] == '-' || aa->out2[k] == '-') continue;
				if (aa->out1[k] != aa->out2[k]) mm++;
				mn++;
			}
			PWcontig *pwc = (PWcontig*)malloc(sizeof(PWcontig));
			pwc->id0 = c0->cls_id;
			pwc->id1 = c1->cls_id;
			pwc->overlap = mn;
			pwc->score = aa->score;
			pwc->het = (float)mm/mn; 
			push_heap(pwdb->hp, pwc);
			push_pwctglist(pwdb->pwctgs, pwc);
			//fprintf(stdout, "%d\t%d\t%d\t%d\t%d\t%d\t%.3f\n", c0->cls_id, c1->cls_id, pwc->id0, pwc->id1, mn, mm, pwc->het);
			//fprintf(stdout, "%s\n%s\n", c0->seq, c1->seq);
			//fprintf(stdout, "%d\t%d\t%d\t%d\t%d\t%d\n%s\n%s\n%s\n\n", aa->start1, aa->end1,aa->start2, aa->end2, pwc->score, pwc->overlap, aa->out1, aa->outm, aa->out2);

			//fprintf(stdout, "%s\n%s\n%s\n\n", aa->out1, aa->outm, aa->out2);
			fflush(stdout);
			aln_free_AlnAln(aa);
		}
	}

	return pwdb;
}
Пример #3
0
static void aln_1seq(const seqs_t *ss, const char *name, int l, const char *s, char strand)
{
	int i;
	for (i = 0; i < ss->n_seqs; ++i) {
		AlnAln *aa;
		seq1_t *p = ss->seqs + i;
		g_aln_param.band_width = l + p->l;
		aa = aln_stdaln_aux(s, (const char*)p->s, &g_aln_param, g_is_global, g_thres, l, p->l);
		if (aa->score >= g_thres || g_is_global) {
			printf(">%s\t%d\t%d\t%s\t%c\t%d\t%d\t%d\t%d\t", p->n, aa->start1? aa->start1 : 1, aa->end1, name, strand,
				   aa->start2? aa->start2 : 1, aa->end2, aa->score, aa->subo);
			// NB: I put the short sequence as the first sequence in SW, an insertion to
			// the reference becomes a deletion from the short sequence. Therefore, I use
			// "MDI" here rather than "MID", and print ->out2 first rather than ->out1.
			for (i = 0; i != aa->n_cigar; ++i)
				printf("%d%c", aa->cigar32[i]>>4, "MDI"[aa->cigar32[i]&0xf]);
			printf("\n%s\n%s\n%s\n", aa->out2, aa->outm, aa->out1);
		}
		aln_free_AlnAln(aa);
	}
Пример #4
0
bool SGSmoothingVisitor::visit(StringGraph* pGraph, Vertex* pVertex)
{
    (void)pGraph;
    if(pVertex->getColor() == GC_RED)
        return false;

    bool found = false;
    for(size_t idx = 0; idx < ED_COUNT; idx++)
    {
        EdgeDir dir = EDGE_DIRECTIONS[idx];
        EdgePtrVec edges = pVertex->getEdges(dir);
        if(edges.size() <= 1)
            continue;

        for(size_t i = 0; i < edges.size(); ++i)
        {
            if(edges[i]->getEnd()->getColor() == GC_RED)
                return false;
        }

        //std::cout << "Smoothing " << pVertex->getID() << "\n";

        const int MAX_WALKS = 10;
        const int MAX_DISTANCE = 5000;
        bool bIsDegenerate = false;
        bool bFailGapCheck = false;
        bool bFailDivergenceCheck = false;
        bool bFailIndelSizeCheck = false;

        SGWalkVector variantWalks;
        SGSearch::findVariantWalks(pVertex, dir, MAX_DISTANCE, MAX_WALKS, variantWalks);

        if(variantWalks.size() > 0)
        {
            found = true;
            size_t selectedIdx = -1;
            size_t selectedCoverage = 0;

            // Calculate the minimum amount overlapped on the start/end vertex.
            // This is used to properly extract the sequences from walks that represent the variation.
            int minOverlapX = std::numeric_limits<int>::max();
            int minOverlapY = std::numeric_limits<int>::max();

            for(size_t i = 0; i < variantWalks.size(); ++i)
            {
                if(variantWalks[i].getNumEdges() <= 1)
                    bIsDegenerate = true;

                // Calculate the walk coverage using the internal vertices of the walk. 
                // The walk with the highest coverage will be retained
                size_t walkCoverage = 0;
                for(size_t j = 1; j < variantWalks[i].getNumVertices() - 1; ++j)
                    walkCoverage += variantWalks[i].getVertex(j)->getCoverage();

                if(walkCoverage > selectedCoverage || selectedCoverage == 0)
                {
                    selectedIdx = i;
                    selectedCoverage = walkCoverage;
                }
                
                Edge* pFirstEdge = variantWalks[i].getFirstEdge();
                Edge* pLastEdge = variantWalks[i].getLastEdge();

                if((int)pFirstEdge->getMatchLength() < minOverlapX)
                    minOverlapX = pFirstEdge->getMatchLength();

                if((int)pLastEdge->getTwin()->getMatchLength() < minOverlapY)
                    minOverlapY = pLastEdge->getTwin()->getMatchLength();
            }

            // Calculate the strings for each walk that represent the region of variation
            StringVector walkStrings;
            for(size_t i = 0; i < variantWalks.size(); ++i)
            {
                Vertex* pStartVertex = variantWalks[i].getStartVertex();
                Vertex* pLastVertex = variantWalks[i].getLastVertex();
                assert(pStartVertex != NULL && pLastVertex != NULL);
                
                std::string full = variantWalks[i].getString(SGWT_START_TO_END);
                int posStart = 0;
                int posEnd = 0;

                if(dir == ED_ANTISENSE)
                {
                    // pLast   -----------
                    // pStart          ------------
                    // full    --------------------
                    // out             ----
                    posStart = pLastVertex->getSeqLen() - minOverlapY;
                    posEnd = full.size() - (pStartVertex->getSeqLen() - minOverlapX);
                }
                else
                {
                    // pStart         --------------
                    // pLast   -----------
                    // full    ---------------------
                    // out            ----
                    posStart = pStartVertex->getSeqLen() - minOverlapX; // match start position
                    posEnd = full.size() - (pLastVertex->getSeqLen() - minOverlapY); // match end position
                }
                
                std::string out;
                if(posEnd > posStart)
                    out = full.substr(posStart, posEnd - posStart);
                walkStrings.push_back(out);
            }

            assert(selectedIdx != (size_t)-1);
            SGWalk& selectedWalk = variantWalks[selectedIdx];
            assert(selectedWalk.isIndexed());

            // Check the divergence of the other walks to this walk
            StringVector cigarStrings;
            std::vector<int> maxIndel;
            std::vector<double> gapPercent; // percentage of matching that is gaps
            std::vector<double> totalPercent; // percent of total alignment that is mismatch or gap

            cigarStrings.resize(variantWalks.size());
            gapPercent.resize(variantWalks.size());
            totalPercent.resize(variantWalks.size());
            maxIndel.resize(variantWalks.size());

            for(size_t i = 0; i < variantWalks.size(); ++i)
            {
                if(i == selectedIdx)
                    continue;

                // We want to compute the total gap length, total mismatches and percent
                // divergence between the two paths.
                int matchLen = 0;
                int totalDiff = 0;
                int gapLength = 0;
                int maxGapLength = 0;
                // We have to handle the degenerate case where one internal string has zero length
                // this can happen when there is an isolated insertion/deletion and the walks are like:
                // x -> y -> z
                // x -> z
                if(walkStrings[selectedIdx].empty() || walkStrings[i].empty())
                {
                    matchLen = std::max(walkStrings[selectedIdx].size(), walkStrings[i].size());
                    totalDiff = matchLen;
                    gapLength = matchLen;
                }
                else
                {
                    AlnAln *aln_global;
                    aln_global = aln_stdaln(walkStrings[selectedIdx].c_str(), walkStrings[i].c_str(), &aln_param_blast, 1, 1);

                    // Calculate the alignment parameters
                    while(aln_global->outm[matchLen] != '\0')
                    {
                        if(aln_global->outm[matchLen] == ' ')
                            totalDiff += 1;
                        matchLen += 1;
                    }

                    std::stringstream cigarSS;
                    for (int j = 0; j != aln_global->n_cigar; ++j)
                    {
                        char cigarOp = "MID"[aln_global->cigar32[j]&0xf];
                        int cigarLen = aln_global->cigar32[j]>>4;
                        if(cigarOp == 'I' || cigarOp == 'D')
                        {
                            gapLength += cigarLen;
                            if(gapLength > maxGapLength)
                                maxGapLength = gapLength;
                        }

                        cigarSS << cigarLen;
                        cigarSS << cigarOp;
                    }
                    cigarStrings[i] = cigarSS.str();
                    aln_free_AlnAln(aln_global);
                }

                double percentDiff = (double)totalDiff / matchLen;
                double percentGap = (double)gapLength / matchLen;

                if(percentDiff > m_maxTotalDivergence)
                    bFailDivergenceCheck = true;
                
                if(percentGap > m_maxGapDivergence)
                    bFailGapCheck = true;

                if(maxGapLength > m_maxIndelLength)
                    bFailIndelSizeCheck = true;

                gapPercent[i] = percentGap;
                totalPercent[i] = percentDiff;
                maxIndel[i] = maxGapLength;
            }

            if(bIsDegenerate || bFailGapCheck || bFailDivergenceCheck || bFailIndelSizeCheck)
                continue;

            // Write the selected path to the variants file as variant 0
            int variantIdx = 0;
            std::string selectedSequence = selectedWalk.getString(SGWT_START_TO_END);
            std::stringstream ss;
            ss << "variant-" << m_numRemovedTotal << "/" << variantIdx++;
            writeFastaRecord(&m_outFile, ss.str(), selectedSequence);


            // The vertex set for each walk is not necessarily disjoint,
            // the selected walk may contain vertices that are part
            // of other paths. We handle this be initially marking all
            // vertices of the 
            for(size_t i = 0; i < variantWalks.size(); ++i)
            {
                if(i == selectedIdx)
                    continue;

                SGWalk& currWalk = variantWalks[i];
                for(size_t j = 0; j < currWalk.getNumEdges() - 1; ++j)
                {
                    Edge* currEdge = currWalk.getEdge(j);
                    
                    // If the vertex is also on the selected path, do not mark it
                    Vertex* currVertex = currEdge->getEnd();
                    if(!selectedWalk.containsVertex(currVertex->getID()))
                    {
                        currEdge->getEnd()->setColor(GC_RED);
                    }
                }

                // Write the variant to a file
                std::string variantSequence = currWalk.getString(SGWT_START_TO_END);
                std::stringstream variantID;
                std::stringstream ss;
                ss << "variant-" << m_numRemovedTotal << "/" << variantIdx++;
                ss << " IGD:" << (double)gapPercent[i] << " ITD:" << totalPercent[i] << " MID: " << maxIndel[i] << " InternalCigar:" << cigarStrings[i];
                writeFastaRecord(&m_outFile, ss.str(), variantSequence);
            }

            if(variantWalks.size() == 2)
                m_simpleBubblesRemoved += 1;
            else
                m_complexBubblesRemoved += 1;
            ++m_numRemovedTotal;
        }
    }
Пример #5
0
int main( int argc, char* argv[] ) {
  unsigned long long num_pairs;
  unsigned long long num_merged;
  unsigned long long num_adapter;
  unsigned long long num_discarded;
  unsigned long long num_too_ambiguous_to_merge;
  unsigned long long max_pretty_print = DEF_MAX_PRETTY_PRINT;
  unsigned long long num_pretty_print = 0;
  int adapter_thresh = DEF_ADAPTER_SCORE_THRES;
  int read_thresh = DEF_READ_SCORE_THRES;
  clock_t start, end;
  //init to 0
  num_pairs = num_merged = num_adapter = num_discarded = num_too_ambiguous_to_merge = 0;
  extern char* optarg;
  bool p64 = false;
  char forward_fn[MAX_FN_LEN];
  char reverse_fn[MAX_FN_LEN];
  char forward_out_fn[MAX_FN_LEN];
  char reverse_out_fn[MAX_FN_LEN];
  char forward_discard_fn[MAX_FN_LEN];
  char reverse_discard_fn[MAX_FN_LEN];
  char merged_out_fn[MAX_FN_LEN];
  bool do_read_merging = false;
  bool print_overhang = false;
  bool write_discard=false;
  char forward_primer[MAX_SEQ_LEN+1];
  strcpy(forward_primer, DEF_FORWARD_PRIMER); //set default
  char forward_primer_dummy_qual[MAX_SEQ_LEN+1];
  char reverse_primer[MAX_SEQ_LEN+1];
  strcpy(reverse_primer, DEF_REVERSE_PRIMER); //set default
  char reverse_primer_dummy_qual[MAX_SEQ_LEN+1];
  int i;
  for(i=0;i<MAX_SEQ_LEN+1;i++){
    forward_primer_dummy_qual[i] = 'N';//phred score of 45
    reverse_primer_dummy_qual[i] = 'N';
  }
  int ich;
  int min_ol_adapter = DEF_OL2MERGE_ADAPTER;
  int min_ol_reads = DEF_OL2MERGE_READS;
  unsigned short int min_read_len =DEF_MIN_READ_LEN;
  float min_match_adapter_frac = DEF_MIN_MATCH_ADAPTER;
  float min_match_reads_frac = DEF_MIN_MATCH_READS;
  float max_mismatch_adapter_frac = DEF_MAX_MISMATCH_ADAPTER;
  float max_mismatch_reads_frac = DEF_MAX_MISMATCH_READS;

  float read_frac_thresh = DEF_READ_GAP_FRAC_CUTOFF;
  unsigned short max_mismatch_adapter[MAX_SEQ_LEN+1];
  unsigned short max_mismatch_reads[MAX_SEQ_LEN+1];
  unsigned short min_match_adapter[MAX_SEQ_LEN+1];
  unsigned short min_match_reads[MAX_SEQ_LEN+1];
  char qcut = (char)DEF_QCUT+33;
  bool pretty_print = false;
  char pretty_print_fn[MAX_FN_LEN+1];
  SQP sqp = SQP_init();
  char untrim_fseq[MAX_SEQ_LEN+1];
  char untrim_fqual[MAX_SEQ_LEN+1];
  char untrim_rseq[MAX_SEQ_LEN+1];
  char untrim_rqual[MAX_SEQ_LEN+1];
  /* No args - help!  */
  if ( argc == 1 ) {
    help(argv[0]);
  }
  int req_args = 0;
  while( (ich=getopt( argc, argv, "f:r:1:2:3:4:q:A:s:y:B:O:E:x:M:N:L:o:m:b:w:W:p:P:X:Q:t:e:Z:n:6gh" )) != -1 ) {
    switch( ich ) {

    //REQUIRED ARGUMENTS
    case 'f' :
      req_args ++;
      strcpy( forward_fn, optarg );
      break;
    case 'r' :
      req_args ++;
      strcpy( reverse_fn, optarg );
      break;
    case '1' :
      req_args ++;
      strcpy(forward_out_fn, optarg);
      break;
    case '2' :
      req_args ++;
      strcpy(reverse_out_fn, optarg);
      break;

      //OPTIONAL GENERAL ARGUMENTS
    case '3' :
      write_discard=true;
      strcpy(forward_discard_fn, optarg);
      break;
    case '4' :
      write_discard=true;
      strcpy(reverse_discard_fn, optarg);
      break;
    case 'h' :
      help(argv[0]);
      break;
    case '6' :
      p64 = true;
      break;
    case 'q' :
      qcut = atoi(optarg)+33;
      break;
    case 'L' :
      min_read_len = atoi(optarg);
      break;

      //OPTIONAL ADAPTER/PRIMER TRIMMING ARGUMENTS
    case 'A':
      strcpy(forward_primer, optarg);
      break;
    case 'B':
      strcpy(reverse_primer, optarg);
      break;
    case 'O':
      min_ol_adapter = atoi(optarg);
      break;
    case 'M':
      max_mismatch_adapter_frac = atof(optarg);
      break;
    case 'N':
      min_match_adapter_frac = atof(optarg);
      break;
    case 'b':
      aln_param_nt2nt.band_width = atoi(optarg);
      break;
    case 'Q':
      aln_param_nt2nt.gap_open = atoi(optarg);
      break;
    case 't':
      aln_param_nt2nt.gap_ext = atoi(optarg);
      break;
    case 'e':
      aln_param_nt2nt.gap_end = atoi(optarg);
      break;
    case 'Z':
      adapter_thresh = atoi(optarg);
      break;


    case 'w':
      aln_param_rd2rd.band_width = atoi(optarg);
      break;
    case 'W':
      aln_param_rd2rd.gap_open = atoi(optarg);
      break;
    case 'p':
      aln_param_rd2rd.gap_ext = atoi(optarg);
      break;
    case 'P':
      aln_param_rd2rd.gap_end = atoi(optarg);
      break;
    case 'X':
      read_frac_thresh = atof(optarg);
      break;

      //OPTIONAL MERGING ARGUMENTS
    case 'y' :
      maximum_quality = optarg[0];
      break;
    case 'g' :
      print_overhang = true;
      break;
    case 's' :
      do_read_merging = true;
      strcpy( merged_out_fn, optarg );
      break;
    case 'o':
      min_ol_reads = atoi(optarg);
      break;
    case 'm':
      max_mismatch_reads_frac = atof(optarg);
      break;
    case 'n':
      min_match_reads_frac = atof(optarg);
      break;
    case 'E':
      pretty_print = true;
      strcpy(pretty_print_fn,optarg);
      break;
    case 'x':
      max_pretty_print = atol(optarg);
      break;


    default :
      help(argv[0]);
    }
  }
  if(req_args < 4){
    fprintf(stderr, "Missing a required argument!\n");
    help(argv[0]);
  }
  start = clock();
  //allocate alignment memory

  //  int min_match = 8;
  //  int ngaps = 1;
  //  int maxglen = 3;

  // AlnParam aln_param_adapter   = {  5, 13, 19, aln_sm_read, 16, 75 };
  //


  //Calculate table matching overlap length to min matches and max mismatches
  for(i=0;i<MAX_SEQ_LEN+1;i++){
    max_mismatch_reads[i] = floor(((float)i)*max_mismatch_reads_frac);
    max_mismatch_adapter[i] = floor(((float)i)*max_mismatch_adapter_frac);
    min_match_reads[i] = ceil(((float)i)*min_match_reads_frac);
    min_match_adapter[i] = ceil(((float)i)*min_match_adapter_frac);
  }
  //get length of forward and reverse primers
  int forward_primer_len = strlen(forward_primer);
  int reverse_primer_len = strlen(reverse_primer);


  gzFile ffq = fileOpen(forward_fn, "r");
  gzFile ffqw = fileOpen(forward_out_fn,"w");
  gzFile rfq = fileOpen(reverse_fn, "r");
  gzFile rfqw = fileOpen(reverse_out_fn,"w");
  gzFile mfqw = NULL;
  gzFile ppaw = NULL;
  gzFile dffqw = NULL;
  gzFile drfqw = NULL;
  if(do_read_merging)
    mfqw = fileOpen(merged_out_fn,"w");
  if(pretty_print)
    ppaw = fileOpen(pretty_print_fn,"w");
  if(write_discard){
    dffqw = fileOpen(forward_discard_fn,"w");
    drfqw = fileOpen(reverse_discard_fn,"w");    
  }


  /**
   * Loop over all of the reads
   */
  while(next_fastqs( ffq, rfq, sqp, p64 )){ //returns false when done
    update_spinner(num_pairs++);


    AlnAln *faaln, *raaln, *fraln;

    //save a copy of the original sequences/qualities first
    strcpy(untrim_fseq,sqp->fseq);
    strcpy(untrim_fqual,sqp->fqual);
    strcpy(untrim_rseq,sqp->rseq);
    strcpy(untrim_rqual,sqp->rqual);

    faaln = aln_stdaln_aux(sqp->fseq, forward_primer, &aln_param_nt2nt,
        ALN_TYPE_LOCAL, adapter_thresh , sqp->flen, forward_primer_len);
    raaln = aln_stdaln_aux(sqp->rseq, reverse_primer, &aln_param_nt2nt,
        ALN_TYPE_LOCAL, adapter_thresh, sqp->rlen, reverse_primer_len);

    //check for direct adapter match.
    if(adapter_trim(sqp, min_ol_adapter,
        forward_primer, forward_primer_dummy_qual,
        forward_primer_len,
        reverse_primer, reverse_primer_dummy_qual,
        reverse_primer_len,
        min_match_adapter,
        max_mismatch_adapter,
        min_match_reads,
        max_mismatch_reads,
        qcut) ||
        faaln->score >= adapter_thresh ||
        raaln->score >= adapter_thresh){
      num_adapter++; //adapter present
      //print it if user wants
      if(pretty_print && num_pretty_print < max_pretty_print){
        //void pretty_print_alignment_stdaln(gzFile out, SQP sqp, AlnAln *aln, bool first_adapter, bool second_adapter)
        if(faaln->score >= adapter_thresh){
          num_pretty_print++;
          pretty_print_alignment_stdaln(ppaw,sqp,faaln,true,false,false);
        }
        if(raaln->score >= adapter_thresh){
          num_pretty_print++;
          pretty_print_alignment_stdaln(ppaw,sqp,raaln,false,true,false);
        }
      }

      //do stuff to it
      //assume full length adapter and squish it down to the read with no gaps
      int rpos,fpos;
      rpos = fpos = (- MAX_SEQ_LEN);
      if(faaln->score >= adapter_thresh){
        fpos = max(faaln->start1 - faaln->start2,0);
      }
      if(raaln->score >= adapter_thresh){
        rpos = max(raaln->start1 - raaln->start2,0);
      }

      //make rlen the minimum of the two adapter search methods
      if(rpos >= 0){
        sqp->rlen = min(sqp->rlen,rpos);
      }

      //make flen the minimum of the two adapter search methods
      if(fpos >= 0){
        sqp->flen = min(sqp->flen,fpos);
      }

      if(sqp->flen < min_read_len || sqp->rlen < min_read_len){
        num_discarded++;
	if(write_discard){
	  write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual);
	  write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual);
	}
        goto CLEAN_ADAPTERS;
      }else{ //trim the adapters
        sqp->fseq[sqp->flen] = '\0';
        sqp->fqual[sqp->flen] = '\0';
        sqp->rseq[sqp->rlen] = '\0';
        sqp->rqual[sqp->rlen] = '\0';
        strncpy(sqp->rc_rseq,sqp->rseq,sqp->rlen+1); //move regular reads now trimmed into RC read's place
        strncpy(sqp->rc_rqual,sqp->rqual,sqp->rlen+1);
        rev_qual(sqp->rc_rqual, sqp->rlen);        //amd re-reverse the RC reads
        revcom_seq(sqp->rc_rseq, sqp->rlen);
      }


      //do a nice global alignment between two reads, and print consensus
      fraln = aln_stdaln_aux(sqp->fseq, sqp->rc_rseq, &aln_param_rd2rd,
          ALN_TYPE_GLOBAL, 1, sqp->flen, sqp->rlen);

      //calculate the minimum score we are willing to accept to merge the reads
      //basically this is saying that 7/8 of the read must overlap perfectly

      read_thresh = (((int)sqp->flen) + ((int)sqp->rlen)) -
          (((int)sqp->flen) * read_frac_thresh * aln_param_rd2rd.gap_ext) -
          (((int)sqp->rlen) * read_frac_thresh * aln_param_rd2rd.gap_ext) -
          (aln_param_rd2rd.gap_open*2) - (aln_param_rd2rd.gap_end*2);
      //now lets put something useful in the alignment suboptimal score thing since right now it
      //is just left blank:
      //fprintf(stderr, "rt:%d\tfl:%d\trl:%d\trft:%f\tgx:%d\tgo:%d\tge%d\n", read_thresh,((int)sqp->flen),((int)sqp->rlen),read_frac_thresh,aln_param_rd2rd.gap_ext,aln_param_rd2rd.gap_open,aln_param_rd2rd.gap_end);
      fraln->subo = read_thresh;

      if(do_read_merging && fraln->score > read_thresh){
        //if we want read merging,
        //and the alignment score is better than the threshold just calculated...

        //write the merged sequence
        fill_merged_sequence(sqp, fraln, true);
        if(pretty_print && num_pretty_print < max_pretty_print){
          num_pretty_print++;
          pretty_print_alignment_stdaln(ppaw,sqp,fraln,false,false,true);
        }
        if(strlen(sqp->merged_seq) >= min_read_len && strlen(sqp->merged_qual) >= min_read_len){
          num_merged++;
          write_fastq(mfqw,sqp->fid,sqp->merged_seq,sqp->merged_qual);
        }
        else{
          num_discarded++;
	  if(write_discard){
	    write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual);
	    write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual);
	  }
        }
      }else if(fraln->score > read_thresh){
        // we know that the adapters are present, trimmed, and the resulting
        // read lengths are both long enough to print.
        // We also know that we aren't doing merging.
        // Now we just need to print.
        if(pretty_print && num_pretty_print < max_pretty_print){
          num_pretty_print++;
          pretty_print_alignment_stdaln(ppaw,sqp,fraln,false,false,true);
        }




        //do end polishing to take care of examples like the following:
        //          Read Alignment Score:59, Suboptimal Score:-85
        //          ID:HWI-ST593:1:1101:14566:7002#ACA/1
        //          READ1: ------------ATACAACTCGCTGACTTTGTCCTGGCATTTGACATATGCCTCGTAGTCTGCAAAGACTTTAAACCGGTCATGGTGGAACAGCATGTTGA
        //                             ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
        //          READ2: CTCTTCCGATCTATACAACTCGCTGACTTTGTCCTGGCATTTGACATATGCCTCGTAGTCTGCAAAGACTTTAAACCGGTCATGGTGGAACAGCATGTTG-



        make_blunt_ends(sqp,fraln);

        if(strlen(sqp->fseq) >= min_read_len &&
            strlen(sqp->fqual) >= min_read_len &&
            strlen(sqp->rseq) >= min_read_len &&
            strlen(sqp->rqual) >= min_read_len){

          write_fastq(ffqw, sqp->fid, sqp->fseq, sqp->fqual);
          write_fastq(rfqw, sqp->rid, sqp->rseq, sqp->rqual);
        }else{
          num_discarded++;
	  if(write_discard){
	    write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual);
	    write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual);
	  }
        }


      }else{ //there was a bad looking read-read alignment, so lets not risk it and junk it
        num_discarded++;
	if(write_discard){
	  //write_fastq(dffqw, sqp->fid, sqp->fseq, sqp->fqual);
	  //write_fastq(drfqw, sqp->rid, sqp->rseq, sqp->rqual);
	  write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual);
	  write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual);
	}
      }
    }else{
      //no adapters present
      //check for strong read overlap to assist trimming ends of adapters from end of read
      if(do_read_merging){
        if(read_merge(sqp, min_ol_reads, min_match_reads, max_mismatch_reads, qcut)){
          //print merged output
          if(strlen(sqp->merged_seq) >= min_read_len &&
              strlen(sqp->merged_qual) >= min_read_len){
            num_merged++;
            write_fastq(mfqw,sqp->fid,sqp->merged_seq,sqp->merged_qual);
            if(pretty_print && num_pretty_print < max_pretty_print){
              num_pretty_print++;
              pretty_print_alignment(ppaw,sqp,qcut,false); //false b/c merged input in fixed order
            }
          }else{
            num_discarded++;
	    if(write_discard){
	      write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual);
	      write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual);
	    }
          }
        }else{
          //no significant overlap so just write them
          if(strlen(sqp->fseq) >= min_read_len &&
              strlen(sqp->fqual) >= min_read_len &&
              strlen(sqp->rseq) >= min_read_len &&
              strlen(sqp->rqual) >= min_read_len){
            write_fastq(ffqw, sqp->fid, sqp->fseq, sqp->fqual);
            write_fastq(rfqw, sqp->rid, sqp->rseq, sqp->rqual);
          }else{
            num_discarded++;
	    if(write_discard){
	      write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual);
	      write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual);
	    }
          }

        }
        //done
        goto CLEAN_ADAPTERS;
      }else{ //just write reads to output fastqs
        if(strlen(sqp->fseq) >= min_read_len &&
            strlen(sqp->fqual) >= min_read_len &&
            strlen(sqp->rseq) >= min_read_len &&
            strlen(sqp->rqual) >= min_read_len){
          write_fastq(ffqw, sqp->fid, sqp->fseq, sqp->fqual);
          write_fastq(rfqw, sqp->rid, sqp->rseq, sqp->rqual);
        }else{
          num_discarded++;
	  if(write_discard){
	    write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual);
	    write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual);
	  }
        }
        goto CLEAN_ADAPTERS;
      }
    }


    /**
     * Section for heirarchial cleanup
     *
     * In every case we will at least have to free up the alignment between the adapter and two reads.
     * however in some cases there will be an additional alignment between the two reads. We can do
     * good cleanup in this case with gotos
     */
    aln_free_AlnAln(fraln);

    CLEAN_ADAPTERS:
    aln_free_AlnAln(faaln);
    aln_free_AlnAln(raaln);

    //End the loop over reads
  }
  end = clock();
  double cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
  fprintf(stderr,"\nPairs Processed:\t%lld\n",num_pairs);
  fprintf(stderr,"Pairs Merged:\t%lld\n",num_merged);
  fprintf(stderr,"Pairs With Adapters:\t%lld\n",num_adapter);
  fprintf(stderr,"Pairs Discarded:\t%lld\n",num_discarded);
  fprintf(stderr,"CPU Time Used (Minutes):\t%lf\n",cpu_time_used/60.0);



  SQP_destroy(sqp);
  gzclose(ffq);
  gzclose(ffqw);
  gzclose(rfq);
  gzclose(rfqw);
  if(mfqw != NULL)
    gzclose(mfqw);
  if(ppaw != NULL)
    gzclose(ppaw);
  if(dffqw != NULL)
    gzclose(dffqw);
  if(drfqw != NULL)
    gzclose(drfqw);
  return 0;
}
Пример #6
0
// extract differences between the pair of strings
std::vector<Variant> extract_variants(const std::string& reference, 
                                      const std::string& haplotype)
{
    AlnParam par = aln_param_nt2nt;
    par.band_width = std::max(20, abs(reference.size() - haplotype.size()) * 2);
    AlnAln* aln = aln_stdaln(reference.c_str(), haplotype.c_str(), &par, 1, 1);
    
    // Make aligned strings where gaps are padded with '-'
    std::string pad_ref(aln->out1);
    std::string pad_hap(aln->out2);

    assert(pad_ref.size() == pad_hap.size());
    
    //std::cout << "PR: " << pad_ref << "\n";
    //std::cout << "PH: " << pad_hap << "\n";

    // parse variants from the alignment
    std::vector<Variant> variants;

    // generate a map from padded bases to positions in the original reference sequence
    std::vector<size_t> ref_positions(pad_ref.size(), 0);
    size_t pos = 0;
    for(size_t i = 0; i < pad_ref.size(); ++i) {
        ref_positions[i] = pad_ref[i] != '-' ? pos : std::string::npos;
        pos += pad_ref[i] != '-';
    }

    // diff_start iterates over the places where these sequences are different
    size_t diff_start = 0;
    while(1) {
        
        // find the start point of the next difference between the strings
        while(diff_start < pad_ref.size() && pad_ref[diff_start] == pad_hap[diff_start]) {
            diff_start++;
        }
 
        // check for end of alignment
        if(diff_start == pad_ref.size())
            break;

        // find the end point of the difference
        bool is_indel = false;
        size_t diff_end = diff_start;
        while(diff_end < pad_ref.size() && pad_ref[diff_end] != pad_hap[diff_end]) {
            is_indel = is_indel || pad_ref[diff_end] == '-' || pad_hap[diff_end] == '-';
            diff_end++;
        }

        // If the difference is an indel, we include the previous matching reference base
        diff_start -= is_indel;
    
        Variant v;
        v.ref_name = "noctg";

        assert(ref_positions[diff_start] != std::string::npos);
        v.ref_position = ref_positions[diff_start];
        v.ref_seq = remove_gaps(pad_ref.substr(diff_start, diff_end - diff_start).c_str());
        v.alt_seq = remove_gaps(pad_hap.substr(diff_start, diff_end - diff_start).c_str());
        
        variants.push_back(v);
        diff_start = diff_end;
    }

    aln_free_AlnAln(aln);
    return variants;
}