int aln_pair_align(FILE *fp1, FILE *fp2, AlnParam *ap, int type, int misc_flag) { seq_t seq1, seq2; int len1, len2, n; char name1[MAX_NAME_LEN], name2[MAX_NAME_LEN]; path_t *pt, *pp; AlnAln *aa; INIT_SEQ(seq1); INIT_SEQ(seq2); for (n = 0; ; ++n) { len1 = read_fasta(fp1, &seq1, name1, 0); len2 = read_fasta(fp2, &seq2, name2, 0); if (len1 < 0 || len2 < 0) break; aa = aln_align((char*)seq1.s, (char*)seq2.s, ap, type); pp = aa->path; pt = aa->path + aa->path_len - 1; printf(">%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\n", name1, len1, pt->i, pp->i, name2, len2, pt->j, pp->j, aa->score); if (aa->out1) printf("%s\n", aa->out1); if (aa->outm) printf("%s\n", aa->outm); if (aa->out2) printf("%s\n", aa->out2); if (type != ALN_BOUND_ALIGN) printf("//\n"); fflush(stdout); if (misc_flag) aln_output_segment((char*)seq1.s, (char*)seq2.s, aa->path, aa->path_len, name1, name2); aln_free_AlnAln(aa); } MYFREE(seq1.s); MYFREE(seq2.s); return n; }
PWDB* pw_aln_contigs(CtgDB *db) { uint32_t i, j, n; int k, mn, mm, off0, off1, aln_len; PWDB *pwdb; Ctg *c0, *c1; pwdb = (PWDB*)malloc(sizeof(PWDB)); pwdb->pwctgs = init_pwctglist(6); pwdb->hp = init_heap(aln_cmp, pwdb); pwdb->ctgv = db->ctgs; AlnParam ap = {10, 2, 2, aln_sm_nt, 16, 75}; n = db->ctgnum; for (i = 0; i < n-1; i++) { c0 = ref_ctglist(db->ctgs, i); for (j = i+1; j < n; j++) { c1 = ref_ctglist(db->ctgs, j); AlnAln *aa; mn = mm = 0; off0 = off1 = -1; aa = aln_stdaln(c0->seq, c1->seq, &ap, 0, 1); aln_len = strlen(aa->out1); for (k = 0; k < aln_len; k++) { if (aa->out1[k] == '-' || aa->out2[k] == '-') continue; if (aa->out1[k] != aa->out2[k]) mm++; mn++; } PWcontig *pwc = (PWcontig*)malloc(sizeof(PWcontig)); pwc->id0 = c0->cls_id; pwc->id1 = c1->cls_id; pwc->overlap = mn; pwc->score = aa->score; pwc->het = (float)mm/mn; push_heap(pwdb->hp, pwc); push_pwctglist(pwdb->pwctgs, pwc); //fprintf(stdout, "%d\t%d\t%d\t%d\t%d\t%d\t%.3f\n", c0->cls_id, c1->cls_id, pwc->id0, pwc->id1, mn, mm, pwc->het); //fprintf(stdout, "%s\n%s\n", c0->seq, c1->seq); //fprintf(stdout, "%d\t%d\t%d\t%d\t%d\t%d\n%s\n%s\n%s\n\n", aa->start1, aa->end1,aa->start2, aa->end2, pwc->score, pwc->overlap, aa->out1, aa->outm, aa->out2); //fprintf(stdout, "%s\n%s\n%s\n\n", aa->out1, aa->outm, aa->out2); fflush(stdout); aln_free_AlnAln(aa); } } return pwdb; }
static void aln_1seq(const seqs_t *ss, const char *name, int l, const char *s, char strand) { int i; for (i = 0; i < ss->n_seqs; ++i) { AlnAln *aa; seq1_t *p = ss->seqs + i; g_aln_param.band_width = l + p->l; aa = aln_stdaln_aux(s, (const char*)p->s, &g_aln_param, g_is_global, g_thres, l, p->l); if (aa->score >= g_thres || g_is_global) { printf(">%s\t%d\t%d\t%s\t%c\t%d\t%d\t%d\t%d\t", p->n, aa->start1? aa->start1 : 1, aa->end1, name, strand, aa->start2? aa->start2 : 1, aa->end2, aa->score, aa->subo); // NB: I put the short sequence as the first sequence in SW, an insertion to // the reference becomes a deletion from the short sequence. Therefore, I use // "MDI" here rather than "MID", and print ->out2 first rather than ->out1. for (i = 0; i != aa->n_cigar; ++i) printf("%d%c", aa->cigar32[i]>>4, "MDI"[aa->cigar32[i]&0xf]); printf("\n%s\n%s\n%s\n", aa->out2, aa->outm, aa->out1); } aln_free_AlnAln(aa); }
bool SGSmoothingVisitor::visit(StringGraph* pGraph, Vertex* pVertex) { (void)pGraph; if(pVertex->getColor() == GC_RED) return false; bool found = false; for(size_t idx = 0; idx < ED_COUNT; idx++) { EdgeDir dir = EDGE_DIRECTIONS[idx]; EdgePtrVec edges = pVertex->getEdges(dir); if(edges.size() <= 1) continue; for(size_t i = 0; i < edges.size(); ++i) { if(edges[i]->getEnd()->getColor() == GC_RED) return false; } //std::cout << "Smoothing " << pVertex->getID() << "\n"; const int MAX_WALKS = 10; const int MAX_DISTANCE = 5000; bool bIsDegenerate = false; bool bFailGapCheck = false; bool bFailDivergenceCheck = false; bool bFailIndelSizeCheck = false; SGWalkVector variantWalks; SGSearch::findVariantWalks(pVertex, dir, MAX_DISTANCE, MAX_WALKS, variantWalks); if(variantWalks.size() > 0) { found = true; size_t selectedIdx = -1; size_t selectedCoverage = 0; // Calculate the minimum amount overlapped on the start/end vertex. // This is used to properly extract the sequences from walks that represent the variation. int minOverlapX = std::numeric_limits<int>::max(); int minOverlapY = std::numeric_limits<int>::max(); for(size_t i = 0; i < variantWalks.size(); ++i) { if(variantWalks[i].getNumEdges() <= 1) bIsDegenerate = true; // Calculate the walk coverage using the internal vertices of the walk. // The walk with the highest coverage will be retained size_t walkCoverage = 0; for(size_t j = 1; j < variantWalks[i].getNumVertices() - 1; ++j) walkCoverage += variantWalks[i].getVertex(j)->getCoverage(); if(walkCoverage > selectedCoverage || selectedCoverage == 0) { selectedIdx = i; selectedCoverage = walkCoverage; } Edge* pFirstEdge = variantWalks[i].getFirstEdge(); Edge* pLastEdge = variantWalks[i].getLastEdge(); if((int)pFirstEdge->getMatchLength() < minOverlapX) minOverlapX = pFirstEdge->getMatchLength(); if((int)pLastEdge->getTwin()->getMatchLength() < minOverlapY) minOverlapY = pLastEdge->getTwin()->getMatchLength(); } // Calculate the strings for each walk that represent the region of variation StringVector walkStrings; for(size_t i = 0; i < variantWalks.size(); ++i) { Vertex* pStartVertex = variantWalks[i].getStartVertex(); Vertex* pLastVertex = variantWalks[i].getLastVertex(); assert(pStartVertex != NULL && pLastVertex != NULL); std::string full = variantWalks[i].getString(SGWT_START_TO_END); int posStart = 0; int posEnd = 0; if(dir == ED_ANTISENSE) { // pLast ----------- // pStart ------------ // full -------------------- // out ---- posStart = pLastVertex->getSeqLen() - minOverlapY; posEnd = full.size() - (pStartVertex->getSeqLen() - minOverlapX); } else { // pStart -------------- // pLast ----------- // full --------------------- // out ---- posStart = pStartVertex->getSeqLen() - minOverlapX; // match start position posEnd = full.size() - (pLastVertex->getSeqLen() - minOverlapY); // match end position } std::string out; if(posEnd > posStart) out = full.substr(posStart, posEnd - posStart); walkStrings.push_back(out); } assert(selectedIdx != (size_t)-1); SGWalk& selectedWalk = variantWalks[selectedIdx]; assert(selectedWalk.isIndexed()); // Check the divergence of the other walks to this walk StringVector cigarStrings; std::vector<int> maxIndel; std::vector<double> gapPercent; // percentage of matching that is gaps std::vector<double> totalPercent; // percent of total alignment that is mismatch or gap cigarStrings.resize(variantWalks.size()); gapPercent.resize(variantWalks.size()); totalPercent.resize(variantWalks.size()); maxIndel.resize(variantWalks.size()); for(size_t i = 0; i < variantWalks.size(); ++i) { if(i == selectedIdx) continue; // We want to compute the total gap length, total mismatches and percent // divergence between the two paths. int matchLen = 0; int totalDiff = 0; int gapLength = 0; int maxGapLength = 0; // We have to handle the degenerate case where one internal string has zero length // this can happen when there is an isolated insertion/deletion and the walks are like: // x -> y -> z // x -> z if(walkStrings[selectedIdx].empty() || walkStrings[i].empty()) { matchLen = std::max(walkStrings[selectedIdx].size(), walkStrings[i].size()); totalDiff = matchLen; gapLength = matchLen; } else { AlnAln *aln_global; aln_global = aln_stdaln(walkStrings[selectedIdx].c_str(), walkStrings[i].c_str(), &aln_param_blast, 1, 1); // Calculate the alignment parameters while(aln_global->outm[matchLen] != '\0') { if(aln_global->outm[matchLen] == ' ') totalDiff += 1; matchLen += 1; } std::stringstream cigarSS; for (int j = 0; j != aln_global->n_cigar; ++j) { char cigarOp = "MID"[aln_global->cigar32[j]&0xf]; int cigarLen = aln_global->cigar32[j]>>4; if(cigarOp == 'I' || cigarOp == 'D') { gapLength += cigarLen; if(gapLength > maxGapLength) maxGapLength = gapLength; } cigarSS << cigarLen; cigarSS << cigarOp; } cigarStrings[i] = cigarSS.str(); aln_free_AlnAln(aln_global); } double percentDiff = (double)totalDiff / matchLen; double percentGap = (double)gapLength / matchLen; if(percentDiff > m_maxTotalDivergence) bFailDivergenceCheck = true; if(percentGap > m_maxGapDivergence) bFailGapCheck = true; if(maxGapLength > m_maxIndelLength) bFailIndelSizeCheck = true; gapPercent[i] = percentGap; totalPercent[i] = percentDiff; maxIndel[i] = maxGapLength; } if(bIsDegenerate || bFailGapCheck || bFailDivergenceCheck || bFailIndelSizeCheck) continue; // Write the selected path to the variants file as variant 0 int variantIdx = 0; std::string selectedSequence = selectedWalk.getString(SGWT_START_TO_END); std::stringstream ss; ss << "variant-" << m_numRemovedTotal << "/" << variantIdx++; writeFastaRecord(&m_outFile, ss.str(), selectedSequence); // The vertex set for each walk is not necessarily disjoint, // the selected walk may contain vertices that are part // of other paths. We handle this be initially marking all // vertices of the for(size_t i = 0; i < variantWalks.size(); ++i) { if(i == selectedIdx) continue; SGWalk& currWalk = variantWalks[i]; for(size_t j = 0; j < currWalk.getNumEdges() - 1; ++j) { Edge* currEdge = currWalk.getEdge(j); // If the vertex is also on the selected path, do not mark it Vertex* currVertex = currEdge->getEnd(); if(!selectedWalk.containsVertex(currVertex->getID())) { currEdge->getEnd()->setColor(GC_RED); } } // Write the variant to a file std::string variantSequence = currWalk.getString(SGWT_START_TO_END); std::stringstream variantID; std::stringstream ss; ss << "variant-" << m_numRemovedTotal << "/" << variantIdx++; ss << " IGD:" << (double)gapPercent[i] << " ITD:" << totalPercent[i] << " MID: " << maxIndel[i] << " InternalCigar:" << cigarStrings[i]; writeFastaRecord(&m_outFile, ss.str(), variantSequence); } if(variantWalks.size() == 2) m_simpleBubblesRemoved += 1; else m_complexBubblesRemoved += 1; ++m_numRemovedTotal; } }
int main( int argc, char* argv[] ) { unsigned long long num_pairs; unsigned long long num_merged; unsigned long long num_adapter; unsigned long long num_discarded; unsigned long long num_too_ambiguous_to_merge; unsigned long long max_pretty_print = DEF_MAX_PRETTY_PRINT; unsigned long long num_pretty_print = 0; int adapter_thresh = DEF_ADAPTER_SCORE_THRES; int read_thresh = DEF_READ_SCORE_THRES; clock_t start, end; //init to 0 num_pairs = num_merged = num_adapter = num_discarded = num_too_ambiguous_to_merge = 0; extern char* optarg; bool p64 = false; char forward_fn[MAX_FN_LEN]; char reverse_fn[MAX_FN_LEN]; char forward_out_fn[MAX_FN_LEN]; char reverse_out_fn[MAX_FN_LEN]; char forward_discard_fn[MAX_FN_LEN]; char reverse_discard_fn[MAX_FN_LEN]; char merged_out_fn[MAX_FN_LEN]; bool do_read_merging = false; bool print_overhang = false; bool write_discard=false; char forward_primer[MAX_SEQ_LEN+1]; strcpy(forward_primer, DEF_FORWARD_PRIMER); //set default char forward_primer_dummy_qual[MAX_SEQ_LEN+1]; char reverse_primer[MAX_SEQ_LEN+1]; strcpy(reverse_primer, DEF_REVERSE_PRIMER); //set default char reverse_primer_dummy_qual[MAX_SEQ_LEN+1]; int i; for(i=0;i<MAX_SEQ_LEN+1;i++){ forward_primer_dummy_qual[i] = 'N';//phred score of 45 reverse_primer_dummy_qual[i] = 'N'; } int ich; int min_ol_adapter = DEF_OL2MERGE_ADAPTER; int min_ol_reads = DEF_OL2MERGE_READS; unsigned short int min_read_len =DEF_MIN_READ_LEN; float min_match_adapter_frac = DEF_MIN_MATCH_ADAPTER; float min_match_reads_frac = DEF_MIN_MATCH_READS; float max_mismatch_adapter_frac = DEF_MAX_MISMATCH_ADAPTER; float max_mismatch_reads_frac = DEF_MAX_MISMATCH_READS; float read_frac_thresh = DEF_READ_GAP_FRAC_CUTOFF; unsigned short max_mismatch_adapter[MAX_SEQ_LEN+1]; unsigned short max_mismatch_reads[MAX_SEQ_LEN+1]; unsigned short min_match_adapter[MAX_SEQ_LEN+1]; unsigned short min_match_reads[MAX_SEQ_LEN+1]; char qcut = (char)DEF_QCUT+33; bool pretty_print = false; char pretty_print_fn[MAX_FN_LEN+1]; SQP sqp = SQP_init(); char untrim_fseq[MAX_SEQ_LEN+1]; char untrim_fqual[MAX_SEQ_LEN+1]; char untrim_rseq[MAX_SEQ_LEN+1]; char untrim_rqual[MAX_SEQ_LEN+1]; /* No args - help! */ if ( argc == 1 ) { help(argv[0]); } int req_args = 0; while( (ich=getopt( argc, argv, "f:r:1:2:3:4:q:A:s:y:B:O:E:x:M:N:L:o:m:b:w:W:p:P:X:Q:t:e:Z:n:6gh" )) != -1 ) { switch( ich ) { //REQUIRED ARGUMENTS case 'f' : req_args ++; strcpy( forward_fn, optarg ); break; case 'r' : req_args ++; strcpy( reverse_fn, optarg ); break; case '1' : req_args ++; strcpy(forward_out_fn, optarg); break; case '2' : req_args ++; strcpy(reverse_out_fn, optarg); break; //OPTIONAL GENERAL ARGUMENTS case '3' : write_discard=true; strcpy(forward_discard_fn, optarg); break; case '4' : write_discard=true; strcpy(reverse_discard_fn, optarg); break; case 'h' : help(argv[0]); break; case '6' : p64 = true; break; case 'q' : qcut = atoi(optarg)+33; break; case 'L' : min_read_len = atoi(optarg); break; //OPTIONAL ADAPTER/PRIMER TRIMMING ARGUMENTS case 'A': strcpy(forward_primer, optarg); break; case 'B': strcpy(reverse_primer, optarg); break; case 'O': min_ol_adapter = atoi(optarg); break; case 'M': max_mismatch_adapter_frac = atof(optarg); break; case 'N': min_match_adapter_frac = atof(optarg); break; case 'b': aln_param_nt2nt.band_width = atoi(optarg); break; case 'Q': aln_param_nt2nt.gap_open = atoi(optarg); break; case 't': aln_param_nt2nt.gap_ext = atoi(optarg); break; case 'e': aln_param_nt2nt.gap_end = atoi(optarg); break; case 'Z': adapter_thresh = atoi(optarg); break; case 'w': aln_param_rd2rd.band_width = atoi(optarg); break; case 'W': aln_param_rd2rd.gap_open = atoi(optarg); break; case 'p': aln_param_rd2rd.gap_ext = atoi(optarg); break; case 'P': aln_param_rd2rd.gap_end = atoi(optarg); break; case 'X': read_frac_thresh = atof(optarg); break; //OPTIONAL MERGING ARGUMENTS case 'y' : maximum_quality = optarg[0]; break; case 'g' : print_overhang = true; break; case 's' : do_read_merging = true; strcpy( merged_out_fn, optarg ); break; case 'o': min_ol_reads = atoi(optarg); break; case 'm': max_mismatch_reads_frac = atof(optarg); break; case 'n': min_match_reads_frac = atof(optarg); break; case 'E': pretty_print = true; strcpy(pretty_print_fn,optarg); break; case 'x': max_pretty_print = atol(optarg); break; default : help(argv[0]); } } if(req_args < 4){ fprintf(stderr, "Missing a required argument!\n"); help(argv[0]); } start = clock(); //allocate alignment memory // int min_match = 8; // int ngaps = 1; // int maxglen = 3; // AlnParam aln_param_adapter = { 5, 13, 19, aln_sm_read, 16, 75 }; // //Calculate table matching overlap length to min matches and max mismatches for(i=0;i<MAX_SEQ_LEN+1;i++){ max_mismatch_reads[i] = floor(((float)i)*max_mismatch_reads_frac); max_mismatch_adapter[i] = floor(((float)i)*max_mismatch_adapter_frac); min_match_reads[i] = ceil(((float)i)*min_match_reads_frac); min_match_adapter[i] = ceil(((float)i)*min_match_adapter_frac); } //get length of forward and reverse primers int forward_primer_len = strlen(forward_primer); int reverse_primer_len = strlen(reverse_primer); gzFile ffq = fileOpen(forward_fn, "r"); gzFile ffqw = fileOpen(forward_out_fn,"w"); gzFile rfq = fileOpen(reverse_fn, "r"); gzFile rfqw = fileOpen(reverse_out_fn,"w"); gzFile mfqw = NULL; gzFile ppaw = NULL; gzFile dffqw = NULL; gzFile drfqw = NULL; if(do_read_merging) mfqw = fileOpen(merged_out_fn,"w"); if(pretty_print) ppaw = fileOpen(pretty_print_fn,"w"); if(write_discard){ dffqw = fileOpen(forward_discard_fn,"w"); drfqw = fileOpen(reverse_discard_fn,"w"); } /** * Loop over all of the reads */ while(next_fastqs( ffq, rfq, sqp, p64 )){ //returns false when done update_spinner(num_pairs++); AlnAln *faaln, *raaln, *fraln; //save a copy of the original sequences/qualities first strcpy(untrim_fseq,sqp->fseq); strcpy(untrim_fqual,sqp->fqual); strcpy(untrim_rseq,sqp->rseq); strcpy(untrim_rqual,sqp->rqual); faaln = aln_stdaln_aux(sqp->fseq, forward_primer, &aln_param_nt2nt, ALN_TYPE_LOCAL, adapter_thresh , sqp->flen, forward_primer_len); raaln = aln_stdaln_aux(sqp->rseq, reverse_primer, &aln_param_nt2nt, ALN_TYPE_LOCAL, adapter_thresh, sqp->rlen, reverse_primer_len); //check for direct adapter match. if(adapter_trim(sqp, min_ol_adapter, forward_primer, forward_primer_dummy_qual, forward_primer_len, reverse_primer, reverse_primer_dummy_qual, reverse_primer_len, min_match_adapter, max_mismatch_adapter, min_match_reads, max_mismatch_reads, qcut) || faaln->score >= adapter_thresh || raaln->score >= adapter_thresh){ num_adapter++; //adapter present //print it if user wants if(pretty_print && num_pretty_print < max_pretty_print){ //void pretty_print_alignment_stdaln(gzFile out, SQP sqp, AlnAln *aln, bool first_adapter, bool second_adapter) if(faaln->score >= adapter_thresh){ num_pretty_print++; pretty_print_alignment_stdaln(ppaw,sqp,faaln,true,false,false); } if(raaln->score >= adapter_thresh){ num_pretty_print++; pretty_print_alignment_stdaln(ppaw,sqp,raaln,false,true,false); } } //do stuff to it //assume full length adapter and squish it down to the read with no gaps int rpos,fpos; rpos = fpos = (- MAX_SEQ_LEN); if(faaln->score >= adapter_thresh){ fpos = max(faaln->start1 - faaln->start2,0); } if(raaln->score >= adapter_thresh){ rpos = max(raaln->start1 - raaln->start2,0); } //make rlen the minimum of the two adapter search methods if(rpos >= 0){ sqp->rlen = min(sqp->rlen,rpos); } //make flen the minimum of the two adapter search methods if(fpos >= 0){ sqp->flen = min(sqp->flen,fpos); } if(sqp->flen < min_read_len || sqp->rlen < min_read_len){ num_discarded++; if(write_discard){ write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual); write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual); } goto CLEAN_ADAPTERS; }else{ //trim the adapters sqp->fseq[sqp->flen] = '\0'; sqp->fqual[sqp->flen] = '\0'; sqp->rseq[sqp->rlen] = '\0'; sqp->rqual[sqp->rlen] = '\0'; strncpy(sqp->rc_rseq,sqp->rseq,sqp->rlen+1); //move regular reads now trimmed into RC read's place strncpy(sqp->rc_rqual,sqp->rqual,sqp->rlen+1); rev_qual(sqp->rc_rqual, sqp->rlen); //amd re-reverse the RC reads revcom_seq(sqp->rc_rseq, sqp->rlen); } //do a nice global alignment between two reads, and print consensus fraln = aln_stdaln_aux(sqp->fseq, sqp->rc_rseq, &aln_param_rd2rd, ALN_TYPE_GLOBAL, 1, sqp->flen, sqp->rlen); //calculate the minimum score we are willing to accept to merge the reads //basically this is saying that 7/8 of the read must overlap perfectly read_thresh = (((int)sqp->flen) + ((int)sqp->rlen)) - (((int)sqp->flen) * read_frac_thresh * aln_param_rd2rd.gap_ext) - (((int)sqp->rlen) * read_frac_thresh * aln_param_rd2rd.gap_ext) - (aln_param_rd2rd.gap_open*2) - (aln_param_rd2rd.gap_end*2); //now lets put something useful in the alignment suboptimal score thing since right now it //is just left blank: //fprintf(stderr, "rt:%d\tfl:%d\trl:%d\trft:%f\tgx:%d\tgo:%d\tge%d\n", read_thresh,((int)sqp->flen),((int)sqp->rlen),read_frac_thresh,aln_param_rd2rd.gap_ext,aln_param_rd2rd.gap_open,aln_param_rd2rd.gap_end); fraln->subo = read_thresh; if(do_read_merging && fraln->score > read_thresh){ //if we want read merging, //and the alignment score is better than the threshold just calculated... //write the merged sequence fill_merged_sequence(sqp, fraln, true); if(pretty_print && num_pretty_print < max_pretty_print){ num_pretty_print++; pretty_print_alignment_stdaln(ppaw,sqp,fraln,false,false,true); } if(strlen(sqp->merged_seq) >= min_read_len && strlen(sqp->merged_qual) >= min_read_len){ num_merged++; write_fastq(mfqw,sqp->fid,sqp->merged_seq,sqp->merged_qual); } else{ num_discarded++; if(write_discard){ write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual); write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual); } } }else if(fraln->score > read_thresh){ // we know that the adapters are present, trimmed, and the resulting // read lengths are both long enough to print. // We also know that we aren't doing merging. // Now we just need to print. if(pretty_print && num_pretty_print < max_pretty_print){ num_pretty_print++; pretty_print_alignment_stdaln(ppaw,sqp,fraln,false,false,true); } //do end polishing to take care of examples like the following: // Read Alignment Score:59, Suboptimal Score:-85 // ID:HWI-ST593:1:1101:14566:7002#ACA/1 // READ1: ------------ATACAACTCGCTGACTTTGTCCTGGCATTTGACATATGCCTCGTAGTCTGCAAAGACTTTAAACCGGTCATGGTGGAACAGCATGTTGA // |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| // READ2: CTCTTCCGATCTATACAACTCGCTGACTTTGTCCTGGCATTTGACATATGCCTCGTAGTCTGCAAAGACTTTAAACCGGTCATGGTGGAACAGCATGTTG- make_blunt_ends(sqp,fraln); if(strlen(sqp->fseq) >= min_read_len && strlen(sqp->fqual) >= min_read_len && strlen(sqp->rseq) >= min_read_len && strlen(sqp->rqual) >= min_read_len){ write_fastq(ffqw, sqp->fid, sqp->fseq, sqp->fqual); write_fastq(rfqw, sqp->rid, sqp->rseq, sqp->rqual); }else{ num_discarded++; if(write_discard){ write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual); write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual); } } }else{ //there was a bad looking read-read alignment, so lets not risk it and junk it num_discarded++; if(write_discard){ //write_fastq(dffqw, sqp->fid, sqp->fseq, sqp->fqual); //write_fastq(drfqw, sqp->rid, sqp->rseq, sqp->rqual); write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual); write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual); } } }else{ //no adapters present //check for strong read overlap to assist trimming ends of adapters from end of read if(do_read_merging){ if(read_merge(sqp, min_ol_reads, min_match_reads, max_mismatch_reads, qcut)){ //print merged output if(strlen(sqp->merged_seq) >= min_read_len && strlen(sqp->merged_qual) >= min_read_len){ num_merged++; write_fastq(mfqw,sqp->fid,sqp->merged_seq,sqp->merged_qual); if(pretty_print && num_pretty_print < max_pretty_print){ num_pretty_print++; pretty_print_alignment(ppaw,sqp,qcut,false); //false b/c merged input in fixed order } }else{ num_discarded++; if(write_discard){ write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual); write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual); } } }else{ //no significant overlap so just write them if(strlen(sqp->fseq) >= min_read_len && strlen(sqp->fqual) >= min_read_len && strlen(sqp->rseq) >= min_read_len && strlen(sqp->rqual) >= min_read_len){ write_fastq(ffqw, sqp->fid, sqp->fseq, sqp->fqual); write_fastq(rfqw, sqp->rid, sqp->rseq, sqp->rqual); }else{ num_discarded++; if(write_discard){ write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual); write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual); } } } //done goto CLEAN_ADAPTERS; }else{ //just write reads to output fastqs if(strlen(sqp->fseq) >= min_read_len && strlen(sqp->fqual) >= min_read_len && strlen(sqp->rseq) >= min_read_len && strlen(sqp->rqual) >= min_read_len){ write_fastq(ffqw, sqp->fid, sqp->fseq, sqp->fqual); write_fastq(rfqw, sqp->rid, sqp->rseq, sqp->rqual); }else{ num_discarded++; if(write_discard){ write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual); write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual); } } goto CLEAN_ADAPTERS; } } /** * Section for heirarchial cleanup * * In every case we will at least have to free up the alignment between the adapter and two reads. * however in some cases there will be an additional alignment between the two reads. We can do * good cleanup in this case with gotos */ aln_free_AlnAln(fraln); CLEAN_ADAPTERS: aln_free_AlnAln(faaln); aln_free_AlnAln(raaln); //End the loop over reads } end = clock(); double cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC; fprintf(stderr,"\nPairs Processed:\t%lld\n",num_pairs); fprintf(stderr,"Pairs Merged:\t%lld\n",num_merged); fprintf(stderr,"Pairs With Adapters:\t%lld\n",num_adapter); fprintf(stderr,"Pairs Discarded:\t%lld\n",num_discarded); fprintf(stderr,"CPU Time Used (Minutes):\t%lf\n",cpu_time_used/60.0); SQP_destroy(sqp); gzclose(ffq); gzclose(ffqw); gzclose(rfq); gzclose(rfqw); if(mfqw != NULL) gzclose(mfqw); if(ppaw != NULL) gzclose(ppaw); if(dffqw != NULL) gzclose(dffqw); if(drfqw != NULL) gzclose(drfqw); return 0; }
// extract differences between the pair of strings std::vector<Variant> extract_variants(const std::string& reference, const std::string& haplotype) { AlnParam par = aln_param_nt2nt; par.band_width = std::max(20, abs(reference.size() - haplotype.size()) * 2); AlnAln* aln = aln_stdaln(reference.c_str(), haplotype.c_str(), &par, 1, 1); // Make aligned strings where gaps are padded with '-' std::string pad_ref(aln->out1); std::string pad_hap(aln->out2); assert(pad_ref.size() == pad_hap.size()); //std::cout << "PR: " << pad_ref << "\n"; //std::cout << "PH: " << pad_hap << "\n"; // parse variants from the alignment std::vector<Variant> variants; // generate a map from padded bases to positions in the original reference sequence std::vector<size_t> ref_positions(pad_ref.size(), 0); size_t pos = 0; for(size_t i = 0; i < pad_ref.size(); ++i) { ref_positions[i] = pad_ref[i] != '-' ? pos : std::string::npos; pos += pad_ref[i] != '-'; } // diff_start iterates over the places where these sequences are different size_t diff_start = 0; while(1) { // find the start point of the next difference between the strings while(diff_start < pad_ref.size() && pad_ref[diff_start] == pad_hap[diff_start]) { diff_start++; } // check for end of alignment if(diff_start == pad_ref.size()) break; // find the end point of the difference bool is_indel = false; size_t diff_end = diff_start; while(diff_end < pad_ref.size() && pad_ref[diff_end] != pad_hap[diff_end]) { is_indel = is_indel || pad_ref[diff_end] == '-' || pad_hap[diff_end] == '-'; diff_end++; } // If the difference is an indel, we include the previous matching reference base diff_start -= is_indel; Variant v; v.ref_name = "noctg"; assert(ref_positions[diff_start] != std::string::npos); v.ref_position = ref_positions[diff_start]; v.ref_seq = remove_gaps(pad_ref.substr(diff_start, diff_end - diff_start).c_str()); v.alt_seq = remove_gaps(pad_hap.substr(diff_start, diff_end - diff_start).c_str()); variants.push_back(v); diff_start = diff_end; } aln_free_AlnAln(aln); return variants; }