/* next_fastqs Read the next forward and reverse fastq sequences. Check to make sure their ID's are compatible and put the results in the next SQP of SQPDB. Grow this, if necessary. */ inline bool next_fastqs( gzFile ffq, gzFile rfq, SQP curr_sqp, bool p64 ) { int frs; // forward fastq read status int rrs; // reverse fastq read status size_t id1len = 0; size_t id2len = 0; /* Read the next fastq record from the forward and reverse pair of each */ //make sure everything is fresh... memset(curr_sqp->fid,'\0',MAX_SEQ_LEN); memset(curr_sqp->rid,'\0',MAX_SEQ_LEN); memset(curr_sqp->fseq,'\0',MAX_SEQ_LEN); memset(curr_sqp->rseq,'\0',MAX_SEQ_LEN); memset(curr_sqp->rc_rseq,'\0',MAX_SEQ_LEN); memset(curr_sqp->fqual,'\0',MAX_SEQ_LEN); memset(curr_sqp->rqual,'\0',MAX_SEQ_LEN); memset(curr_sqp->merged_seq,'\0',MAX_SEQ_LEN+MAX_SEQ_LEN); memset(curr_sqp->merged_qual,'\0',MAX_SEQ_LEN+MAX_SEQ_LEN); memset(curr_sqp->rc_rqual,'\0',MAX_SEQ_LEN); curr_sqp->flen = curr_sqp->rlen = 0; // frs = read_fastq( ffq, curr_sqp->fid, curr_sqp->fseq, curr_sqp->fqual, &id1len, &(curr_sqp->flen), p64 ); rrs = read_fastq( rfq, curr_sqp->rid, curr_sqp->rseq, curr_sqp->rqual, &id2len, &(curr_sqp->rlen), p64 ); // //reverse comp the second read for overlapping and everything. // strcpy(curr_sqp->rc_rseq,curr_sqp->rseq); // strcpy(curr_sqp->rc_rqual,curr_sqp->rqual); // revcom_seq(curr_sqp->rc_rseq,curr_sqp->rlen); // rev_qual(curr_sqp->rc_rqual,curr_sqp->rlen); if ( (frs == 1) && (rrs == 1) && f_r_id_check( curr_sqp->fid, id1len, curr_sqp->rid, id2len ) ) { strncpy(curr_sqp->rc_rseq,curr_sqp->rseq,curr_sqp->rlen+1); strncpy(curr_sqp->rc_rqual,curr_sqp->rqual,curr_sqp->rlen+1); rev_qual(curr_sqp->rc_rqual, curr_sqp->rlen); revcom_seq(curr_sqp->rc_rseq, curr_sqp->rlen); return true; } else { return false; } }
/** * look for adapters by read overlap * */ bool read_olap_adapter_trim(SQP sqp, size_t min_ol_adapter, unsigned short min_match_adapter[MAX_SEQ_LEN+1], unsigned short max_mismatch_adapter[MAX_SEQ_LEN+1], unsigned short min_match_reads[MAX_SEQ_LEN+1], unsigned short max_mismatch_reads[MAX_SEQ_LEN+1], char qcut){ //////////// // Look at the adapter overhang // Starting from our minimum adapter overlap // check to see if there is total overlap with //Round1: // ---------- Subj // ---------- Query //Round2: // ---------- Subj // ---------- Query //... //we can get this effect by swapping the query and subj, and then have a high minimum //overlap char *queryseq= sqp->rc_rseq; char *queryqual= sqp->rc_rqual; char *subjseq= sqp->fseq; char *subjqual= sqp->fqual; int querylen = sqp->rlen; int subjlen = sqp->flen; int ppos = compute_ol( queryseq, queryqual, querylen, subjseq, subjqual, subjlen, //min(subjlen,min(min_ol_adapter,querylen)), max(0,min(querylen,subjlen)-min_ol_adapter-1), min_match_reads, max_mismatch_reads, true, qcut ); //pass true here so ambiguous matches are avoided if(ppos != CODE_NOMATCH && ppos != CODE_AMBIGUOUS){ //we have a match, trim the adapter! if(ppos == 0){ //no adapter return false; }else{ //ppos gives us the shift to the left of the query // One case: // ----X------- fread // -X---- rread // Another case: // ---X- fread // -X--- rread // Another case: // ---- fread // -X----X- rread //first calc rlen after the first clip sqp->rlen -= ppos; //now in the first two cases shown above, the other cut point is just the //new rlen if(sqp->rlen <= sqp->flen) sqp->flen = sqp->rlen; //otherwise leave sqp->flen alone else if(sqp->rlen > sqp->flen){ // Another case: // ---- fread // -X----X--- rread // make initial cut to rc read sqp->rc_rqual[ppos + sqp->flen] = '\0'; sqp->rc_rseq[ppos + sqp->flen] = '\0'; strncpy(sqp->rseq,sqp->rc_rseq,ppos + sqp->flen+1); //move RC reads into reg place and reverse them strncpy(sqp->rqual,sqp->rc_rqual,ppos + sqp->flen+1); rev_qual(sqp->rqual, ppos + sqp->flen); revcom_seq(sqp->rseq, ppos + sqp->flen); //now we have our end cut in place in the regular reads sqp->rlen = sqp->flen; } //now cases have been handled and length has been determined sqp->fseq[sqp->flen] = '\0'; sqp->fqual[sqp->flen] = '\0'; sqp->rseq[sqp->rlen] = '\0'; sqp->rqual[sqp->rlen] = '\0'; // now re-reverse complement the sequences strncpy(sqp->rc_rseq,sqp->rseq,sqp->rlen+1); strncpy(sqp->rc_rqual,sqp->rqual,sqp->rlen+1); rev_qual(sqp->rc_rqual, sqp->rlen); revcom_seq(sqp->rc_rseq, sqp->rlen); return true; } } return false; }
/** * After performing overlap post adapter trimming, * this trims the ends of the reads the same way as when * they are merged, but otherwise leaves the nucleotides * the same as they were. */ void make_blunt_ends(SQP sqp, AlnAln *aln){ int len = strlen(aln->out1); char *out1, *out2; out1 = aln->out1; out2 = aln->out2; int i,p1,p2; //p1,2 store pointers to corresponding pos in original seqs p1 = p2 = 0; char c1,c2,q1,q2,t1,t2; bool trim_overhang = true; bool end_gaps; bool begin_gaps = trim_overhang; int j1 = 0; int j2 = 0; int k; for(i=0;i<len;i++){ c1 = toupper(out1[i]); c2 = toupper(out2[i]); q1 = sqp->fqual[p1]; q2 = sqp->rc_rqual[p2]; if(isXDNA(c1) && isXDNA(c2)){ sqp->fseq[j1] = c1; sqp->fqual[j1] = q1; sqp->rc_rseq[j2] = c2; sqp->rc_rqual[j2] = q2; //case 1 both are DNA, choose one with best score and subtract if (begin_gaps) begin_gaps = false; //switch it off now that we have seen a match //increment both positions of the reads p1++; p2++; j1++; j2++; }else if(isXDNA(c1)){ // c2 is a gap if (!begin_gaps){ sqp->fseq[j1] = c1; sqp->fqual[j1] = q1; //now check to see if we are done: if(trim_overhang){ end_gaps = true; for(k=i;k<len;k++){ t2 = out2[k]; if(t2 != '-'){ end_gaps = false; break; } } if(end_gaps){ //everything after this is a gap break; } } j1++; } //increment the first p1++; }else if(isXDNA(c2)){ //c1 is a gap if(!begin_gaps){ sqp->rc_rseq[j2] = c2; sqp->rc_rqual[j2] = q2; if(trim_overhang){ end_gaps = true; for(k=i;k<len;k++){ t1 = out1[k]; if(t1 != '-'){ end_gaps = false; break; } } if(end_gaps){ //everything after this is a gap break; } } j2++; } //increment the second p2++; } } sqp->fseq[j1] = '\0'; sqp->fqual[j1] = '\0'; sqp->flen = j1; sqp->rc_rseq[j2] = '\0'; sqp->rc_rqual[j2] = '\0'; sqp->rlen = j2; strncpy(sqp->rseq,sqp->rc_rseq,sqp->rlen+1); strncpy(sqp->rqual,sqp->rc_rqual,sqp->rlen+1); rev_qual( sqp->rqual, sqp->rlen ); revcom_seq( sqp->rseq, sqp->rlen); }
/** * adapter_trim: * * */ bool adapter_trim(SQP sqp, size_t min_ol_adapter, char *forward_primer, char *forward_primer_dummy_qual, int forward_primer_len, char *reverse_primer, char *reverse_primer_dummy_qual, int reverse_primer_len, unsigned short min_match_adapter[MAX_SEQ_LEN+1], unsigned short max_mismatch_adapter[MAX_SEQ_LEN+1], unsigned short min_match_reads[MAX_SEQ_LEN+1], unsigned short max_mismatch_reads[MAX_SEQ_LEN+1], char qcut){ //adapters on reads if the insert size is less than the read length, the adapter // appears at the end of the sequence. /** * First check for adapter match before the first position of the read */ int pfpos = compute_ol( forward_primer, forward_primer_dummy_qual, forward_primer_len, sqp->fseq,sqp->fqual,sqp->flen, max(min(forward_primer_len,sqp->flen)-5,0), min_match_adapter, max_mismatch_adapter, false, qcut); int prpos = compute_ol( reverse_primer, reverse_primer_dummy_qual, reverse_primer_len, sqp->rseq,sqp->rqual,sqp->rlen, max(min(reverse_primer_len,sqp->rlen)-5,0), min_match_adapter, max_mismatch_adapter, false, qcut); if(pfpos >= 0 || prpos >= 0){ //yikes, a match to the adapter at the first position! sqp->fseq[0] = '\0'; sqp->fqual[0] = '\0'; sqp->flen = 0; sqp->rseq[0] = '\0'; sqp->rqual[0] = '\0'; sqp->rlen = 0; sqp->rc_rqual[0] = '\0'; sqp->rc_rseq[0] = '\0'; return true; } /** * now check for the adapter after the first position of the read */ int fpos = compute_ol(sqp->fseq,sqp->fqual,sqp->flen, forward_primer, forward_primer_dummy_qual, forward_primer_len, min_ol_adapter, min_match_adapter, max_mismatch_adapter, false, qcut); int rpos = compute_ol(sqp->rseq,sqp->rqual,sqp->rlen, reverse_primer, reverse_primer_dummy_qual, reverse_primer_len, min_ol_adapter, min_match_adapter, max_mismatch_adapter, false, qcut); if(fpos != CODE_NOMATCH || rpos != CODE_NOMATCH){ //check if reads are long enough to do anything with. // trim adapters if(fpos >=0){ sqp->fseq[fpos] = '\0'; sqp->fqual[fpos] = '\0'; sqp->flen = fpos; } if(rpos >= 0){ sqp->rseq[rpos] = '\0'; sqp->rqual[rpos] = '\0'; sqp->rlen = rpos; } // now re-reverse complement the sequences strncpy(sqp->rc_rseq,sqp->rseq,sqp->rlen+1); strncpy(sqp->rc_rqual,sqp->rqual,sqp->rlen+1); rev_qual(sqp->rc_rqual, sqp->rlen); revcom_seq(sqp->rc_rseq, sqp->rlen); //adapters present return true; } return read_olap_adapter_trim(sqp, min_ol_adapter, min_match_adapter, max_mismatch_adapter, min_match_reads, max_mismatch_reads, qcut); }
int main( int argc, char* argv[] ) { unsigned long long num_pairs; unsigned long long num_merged; unsigned long long num_adapter; unsigned long long num_discarded; unsigned long long num_too_ambiguous_to_merge; unsigned long long max_pretty_print = DEF_MAX_PRETTY_PRINT; unsigned long long num_pretty_print = 0; int adapter_thresh = DEF_ADAPTER_SCORE_THRES; int read_thresh = DEF_READ_SCORE_THRES; clock_t start, end; //init to 0 num_pairs = num_merged = num_adapter = num_discarded = num_too_ambiguous_to_merge = 0; extern char* optarg; bool p64 = false; char forward_fn[MAX_FN_LEN]; char reverse_fn[MAX_FN_LEN]; char forward_out_fn[MAX_FN_LEN]; char reverse_out_fn[MAX_FN_LEN]; char forward_discard_fn[MAX_FN_LEN]; char reverse_discard_fn[MAX_FN_LEN]; char merged_out_fn[MAX_FN_LEN]; bool do_read_merging = false; bool print_overhang = false; bool write_discard=false; char forward_primer[MAX_SEQ_LEN+1]; strcpy(forward_primer, DEF_FORWARD_PRIMER); //set default char forward_primer_dummy_qual[MAX_SEQ_LEN+1]; char reverse_primer[MAX_SEQ_LEN+1]; strcpy(reverse_primer, DEF_REVERSE_PRIMER); //set default char reverse_primer_dummy_qual[MAX_SEQ_LEN+1]; int i; for(i=0;i<MAX_SEQ_LEN+1;i++){ forward_primer_dummy_qual[i] = 'N';//phred score of 45 reverse_primer_dummy_qual[i] = 'N'; } int ich; int min_ol_adapter = DEF_OL2MERGE_ADAPTER; int min_ol_reads = DEF_OL2MERGE_READS; unsigned short int min_read_len =DEF_MIN_READ_LEN; float min_match_adapter_frac = DEF_MIN_MATCH_ADAPTER; float min_match_reads_frac = DEF_MIN_MATCH_READS; float max_mismatch_adapter_frac = DEF_MAX_MISMATCH_ADAPTER; float max_mismatch_reads_frac = DEF_MAX_MISMATCH_READS; float read_frac_thresh = DEF_READ_GAP_FRAC_CUTOFF; unsigned short max_mismatch_adapter[MAX_SEQ_LEN+1]; unsigned short max_mismatch_reads[MAX_SEQ_LEN+1]; unsigned short min_match_adapter[MAX_SEQ_LEN+1]; unsigned short min_match_reads[MAX_SEQ_LEN+1]; char qcut = (char)DEF_QCUT+33; bool pretty_print = false; char pretty_print_fn[MAX_FN_LEN+1]; SQP sqp = SQP_init(); char untrim_fseq[MAX_SEQ_LEN+1]; char untrim_fqual[MAX_SEQ_LEN+1]; char untrim_rseq[MAX_SEQ_LEN+1]; char untrim_rqual[MAX_SEQ_LEN+1]; /* No args - help! */ if ( argc == 1 ) { help(argv[0]); } int req_args = 0; while( (ich=getopt( argc, argv, "f:r:1:2:3:4:q:A:s:y:B:O:E:x:M:N:L:o:m:b:w:W:p:P:X:Q:t:e:Z:n:6gh" )) != -1 ) { switch( ich ) { //REQUIRED ARGUMENTS case 'f' : req_args ++; strcpy( forward_fn, optarg ); break; case 'r' : req_args ++; strcpy( reverse_fn, optarg ); break; case '1' : req_args ++; strcpy(forward_out_fn, optarg); break; case '2' : req_args ++; strcpy(reverse_out_fn, optarg); break; //OPTIONAL GENERAL ARGUMENTS case '3' : write_discard=true; strcpy(forward_discard_fn, optarg); break; case '4' : write_discard=true; strcpy(reverse_discard_fn, optarg); break; case 'h' : help(argv[0]); break; case '6' : p64 = true; break; case 'q' : qcut = atoi(optarg)+33; break; case 'L' : min_read_len = atoi(optarg); break; //OPTIONAL ADAPTER/PRIMER TRIMMING ARGUMENTS case 'A': strcpy(forward_primer, optarg); break; case 'B': strcpy(reverse_primer, optarg); break; case 'O': min_ol_adapter = atoi(optarg); break; case 'M': max_mismatch_adapter_frac = atof(optarg); break; case 'N': min_match_adapter_frac = atof(optarg); break; case 'b': aln_param_nt2nt.band_width = atoi(optarg); break; case 'Q': aln_param_nt2nt.gap_open = atoi(optarg); break; case 't': aln_param_nt2nt.gap_ext = atoi(optarg); break; case 'e': aln_param_nt2nt.gap_end = atoi(optarg); break; case 'Z': adapter_thresh = atoi(optarg); break; case 'w': aln_param_rd2rd.band_width = atoi(optarg); break; case 'W': aln_param_rd2rd.gap_open = atoi(optarg); break; case 'p': aln_param_rd2rd.gap_ext = atoi(optarg); break; case 'P': aln_param_rd2rd.gap_end = atoi(optarg); break; case 'X': read_frac_thresh = atof(optarg); break; //OPTIONAL MERGING ARGUMENTS case 'y' : maximum_quality = optarg[0]; break; case 'g' : print_overhang = true; break; case 's' : do_read_merging = true; strcpy( merged_out_fn, optarg ); break; case 'o': min_ol_reads = atoi(optarg); break; case 'm': max_mismatch_reads_frac = atof(optarg); break; case 'n': min_match_reads_frac = atof(optarg); break; case 'E': pretty_print = true; strcpy(pretty_print_fn,optarg); break; case 'x': max_pretty_print = atol(optarg); break; default : help(argv[0]); } } if(req_args < 4){ fprintf(stderr, "Missing a required argument!\n"); help(argv[0]); } start = clock(); //allocate alignment memory // int min_match = 8; // int ngaps = 1; // int maxglen = 3; // AlnParam aln_param_adapter = { 5, 13, 19, aln_sm_read, 16, 75 }; // //Calculate table matching overlap length to min matches and max mismatches for(i=0;i<MAX_SEQ_LEN+1;i++){ max_mismatch_reads[i] = floor(((float)i)*max_mismatch_reads_frac); max_mismatch_adapter[i] = floor(((float)i)*max_mismatch_adapter_frac); min_match_reads[i] = ceil(((float)i)*min_match_reads_frac); min_match_adapter[i] = ceil(((float)i)*min_match_adapter_frac); } //get length of forward and reverse primers int forward_primer_len = strlen(forward_primer); int reverse_primer_len = strlen(reverse_primer); gzFile ffq = fileOpen(forward_fn, "r"); gzFile ffqw = fileOpen(forward_out_fn,"w"); gzFile rfq = fileOpen(reverse_fn, "r"); gzFile rfqw = fileOpen(reverse_out_fn,"w"); gzFile mfqw = NULL; gzFile ppaw = NULL; gzFile dffqw = NULL; gzFile drfqw = NULL; if(do_read_merging) mfqw = fileOpen(merged_out_fn,"w"); if(pretty_print) ppaw = fileOpen(pretty_print_fn,"w"); if(write_discard){ dffqw = fileOpen(forward_discard_fn,"w"); drfqw = fileOpen(reverse_discard_fn,"w"); } /** * Loop over all of the reads */ while(next_fastqs( ffq, rfq, sqp, p64 )){ //returns false when done update_spinner(num_pairs++); AlnAln *faaln, *raaln, *fraln; //save a copy of the original sequences/qualities first strcpy(untrim_fseq,sqp->fseq); strcpy(untrim_fqual,sqp->fqual); strcpy(untrim_rseq,sqp->rseq); strcpy(untrim_rqual,sqp->rqual); faaln = aln_stdaln_aux(sqp->fseq, forward_primer, &aln_param_nt2nt, ALN_TYPE_LOCAL, adapter_thresh , sqp->flen, forward_primer_len); raaln = aln_stdaln_aux(sqp->rseq, reverse_primer, &aln_param_nt2nt, ALN_TYPE_LOCAL, adapter_thresh, sqp->rlen, reverse_primer_len); //check for direct adapter match. if(adapter_trim(sqp, min_ol_adapter, forward_primer, forward_primer_dummy_qual, forward_primer_len, reverse_primer, reverse_primer_dummy_qual, reverse_primer_len, min_match_adapter, max_mismatch_adapter, min_match_reads, max_mismatch_reads, qcut) || faaln->score >= adapter_thresh || raaln->score >= adapter_thresh){ num_adapter++; //adapter present //print it if user wants if(pretty_print && num_pretty_print < max_pretty_print){ //void pretty_print_alignment_stdaln(gzFile out, SQP sqp, AlnAln *aln, bool first_adapter, bool second_adapter) if(faaln->score >= adapter_thresh){ num_pretty_print++; pretty_print_alignment_stdaln(ppaw,sqp,faaln,true,false,false); } if(raaln->score >= adapter_thresh){ num_pretty_print++; pretty_print_alignment_stdaln(ppaw,sqp,raaln,false,true,false); } } //do stuff to it //assume full length adapter and squish it down to the read with no gaps int rpos,fpos; rpos = fpos = (- MAX_SEQ_LEN); if(faaln->score >= adapter_thresh){ fpos = max(faaln->start1 - faaln->start2,0); } if(raaln->score >= adapter_thresh){ rpos = max(raaln->start1 - raaln->start2,0); } //make rlen the minimum of the two adapter search methods if(rpos >= 0){ sqp->rlen = min(sqp->rlen,rpos); } //make flen the minimum of the two adapter search methods if(fpos >= 0){ sqp->flen = min(sqp->flen,fpos); } if(sqp->flen < min_read_len || sqp->rlen < min_read_len){ num_discarded++; if(write_discard){ write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual); write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual); } goto CLEAN_ADAPTERS; }else{ //trim the adapters sqp->fseq[sqp->flen] = '\0'; sqp->fqual[sqp->flen] = '\0'; sqp->rseq[sqp->rlen] = '\0'; sqp->rqual[sqp->rlen] = '\0'; strncpy(sqp->rc_rseq,sqp->rseq,sqp->rlen+1); //move regular reads now trimmed into RC read's place strncpy(sqp->rc_rqual,sqp->rqual,sqp->rlen+1); rev_qual(sqp->rc_rqual, sqp->rlen); //amd re-reverse the RC reads revcom_seq(sqp->rc_rseq, sqp->rlen); } //do a nice global alignment between two reads, and print consensus fraln = aln_stdaln_aux(sqp->fseq, sqp->rc_rseq, &aln_param_rd2rd, ALN_TYPE_GLOBAL, 1, sqp->flen, sqp->rlen); //calculate the minimum score we are willing to accept to merge the reads //basically this is saying that 7/8 of the read must overlap perfectly read_thresh = (((int)sqp->flen) + ((int)sqp->rlen)) - (((int)sqp->flen) * read_frac_thresh * aln_param_rd2rd.gap_ext) - (((int)sqp->rlen) * read_frac_thresh * aln_param_rd2rd.gap_ext) - (aln_param_rd2rd.gap_open*2) - (aln_param_rd2rd.gap_end*2); //now lets put something useful in the alignment suboptimal score thing since right now it //is just left blank: //fprintf(stderr, "rt:%d\tfl:%d\trl:%d\trft:%f\tgx:%d\tgo:%d\tge%d\n", read_thresh,((int)sqp->flen),((int)sqp->rlen),read_frac_thresh,aln_param_rd2rd.gap_ext,aln_param_rd2rd.gap_open,aln_param_rd2rd.gap_end); fraln->subo = read_thresh; if(do_read_merging && fraln->score > read_thresh){ //if we want read merging, //and the alignment score is better than the threshold just calculated... //write the merged sequence fill_merged_sequence(sqp, fraln, true); if(pretty_print && num_pretty_print < max_pretty_print){ num_pretty_print++; pretty_print_alignment_stdaln(ppaw,sqp,fraln,false,false,true); } if(strlen(sqp->merged_seq) >= min_read_len && strlen(sqp->merged_qual) >= min_read_len){ num_merged++; write_fastq(mfqw,sqp->fid,sqp->merged_seq,sqp->merged_qual); } else{ num_discarded++; if(write_discard){ write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual); write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual); } } }else if(fraln->score > read_thresh){ // we know that the adapters are present, trimmed, and the resulting // read lengths are both long enough to print. // We also know that we aren't doing merging. // Now we just need to print. if(pretty_print && num_pretty_print < max_pretty_print){ num_pretty_print++; pretty_print_alignment_stdaln(ppaw,sqp,fraln,false,false,true); } //do end polishing to take care of examples like the following: // Read Alignment Score:59, Suboptimal Score:-85 // ID:HWI-ST593:1:1101:14566:7002#ACA/1 // READ1: ------------ATACAACTCGCTGACTTTGTCCTGGCATTTGACATATGCCTCGTAGTCTGCAAAGACTTTAAACCGGTCATGGTGGAACAGCATGTTGA // |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| // READ2: CTCTTCCGATCTATACAACTCGCTGACTTTGTCCTGGCATTTGACATATGCCTCGTAGTCTGCAAAGACTTTAAACCGGTCATGGTGGAACAGCATGTTG- make_blunt_ends(sqp,fraln); if(strlen(sqp->fseq) >= min_read_len && strlen(sqp->fqual) >= min_read_len && strlen(sqp->rseq) >= min_read_len && strlen(sqp->rqual) >= min_read_len){ write_fastq(ffqw, sqp->fid, sqp->fseq, sqp->fqual); write_fastq(rfqw, sqp->rid, sqp->rseq, sqp->rqual); }else{ num_discarded++; if(write_discard){ write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual); write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual); } } }else{ //there was a bad looking read-read alignment, so lets not risk it and junk it num_discarded++; if(write_discard){ //write_fastq(dffqw, sqp->fid, sqp->fseq, sqp->fqual); //write_fastq(drfqw, sqp->rid, sqp->rseq, sqp->rqual); write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual); write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual); } } }else{ //no adapters present //check for strong read overlap to assist trimming ends of adapters from end of read if(do_read_merging){ if(read_merge(sqp, min_ol_reads, min_match_reads, max_mismatch_reads, qcut)){ //print merged output if(strlen(sqp->merged_seq) >= min_read_len && strlen(sqp->merged_qual) >= min_read_len){ num_merged++; write_fastq(mfqw,sqp->fid,sqp->merged_seq,sqp->merged_qual); if(pretty_print && num_pretty_print < max_pretty_print){ num_pretty_print++; pretty_print_alignment(ppaw,sqp,qcut,false); //false b/c merged input in fixed order } }else{ num_discarded++; if(write_discard){ write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual); write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual); } } }else{ //no significant overlap so just write them if(strlen(sqp->fseq) >= min_read_len && strlen(sqp->fqual) >= min_read_len && strlen(sqp->rseq) >= min_read_len && strlen(sqp->rqual) >= min_read_len){ write_fastq(ffqw, sqp->fid, sqp->fseq, sqp->fqual); write_fastq(rfqw, sqp->rid, sqp->rseq, sqp->rqual); }else{ num_discarded++; if(write_discard){ write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual); write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual); } } } //done goto CLEAN_ADAPTERS; }else{ //just write reads to output fastqs if(strlen(sqp->fseq) >= min_read_len && strlen(sqp->fqual) >= min_read_len && strlen(sqp->rseq) >= min_read_len && strlen(sqp->rqual) >= min_read_len){ write_fastq(ffqw, sqp->fid, sqp->fseq, sqp->fqual); write_fastq(rfqw, sqp->rid, sqp->rseq, sqp->rqual); }else{ num_discarded++; if(write_discard){ write_fastq(dffqw, sqp->fid, untrim_fseq, untrim_fqual); write_fastq(drfqw, sqp->rid, untrim_rseq, untrim_rqual); } } goto CLEAN_ADAPTERS; } } /** * Section for heirarchial cleanup * * In every case we will at least have to free up the alignment between the adapter and two reads. * however in some cases there will be an additional alignment between the two reads. We can do * good cleanup in this case with gotos */ aln_free_AlnAln(fraln); CLEAN_ADAPTERS: aln_free_AlnAln(faaln); aln_free_AlnAln(raaln); //End the loop over reads } end = clock(); double cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC; fprintf(stderr,"\nPairs Processed:\t%lld\n",num_pairs); fprintf(stderr,"Pairs Merged:\t%lld\n",num_merged); fprintf(stderr,"Pairs With Adapters:\t%lld\n",num_adapter); fprintf(stderr,"Pairs Discarded:\t%lld\n",num_discarded); fprintf(stderr,"CPU Time Used (Minutes):\t%lf\n",cpu_time_used/60.0); SQP_destroy(sqp); gzclose(ffq); gzclose(ffqw); gzclose(rfq); gzclose(rfqw); if(mfqw != NULL) gzclose(mfqw); if(ppaw != NULL) gzclose(ppaw); if(dffqw != NULL) gzclose(dffqw); if(drfqw != NULL) gzclose(drfqw); return 0; }