int subsample(int nseqs, char *fastqfile, struct fastq *seqs[], int compressed) { // read the fastq file into a temporary hash struct fastq *allseqs[HASHSIZE] = {NULL}; int read_seqs = 0; if (compressed) read_seqs = read_fastq_gz(fastqfile, allseqs); else read_seqs = read_fastq(fastqfile, allseqs); if (read_seqs < nseqs) { fprintf(stderr, "You requested %d sequences but there are only %d in the file!\n", nseqs, read_seqs); nseqs = read_seqs; } // get all the ids from the sequences char *ids[read_seqs]; get_ids(ids, allseqs); // subsample those IDs char **subsample = subsample_n(read_seqs, ids, nseqs); for (int i=0; i<nseqs; i++) { char *info = get_seq_information(subsample[i], seqs); char *seq = get_sequence(subsample[i], seqs); char *qua = get_quality(subsample[i], seqs); } return nseqs; }
int filter_se_fastq_bz2(FLT_OPTS *opts){ int index=1; int stat_left = 0; SEQ_QUAL item=init_read(); gzFile fp=gzopen_report(opts->r1,"r"); FILE *fo=fopen_report(strcat(opts->output,".flt"),"w+"); int left = 0; while(read_fastq(fp,&item,index++) > 0){ check_read(&item,2); left=filter_all(&item, opts); if(left == 1){ output_fastq(fo, &item); stat_left++; } } printf("Totally %d reads were processed\n",index-1); printf(" file [ %s ]: %d reads were left (%.2f%)\n",opts->r1,stat_left,(float) stat_left*100/(index-1)); free_read(&item); fclose(fp); fclose(fo); return 0; }
/* next_fastqs Read the next forward and reverse fastq sequences. Check to make sure their ID's are compatible and put the results in the next SQP of SQPDB. Grow this, if necessary. */ inline bool next_fastqs( gzFile ffq, gzFile rfq, SQP curr_sqp, bool p64 ) { int frs; // forward fastq read status int rrs; // reverse fastq read status size_t id1len = 0; size_t id2len = 0; /* Read the next fastq record from the forward and reverse pair of each */ //make sure everything is fresh... memset(curr_sqp->fid,'\0',MAX_SEQ_LEN); memset(curr_sqp->rid,'\0',MAX_SEQ_LEN); memset(curr_sqp->fseq,'\0',MAX_SEQ_LEN); memset(curr_sqp->rseq,'\0',MAX_SEQ_LEN); memset(curr_sqp->rc_rseq,'\0',MAX_SEQ_LEN); memset(curr_sqp->fqual,'\0',MAX_SEQ_LEN); memset(curr_sqp->rqual,'\0',MAX_SEQ_LEN); memset(curr_sqp->merged_seq,'\0',MAX_SEQ_LEN+MAX_SEQ_LEN); memset(curr_sqp->merged_qual,'\0',MAX_SEQ_LEN+MAX_SEQ_LEN); memset(curr_sqp->rc_rqual,'\0',MAX_SEQ_LEN); curr_sqp->flen = curr_sqp->rlen = 0; // frs = read_fastq( ffq, curr_sqp->fid, curr_sqp->fseq, curr_sqp->fqual, &id1len, &(curr_sqp->flen), p64 ); rrs = read_fastq( rfq, curr_sqp->rid, curr_sqp->rseq, curr_sqp->rqual, &id2len, &(curr_sqp->rlen), p64 ); // //reverse comp the second read for overlapping and everything. // strcpy(curr_sqp->rc_rseq,curr_sqp->rseq); // strcpy(curr_sqp->rc_rqual,curr_sqp->rqual); // revcom_seq(curr_sqp->rc_rseq,curr_sqp->rlen); // rev_qual(curr_sqp->rc_rqual,curr_sqp->rlen); if ( (frs == 1) && (rrs == 1) && f_r_id_check( curr_sqp->fid, id1len, curr_sqp->rid, id2len ) ) { strncpy(curr_sqp->rc_rseq,curr_sqp->rseq,curr_sqp->rlen+1); strncpy(curr_sqp->rc_rqual,curr_sqp->rqual,curr_sqp->rlen+1); rev_qual(curr_sqp->rc_rqual, curr_sqp->rlen); revcom_seq(curr_sqp->rc_rseq, curr_sqp->rlen); return true; } else { return false; } }
int detect_datatype(char *file){ int dataType=0; int i = 0; int max = 0; int min = 999; int sample = 100; SEQ_QUAL item = init_read(); gzFile zfp = gzopen_report(file,"r"); if(gzgetc(zfp) == '>'){ dataType |= FILE_FASTA; }else{ gzseek(zfp, 0L, SEEK_SET); if(read_fastq(zfp,&item,i) >= 0){ dataType |= FILE_FASTQ; do{ for(i=0;i+1<strlen(item.qual);i++){ min = MIN(min, item.qual[i]); max = MAX(max, item.qual[i]); } if((sample--) == 0){ if(max >= 75) dataType|=FILE_PHRED64; else{ dataType|=FILE_PHRED33; if(min > 58) warning_msg("Can not identified quality score type in 100 read samples, assume phred+33\n"); } break; } }while(read_fastq(zfp,&item, i)); }else dataType |= FILE_UNKN; } gzclose(zfp); free_read(&item); return dataType; }
void CAReader_create_ovl_list(std::string fastq_filename, std::string gkp_store_name, std::string ovl_store_name, std::vector<Overlap_T>* ovl_list) { std::vector<std::string> fastq_names; std::vector<size_t> fastq_lengths; read_fastq(fastq_filename, &fastq_names, &fastq_lengths); gkStore* gkp_store = new gkStore(gkp_store_name.c_str(), false, false); OverlapStore *ovl_store = AS_OVS_openOverlapStore(ovl_store_name.c_str()); uint32_t iidMin = 0; uint32_t iidMax = gkp_store->gkStore_getNumFragments(); uint32_t ovl_len = 0; uint32_t ovl_max = 25000; OVSoverlap* overlaps = new OVSoverlap [ovl_max]; for(uint32_t iid=iidMin; iid < iidMax; iid++) { loadOverlaps(iid, overlaps, ovl_len, ovl_max, ovl_store, NULL); for(uint32_t i=0; i<ovl_len; i++) { Overlap_T ovl = {"","",0,0,0,0,0,0,false}; ovl.name_a = fastq_names[overlaps[i].a_iid - 1]; ovl.name_b = fastq_names[overlaps[i].b_iid - 1]; ovl.start_a = overlaps[i].dat.obt.a_beg; ovl.start_b = overlaps[i].dat.obt.b_beg; ovl.end_a = overlaps[i].dat.obt.a_end; ovl.end_b = overlaps[i].dat.obt.b_end_lo | (overlaps[i].dat.obt.b_end_hi << 9); if(ovl.end_b < ovl.start_b) { int tmp = ovl.start_b; ovl.start_b = ovl.end_b; ovl.end_b = tmp; } ovl.length_a = fastq_lengths[overlaps[i].a_iid - 1]; ovl.length_b = fastq_lengths[overlaps[i].b_iid - 1]; ovl.forward = overlaps[i].dat.obt.fwd; ovl_list->push_back(ovl); } } delete[] overlaps; AS_OVS_closeOverlapStore(ovl_store); delete gkp_store; }
inline bool produce(uint32_t i, sequence_list& buff) { stream_status& st = streams_[i]; switch(st.type) { case FASTA_TYPE: read_fasta(st, buff); break; case FASTQ_TYPE: read_fastq(st, buff); break; case DONE_TYPE: return true; } if(st.stream->good()) return false; // Reach the end of file, close current and try to open the next one open_next_file(st); return false; }
int main(int argc, char *argv[]) { if (argc < 2) { help(); exit(0); } struct fastq *seqs[HASHSIZE] = {NULL}; int nseqs = read_fastq(argv[1], seqs); char **ids = sorted_sequence_ids(seqs, nseqs); for (int i=0; i<nseqs; i++) { int hashval = hash(ids[i]); struct fastq *ptr; for (ptr=seqs[hashval]; ptr != NULL; ptr = ptr->next) if (strcmp(ptr->seqid, ids[i]) == 0) printf(">%s\n%s\n", ids[i], ptr->seq); } /* print_fasta(seqs); */ }
ReferenceMap::ReferenceMap(const std::string& filename) { FastqReader reader{filename}; read_fastq(reader); }
int filter_pe_fastq(FLT_OPTS *opts){ int left1=0,left2=0; int stat_single1 = 0; int stat_single2 = 0; int stat_paired = 0; int index=1; char fn[128]; char outfile[128]; SEQ_QUAL item1=init_read(); SEQ_QUAL item2=init_read(); gzFile fp1=gzopen_report(opts->r1,"r"); if(!fp1) return -1; gzFile fp2=gzopen_report(opts->r2,"r"); if(!fp2) return -1; file_name(outfile,opts->r1); sprintf(fn,"%s/%s.flt",opts->output,outfile); FILE *fo1=fopen_report(fn,"w+"); if(!fo1) return -1; file_name(outfile,opts->r2); sprintf(fn,"%s/%s.flt",opts->output,outfile); FILE *fo2=fopen_report(fn,"w+"); if(!fo2) return -1; sprintf(fn,"%s/%s.flt.s",opts->output,outfile); FILE *fos=fopen_report(fn,"w+"); if(!fos) return -1; while(read_fastq(fp1,&item1,index) >= 0 && read_fastq(fp2,&item2,index) >= 0) { left1=filter_all(&item1, opts); left2=filter_all(&item2, opts); if(left1 == 1 && left2 == 1){ output_fastq(fo1, &item1); output_fastq(fo2, &item2); stat_paired++; }else{ if(left1 == 1){ output_fastq(fos, &item1); stat_single1++; } if(left2 == 1){ output_fastq(fos, &item2); stat_single2++; } } index++; } printf("Totally %d reads were processed\n",(index-1)*2); printf(" file [ %s ]: %d reads were left (%.2f%)\n",opts->r1,stat_paired+stat_single1,(float) (stat_paired+stat_single1)*100/(index-1)); printf(" file [ %s ]: %d reads were left (%.2f%)\n",opts->r2,stat_paired+stat_single2,(float) (stat_paired+stat_single2)*100/(index-1)); printf("After filtering %d reads are paired in each file (%.2f%)\n",stat_paired,(float) stat_paired*100/(index-1)); printf(" file [ %s ]: %d reads were left as single end\n",opts->r1,stat_single1); printf(" file [ %s ]: %d reads were left as single end\n",opts->r2,stat_single2); free_read(&item1); free_read(&item2); gzclose(fp1); gzclose(fp2); fclose(fo1); fclose(fo2); fclose(fos); return 0; }