// check if the read name format was generated by casava 1.8 int is_casava_1_8(char *f) { regex_t regex; int reti; int is_casava_1_8=0; reti = regcomp(®ex,"[A-Z0-9:]* [12]:[YN]:[0-9]*:.*",0); if ( reti ) { fprintf(stderr, "Internal error: Could not compile regex\n"); exit(2); } FILE *fd1=open_fastq(f); char *hdr=READ_LINE(fd1); fclose(fd1); /* Execute regular expression */ //fprintf(stderr,"%s\n",hdr); reti = regexec(®ex, hdr, 0, NULL, 0); if ( !reti ) { // match is_casava_1_8=1; } /* else{ char msgbuf[100]; regerror(reti, ®ex, msgbuf, sizeof(msgbuf)); //fprintf(stderr, "Regex match failed: %s\n", msgbuf); } */ regfree(®ex); return is_casava_1_8; }
void index_file(char *filename,hashtable sn_index,long start_offset,long length) { gzFile fd1=open_fastq(filename); gzFile fdf=open_fixed_fastq(filename); if (fd1==NULL) { fprintf(stderr,"\nError: Unable to open %s\n",filename); exit(1); } // move to the right position if(length>0) { fprintf(stderr, "\nInternal error: Not implemented\n"); exit(2); } long cline=1; // sn_index creation could be done in parallel while(!gzeof(fd1)) { long long start_pos=gztell(fd1); char *hdr=READ_LINE_HDR(fd1); if ( hdr==NULL) break; int len; //fprintf(stderr,"sn_index: =%s=\n",readname); // get seq //printf("cline=%ld\nLEN=%ld hdr=%s\n",cline,len,hdr); char *seq=READ_LINE_SEQ(fd1); char *hdr2=READ_LINE_HDR2(fd1); char *qual=READ_LINE_QUAL(fd1); char* readname=get_readname(hdr,&len,cline,filename); if (seq==NULL || hdr2==NULL || qual==NULL ) { fprintf(stderr,"\nError in file %s, line %lu: file truncated?\n",filename,cline); exit(1); } if (validate_entry(hdr,hdr2,seq,qual,cline,filename)!=0) { exit(1); } // check for duplicates if ( lookup_header(sn_index,readname)!=NULL ) { fprintf(stderr,"\nError in file %s, line %lu: duplicated sequence %s\n",filename,cline,readname); exit(1); } if ( new_indexentry(sn_index,readname,len,start_pos)==NULL) { fprintf(stderr,"\nError in file %s, line %lu: malloc failed?",filename,cline); exit(1); } replace_dots(start_pos,seq,hdr,hdr2,qual,fdf); PRINT_READS_PROCESSED(cline/4); // cline+=4; } close_fixed_fastq(fdf); gzclose(fd1); return; }
int validate_interleaved(char *f) { unsigned long cline=1; unsigned long nreads1=0; gzFile fd1=NULL; fprintf(stderr,"Paired-end interleaved\n"); fd1=open_fastq(f); gzFile fdf=open_fixed_fastq(f); while(!gzeof(fd1)) { long start_pos=gztell(fd1); // Read 1 char *hdr1=READ_LINE_HDR(fd1); if ( hdr1==NULL) break; int len; char *seq1=READ_LINE_SEQ(fd1); char *hdr1_2=READ_LINE_HDR2(fd1); char *qual1=READ_LINE_QUAL(fd1); // Read 2 char *hdr2=READ_LINE_HDR2_1(fd1); char *seq2=READ_LINE_SEQ2(fd1); char *hdr2_2=READ_LINE_HDR2_2(fd1); char *qual2=READ_LINE_QUAL2(fd1); if ( seq1==NULL || hdr1_2==NULL || qual1==NULL || hdr2==NULL || seq2==NULL || hdr2_2==NULL || qual2==NULL ) { fprintf(stderr,"\nError in file %s, line %lu: file truncated?\n",f,cline); return(1); } if (validate_entry(hdr1,hdr1_2,seq1,qual1,cline,f)!=0) { return(1); } if (validate_entry(hdr2,hdr2_2,seq2,qual2,cline+4,f)!=0) { return(1); } char* readname1=get_readname(hdr1,&len,cline,f); char* readname2=get_readname(hdr2,&len,cline+4,f); if ( strcmp(readname1,readname2) ) { fprintf(stderr,"\nError in file %s, line %lu: unpaired read - %s\n",f,cline,readname1); return(1); } PRINT_READS_PROCESSED(cline/4); replace_dots(start_pos,seq1,hdr1,hdr1_2,qual1,fdf); replace_dots(start_pos,seq2,hdr2,hdr2_2,qual2,fdf); // cline+=8; nreads1+=2; } printf("\n"); close_fixed_fastq(fdf); gzclose(fd1); return(nreads1); }
int main(int argc, char **argv ) { //printf("%d",sizeof(struct index_entry)); if (argc!=2) { fprintf(stderr,"Usage: fastq_filter_n fastq1\n"); exit(1); } FILE *fd1=open_fastq(argv[1]); // ************************************************************ unsigned long cline=1; unsigned long cur_read=0; //char tmp_buffer[MAX_READ_LENGTH]; // read the entry using another fd cline=1; while(!feof(fd1)) { char *hdr=READ_LINE(fd1,1); if ( hdr==NULL) break; if ( hdr[0]!='@' ) { fprintf(stderr,"line %lu: error in header %s",cline,hdr); return 1; } // char *seq=READ_LINE(fd1,2); READ_LINE(fd1,3); READ_LINE(fd1,4); short n_found=0; int k; for ( k=0;k<MAX_READ_LENGTH;k++) { if (seq[k]=='\n') break; if (seq[k]=='N' || seq[k]=='n' ) { n_found=1; break; } } if ( ! n_found ) WRITE_READ(stdout); cline+=4; cur_read++; } fclose(fd1); exit(0); }
int main(int argc, char **argv ) { long paired=0; //printf("%d",sizeof(struct index_entry)); if (argc!=6) { fprintf(stderr,"Usage: fastqinterleaved2pair.c fastq fastq1 fastq2\n"); //fprintf(stderr,"%d",argc); exit(1); } FILE *fd=open_fastq(argv[1]); // ************************************************************ off_t cur_offset=1; unsigned long cline=1; hashtable index=new_hashtable(HASHSIZE); index_mem+=sizeof(hashtable); index_file(argv[1],index,0,-1); printf("\n"); // print some info printf("Reads indexed: %ld\n",index->n_entries); printf("Memory used in indexing: %ld MB\n",index_mem/1024/1024); // char *p1=argv[3]; char *p2=argv[4]; char *p3=argv[5]; fd1=open_fastq(argv[1]); fd2=open_fastq(argv[2]); FILE *fdw1=fopen(p1,"w"); FILE *fdw2=fopen(p2,"w"); FILE *fdw3=fopen(p3,"w"); unsigned long up2=0; if ( fdw1==NULL || fdw2==NULL || fdw3==NULL ) { fprintf(stderr,"Unable to create output files\n"); exit(1); } // read the entry using another fd cline=1; while(!feof(fd2)) { long start_pos=ftell(fd2); char *hdr=READ_LINE(fd2); if ( hdr==NULL) break; if ( hdr[0]!='@' ) { fprintf(stderr,"line %ul: error in header %s",cline,hdr); return 1; } // discard @ (one less byte) hdr=&hdr[1]; int len=strlen(hdr); len--; hdr[len-1]='\0'; // // lookup hdr in index INDEX_ENTRY* e=lookup_header(index,hdr); if (e==NULL) { ++up2; copy_read(start_pos,fd2,fdw3); } else { long key=hashit(hdr); // pair found ++paired; copy_read(start_pos,fd2,fdw2); copy_read(e->entry_start,fd1,fdw1); // remove entry from index if (delete(index,key,e)!=e) { fprintf(stderr,"Unable to delete entry from index\n"); exit(1); } free_indexentry(e); } PRINT_READS_PROCESSED(cline/4); cline+=4; } printf("\n"); printf("Recording %ld unpaired reads from %s\n",index->n_entries,argv[1]);fflush(stdout); fclose(fd1); // record the unpaired from argv[1] fd1=open_fastq(argv[1]); #ifndef SEQDISKACCESS init_hash_traversal(index); INDEX_ENTRY* e; cline=1; while((e=(INDEX_ENTRY*)next_hash_object(index))!=NULL) { copy_read(e->entry_start,fd1,fdw3); PRINT_READS_PROCESSED(cline); ++cline; } // #else //sequential disk access // cline=1; unsigned long remaining=index->n_entries; while(!feof(fd1) && remaining ) { //long start_pos=ftell(fd2); char *hdr=READ_LINE(fd1); if ( hdr==NULL) break; if ( hdr[0]!='@' ) { fprintf(stderr,"line %ld %s: error in header %s",cline,argv[1],hdr); return 1; } // discard @ (one less byte) hdr=&hdr[1]; int len=strlen(hdr); len--; hdr[len-1]='\0'; // // lookup hdr in index INDEX_ENTRY* e=lookup_header(index,hdr); if (e!=NULL) { copy_read(e->entry_start,fd1,fdw3); remaining--; } else { READ_LINE(fd1);//seq READ_LINE(fd1);//qual READ_LINE(fd1);//qual } PRINT_READS_PROCESSED(cline/4); cline+=4; } fclose(fd1); #endif printf("\n"); printf("Unpaired from %s: %ld\n",argv[1],index->n_entries); printf("Unpaired from %s: %ld\n",argv[2],up2); printf("Paired: %ld\n",paired); /*fseek(fd2,start_pos,SEEK_SET); printf("%s",READ_LINE(fd2)); printf("%s",READ_LINE(fd2)); printf("%s",READ_LINE(fd2)); printf("%s",READ_LINE(fd2)); */ fclose(fdw1); fclose(fdw2); fclose(fdw3); if ( paired == 0 ) { fprintf(stderr,"!!!WARNING!!! 0 paired reads! are the headers ok?\n"); exit(1); } exit(0); }
int main(int argc, char **argv ) { //long paired=0; is_paired_data=0; is_interleaved=0; printf("Version iRAP %s\n",VERSION); if (argc<2 || argc>3) { fprintf(stderr,"Usage: fastq_validator fastq1 [fastq2 file|pe]\n"); //fprintf(stderr,"%d",argc); exit(1); } FILE *fd1=NULL; FILE *fd2=NULL; // open & close fd1=open_fastq(argv[1]); fclose(fd1); //fprintf(stderr,"%d\n",argc); //bin/fprintf(stderr,"%s\n",argv[0]); if (argc ==3) { is_paired_data=1; if ( !strncmp(argv[2],"pe",2) ) { is_interleaved=1; } else { fd2=open_fastq(argv[2]); fclose(fd2); } } // ************************************************************ // casava 1.8? is_casava_18=is_casava_1_8(argv[1]); if (is_casava_18) fprintf(stderr,"CASAVA=1.8\n"); // ************************************************************ //off_t cur_offset=1; // interleaved if ( is_interleaved ) { exit(validate_interleaved(argv[1])); } unsigned long cline=1; fprintf(stderr,"HASHSIZE=%lu\n",(long unsigned int)HASHSIZE); //memset(&collisions[0],0,HASHSIZE+1); hashtable sn_index=new_hashtable(HASHSIZE); index_mem+=sizeof(hashtable); index_file(argv[1],sn_index,0,-1); fprintf(stderr,"\n"); // print some info fprintf(stderr,"Reads processed: %ld\n",sn_index->n_entries); fprintf(stderr,"Memory used in indexing: ~%ld MB\n",index_mem/1024/1024); // pair-end if (argc ==3 ) { fprintf(stderr,"File %s processed\n",argv[1]); fprintf(stderr,"Next file %s\n",argv[2]); // validate the second file and check if all reads are paired fd2=open_fastq(argv[2]); INDEX_ENTRY* e; // read the entry using another fd cline=1; // TODO: improve code - mostly duplicated:( while(!feof(fd2)) { //long start_pos=ftell(fd2); char *hdr=READ_LINE_HDR(fd2); if ( hdr==NULL) break; int len; char *seq=READ_LINE_SEQ(fd2); char *hdr2=READ_LINE_HDR2(fd2); char *qual=READ_LINE_QUAL(fd2); char* readname=get_readname(hdr,&len,cline,argv[2]); if (seq==NULL || hdr2==NULL || qual==NULL ) { fprintf(stderr,"Error in file %s, line %lu: file truncated?\n",argv[2],cline); exit(1); } if (validate_entry(hdr,hdr2,seq,qual,cline,argv[2])!=0) { exit(1); } //fprintf(stderr,"Reads processed: %ld\n",sn_index->n_entries); // check for duplicates if ( (e=lookup_header(sn_index,readname))==NULL ) { fprintf(stderr,"Error in file %s, line %lu: unpaired read - %s\n",argv[2],cline,readname); exit(1); } else { ulong key=hashit(readname); // remove entry from sn_index if (delete(sn_index,key,e)!=e) { fprintf(stderr,"Error in file %s, line %lu: unable to delete entry from sn_index - %s\n",argv[2],cline,readname); exit(1); } free_indexentry(e); } PRINT_READS_PROCESSED(cline/4); // cline+=4; } printf("\n"); if (sn_index->n_entries>0 ) { fprintf(stderr,"Error in file %s: found %lu unpaired reads\n",argv[1],sn_index->n_entries); exit(1); } } printf("OK\n"); exit(0); }
int main(int argc, char **argv ) { //long paired=0; unsigned long num_reads1=0, num_reads2=0; is_paired_data=FALSE; is_interleaved=FALSE; fix_dot=FALSE; int nopt=0; int c; opterr = 0; fprintf(stderr,"Version iRAP %s\n",VERSION); while ((c = getopt (argc, argv, "f")) != -1) switch (c) { case 'f': fix_dot = TRUE; fprintf(stderr,"Fixing (-f) enabled: Replacing . by N (creating .fix.gz files)\n"); ++nopt; break; default: ++nopt; fprintf(stderr,"ERROR: Option -%c invalid\n",optopt); exit(1); } if (argc-nopt<2 || argc-nopt>3) { fprintf(stderr,"Usage: fastq_info [-f] fastq1 [fastq2 file|pe]\n"); //fprintf(stderr,"%d",argc); exit(1); } //gzFile fd1=NULL; gzFile fd2=NULL; if (argc-nopt ==3) { is_paired_data=TRUE; //fprintf(stderr,"%d %d %d %s\n",argc,nopt,argc-nopt,argv[2+nopt]); if ( !strncmp(argv[2+nopt],"pe",2) ) { is_interleaved=FALSE; } //else { // fd2=open_fastq(argv[2+nopt]); // gzclose(fd2); // } // ************************************************************ if ( is_interleaved ) { // interleaved num_reads1=validate_interleaved(argv[1+nopt]); } else { // single or pair of fastq file(s) unsigned long cline=1; fprintf(stderr,"HASHSIZE=%lu\n",(long unsigned int)HASHSIZE); //memset(&collisions[0],0,HASHSIZE+1); hashtable sn_index=new_hashtable(HASHSIZE); index_mem+=sizeof(hashtable); index_file(argv[1+nopt],sn_index,0,-1); num_reads1=sn_index->n_entries; fprintf(stderr,"\n"); // print some info fprintf(stderr,"Reads processed: %ld\n",sn_index->n_entries); fprintf(stderr,"Memory used in indexing: ~%ld MB\n",index_mem/1024/1024); // pair-end if (argc-nopt ==3 ) { fprintf(stderr,"File %s processed\n",argv[1+nopt]); fprintf(stderr,"Next file %s\n",argv[2+nopt]); // validate the second file and check if all reads are paired fd2=open_fastq(argv[2+nopt]); gzFile fdf=open_fixed_fastq(argv[2+nopt]); INDEX_ENTRY* e; // read the entry using another fd cline=1; // TODO: improve code - mostly duplicated:( while(!gzeof(fd2)) { long long start_pos=gztell(fd2); char *hdr=READ_LINE_HDR(fd2); if ( hdr==NULL) break; int len; char *seq=READ_LINE_SEQ(fd2); char *hdr2=READ_LINE_HDR2(fd2); char *qual=READ_LINE_QUAL(fd2); char* readname=get_readname(hdr,&len,cline,argv[2+nopt]); if (seq==NULL || hdr2==NULL || qual==NULL ) { fprintf(stderr,"\nError in file %s, line %lu: file truncated?\n",argv[2+nopt],cline); exit(1); } if (validate_entry(hdr,hdr2,seq,qual,cline,argv[2+nopt])!=0) { exit(1); } //fprintf(stderr,"Reads processed: %ld\n",sn_index->n_entries); // check for duplicates if ( (e=lookup_header(sn_index,readname))==NULL ) { fprintf(stderr,"\nError in file %s, line %lu: unpaired read - %s\n",argv[2+nopt],cline,readname); exit(1); } else { ulong key=hashit(readname); // remove entry from sn_index if (delete(sn_index,key,e)!=e) { fprintf(stderr,"\nError in file %s, line %lu: unable to delete entry from sn_index - %s\n",argv[2+nopt],cline,readname); exit(1); } free_indexentry(e); } PRINT_READS_PROCESSED(cline/4); ++num_reads2; // replace_dots(start_pos,seq,hdr,hdr2,qual,fdf); cline+=4; } printf("\n"); close_fixed_fastq(fdf); if (sn_index->n_entries>0 ) { fprintf(stderr,"\nError in file %s: found %lu unpaired reads\n",argv[1+nopt],sn_index->n_entries); exit(1); } } } FILE* out; if (fix_dot) { out=stderr; } else { out=stdout; } fprintf(out,"------------------------------------\n"); if ( num_reads2>0 ) { fprintf(out,"Number of reads: %lu %lu\n",num_reads1,num_reads2); } else { fprintf(out,"Number of reads: %lu\n",num_reads1); } fprintf(out,"Quality encoding range: %lu %lu\n",min_qual,max_qual); char *enc=qualRange2enc(min_qual,max_qual); if ( enc == NULL ) { fprintf(stderr,"\nERROR: Unable to determine quality encoding - unknown range [%lu,%lu]\n",min_qual,max_qual); exit(1); } fprintf(out,"Quality encoding: %s\n",qualRange2enc(min_qual,max_qual)); fprintf(out,"Read length: %lu %lu\n",min_rl,max_rl); fprintf(out,"OK\n"); exit(0); }