int validate_interleaved(char *f) { unsigned long cline=1; unsigned long nreads1=0; gzFile fd1=NULL; fprintf(stderr,"Paired-end interleaved\n"); fd1=open_fastq(f); gzFile fdf=open_fixed_fastq(f); while(!gzeof(fd1)) { long start_pos=gztell(fd1); // Read 1 char *hdr1=READ_LINE_HDR(fd1); if ( hdr1==NULL) break; int len; char *seq1=READ_LINE_SEQ(fd1); char *hdr1_2=READ_LINE_HDR2(fd1); char *qual1=READ_LINE_QUAL(fd1); // Read 2 char *hdr2=READ_LINE_HDR2_1(fd1); char *seq2=READ_LINE_SEQ2(fd1); char *hdr2_2=READ_LINE_HDR2_2(fd1); char *qual2=READ_LINE_QUAL2(fd1); if ( seq1==NULL || hdr1_2==NULL || qual1==NULL || hdr2==NULL || seq2==NULL || hdr2_2==NULL || qual2==NULL ) { fprintf(stderr,"\nError in file %s, line %lu: file truncated?\n",f,cline); return(1); } if (validate_entry(hdr1,hdr1_2,seq1,qual1,cline,f)!=0) { return(1); } if (validate_entry(hdr2,hdr2_2,seq2,qual2,cline+4,f)!=0) { return(1); } char* readname1=get_readname(hdr1,&len,cline,f); char* readname2=get_readname(hdr2,&len,cline+4,f); if ( strcmp(readname1,readname2) ) { fprintf(stderr,"\nError in file %s, line %lu: unpaired read - %s\n",f,cline,readname1); return(1); } PRINT_READS_PROCESSED(cline/4); replace_dots(start_pos,seq1,hdr1,hdr1_2,qual1,fdf); replace_dots(start_pos,seq2,hdr2,hdr2_2,qual2,fdf); // cline+=8; nreads1+=2; } printf("\n"); close_fixed_fastq(fdf); gzclose(fd1); return(nreads1); }
void index_file(char *filename,hashtable sn_index,long start_offset,long length) { gzFile fd1=open_fastq(filename); gzFile fdf=open_fixed_fastq(filename); if (fd1==NULL) { fprintf(stderr,"\nError: Unable to open %s\n",filename); exit(1); } // move to the right position if(length>0) { fprintf(stderr, "\nInternal error: Not implemented\n"); exit(2); } long cline=1; // sn_index creation could be done in parallel while(!gzeof(fd1)) { long long start_pos=gztell(fd1); char *hdr=READ_LINE_HDR(fd1); if ( hdr==NULL) break; int len; //fprintf(stderr,"sn_index: =%s=\n",readname); // get seq //printf("cline=%ld\nLEN=%ld hdr=%s\n",cline,len,hdr); char *seq=READ_LINE_SEQ(fd1); char *hdr2=READ_LINE_HDR2(fd1); char *qual=READ_LINE_QUAL(fd1); char* readname=get_readname(hdr,&len,cline,filename); if (seq==NULL || hdr2==NULL || qual==NULL ) { fprintf(stderr,"\nError in file %s, line %lu: file truncated?\n",filename,cline); exit(1); } if (validate_entry(hdr,hdr2,seq,qual,cline,filename)!=0) { exit(1); } // check for duplicates if ( lookup_header(sn_index,readname)!=NULL ) { fprintf(stderr,"\nError in file %s, line %lu: duplicated sequence %s\n",filename,cline,readname); exit(1); } if ( new_indexentry(sn_index,readname,len,start_pos)==NULL) { fprintf(stderr,"\nError in file %s, line %lu: malloc failed?",filename,cline); exit(1); } replace_dots(start_pos,seq,hdr,hdr2,qual,fdf); PRINT_READS_PROCESSED(cline/4); // cline+=4; } close_fixed_fastq(fdf); gzclose(fd1); return; }
int main(int argc, char *argv[]) { if (argc < 3) { printf("not enough arguments\n"); return 1; } FILE *fp_h = fopen(argv[1], "w"); if (!fp_h) { printf("can't open %s\n", argv[1]); return 2; } FILE *fp_c = fopen(argv[2], "w"); if (!fp_c) { fclose(fp_h); printf("can't open %s\n", argv[2]); return 2; } fprintf(fp_h, "// generated file, all changes will be lost\n" "#ifndef FPP_GENERATED_TXT_RESOURCES_H\n" "#define FPP_GENERATED_TXT_RESOURCES_H\n" "\n" "\n"); for (int k = 3; k < argc; k ++) { char *fname = strdup(argv[k]); char *bname = basename(fname); replace_dots(bname); fprintf(fp_h, "extern const char *resource_text_%s;\n", bname); free(fname); } fprintf(fp_h, "\n\n#endif\n"); fclose(fp_h); fprintf(fp_c, "// generated file, all changes will be lost\n" "#include \"%s\"\n" "\n", argv[1]); for (int k = 3; k < argc; k ++) { unsigned char buf[4096]; char *fname = strdup(argv[k]); char *bname = basename(fname); replace_dots(bname); fprintf(fp_c, "const char *resource_text_%s = \"", bname); size_t fsize = 0; FILE *tmp = fopen(argv[k], "rb"); if (!tmp) { printf("can't open %s\n", argv[k]); free(fname); goto err_1; } while (!feof(tmp)) { size_t read_bytes = fread(buf, 1, sizeof(buf), tmp); for (size_t j = 0; j < read_bytes; j ++) fprintf(fp_c, "\\x%02x", buf[j]); fsize += read_bytes; } fclose(tmp); fprintf(fp_c, "\";\n"); free(fname); } fclose(fp_c); return 0; err_1: fclose(fp_c); return 3; }
int main(int argc, char **argv ) { //long paired=0; unsigned long num_reads1=0, num_reads2=0; is_paired_data=FALSE; is_interleaved=FALSE; fix_dot=FALSE; int nopt=0; int c; opterr = 0; fprintf(stderr,"Version iRAP %s\n",VERSION); while ((c = getopt (argc, argv, "f")) != -1) switch (c) { case 'f': fix_dot = TRUE; fprintf(stderr,"Fixing (-f) enabled: Replacing . by N (creating .fix.gz files)\n"); ++nopt; break; default: ++nopt; fprintf(stderr,"ERROR: Option -%c invalid\n",optopt); exit(1); } if (argc-nopt<2 || argc-nopt>3) { fprintf(stderr,"Usage: fastq_info [-f] fastq1 [fastq2 file|pe]\n"); //fprintf(stderr,"%d",argc); exit(1); } //gzFile fd1=NULL; gzFile fd2=NULL; if (argc-nopt ==3) { is_paired_data=TRUE; //fprintf(stderr,"%d %d %d %s\n",argc,nopt,argc-nopt,argv[2+nopt]); if ( !strncmp(argv[2+nopt],"pe",2) ) { is_interleaved=FALSE; } //else { // fd2=open_fastq(argv[2+nopt]); // gzclose(fd2); // } // ************************************************************ if ( is_interleaved ) { // interleaved num_reads1=validate_interleaved(argv[1+nopt]); } else { // single or pair of fastq file(s) unsigned long cline=1; fprintf(stderr,"HASHSIZE=%lu\n",(long unsigned int)HASHSIZE); //memset(&collisions[0],0,HASHSIZE+1); hashtable sn_index=new_hashtable(HASHSIZE); index_mem+=sizeof(hashtable); index_file(argv[1+nopt],sn_index,0,-1); num_reads1=sn_index->n_entries; fprintf(stderr,"\n"); // print some info fprintf(stderr,"Reads processed: %ld\n",sn_index->n_entries); fprintf(stderr,"Memory used in indexing: ~%ld MB\n",index_mem/1024/1024); // pair-end if (argc-nopt ==3 ) { fprintf(stderr,"File %s processed\n",argv[1+nopt]); fprintf(stderr,"Next file %s\n",argv[2+nopt]); // validate the second file and check if all reads are paired fd2=open_fastq(argv[2+nopt]); gzFile fdf=open_fixed_fastq(argv[2+nopt]); INDEX_ENTRY* e; // read the entry using another fd cline=1; // TODO: improve code - mostly duplicated:( while(!gzeof(fd2)) { long long start_pos=gztell(fd2); char *hdr=READ_LINE_HDR(fd2); if ( hdr==NULL) break; int len; char *seq=READ_LINE_SEQ(fd2); char *hdr2=READ_LINE_HDR2(fd2); char *qual=READ_LINE_QUAL(fd2); char* readname=get_readname(hdr,&len,cline,argv[2+nopt]); if (seq==NULL || hdr2==NULL || qual==NULL ) { fprintf(stderr,"\nError in file %s, line %lu: file truncated?\n",argv[2+nopt],cline); exit(1); } if (validate_entry(hdr,hdr2,seq,qual,cline,argv[2+nopt])!=0) { exit(1); } //fprintf(stderr,"Reads processed: %ld\n",sn_index->n_entries); // check for duplicates if ( (e=lookup_header(sn_index,readname))==NULL ) { fprintf(stderr,"\nError in file %s, line %lu: unpaired read - %s\n",argv[2+nopt],cline,readname); exit(1); } else { ulong key=hashit(readname); // remove entry from sn_index if (delete(sn_index,key,e)!=e) { fprintf(stderr,"\nError in file %s, line %lu: unable to delete entry from sn_index - %s\n",argv[2+nopt],cline,readname); exit(1); } free_indexentry(e); } PRINT_READS_PROCESSED(cline/4); ++num_reads2; // replace_dots(start_pos,seq,hdr,hdr2,qual,fdf); cline+=4; } printf("\n"); close_fixed_fastq(fdf); if (sn_index->n_entries>0 ) { fprintf(stderr,"\nError in file %s: found %lu unpaired reads\n",argv[1+nopt],sn_index->n_entries); exit(1); } } } FILE* out; if (fix_dot) { out=stderr; } else { out=stdout; } fprintf(out,"------------------------------------\n"); if ( num_reads2>0 ) { fprintf(out,"Number of reads: %lu %lu\n",num_reads1,num_reads2); } else { fprintf(out,"Number of reads: %lu\n",num_reads1); } fprintf(out,"Quality encoding range: %lu %lu\n",min_qual,max_qual); char *enc=qualRange2enc(min_qual,max_qual); if ( enc == NULL ) { fprintf(stderr,"\nERROR: Unable to determine quality encoding - unknown range [%lu,%lu]\n",min_qual,max_qual); exit(1); } fprintf(out,"Quality encoding: %s\n",qualRange2enc(min_qual,max_qual)); fprintf(out,"Read length: %lu %lu\n",min_rl,max_rl); fprintf(out,"OK\n"); exit(0); }