Пример #1
0
// check if the read name format was generated by casava 1.8
int is_casava_1_8(char *f) {
  regex_t regex;
  int reti;
  int is_casava_1_8=0;
  reti = regcomp(&regex,"[A-Z0-9:]* [12]:[YN]:[0-9]*:.*",0);  
  if ( reti ) { 
    fprintf(stderr, "Internal error: Could not compile regex\n"); 
    exit(2); 
  }
  FILE *fd1=open_fastq(f);
  char *hdr=READ_LINE(fd1);
  fclose(fd1);
  /* Execute regular expression */
  //fprintf(stderr,"%s\n",hdr);
  reti = regexec(&regex, hdr, 0, NULL, 0);
  if ( !reti ) {    // match
    is_casava_1_8=1;
  } 
  /* else{
    char msgbuf[100];
    regerror(reti, &regex, msgbuf, sizeof(msgbuf));
    //fprintf(stderr, "Regex match failed: %s\n", msgbuf);
    } */
  regfree(&regex);
  return is_casava_1_8;
}
Пример #2
0
void index_file(char *filename,hashtable sn_index,long start_offset,long length) {
  gzFile fd1=open_fastq(filename);  
  gzFile fdf=open_fixed_fastq(filename);  
  if (fd1==NULL) {
    fprintf(stderr,"\nError: Unable to open %s\n",filename);
    exit(1);
  }
  // move to the right position
  if(length>0) {
    fprintf(stderr, "\nInternal error: Not implemented\n");
    exit(2);
  }
  long cline=1;
  // sn_index creation could be done in parallel
  while(!gzeof(fd1)) {
    long long start_pos=gztell(fd1);
    char *hdr=READ_LINE_HDR(fd1);

    if ( hdr==NULL) break;
    int len;
    //fprintf(stderr,"sn_index: =%s=\n",readname);
    // get seq
    //printf("cline=%ld\nLEN=%ld  hdr=%s\n",cline,len,hdr);
    char *seq=READ_LINE_SEQ(fd1);
    char *hdr2=READ_LINE_HDR2(fd1);
    char *qual=READ_LINE_QUAL(fd1);
    char* readname=get_readname(hdr,&len,cline,filename);
    if (seq==NULL || hdr2==NULL || qual==NULL ) {
      fprintf(stderr,"\nError in file %s, line %lu: file truncated?\n",filename,cline);
      exit(1);
    }
    if (validate_entry(hdr,hdr2,seq,qual,cline,filename)!=0) {
      exit(1);
    }
    // check for duplicates
    if ( lookup_header(sn_index,readname)!=NULL ) {
      fprintf(stderr,"\nError in file %s, line %lu: duplicated sequence %s\n",filename,cline,readname);
      exit(1);
    }
    if ( new_indexentry(sn_index,readname,len,start_pos)==NULL) {
      fprintf(stderr,"\nError in file %s, line %lu: malloc failed?",filename,cline);
      exit(1);
    }
    replace_dots(start_pos,seq,hdr,hdr2,qual,fdf);    
    PRINT_READS_PROCESSED(cline/4);
    //
    cline+=4;
  }
  close_fixed_fastq(fdf);
  gzclose(fd1);
  return;
}
Пример #3
0
int validate_interleaved(char *f) {
  unsigned long cline=1;
  unsigned long nreads1=0;
  gzFile fd1=NULL;  
  fprintf(stderr,"Paired-end interleaved\n");
  fd1=open_fastq(f);
  gzFile fdf=open_fixed_fastq(f);  
  while(!gzeof(fd1)) {
    long start_pos=gztell(fd1);
    // Read 1
    char *hdr1=READ_LINE_HDR(fd1);
    if ( hdr1==NULL) break;
    int len;
    char *seq1=READ_LINE_SEQ(fd1);
    char *hdr1_2=READ_LINE_HDR2(fd1);
    char *qual1=READ_LINE_QUAL(fd1);
    // Read 2
    char *hdr2=READ_LINE_HDR2_1(fd1);
    char *seq2=READ_LINE_SEQ2(fd1);
    char *hdr2_2=READ_LINE_HDR2_2(fd1);
    char *qual2=READ_LINE_QUAL2(fd1);
    
    if ( seq1==NULL || hdr1_2==NULL || qual1==NULL ||
	 hdr2==NULL || seq2==NULL || hdr2_2==NULL || qual2==NULL ) {
      fprintf(stderr,"\nError in file %s, line %lu: file truncated?\n",f,cline);
      return(1);
    }
    if (validate_entry(hdr1,hdr1_2,seq1,qual1,cline,f)!=0) {
      return(1);
    }
    if (validate_entry(hdr2,hdr2_2,seq2,qual2,cline+4,f)!=0) {
      return(1);
    }
    char* readname1=get_readname(hdr1,&len,cline,f);
    char* readname2=get_readname(hdr2,&len,cline+4,f);
    if ( strcmp(readname1,readname2) ) {
      fprintf(stderr,"\nError in file %s, line %lu: unpaired read - %s\n",f,cline,readname1);
      return(1);
    } 
    PRINT_READS_PROCESSED(cline/4);
    replace_dots(start_pos,seq1,hdr1,hdr1_2,qual1,fdf);    
    replace_dots(start_pos,seq2,hdr2,hdr2_2,qual2,fdf);    
    //
    cline+=8;
    nreads1+=2;
  }
  printf("\n");
  close_fixed_fastq(fdf);
  gzclose(fd1);
  return(nreads1);
}
Пример #4
0
int main(int argc, char **argv ) {

  //printf("%d",sizeof(struct index_entry)); 
  
  if (argc!=2) {
    fprintf(stderr,"Usage: fastq_filter_n fastq1\n");
    exit(1);
  }
  FILE *fd1=open_fastq(argv[1]);
  // ************************************************************
  unsigned long cline=1;
  unsigned long cur_read=0;
  //char tmp_buffer[MAX_READ_LENGTH];
  // read the entry using another fd
  cline=1;
  while(!feof(fd1)) {
    char *hdr=READ_LINE(fd1,1);
    if ( hdr==NULL) break;
    if ( hdr[0]!='@' ) {
      fprintf(stderr,"line %lu: error in header %s",cline,hdr);
      return 1;
    }
    //
    char *seq=READ_LINE(fd1,2);
    READ_LINE(fd1,3);
    READ_LINE(fd1,4);
    
    short n_found=0;
    int k;
    for ( k=0;k<MAX_READ_LENGTH;k++) {
      if (seq[k]=='\n') break;
      if (seq[k]=='N' || seq[k]=='n' ) {
	n_found=1; break;
      }
    }
    if ( ! n_found ) 
      WRITE_READ(stdout);
    cline+=4;
    cur_read++;    
  }
  fclose(fd1);
  exit(0);
}
Пример #5
0
int main(int argc, char **argv ) {
  long paired=0;

  //printf("%d",sizeof(struct index_entry)); 
  
  if (argc!=6) {
    fprintf(stderr,"Usage: fastqinterleaved2pair.c fastq fastq1 fastq2\n");
    //fprintf(stderr,"%d",argc);
    exit(1);
  }

  FILE *fd=open_fastq(argv[1]);
  // ************************************************************
  off_t cur_offset=1;
  unsigned long cline=1;
  hashtable index=new_hashtable(HASHSIZE);
  index_mem+=sizeof(hashtable);

  index_file(argv[1],index,0,-1);
  printf("\n");
  // print some info
  printf("Reads indexed: %ld\n",index->n_entries);
  printf("Memory used in indexing: %ld MB\n",index_mem/1024/1024);  
  // 

  char *p1=argv[3];
  char *p2=argv[4];
  char *p3=argv[5];
  fd1=open_fastq(argv[1]);
  fd2=open_fastq(argv[2]);
  FILE *fdw1=fopen(p1,"w");
  FILE *fdw2=fopen(p2,"w");
  FILE *fdw3=fopen(p3,"w");
  unsigned long up2=0;

  if ( fdw1==NULL || fdw2==NULL || fdw3==NULL ) {
    fprintf(stderr,"Unable to create output files\n");
    exit(1);
  }
  
  // read the entry using another fd
  cline=1;
  while(!feof(fd2)) {
    long start_pos=ftell(fd2);
    char *hdr=READ_LINE(fd2);

    if ( hdr==NULL) break;
    if ( hdr[0]!='@' ) {
      fprintf(stderr,"line %ul: error in header %s",cline,hdr);
      return 1;
    }
    // discard @ (one less byte)
    hdr=&hdr[1];
    int len=strlen(hdr);
    len--;
    hdr[len-1]='\0'; //
    // lookup hdr in index
    INDEX_ENTRY* e=lookup_header(index,hdr);
    if (e==NULL) {
      ++up2;
      copy_read(start_pos,fd2,fdw3);
    } else {
      long key=hashit(hdr);
      // pair found
      ++paired;
      copy_read(start_pos,fd2,fdw2);
      copy_read(e->entry_start,fd1,fdw1);
      // remove entry from index
      if (delete(index,key,e)!=e) {
	fprintf(stderr,"Unable to delete entry from index\n");
	exit(1);
      }
      free_indexentry(e);
    }
    PRINT_READS_PROCESSED(cline/4);

    cline+=4;
  }
  printf("\n");
  printf("Recording %ld unpaired reads from %s\n",index->n_entries,argv[1]);fflush(stdout);
  fclose(fd1);


  // record the unpaired from argv[1]
  fd1=open_fastq(argv[1]); 
#ifndef SEQDISKACCESS
  init_hash_traversal(index);
  INDEX_ENTRY* e;
  cline=1;
  while((e=(INDEX_ENTRY*)next_hash_object(index))!=NULL) {
          copy_read(e->entry_start,fd1,fdw3);
	  PRINT_READS_PROCESSED(cline);
	  ++cline;
  }
  //
#else
  //sequential disk access
  //
  cline=1;
  unsigned long remaining=index->n_entries;
  while(!feof(fd1) && remaining ) {
    //long start_pos=ftell(fd2);
    char *hdr=READ_LINE(fd1);

    if ( hdr==NULL) break;
    if ( hdr[0]!='@' ) {
      fprintf(stderr,"line %ld %s: error in header %s",cline,argv[1],hdr);
      return 1;
    }
    // discard @ (one less byte)
    hdr=&hdr[1];
    int len=strlen(hdr);
    len--;
    hdr[len-1]='\0'; //

    // lookup hdr in index
    INDEX_ENTRY* e=lookup_header(index,hdr);
    if (e!=NULL) {
      copy_read(e->entry_start,fd1,fdw3);
      remaining--;
    } else {
      READ_LINE(fd1);//seq
      READ_LINE(fd1);//qual
      READ_LINE(fd1);//qual
    }
    PRINT_READS_PROCESSED(cline/4);
    cline+=4;
  }
  fclose(fd1);
#endif
  printf("\n");
  printf("Unpaired from %s: %ld\n",argv[1],index->n_entries);
  printf("Unpaired from %s: %ld\n",argv[2],up2);
  printf("Paired: %ld\n",paired);
  /*fseek(fd2,start_pos,SEEK_SET);
    printf("%s",READ_LINE(fd2));
    printf("%s",READ_LINE(fd2));
    printf("%s",READ_LINE(fd2));
    printf("%s",READ_LINE(fd2));
  */
  fclose(fdw1);
  fclose(fdw2);
  fclose(fdw3);
  if ( paired == 0 ) {
    fprintf(stderr,"!!!WARNING!!! 0 paired reads! are the headers ok?\n");
    exit(1);
  }
  exit(0);
}
Пример #6
0
int main(int argc, char **argv ) {
  //long paired=0;
  is_paired_data=0;
  is_interleaved=0;
  printf("Version iRAP %s\n",VERSION);
  if (argc<2 || argc>3) {
    fprintf(stderr,"Usage: fastq_validator fastq1 [fastq2 file|pe]\n");
    //fprintf(stderr,"%d",argc);
    exit(1);
  }

  FILE *fd1=NULL;
  FILE *fd2=NULL;
  // open & close
  fd1=open_fastq(argv[1]);
  fclose(fd1);
  //fprintf(stderr,"%d\n",argc);
  //bin/fprintf(stderr,"%s\n",argv[0]);
  if (argc ==3) {
    is_paired_data=1;
    if ( !strncmp(argv[2],"pe",2) ) {
      is_interleaved=1;
    } else  {
      fd2=open_fastq(argv[2]);
      fclose(fd2);
    }
  }
  // ************************************************************
  // casava 1.8?
  is_casava_18=is_casava_1_8(argv[1]);
  if (is_casava_18) fprintf(stderr,"CASAVA=1.8\n");
  // ************************************************************
  //off_t cur_offset=1;
  // interleaved
  if ( is_interleaved ) {
    exit(validate_interleaved(argv[1]));
  }
  unsigned long cline=1;
  fprintf(stderr,"HASHSIZE=%lu\n",(long unsigned int)HASHSIZE);
  //memset(&collisions[0],0,HASHSIZE+1);
  hashtable sn_index=new_hashtable(HASHSIZE);
  index_mem+=sizeof(hashtable);

  index_file(argv[1],sn_index,0,-1);
  fprintf(stderr,"\n");
  // print some info
  fprintf(stderr,"Reads processed: %ld\n",sn_index->n_entries);
  fprintf(stderr,"Memory used in indexing: ~%ld MB\n",index_mem/1024/1024);  
  // pair-end
  if (argc ==3 ) {
    fprintf(stderr,"File %s processed\n",argv[1]);  
    fprintf(stderr,"Next file %s\n",argv[2]);  
    // validate the second file and check if all reads are paired
    fd2=open_fastq(argv[2]);
    INDEX_ENTRY* e;
    // read the entry using another fd
    cline=1;
    // TODO: improve code - mostly duplicated:(
    while(!feof(fd2)) {
      //long start_pos=ftell(fd2);
      char *hdr=READ_LINE_HDR(fd2);
      if ( hdr==NULL) break;
      int len;
      char *seq=READ_LINE_SEQ(fd2);
      char *hdr2=READ_LINE_HDR2(fd2);
      char *qual=READ_LINE_QUAL(fd2);
      char* readname=get_readname(hdr,&len,cline,argv[2]);
      if (seq==NULL || hdr2==NULL || qual==NULL ) {
	fprintf(stderr,"Error in file %s, line %lu: file truncated?\n",argv[2],cline);
	exit(1);
      }
      if (validate_entry(hdr,hdr2,seq,qual,cline,argv[2])!=0) {
	exit(1);
      }
      //fprintf(stderr,"Reads processed: %ld\n",sn_index->n_entries);
      // check for duplicates
      if ( (e=lookup_header(sn_index,readname))==NULL ) {
	fprintf(stderr,"Error in file %s, line %lu: unpaired read - %s\n",argv[2],cline,readname);
	exit(1);
      } else {
	ulong key=hashit(readname);
	// remove entry from sn_index
	if (delete(sn_index,key,e)!=e) {
	  fprintf(stderr,"Error in file %s, line %lu: unable to delete entry from sn_index - %s\n",argv[2],cline,readname);
	  exit(1);
	}
	free_indexentry(e);
      }
      PRINT_READS_PROCESSED(cline/4);
      //
      cline+=4;
    }
    printf("\n");
    if (sn_index->n_entries>0 ) {
      fprintf(stderr,"Error in file %s: found %lu unpaired reads\n",argv[1],sn_index->n_entries);
      exit(1);
    }
  }
  printf("OK\n");  
  exit(0);
}
Пример #7
0
int main(int argc, char **argv ) {
  //long paired=0;
  unsigned long num_reads1=0,
    num_reads2=0;
  
  is_paired_data=FALSE;
  is_interleaved=FALSE;
  fix_dot=FALSE;
  
  int nopt=0;
  int c;
  opterr = 0;

  fprintf(stderr,"Version iRAP %s\n",VERSION);
  
  while ((c = getopt (argc, argv, "f")) != -1)
    switch (c)
      {
      case 'f':
        fix_dot = TRUE;
	fprintf(stderr,"Fixing (-f) enabled: Replacing . by N (creating .fix.gz files)\n");
	++nopt;
        break;
      default:
	++nopt;
        fprintf(stderr,"ERROR: Option -%c invalid\n",optopt);
	exit(1);
      }
  
  if (argc-nopt<2 || argc-nopt>3) {
    fprintf(stderr,"Usage: fastq_info [-f] fastq1 [fastq2 file|pe]\n");
    //fprintf(stderr,"%d",argc);
    exit(1);
  }

  //gzFile fd1=NULL;
  gzFile fd2=NULL;

  if (argc-nopt ==3) {
    is_paired_data=TRUE;
    //fprintf(stderr,"%d %d %d %s\n",argc,nopt,argc-nopt,argv[2+nopt]);
    if ( !strncmp(argv[2+nopt],"pe",2) ) {
      is_interleaved=FALSE;
    } 
    //else  {
    //  fd2=open_fastq(argv[2+nopt]);
    //  gzclose(fd2);
    //
  }

  // ************************************************************
  if ( is_interleaved ) {
    // interleaved    
    num_reads1=validate_interleaved(argv[1+nopt]);
  } else {
    // single or pair of fastq file(s)
    unsigned long cline=1;
    fprintf(stderr,"HASHSIZE=%lu\n",(long unsigned int)HASHSIZE);
    //memset(&collisions[0],0,HASHSIZE+1);
    hashtable sn_index=new_hashtable(HASHSIZE);
    index_mem+=sizeof(hashtable);
    
    index_file(argv[1+nopt],sn_index,0,-1);
    num_reads1=sn_index->n_entries;
    fprintf(stderr,"\n");
    // print some info
    fprintf(stderr,"Reads processed: %ld\n",sn_index->n_entries);    
    fprintf(stderr,"Memory used in indexing: ~%ld MB\n",index_mem/1024/1024);  
    // pair-end
    if (argc-nopt ==3 ) {
      fprintf(stderr,"File %s processed\n",argv[1+nopt]);  
      fprintf(stderr,"Next file %s\n",argv[2+nopt]);  
      // validate the second file and check if all reads are paired
      fd2=open_fastq(argv[2+nopt]);
      gzFile fdf=open_fixed_fastq(argv[2+nopt]);  
      INDEX_ENTRY* e;
      // read the entry using another fd
      cline=1;
      // TODO: improve code - mostly duplicated:(
      while(!gzeof(fd2)) {
	long long start_pos=gztell(fd2);
	char *hdr=READ_LINE_HDR(fd2);
	if ( hdr==NULL) break;
	int len;
	char *seq=READ_LINE_SEQ(fd2);
	char *hdr2=READ_LINE_HDR2(fd2);
	char *qual=READ_LINE_QUAL(fd2);
	char* readname=get_readname(hdr,&len,cline,argv[2+nopt]);
	if (seq==NULL || hdr2==NULL || qual==NULL ) {
	  fprintf(stderr,"\nError in file %s, line %lu: file truncated?\n",argv[2+nopt],cline);
	  exit(1);
	}
	if (validate_entry(hdr,hdr2,seq,qual,cline,argv[2+nopt])!=0) {
	  exit(1);
	}
	//fprintf(stderr,"Reads processed: %ld\n",sn_index->n_entries);
	// check for duplicates
	if ( (e=lookup_header(sn_index,readname))==NULL ) {
	  fprintf(stderr,"\nError in file %s, line %lu: unpaired read - %s\n",argv[2+nopt],cline,readname);
	  exit(1);
	} else {
	  ulong key=hashit(readname);
	  // remove entry from sn_index
	  if (delete(sn_index,key,e)!=e) {
	    fprintf(stderr,"\nError in file %s, line %lu: unable to delete entry from sn_index - %s\n",argv[2+nopt],cline,readname);
	    exit(1);
	  }
	  free_indexentry(e);
	}
	PRINT_READS_PROCESSED(cline/4);
	++num_reads2;
	//
	replace_dots(start_pos,seq,hdr,hdr2,qual,fdf);
	cline+=4;
      }
      printf("\n");
      close_fixed_fastq(fdf);
      if (sn_index->n_entries>0 ) {
	fprintf(stderr,"\nError in file %s: found %lu unpaired reads\n",argv[1+nopt],sn_index->n_entries);
	exit(1);
      }
    }
  }
  FILE* out;  
  if (fix_dot) {
    out=stderr;
  } else {
    out=stdout;
  }
  fprintf(out,"------------------------------------\n");
  if ( num_reads2>0 ) {
    fprintf(out,"Number of reads: %lu %lu\n",num_reads1,num_reads2);
  } else {
    fprintf(out,"Number of reads: %lu\n",num_reads1);
  }
  fprintf(out,"Quality encoding range: %lu %lu\n",min_qual,max_qual);
  char *enc=qualRange2enc(min_qual,max_qual);
  if ( enc == NULL ) {
    fprintf(stderr,"\nERROR: Unable to determine quality encoding - unknown range [%lu,%lu]\n",min_qual,max_qual);
    exit(1);    
  }
  fprintf(out,"Quality encoding: %s\n",qualRange2enc(min_qual,max_qual));
  fprintf(out,"Read length: %lu %lu\n",min_rl,max_rl);
  fprintf(out,"OK\n");  
  exit(0);
}