Ejemplo n.º 1
0
/* Lookup a header in the mapping and return the value as if it's an integer
 * and otherwise return the default.
 */
INLINE static int lookup_integer_header(char *headername, int default_value)
{
  struct svalue *head = NULL;
  THREAD_SAFE_RUN(head = lookup_header(headername), "header lookup");
  if(!head || head->type != PIKE_T_INT)
    return default_value;
  return head->u.integer;
}
Ejemplo n.º 2
0
/* Lookup a header in the mapping and return the value as a string, or
 * return the default if it's missing
 */
INLINE static char *lookup_string_header(char *headername, char *default_value)
{
  struct svalue *head = NULL;
  THREAD_SAFE_RUN(head = lookup_header(headername), "header lookup");
  if(!head || head->type != PIKE_T_STRING)
    return default_value;
  return head->u.string->str;
}
Ejemplo n.º 3
0
void index_file(char *filename,hashtable sn_index,long start_offset,long length) {
  gzFile fd1=open_fastq(filename);  
  gzFile fdf=open_fixed_fastq(filename);  
  if (fd1==NULL) {
    fprintf(stderr,"\nError: Unable to open %s\n",filename);
    exit(1);
  }
  // move to the right position
  if(length>0) {
    fprintf(stderr, "\nInternal error: Not implemented\n");
    exit(2);
  }
  long cline=1;
  // sn_index creation could be done in parallel
  while(!gzeof(fd1)) {
    long long start_pos=gztell(fd1);
    char *hdr=READ_LINE_HDR(fd1);

    if ( hdr==NULL) break;
    int len;
    //fprintf(stderr,"sn_index: =%s=\n",readname);
    // get seq
    //printf("cline=%ld\nLEN=%ld  hdr=%s\n",cline,len,hdr);
    char *seq=READ_LINE_SEQ(fd1);
    char *hdr2=READ_LINE_HDR2(fd1);
    char *qual=READ_LINE_QUAL(fd1);
    char* readname=get_readname(hdr,&len,cline,filename);
    if (seq==NULL || hdr2==NULL || qual==NULL ) {
      fprintf(stderr,"\nError in file %s, line %lu: file truncated?\n",filename,cline);
      exit(1);
    }
    if (validate_entry(hdr,hdr2,seq,qual,cline,filename)!=0) {
      exit(1);
    }
    // check for duplicates
    if ( lookup_header(sn_index,readname)!=NULL ) {
      fprintf(stderr,"\nError in file %s, line %lu: duplicated sequence %s\n",filename,cline,readname);
      exit(1);
    }
    if ( new_indexentry(sn_index,readname,len,start_pos)==NULL) {
      fprintf(stderr,"\nError in file %s, line %lu: malloc failed?",filename,cline);
      exit(1);
    }
    replace_dots(start_pos,seq,hdr,hdr2,qual,fdf);    
    PRINT_READS_PROCESSED(cline/4);
    //
    cline+=4;
  }
  close_fixed_fastq(fdf);
  gzclose(fd1);
  return;
}
Ejemplo n.º 4
0
int main(int argc, char **argv ) {
  long paired=0;

  //printf("%d",sizeof(struct index_entry)); 
  
  if (argc!=6) {
    fprintf(stderr,"Usage: fastqinterleaved2pair.c fastq fastq1 fastq2\n");
    //fprintf(stderr,"%d",argc);
    exit(1);
  }

  FILE *fd=open_fastq(argv[1]);
  // ************************************************************
  off_t cur_offset=1;
  unsigned long cline=1;
  hashtable index=new_hashtable(HASHSIZE);
  index_mem+=sizeof(hashtable);

  index_file(argv[1],index,0,-1);
  printf("\n");
  // print some info
  printf("Reads indexed: %ld\n",index->n_entries);
  printf("Memory used in indexing: %ld MB\n",index_mem/1024/1024);  
  // 

  char *p1=argv[3];
  char *p2=argv[4];
  char *p3=argv[5];
  fd1=open_fastq(argv[1]);
  fd2=open_fastq(argv[2]);
  FILE *fdw1=fopen(p1,"w");
  FILE *fdw2=fopen(p2,"w");
  FILE *fdw3=fopen(p3,"w");
  unsigned long up2=0;

  if ( fdw1==NULL || fdw2==NULL || fdw3==NULL ) {
    fprintf(stderr,"Unable to create output files\n");
    exit(1);
  }
  
  // read the entry using another fd
  cline=1;
  while(!feof(fd2)) {
    long start_pos=ftell(fd2);
    char *hdr=READ_LINE(fd2);

    if ( hdr==NULL) break;
    if ( hdr[0]!='@' ) {
      fprintf(stderr,"line %ul: error in header %s",cline,hdr);
      return 1;
    }
    // discard @ (one less byte)
    hdr=&hdr[1];
    int len=strlen(hdr);
    len--;
    hdr[len-1]='\0'; //
    // lookup hdr in index
    INDEX_ENTRY* e=lookup_header(index,hdr);
    if (e==NULL) {
      ++up2;
      copy_read(start_pos,fd2,fdw3);
    } else {
      long key=hashit(hdr);
      // pair found
      ++paired;
      copy_read(start_pos,fd2,fdw2);
      copy_read(e->entry_start,fd1,fdw1);
      // remove entry from index
      if (delete(index,key,e)!=e) {
	fprintf(stderr,"Unable to delete entry from index\n");
	exit(1);
      }
      free_indexentry(e);
    }
    PRINT_READS_PROCESSED(cline/4);

    cline+=4;
  }
  printf("\n");
  printf("Recording %ld unpaired reads from %s\n",index->n_entries,argv[1]);fflush(stdout);
  fclose(fd1);


  // record the unpaired from argv[1]
  fd1=open_fastq(argv[1]); 
#ifndef SEQDISKACCESS
  init_hash_traversal(index);
  INDEX_ENTRY* e;
  cline=1;
  while((e=(INDEX_ENTRY*)next_hash_object(index))!=NULL) {
          copy_read(e->entry_start,fd1,fdw3);
	  PRINT_READS_PROCESSED(cline);
	  ++cline;
  }
  //
#else
  //sequential disk access
  //
  cline=1;
  unsigned long remaining=index->n_entries;
  while(!feof(fd1) && remaining ) {
    //long start_pos=ftell(fd2);
    char *hdr=READ_LINE(fd1);

    if ( hdr==NULL) break;
    if ( hdr[0]!='@' ) {
      fprintf(stderr,"line %ld %s: error in header %s",cline,argv[1],hdr);
      return 1;
    }
    // discard @ (one less byte)
    hdr=&hdr[1];
    int len=strlen(hdr);
    len--;
    hdr[len-1]='\0'; //

    // lookup hdr in index
    INDEX_ENTRY* e=lookup_header(index,hdr);
    if (e!=NULL) {
      copy_read(e->entry_start,fd1,fdw3);
      remaining--;
    } else {
      READ_LINE(fd1);//seq
      READ_LINE(fd1);//qual
      READ_LINE(fd1);//qual
    }
    PRINT_READS_PROCESSED(cline/4);
    cline+=4;
  }
  fclose(fd1);
#endif
  printf("\n");
  printf("Unpaired from %s: %ld\n",argv[1],index->n_entries);
  printf("Unpaired from %s: %ld\n",argv[2],up2);
  printf("Paired: %ld\n",paired);
  /*fseek(fd2,start_pos,SEEK_SET);
    printf("%s",READ_LINE(fd2));
    printf("%s",READ_LINE(fd2));
    printf("%s",READ_LINE(fd2));
    printf("%s",READ_LINE(fd2));
  */
  fclose(fdw1);
  fclose(fdw2);
  fclose(fdw3);
  if ( paired == 0 ) {
    fprintf(stderr,"!!!WARNING!!! 0 paired reads! are the headers ok?\n");
    exit(1);
  }
  exit(0);
}
Ejemplo n.º 5
0
int main(int argc, char **argv ) {
  //long paired=0;
  is_paired_data=0;
  is_interleaved=0;
  printf("Version iRAP %s\n",VERSION);
  if (argc<2 || argc>3) {
    fprintf(stderr,"Usage: fastq_validator fastq1 [fastq2 file|pe]\n");
    //fprintf(stderr,"%d",argc);
    exit(1);
  }

  FILE *fd1=NULL;
  FILE *fd2=NULL;
  // open & close
  fd1=open_fastq(argv[1]);
  fclose(fd1);
  //fprintf(stderr,"%d\n",argc);
  //bin/fprintf(stderr,"%s\n",argv[0]);
  if (argc ==3) {
    is_paired_data=1;
    if ( !strncmp(argv[2],"pe",2) ) {
      is_interleaved=1;
    } else  {
      fd2=open_fastq(argv[2]);
      fclose(fd2);
    }
  }
  // ************************************************************
  // casava 1.8?
  is_casava_18=is_casava_1_8(argv[1]);
  if (is_casava_18) fprintf(stderr,"CASAVA=1.8\n");
  // ************************************************************
  //off_t cur_offset=1;
  // interleaved
  if ( is_interleaved ) {
    exit(validate_interleaved(argv[1]));
  }
  unsigned long cline=1;
  fprintf(stderr,"HASHSIZE=%lu\n",(long unsigned int)HASHSIZE);
  //memset(&collisions[0],0,HASHSIZE+1);
  hashtable sn_index=new_hashtable(HASHSIZE);
  index_mem+=sizeof(hashtable);

  index_file(argv[1],sn_index,0,-1);
  fprintf(stderr,"\n");
  // print some info
  fprintf(stderr,"Reads processed: %ld\n",sn_index->n_entries);
  fprintf(stderr,"Memory used in indexing: ~%ld MB\n",index_mem/1024/1024);  
  // pair-end
  if (argc ==3 ) {
    fprintf(stderr,"File %s processed\n",argv[1]);  
    fprintf(stderr,"Next file %s\n",argv[2]);  
    // validate the second file and check if all reads are paired
    fd2=open_fastq(argv[2]);
    INDEX_ENTRY* e;
    // read the entry using another fd
    cline=1;
    // TODO: improve code - mostly duplicated:(
    while(!feof(fd2)) {
      //long start_pos=ftell(fd2);
      char *hdr=READ_LINE_HDR(fd2);
      if ( hdr==NULL) break;
      int len;
      char *seq=READ_LINE_SEQ(fd2);
      char *hdr2=READ_LINE_HDR2(fd2);
      char *qual=READ_LINE_QUAL(fd2);
      char* readname=get_readname(hdr,&len,cline,argv[2]);
      if (seq==NULL || hdr2==NULL || qual==NULL ) {
	fprintf(stderr,"Error in file %s, line %lu: file truncated?\n",argv[2],cline);
	exit(1);
      }
      if (validate_entry(hdr,hdr2,seq,qual,cline,argv[2])!=0) {
	exit(1);
      }
      //fprintf(stderr,"Reads processed: %ld\n",sn_index->n_entries);
      // check for duplicates
      if ( (e=lookup_header(sn_index,readname))==NULL ) {
	fprintf(stderr,"Error in file %s, line %lu: unpaired read - %s\n",argv[2],cline,readname);
	exit(1);
      } else {
	ulong key=hashit(readname);
	// remove entry from sn_index
	if (delete(sn_index,key,e)!=e) {
	  fprintf(stderr,"Error in file %s, line %lu: unable to delete entry from sn_index - %s\n",argv[2],cline,readname);
	  exit(1);
	}
	free_indexentry(e);
      }
      PRINT_READS_PROCESSED(cline/4);
      //
      cline+=4;
    }
    printf("\n");
    if (sn_index->n_entries>0 ) {
      fprintf(stderr,"Error in file %s: found %lu unpaired reads\n",argv[1],sn_index->n_entries);
      exit(1);
    }
  }
  printf("OK\n");  
  exit(0);
}
Ejemplo n.º 6
0
int main(int argc, char **argv ) {
  //long paired=0;
  unsigned long num_reads1=0,
    num_reads2=0;
  
  is_paired_data=FALSE;
  is_interleaved=FALSE;
  fix_dot=FALSE;
  
  int nopt=0;
  int c;
  opterr = 0;

  fprintf(stderr,"Version iRAP %s\n",VERSION);
  
  while ((c = getopt (argc, argv, "f")) != -1)
    switch (c)
      {
      case 'f':
        fix_dot = TRUE;
	fprintf(stderr,"Fixing (-f) enabled: Replacing . by N (creating .fix.gz files)\n");
	++nopt;
        break;
      default:
	++nopt;
        fprintf(stderr,"ERROR: Option -%c invalid\n",optopt);
	exit(1);
      }
  
  if (argc-nopt<2 || argc-nopt>3) {
    fprintf(stderr,"Usage: fastq_info [-f] fastq1 [fastq2 file|pe]\n");
    //fprintf(stderr,"%d",argc);
    exit(1);
  }

  //gzFile fd1=NULL;
  gzFile fd2=NULL;

  if (argc-nopt ==3) {
    is_paired_data=TRUE;
    //fprintf(stderr,"%d %d %d %s\n",argc,nopt,argc-nopt,argv[2+nopt]);
    if ( !strncmp(argv[2+nopt],"pe",2) ) {
      is_interleaved=FALSE;
    } 
    //else  {
    //  fd2=open_fastq(argv[2+nopt]);
    //  gzclose(fd2);
    //
  }

  // ************************************************************
  if ( is_interleaved ) {
    // interleaved    
    num_reads1=validate_interleaved(argv[1+nopt]);
  } else {
    // single or pair of fastq file(s)
    unsigned long cline=1;
    fprintf(stderr,"HASHSIZE=%lu\n",(long unsigned int)HASHSIZE);
    //memset(&collisions[0],0,HASHSIZE+1);
    hashtable sn_index=new_hashtable(HASHSIZE);
    index_mem+=sizeof(hashtable);
    
    index_file(argv[1+nopt],sn_index,0,-1);
    num_reads1=sn_index->n_entries;
    fprintf(stderr,"\n");
    // print some info
    fprintf(stderr,"Reads processed: %ld\n",sn_index->n_entries);    
    fprintf(stderr,"Memory used in indexing: ~%ld MB\n",index_mem/1024/1024);  
    // pair-end
    if (argc-nopt ==3 ) {
      fprintf(stderr,"File %s processed\n",argv[1+nopt]);  
      fprintf(stderr,"Next file %s\n",argv[2+nopt]);  
      // validate the second file and check if all reads are paired
      fd2=open_fastq(argv[2+nopt]);
      gzFile fdf=open_fixed_fastq(argv[2+nopt]);  
      INDEX_ENTRY* e;
      // read the entry using another fd
      cline=1;
      // TODO: improve code - mostly duplicated:(
      while(!gzeof(fd2)) {
	long long start_pos=gztell(fd2);
	char *hdr=READ_LINE_HDR(fd2);
	if ( hdr==NULL) break;
	int len;
	char *seq=READ_LINE_SEQ(fd2);
	char *hdr2=READ_LINE_HDR2(fd2);
	char *qual=READ_LINE_QUAL(fd2);
	char* readname=get_readname(hdr,&len,cline,argv[2+nopt]);
	if (seq==NULL || hdr2==NULL || qual==NULL ) {
	  fprintf(stderr,"\nError in file %s, line %lu: file truncated?\n",argv[2+nopt],cline);
	  exit(1);
	}
	if (validate_entry(hdr,hdr2,seq,qual,cline,argv[2+nopt])!=0) {
	  exit(1);
	}
	//fprintf(stderr,"Reads processed: %ld\n",sn_index->n_entries);
	// check for duplicates
	if ( (e=lookup_header(sn_index,readname))==NULL ) {
	  fprintf(stderr,"\nError in file %s, line %lu: unpaired read - %s\n",argv[2+nopt],cline,readname);
	  exit(1);
	} else {
	  ulong key=hashit(readname);
	  // remove entry from sn_index
	  if (delete(sn_index,key,e)!=e) {
	    fprintf(stderr,"\nError in file %s, line %lu: unable to delete entry from sn_index - %s\n",argv[2+nopt],cline,readname);
	    exit(1);
	  }
	  free_indexentry(e);
	}
	PRINT_READS_PROCESSED(cline/4);
	++num_reads2;
	//
	replace_dots(start_pos,seq,hdr,hdr2,qual,fdf);
	cline+=4;
      }
      printf("\n");
      close_fixed_fastq(fdf);
      if (sn_index->n_entries>0 ) {
	fprintf(stderr,"\nError in file %s: found %lu unpaired reads\n",argv[1+nopt],sn_index->n_entries);
	exit(1);
      }
    }
  }
  FILE* out;  
  if (fix_dot) {
    out=stderr;
  } else {
    out=stdout;
  }
  fprintf(out,"------------------------------------\n");
  if ( num_reads2>0 ) {
    fprintf(out,"Number of reads: %lu %lu\n",num_reads1,num_reads2);
  } else {
    fprintf(out,"Number of reads: %lu\n",num_reads1);
  }
  fprintf(out,"Quality encoding range: %lu %lu\n",min_qual,max_qual);
  char *enc=qualRange2enc(min_qual,max_qual);
  if ( enc == NULL ) {
    fprintf(stderr,"\nERROR: Unable to determine quality encoding - unknown range [%lu,%lu]\n",min_qual,max_qual);
    exit(1);    
  }
  fprintf(out,"Quality encoding: %s\n",qualRange2enc(min_qual,max_qual));
  fprintf(out,"Read length: %lu %lu\n",min_rl,max_rl);
  fprintf(out,"OK\n");  
  exit(0);
}