示例#1
0
文件: fastq_info.c 项目: hjanime/irap
int validate_interleaved(char *f) {
  unsigned long cline=1;
  unsigned long nreads1=0;
  gzFile fd1=NULL;  
  fprintf(stderr,"Paired-end interleaved\n");
  fd1=open_fastq(f);
  gzFile fdf=open_fixed_fastq(f);  
  while(!gzeof(fd1)) {
    long start_pos=gztell(fd1);
    // Read 1
    char *hdr1=READ_LINE_HDR(fd1);
    if ( hdr1==NULL) break;
    int len;
    char *seq1=READ_LINE_SEQ(fd1);
    char *hdr1_2=READ_LINE_HDR2(fd1);
    char *qual1=READ_LINE_QUAL(fd1);
    // Read 2
    char *hdr2=READ_LINE_HDR2_1(fd1);
    char *seq2=READ_LINE_SEQ2(fd1);
    char *hdr2_2=READ_LINE_HDR2_2(fd1);
    char *qual2=READ_LINE_QUAL2(fd1);
    
    if ( seq1==NULL || hdr1_2==NULL || qual1==NULL ||
	 hdr2==NULL || seq2==NULL || hdr2_2==NULL || qual2==NULL ) {
      fprintf(stderr,"\nError in file %s, line %lu: file truncated?\n",f,cline);
      return(1);
    }
    if (validate_entry(hdr1,hdr1_2,seq1,qual1,cline,f)!=0) {
      return(1);
    }
    if (validate_entry(hdr2,hdr2_2,seq2,qual2,cline+4,f)!=0) {
      return(1);
    }
    char* readname1=get_readname(hdr1,&len,cline,f);
    char* readname2=get_readname(hdr2,&len,cline+4,f);
    if ( strcmp(readname1,readname2) ) {
      fprintf(stderr,"\nError in file %s, line %lu: unpaired read - %s\n",f,cline,readname1);
      return(1);
    } 
    PRINT_READS_PROCESSED(cline/4);
    replace_dots(start_pos,seq1,hdr1,hdr1_2,qual1,fdf);    
    replace_dots(start_pos,seq2,hdr2,hdr2_2,qual2,fdf);    
    //
    cline+=8;
    nreads1+=2;
  }
  printf("\n");
  close_fixed_fastq(fdf);
  gzclose(fd1);
  return(nreads1);
}
示例#2
0
文件: fastq_info.c 项目: hjanime/irap
void index_file(char *filename,hashtable sn_index,long start_offset,long length) {
  gzFile fd1=open_fastq(filename);  
  gzFile fdf=open_fixed_fastq(filename);  
  if (fd1==NULL) {
    fprintf(stderr,"\nError: Unable to open %s\n",filename);
    exit(1);
  }
  // move to the right position
  if(length>0) {
    fprintf(stderr, "\nInternal error: Not implemented\n");
    exit(2);
  }
  long cline=1;
  // sn_index creation could be done in parallel
  while(!gzeof(fd1)) {
    long long start_pos=gztell(fd1);
    char *hdr=READ_LINE_HDR(fd1);

    if ( hdr==NULL) break;
    int len;
    //fprintf(stderr,"sn_index: =%s=\n",readname);
    // get seq
    //printf("cline=%ld\nLEN=%ld  hdr=%s\n",cline,len,hdr);
    char *seq=READ_LINE_SEQ(fd1);
    char *hdr2=READ_LINE_HDR2(fd1);
    char *qual=READ_LINE_QUAL(fd1);
    char* readname=get_readname(hdr,&len,cline,filename);
    if (seq==NULL || hdr2==NULL || qual==NULL ) {
      fprintf(stderr,"\nError in file %s, line %lu: file truncated?\n",filename,cline);
      exit(1);
    }
    if (validate_entry(hdr,hdr2,seq,qual,cline,filename)!=0) {
      exit(1);
    }
    // check for duplicates
    if ( lookup_header(sn_index,readname)!=NULL ) {
      fprintf(stderr,"\nError in file %s, line %lu: duplicated sequence %s\n",filename,cline,readname);
      exit(1);
    }
    if ( new_indexentry(sn_index,readname,len,start_pos)==NULL) {
      fprintf(stderr,"\nError in file %s, line %lu: malloc failed?",filename,cline);
      exit(1);
    }
    replace_dots(start_pos,seq,hdr,hdr2,qual,fdf);    
    PRINT_READS_PROCESSED(cline/4);
    //
    cline+=4;
  }
  close_fixed_fastq(fdf);
  gzclose(fd1);
  return;
}
int
main(int argc, char *argv[])
{
    if (argc < 3) {
        printf("not enough arguments\n");
        return 1;
    }

    FILE *fp_h = fopen(argv[1], "w");
    if (!fp_h) {
        printf("can't open %s\n", argv[1]);
        return 2;
    }

    FILE *fp_c = fopen(argv[2], "w");
    if (!fp_c) {
        fclose(fp_h);
        printf("can't open %s\n", argv[2]);
        return 2;
    }

    fprintf(fp_h,
        "// generated file, all changes will be lost\n"
        "#ifndef FPP_GENERATED_TXT_RESOURCES_H\n"
        "#define FPP_GENERATED_TXT_RESOURCES_H\n"
        "\n"
        "\n");
    for (int k = 3; k < argc; k ++) {
        char *fname = strdup(argv[k]);
        char *bname = basename(fname);

        replace_dots(bname);
        fprintf(fp_h, "extern const char *resource_text_%s;\n", bname);
        free(fname);
    }
    fprintf(fp_h, "\n\n#endif\n");
    fclose(fp_h);

    fprintf(fp_c,
        "// generated file, all changes will be lost\n"
        "#include \"%s\"\n"
        "\n", argv[1]);

    for (int k = 3; k < argc; k ++) {
        unsigned char buf[4096];
        char *fname = strdup(argv[k]);
        char *bname = basename(fname);

        replace_dots(bname);
        fprintf(fp_c, "const char *resource_text_%s = \"", bname);

        size_t fsize = 0;
        FILE *tmp = fopen(argv[k], "rb");
        if (!tmp) {
            printf("can't open %s\n", argv[k]);
            free(fname);
            goto err_1;
        }
        while (!feof(tmp)) {
            size_t read_bytes = fread(buf, 1, sizeof(buf), tmp);
            for (size_t j = 0; j < read_bytes; j ++)
                fprintf(fp_c, "\\x%02x", buf[j]);
            fsize += read_bytes;
        }
        fclose(tmp);
        fprintf(fp_c, "\";\n");
        free(fname);
    }

    fclose(fp_c);
    return 0;

err_1:
    fclose(fp_c);
    return 3;
}
示例#4
0
文件: fastq_info.c 项目: hjanime/irap
int main(int argc, char **argv ) {
  //long paired=0;
  unsigned long num_reads1=0,
    num_reads2=0;
  
  is_paired_data=FALSE;
  is_interleaved=FALSE;
  fix_dot=FALSE;
  
  int nopt=0;
  int c;
  opterr = 0;

  fprintf(stderr,"Version iRAP %s\n",VERSION);
  
  while ((c = getopt (argc, argv, "f")) != -1)
    switch (c)
      {
      case 'f':
        fix_dot = TRUE;
	fprintf(stderr,"Fixing (-f) enabled: Replacing . by N (creating .fix.gz files)\n");
	++nopt;
        break;
      default:
	++nopt;
        fprintf(stderr,"ERROR: Option -%c invalid\n",optopt);
	exit(1);
      }
  
  if (argc-nopt<2 || argc-nopt>3) {
    fprintf(stderr,"Usage: fastq_info [-f] fastq1 [fastq2 file|pe]\n");
    //fprintf(stderr,"%d",argc);
    exit(1);
  }

  //gzFile fd1=NULL;
  gzFile fd2=NULL;

  if (argc-nopt ==3) {
    is_paired_data=TRUE;
    //fprintf(stderr,"%d %d %d %s\n",argc,nopt,argc-nopt,argv[2+nopt]);
    if ( !strncmp(argv[2+nopt],"pe",2) ) {
      is_interleaved=FALSE;
    } 
    //else  {
    //  fd2=open_fastq(argv[2+nopt]);
    //  gzclose(fd2);
    //
  }

  // ************************************************************
  if ( is_interleaved ) {
    // interleaved    
    num_reads1=validate_interleaved(argv[1+nopt]);
  } else {
    // single or pair of fastq file(s)
    unsigned long cline=1;
    fprintf(stderr,"HASHSIZE=%lu\n",(long unsigned int)HASHSIZE);
    //memset(&collisions[0],0,HASHSIZE+1);
    hashtable sn_index=new_hashtable(HASHSIZE);
    index_mem+=sizeof(hashtable);
    
    index_file(argv[1+nopt],sn_index,0,-1);
    num_reads1=sn_index->n_entries;
    fprintf(stderr,"\n");
    // print some info
    fprintf(stderr,"Reads processed: %ld\n",sn_index->n_entries);    
    fprintf(stderr,"Memory used in indexing: ~%ld MB\n",index_mem/1024/1024);  
    // pair-end
    if (argc-nopt ==3 ) {
      fprintf(stderr,"File %s processed\n",argv[1+nopt]);  
      fprintf(stderr,"Next file %s\n",argv[2+nopt]);  
      // validate the second file and check if all reads are paired
      fd2=open_fastq(argv[2+nopt]);
      gzFile fdf=open_fixed_fastq(argv[2+nopt]);  
      INDEX_ENTRY* e;
      // read the entry using another fd
      cline=1;
      // TODO: improve code - mostly duplicated:(
      while(!gzeof(fd2)) {
	long long start_pos=gztell(fd2);
	char *hdr=READ_LINE_HDR(fd2);
	if ( hdr==NULL) break;
	int len;
	char *seq=READ_LINE_SEQ(fd2);
	char *hdr2=READ_LINE_HDR2(fd2);
	char *qual=READ_LINE_QUAL(fd2);
	char* readname=get_readname(hdr,&len,cline,argv[2+nopt]);
	if (seq==NULL || hdr2==NULL || qual==NULL ) {
	  fprintf(stderr,"\nError in file %s, line %lu: file truncated?\n",argv[2+nopt],cline);
	  exit(1);
	}
	if (validate_entry(hdr,hdr2,seq,qual,cline,argv[2+nopt])!=0) {
	  exit(1);
	}
	//fprintf(stderr,"Reads processed: %ld\n",sn_index->n_entries);
	// check for duplicates
	if ( (e=lookup_header(sn_index,readname))==NULL ) {
	  fprintf(stderr,"\nError in file %s, line %lu: unpaired read - %s\n",argv[2+nopt],cline,readname);
	  exit(1);
	} else {
	  ulong key=hashit(readname);
	  // remove entry from sn_index
	  if (delete(sn_index,key,e)!=e) {
	    fprintf(stderr,"\nError in file %s, line %lu: unable to delete entry from sn_index - %s\n",argv[2+nopt],cline,readname);
	    exit(1);
	  }
	  free_indexentry(e);
	}
	PRINT_READS_PROCESSED(cline/4);
	++num_reads2;
	//
	replace_dots(start_pos,seq,hdr,hdr2,qual,fdf);
	cline+=4;
      }
      printf("\n");
      close_fixed_fastq(fdf);
      if (sn_index->n_entries>0 ) {
	fprintf(stderr,"\nError in file %s: found %lu unpaired reads\n",argv[1+nopt],sn_index->n_entries);
	exit(1);
      }
    }
  }
  FILE* out;  
  if (fix_dot) {
    out=stderr;
  } else {
    out=stdout;
  }
  fprintf(out,"------------------------------------\n");
  if ( num_reads2>0 ) {
    fprintf(out,"Number of reads: %lu %lu\n",num_reads1,num_reads2);
  } else {
    fprintf(out,"Number of reads: %lu\n",num_reads1);
  }
  fprintf(out,"Quality encoding range: %lu %lu\n",min_qual,max_qual);
  char *enc=qualRange2enc(min_qual,max_qual);
  if ( enc == NULL ) {
    fprintf(stderr,"\nERROR: Unable to determine quality encoding - unknown range [%lu,%lu]\n",min_qual,max_qual);
    exit(1);    
  }
  fprintf(out,"Quality encoding: %s\n",qualRange2enc(min_qual,max_qual));
  fprintf(out,"Read length: %lu %lu\n",min_rl,max_rl);
  fprintf(out,"OK\n");  
  exit(0);
}