示例#1
0
文件: tabix.c 项目: Bratdaking/pysam
int main_tabix(int argc, char *argv[])
{
    int c, min_shift = -1, is_force = 0, is_all = 0;
    tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL;
    while ((c = getopt(argc, argv, "0fap:s:b:e:S:c:m:")) >= 0)
        if (c == '0') conf.preset |= TBX_UCSC;
        else if (c == 'f') is_force = 1;
        else if (c == 'a') is_all = 1;
        else if (c == 'm') min_shift = atoi(optarg);
        else if (c == 's') conf.sc = atoi(optarg);
        else if (c == 'b') conf.bc = atoi(optarg);
        else if (c == 'e') conf.ec = atoi(optarg);
        else if (c == 'c') conf.meta_char = *optarg;
        else if (c == 'S') conf.line_skip = atoi(optarg);
        else if (c == 'p') {
            if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff;
            else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed;
            else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam;
            else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf;
            else {
                fprintf(stderr, "The type '%s' not recognised\n", optarg);
                return 1;
            }

        }
    if (optind == argc) {
        fprintf(stderr, "\nUsage: bcftools tabix [options] <in.gz> [reg1 [...]]\n\n");
        fprintf(stderr, "Options: -p STR    preset: gff, bed, sam or vcf [gff]\n");
        fprintf(stderr, "         -s INT    column number for sequence names (suppressed by -p) [1]\n");
        fprintf(stderr, "         -b INT    column number for region start [4]\n");
        fprintf(stderr, "         -e INT    column number for region end (if no end, set INT to -b) [5]\n");
        fprintf(stderr, "         -0        specify coordinates are zero-based\n");
        fprintf(stderr, "         -S INT    skip first INT lines [0]\n");
        fprintf(stderr, "         -c CHAR   skip lines starting with CHAR [null]\n");
        fprintf(stderr, "         -a        print all records\n");
        fprintf(stderr, "         -f        force to overwrite existing index\n");
        fprintf(stderr, "         -m INT    set the minimal interval size to 1<<INT; 0 for the old tabix index [0]\n");
        fprintf(stderr, "\n");
        return 1;
    }
    if (is_all) { // read without random access
        kstring_t s;
        BGZF *fp;
        s.l = s.m = 0; s.s = 0;
        fp = bgzf_open(argv[optind], "r");
        while (bgzf_getline(fp, '\n', &s) >= 0) puts(s.s);
        bgzf_close(fp);
        free(s.s);
    } else if (optind + 2 > argc) { // create index
        if ( !conf_ptr )
        {
            // auto-detect file type by file name
            int l = strlen(argv[optind]);
            int strcasecmp(const char *s1, const char *s2);
            if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &tbx_conf_gff;
            else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &tbx_conf_bed;
            else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &tbx_conf_sam;
            else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &tbx_conf_vcf;
        }
        if ( conf_ptr ) conf = *conf_ptr;

        if (!is_force) {
            char *fn;
            FILE *fp;
            fn = (char*)alloca(strlen(argv[optind]) + 5);
            strcat(strcpy(fn, argv[optind]), min_shift <= 0? ".tbi" : ".csi");
            if ((fp = fopen(fn, "rb")) != 0) {
                fclose(fp);
                fprintf(stderr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__);
                return 1;
            }
        }
        if ( tbx_index_build(argv[optind], min_shift, &conf) )
        {
            fprintf(stderr,"tbx_index_build failed: Is the file bgzip-compressed? Was wrong -p [type] option used?\n");
            return 1;
        }
    } else { // read with random access
        tbx_t *tbx;
        BGZF *fp;
        kstring_t s;
        int i;
        if ((tbx = tbx_index_load(argv[optind])) == 0) return 1;
        if ((fp = bgzf_open(argv[optind], "r")) == 0) return 1;
        s.s = 0; s.l = s.m = 0;
        for (i = optind + 1; i < argc; ++i) {
            hts_itr_t *itr;
            if ((itr = tbx_itr_querys(tbx, argv[i])) == 0) continue;
            while (tbx_bgzf_itr_next(fp, tbx, itr, &s) >= 0) puts(s.s);
            tbx_itr_destroy(itr);
        }
        free(s.s);
        bgzf_close(fp);
        tbx_destroy(tbx);
    }
    return 0;
}
示例#2
0
int init_regions(const char *fname, regions_t *reg)
{
    int bgzf_getline(BGZF *fp, int delim, kstring_t *str);

    BGZF *zfp = bgzf_open(fname, "r");
    if ( !zfp ) 
    {
        fprintf(stderr,"%s: %s\n",fname,strerror(errno));
        return 0;
    }

    int i, mseqs = 10, mpos = 0;
    reg->nseqs = 0;
    reg->pos   = (pos_t **)calloc(mseqs,sizeof(pos_t*));
    reg->npos  = (int*) calloc(mseqs,sizeof(int));
    reg->seq_names = (char **) calloc(mseqs,sizeof(char*));

    kstring_t str = {0,0,0};
    ssize_t nread;
    while ((nread = bgzf_getline(zfp, '\n', &str)) > 0) 
    {
        char *line = str.s;
        if ( line[0] == '#' ) continue;

        int i = 0;
        while ( i<nread && !isspace(line[i]) ) i++;
        if ( i>=nread ) 
        { 
            fprintf(stderr,"Could not parse the file: %s [%s]\n", fname,line); 
            return 0; 
        }
        line[i] = 0;

        if ( reg->nseqs==0 || strcmp(line,reg->seq_names[reg->nseqs-1]) )
        {
            // New sequence
            reg->nseqs++;
            if ( reg->nseqs >= mseqs )
            {
                mseqs++;
                reg->pos  = (pos_t **) realloc(reg->pos,sizeof(pos_t*)*mseqs); reg->pos[mseqs-1] = NULL;
                reg->npos = (int *) realloc(reg->npos,sizeof(int)*mseqs); reg->npos[mseqs-1] = 0;
                reg->seq_names = (char**) realloc(reg->seq_names,sizeof(char*)*mseqs);
            }
            reg->seq_names[reg->nseqs-1] = strdup(line);
            mpos = 0;
        }

        int iseq = reg->nseqs-1;
        if ( reg->npos[iseq] >= mpos )
        {
            mpos += 100;
            reg->pos[iseq] = (pos_t*) realloc(reg->pos[iseq],sizeof(pos_t)*mpos);
        }
        int ipos = reg->npos[iseq];
        pos_t *pos = reg->pos[iseq];
        reg->npos[iseq]++;
        if ( (sscanf(line+i+1,"%d %d",&pos[ipos].from,&pos[ipos].to))!=2 ) 
        {
            if ( (sscanf(line+i+1,"%d",&pos[ipos].from))!=1 )
            {
                fprintf(stderr,"Could not parse the region [%s]\n",line+i+1);
                return 0;
            }
            pos[ipos].to = pos[ipos].from;
        }

        // Check that the file is sorted
        if ( ipos>0 && (pos[ipos].from < pos[ipos-1].from || (pos[ipos].from==pos[ipos-1].from && pos[ipos].to<pos[ipos-1].to)) )
        {
            fprintf(stderr,"The file is not sorted: %s\n", fname);
            return 0;
        }
    }

    // Check that chromosomes come in blocks
    int j;
    for (i=0; i<reg->nseqs; i++)
    {
        for (j=0; j<i; j++)
        {
            if ( !strcmp(reg->seq_names[i],reg->seq_names[j]) ) 
            {
                fprintf(stderr,"The file is not sorted: %s\n", fname);
                return 0;
            }
        }
    }

    if (str.m) free(str.s);
    else return 0;

    bgzf_close(zfp);
    return 1;
}
示例#3
0
int main(int argc, char** argv) {
  const char* fBG = argv[1];
  const char* fIndex = argv[2];
  int64_t pos = strtol(argv[3], NULL, 0);
  // const int Nrecord = 10;

  // read everything
  MmapFile mmapFile;
  mmapFile.open(fIndex);
  size_t Nrecord = mmapFile.getFileSize() / 16 - 1;
  Record* r = (Record*)mmapFile.data;

  // FILE* fp = fopen(fIndex, "rb");
  // if (Nrecord != fread(r, sizeof(Record), Nrecord, fp)) {
  //   fprintf(stderr, "Read error!\n");
  // }

  // binary search for file position
  int64_t offset = -1;
  Record query;
  query.pos = pos;
  // Comparator comparator;
  Record* lb = std::lower_bound(r, r + Nrecord, query,
                                comparator);  // r[lb].pos >= query.pos
  Record* ub = std::upper_bound(lb, r + Nrecord, query,
                                comparator);  // r[ub].pos > query.pos
  for (Record* pi = lb; pi != ub; ++pi) {
    printf("%ld %ld\n", pi->pos, pi->offset);
    offset = pi->offset;
    // (TODO) only store one virtual offset for now.
    break;
  }

  // int64_t offset = -1;
  // for (int i = 0; i < Nrecord; ++i) {
  //   if (r[i].pos == pos) {
  //     offset = r[i].offset;
  //     break;
  //   }
  // }
  if (offset < 0) {
    fprintf(stderr, "Cannot find position!\n");
  } else {
    printf("found: %ld %ld\n", pos, offset);
  }
  BGZF* fp2 = bgzf_open(fBG, "rb");
  if (bgzf_seek(fp2, offset, SEEK_SET)) {
    fprintf(stderr, "seek error!\n");
  }
  kstring_t* str;
  str = (kstring_t*)calloc(1, sizeof(kstring_t));
  kstring_t& s = *str;
  int ret = bgzf_getline(fp2, '\n', &s);
  if (ret <= 0) {
    fprintf(stderr, "getline error, ret = %d!\n", ret);
  }
  for (size_t i = 0; i < s.l; ++i) {
    if (i >= 50) break;
    printf("%c", s.s[i]);
  }
  printf("\n");

  free(str);
  bgzf_close(fp2);
  // fclose(fp);

  return 0;
}