int main_tabix(int argc, char *argv[]) { int c, min_shift = -1, is_force = 0, is_all = 0; tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL; while ((c = getopt(argc, argv, "0fap:s:b:e:S:c:m:")) >= 0) if (c == '0') conf.preset |= TBX_UCSC; else if (c == 'f') is_force = 1; else if (c == 'a') is_all = 1; else if (c == 'm') min_shift = atoi(optarg); else if (c == 's') conf.sc = atoi(optarg); else if (c == 'b') conf.bc = atoi(optarg); else if (c == 'e') conf.ec = atoi(optarg); else if (c == 'c') conf.meta_char = *optarg; else if (c == 'S') conf.line_skip = atoi(optarg); else if (c == 'p') { if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff; else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed; else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam; else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf; else { fprintf(stderr, "The type '%s' not recognised\n", optarg); return 1; } } if (optind == argc) { fprintf(stderr, "\nUsage: bcftools tabix [options] <in.gz> [reg1 [...]]\n\n"); fprintf(stderr, "Options: -p STR preset: gff, bed, sam or vcf [gff]\n"); fprintf(stderr, " -s INT column number for sequence names (suppressed by -p) [1]\n"); fprintf(stderr, " -b INT column number for region start [4]\n"); fprintf(stderr, " -e INT column number for region end (if no end, set INT to -b) [5]\n"); fprintf(stderr, " -0 specify coordinates are zero-based\n"); fprintf(stderr, " -S INT skip first INT lines [0]\n"); fprintf(stderr, " -c CHAR skip lines starting with CHAR [null]\n"); fprintf(stderr, " -a print all records\n"); fprintf(stderr, " -f force to overwrite existing index\n"); fprintf(stderr, " -m INT set the minimal interval size to 1<<INT; 0 for the old tabix index [0]\n"); fprintf(stderr, "\n"); return 1; } if (is_all) { // read without random access kstring_t s; BGZF *fp; s.l = s.m = 0; s.s = 0; fp = bgzf_open(argv[optind], "r"); while (bgzf_getline(fp, '\n', &s) >= 0) puts(s.s); bgzf_close(fp); free(s.s); } else if (optind + 2 > argc) { // create index if ( !conf_ptr ) { // auto-detect file type by file name int l = strlen(argv[optind]); int strcasecmp(const char *s1, const char *s2); if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &tbx_conf_gff; else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &tbx_conf_bed; else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &tbx_conf_sam; else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &tbx_conf_vcf; } if ( conf_ptr ) conf = *conf_ptr; if (!is_force) { char *fn; FILE *fp; fn = (char*)alloca(strlen(argv[optind]) + 5); strcat(strcpy(fn, argv[optind]), min_shift <= 0? ".tbi" : ".csi"); if ((fp = fopen(fn, "rb")) != 0) { fclose(fp); fprintf(stderr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__); return 1; } } if ( tbx_index_build(argv[optind], min_shift, &conf) ) { fprintf(stderr,"tbx_index_build failed: Is the file bgzip-compressed? Was wrong -p [type] option used?\n"); return 1; } } else { // read with random access tbx_t *tbx; BGZF *fp; kstring_t s; int i; if ((tbx = tbx_index_load(argv[optind])) == 0) return 1; if ((fp = bgzf_open(argv[optind], "r")) == 0) return 1; s.s = 0; s.l = s.m = 0; for (i = optind + 1; i < argc; ++i) { hts_itr_t *itr; if ((itr = tbx_itr_querys(tbx, argv[i])) == 0) continue; while (tbx_bgzf_itr_next(fp, tbx, itr, &s) >= 0) puts(s.s); tbx_itr_destroy(itr); } free(s.s); bgzf_close(fp); tbx_destroy(tbx); } return 0; }
int init_regions(const char *fname, regions_t *reg) { int bgzf_getline(BGZF *fp, int delim, kstring_t *str); BGZF *zfp = bgzf_open(fname, "r"); if ( !zfp ) { fprintf(stderr,"%s: %s\n",fname,strerror(errno)); return 0; } int i, mseqs = 10, mpos = 0; reg->nseqs = 0; reg->pos = (pos_t **)calloc(mseqs,sizeof(pos_t*)); reg->npos = (int*) calloc(mseqs,sizeof(int)); reg->seq_names = (char **) calloc(mseqs,sizeof(char*)); kstring_t str = {0,0,0}; ssize_t nread; while ((nread = bgzf_getline(zfp, '\n', &str)) > 0) { char *line = str.s; if ( line[0] == '#' ) continue; int i = 0; while ( i<nread && !isspace(line[i]) ) i++; if ( i>=nread ) { fprintf(stderr,"Could not parse the file: %s [%s]\n", fname,line); return 0; } line[i] = 0; if ( reg->nseqs==0 || strcmp(line,reg->seq_names[reg->nseqs-1]) ) { // New sequence reg->nseqs++; if ( reg->nseqs >= mseqs ) { mseqs++; reg->pos = (pos_t **) realloc(reg->pos,sizeof(pos_t*)*mseqs); reg->pos[mseqs-1] = NULL; reg->npos = (int *) realloc(reg->npos,sizeof(int)*mseqs); reg->npos[mseqs-1] = 0; reg->seq_names = (char**) realloc(reg->seq_names,sizeof(char*)*mseqs); } reg->seq_names[reg->nseqs-1] = strdup(line); mpos = 0; } int iseq = reg->nseqs-1; if ( reg->npos[iseq] >= mpos ) { mpos += 100; reg->pos[iseq] = (pos_t*) realloc(reg->pos[iseq],sizeof(pos_t)*mpos); } int ipos = reg->npos[iseq]; pos_t *pos = reg->pos[iseq]; reg->npos[iseq]++; if ( (sscanf(line+i+1,"%d %d",&pos[ipos].from,&pos[ipos].to))!=2 ) { if ( (sscanf(line+i+1,"%d",&pos[ipos].from))!=1 ) { fprintf(stderr,"Could not parse the region [%s]\n",line+i+1); return 0; } pos[ipos].to = pos[ipos].from; } // Check that the file is sorted if ( ipos>0 && (pos[ipos].from < pos[ipos-1].from || (pos[ipos].from==pos[ipos-1].from && pos[ipos].to<pos[ipos-1].to)) ) { fprintf(stderr,"The file is not sorted: %s\n", fname); return 0; } } // Check that chromosomes come in blocks int j; for (i=0; i<reg->nseqs; i++) { for (j=0; j<i; j++) { if ( !strcmp(reg->seq_names[i],reg->seq_names[j]) ) { fprintf(stderr,"The file is not sorted: %s\n", fname); return 0; } } } if (str.m) free(str.s); else return 0; bgzf_close(zfp); return 1; }
int main(int argc, char** argv) { const char* fBG = argv[1]; const char* fIndex = argv[2]; int64_t pos = strtol(argv[3], NULL, 0); // const int Nrecord = 10; // read everything MmapFile mmapFile; mmapFile.open(fIndex); size_t Nrecord = mmapFile.getFileSize() / 16 - 1; Record* r = (Record*)mmapFile.data; // FILE* fp = fopen(fIndex, "rb"); // if (Nrecord != fread(r, sizeof(Record), Nrecord, fp)) { // fprintf(stderr, "Read error!\n"); // } // binary search for file position int64_t offset = -1; Record query; query.pos = pos; // Comparator comparator; Record* lb = std::lower_bound(r, r + Nrecord, query, comparator); // r[lb].pos >= query.pos Record* ub = std::upper_bound(lb, r + Nrecord, query, comparator); // r[ub].pos > query.pos for (Record* pi = lb; pi != ub; ++pi) { printf("%ld %ld\n", pi->pos, pi->offset); offset = pi->offset; // (TODO) only store one virtual offset for now. break; } // int64_t offset = -1; // for (int i = 0; i < Nrecord; ++i) { // if (r[i].pos == pos) { // offset = r[i].offset; // break; // } // } if (offset < 0) { fprintf(stderr, "Cannot find position!\n"); } else { printf("found: %ld %ld\n", pos, offset); } BGZF* fp2 = bgzf_open(fBG, "rb"); if (bgzf_seek(fp2, offset, SEEK_SET)) { fprintf(stderr, "seek error!\n"); } kstring_t* str; str = (kstring_t*)calloc(1, sizeof(kstring_t)); kstring_t& s = *str; int ret = bgzf_getline(fp2, '\n', &s); if (ret <= 0) { fprintf(stderr, "getline error, ret = %d!\n", ret); } for (size_t i = 0; i < s.l; ++i) { if (i >= 50) break; printf("%c", s.s[i]); } printf("\n"); free(str); bgzf_close(fp2); // fclose(fp); return 0; }