char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) { int l; char c; khiter_t iter; faidx1_t val; char *seq=NULL; // Adjust position iter = kh_get(s, fai->hash, c_name); if(iter == kh_end(fai->hash)) return 0; val = kh_value(fai->hash, iter); if(p_end_i < p_beg_i) p_beg_i = p_end_i; if(p_beg_i < 0) p_beg_i = 0; else if(val.len <= p_beg_i) p_beg_i = val.len - 1; if(p_end_i < 0) p_end_i = 0; else if(val.len <= p_end_i) p_end_i = val.len - 1; // Now retrieve the sequence int ret = bgzf_useek(fai->bgzf, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET); if ( ret<0 ) { *len = -1; fprintf(stderr, "[fai_fetch_seq] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n"); return NULL; } l = 0; seq = (char*)malloc(p_end_i - p_beg_i + 2); while ( (c=bgzf_getc(fai->bgzf))>=0 && l < p_end_i - p_beg_i + 1) if (isgraph(c)) seq[l++] = c; seq[l] = '\0'; *len = l; return seq; }
char *fai_fetch(const faidx_t *fai, const char *str, int *len) { char *s; int c, i, l, k, name_end; khiter_t iter; faidx1_t val; khash_t(s) *h; int beg, end; beg = end = -1; h = fai->hash; name_end = l = strlen(str); s = (char*)malloc(l+1); // remove space for (i = k = 0; i < l; ++i) if (!isspace(str[i])) s[k++] = str[i]; s[k] = 0; l = k; // determine the sequence name for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end if (i >= 0) name_end = i; if (name_end < l) { // check if this is really the end int n_hyphen = 0; for (i = name_end + 1; i < l; ++i) { if (s[i] == '-') ++n_hyphen; else if (!isdigit(s[i]) && s[i] != ',') break; } if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name s[name_end] = 0; iter = kh_get(s, h, s); if (iter == kh_end(h)) { // cannot find the sequence name iter = kh_get(s, h, str); // try str as the name if (iter == kh_end(h)) { *len = 0; free(s); return 0; } else s[name_end] = ':', name_end = l; } } else iter = kh_get(s, h, str); if(iter == kh_end(h)) { fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str); free(s); *len = -2; return 0; }; val = kh_value(h, iter); // parse the interval if (name_end < l) { for (i = k = name_end + 1; i < l; ++i) if (s[i] != ',') s[k++] = s[i]; s[k] = 0; beg = atoi(s + name_end + 1); for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; end = i < k? atoi(s + i + 1) : val.len; if (beg > 0) --beg; } else beg = 0, end = val.len; if (beg >= val.len) beg = val.len; if (end >= val.len) end = val.len; if (beg > end) beg = end; free(s); // now retrieve the sequence int ret = bgzf_useek(fai->bgzf, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET); if ( ret<0 ) { *len = -1; fprintf(stderr, "[fai_fetch] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n"); return NULL; } l = 0; s = (char*)malloc(end - beg + 2); while ( (c=bgzf_getc(fai->bgzf))>=0 && l < end - beg ) if (isgraph(c)) s[l++] = c; s[l] = '\0'; *len = l; return s; }
int main(int argc, char **argv) { int c, compress, pstdout, is_forced, index = 0, rebgzip = 0, reindex = 0; BGZF *fp; void *buffer; long start, end, size; char *index_fname = NULL; int threads = 1; static const struct option loptions[] = { {"help", no_argument, NULL, 'h'}, {"offset", required_argument, NULL, 'b'}, {"stdout", no_argument, NULL, 'c'}, {"decompress", no_argument, NULL, 'd'}, {"force", no_argument, NULL, 'f'}, {"index", no_argument, NULL, 'i'}, {"index-name", required_argument, NULL, 'I'}, {"reindex", no_argument, NULL, 'r'}, {"rebgzip",no_argument,NULL,'g'}, {"size", required_argument, NULL, 's'}, {"threads", required_argument, NULL, '@'}, {"version", no_argument, NULL, 1}, {NULL, 0, NULL, 0} }; compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; while((c = getopt_long(argc, argv, "cdh?fb:@:s:iI:gr",loptions,NULL)) >= 0){ switch(c){ case 'd': compress = 0; break; case 'c': pstdout = 1; break; case 'b': start = atol(optarg); compress = 0; pstdout = 1; break; case 's': size = atol(optarg); pstdout = 1; break; case 'f': is_forced = 1; break; case 'i': index = 1; break; case 'I': index_fname = optarg; break; case 'g': rebgzip = 1; break; case 'r': reindex = 1; compress = 0; break; case '@': threads = atoi(optarg); break; case 1: printf( "bgzip (htslib) %s\n" "Copyright (C) 2017 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; case 'h': case '?': return bgzip_main_usage(); } } if (size >= 0) end = start + size; if (end >= 0 && end < start) { fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end); return 1; } if (compress == 1) { struct stat sbuf; int f_src = fileno(stdin); if ( argc>optind ) { if ( stat(argv[optind],&sbuf)<0 ) { fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } if ((f_src = open(argv[optind], O_RDONLY)) < 0) { fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } if (pstdout) fp = bgzf_open("-", "w"); else { char *name = malloc(strlen(argv[optind]) + 5); strcpy(name, argv[optind]); strcat(name, ".gz"); fp = bgzf_open(name, is_forced? "w" : "wx"); if (fp == NULL && errno == EEXIST && confirm_overwrite(name)) fp = bgzf_open(name, "w"); if (fp == NULL) { fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno)); free(name); return 1; } free(name); } } else if (!pstdout && isatty(fileno((FILE *)stdout)) ) return bgzip_main_usage(); else if ( index && !index_fname ) { fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n"); return 1; } else fp = bgzf_open("-", "w"); if ( index && rebgzip ) { fprintf(stderr, "[bgzip] Can't produce a index and rebgzip simultaneously\n"); return 1; } if ( rebgzip && !index_fname ) { fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n"); return 1; } if (threads > 1) bgzf_mt(fp, threads, 256); if ( index ) bgzf_index_build_init(fp); buffer = malloc(WINDOW_SIZE); #ifdef _WIN32 _setmode(f_src, O_BINARY); #endif if (rebgzip){ if ( bgzf_index_load(fp, index_fname, NULL) < 0 ) error("Could not load index: %s.gzi\n", argv[optind]); while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) if (bgzf_block_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode); } else { while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) if (bgzf_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode); } if ( index ) { if (index_fname) { if (bgzf_index_dump(fp, index_fname, NULL) < 0) error("Could not write index to '%s'\n", index_fname); } else { if (bgzf_index_dump(fp, argv[optind], ".gz.gzi") < 0) error("Could not write index to '%s.gz.gzi'", argv[optind]); } } if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode); if (argc > optind && !pstdout) unlink(argv[optind]); free(buffer); close(f_src); return 0; } else if ( reindex ) { if ( argc>optind ) { fp = bgzf_open(argv[optind], "r"); if ( !fp ) error("[bgzip] Could not open file: %s\n", argv[optind]); } else { if ( !index_fname ) error("[bgzip] Index file name expected when reading from stdin\n"); fp = bgzf_open("-", "r"); if ( !fp ) error("[bgzip] Could not read from stdin: %s\n", strerror(errno)); } buffer = malloc(BGZF_BLOCK_SIZE); bgzf_index_build_init(fp); int ret; while ( (ret=bgzf_read(fp, buffer, BGZF_BLOCK_SIZE))>0 ) ; free(buffer); if ( ret<0 ) error("Is the file gzipped or bgzipped? The latter is required for indexing.\n"); if ( index_fname ) { if (bgzf_index_dump(fp, index_fname, NULL) < 0) error("Could not write index to '%s'\n", index_fname); } else { if (bgzf_index_dump(fp, argv[optind], ".gzi") < 0) error("Could not write index to '%s.gzi'\n", argv[optind]); } if ( bgzf_close(fp)<0 ) error("Close failed: Error %d\n",fp->errcode); return 0; } else { struct stat sbuf; int f_dst; if ( argc>optind ) { if ( stat(argv[optind],&sbuf)<0 ) { fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } char *name; int len = strlen(argv[optind]); if ( strcmp(argv[optind]+len-3,".gz") ) { fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]); return 1; } fp = bgzf_open(argv[optind], "r"); if (fp == NULL) { fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]); return 1; } if (pstdout) { f_dst = fileno(stdout); } else { const int wrflags = O_WRONLY | O_CREAT | O_TRUNC; name = strdup(argv[optind]); name[strlen(name) - 3] = '\0'; f_dst = open(name, is_forced? wrflags : wrflags|O_EXCL, 0666); if (f_dst < 0 && errno == EEXIST && confirm_overwrite(name)) f_dst = open(name, wrflags, 0666); if (f_dst < 0) { fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno)); free(name); return 1; } free(name); } } else if (!pstdout && isatty(fileno((FILE *)stdin)) ) return bgzip_main_usage(); else { f_dst = fileno(stdout); fp = bgzf_open("-", "r"); if (fp == NULL) { fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno)); return 1; } } if (threads > 1) bgzf_mt(fp, threads, 256); buffer = malloc(WINDOW_SIZE); if ( start>0 ) { if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) error("Could not load index: %s.gzi\n", argv[optind]); if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start); } #ifdef _WIN32 _setmode(f_dst, O_BINARY); #endif while (1) { if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE); else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); if (c == 0) break; if (c < 0) error("Could not read %d bytes: Error %d\n", (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start), fp->errcode); start += c; if ( write(f_dst, buffer, c) != c ) { #ifdef _WIN32 if (GetLastError() != ERROR_NO_DATA) #endif error("Could not write %d bytes\n", c); } if (end >= 0 && start >= end) break; } free(buffer); if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode); if (!pstdout) unlink(argv[optind]); return 0; } }
int main(int argc, char **argv) { int c, compress, pstdout, is_forced, index = 0, reindex = 0; BGZF *fp; void *buffer; long start, end, size; char *index_fname = NULL; static struct option loptions[] = { {"help",0,0,'h'}, {"offset",1,0,'b'}, {"stdout",0,0,'c'}, {"decompress",0,0,'d'}, {"force",0,0,'f'}, {"index",0,0,'i'}, {"index-name",1,0,'I'}, {"reindex",0,0,'r'}, {"size",1,0,'s'}, {0,0,0,0} }; compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; while((c = getopt_long(argc, argv, "cdh?fb:s:iI:r",loptions,NULL)) >= 0){ switch(c){ case 'd': compress = 0; break; case 'c': pstdout = 1; break; case 'b': start = atol(optarg); compress = 0; pstdout = 1; break; case 's': size = atol(optarg); pstdout = 1; break; case 'f': is_forced = 1; break; case 'i': index = 1; break; case 'I': index_fname = optarg; break; case 'r': reindex = 1; compress = 0; break; case 'h': case '?': return bgzip_main_usage(); } } if (size >= 0) end = start + size; if (end >= 0 && end < start) { fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end); return 1; } if (compress == 1) { struct stat sbuf; int f_src = fileno(stdin); int f_dst = fileno(stdout); if ( argc>optind ) { if ( stat(argv[optind],&sbuf)<0 ) { fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } if ((f_src = open(argv[optind], O_RDONLY)) < 0) { fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } if (pstdout) f_dst = fileno(stdout); else { char *name = malloc(strlen(argv[optind]) + 5); strcpy(name, argv[optind]); strcat(name, ".gz"); f_dst = write_open(name, is_forced); if (f_dst < 0) return 1; free(name); } } else if (!pstdout && isatty(fileno((FILE *)stdout)) ) return bgzip_main_usage(); else if ( index && !index_fname ) { fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n"); return 1; } fp = bgzf_fdopen(f_dst, "w"); if ( index ) bgzf_index_build_init(fp); buffer = malloc(WINDOW_SIZE); while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) if (bgzf_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode); // f_dst will be closed here if ( index ) { if ( index_fname ) bgzf_index_dump(fp, index_fname, NULL); else bgzf_index_dump(fp, argv[optind], ".gz.gzi"); } if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode); if (argc > optind && !pstdout) unlink(argv[optind]); free(buffer); close(f_src); return 0; } else if ( reindex ) { if ( argc>optind ) { fp = bgzf_open(argv[optind], "r"); if ( !fp ) error("[bgzip] Could not open file: %s\n", argv[optind]); } else { if ( !index_fname ) error("[bgzip] Index file name expected when reading from stdin\n"); fp = bgzf_fdopen(fileno(stdin), "r"); if ( !fp ) error("[bgzip] Could not read from stdin: %s\n", strerror(errno)); } buffer = malloc(BGZF_BLOCK_SIZE); bgzf_index_build_init(fp); int ret; while ( (ret=bgzf_read(fp, buffer, BGZF_BLOCK_SIZE))>0 ) ; free(buffer); if ( ret<0 ) error("Is the file gzipped or bgzipped? The latter is required for indexing.\n"); if ( index_fname ) bgzf_index_dump(fp, index_fname, NULL); else bgzf_index_dump(fp, argv[optind], ".gzi"); if ( bgzf_close(fp)<0 ) error("Close failed: Error %d\n",fp->errcode); return 0; } else { struct stat sbuf; int f_dst; if ( argc>optind ) { if ( stat(argv[optind],&sbuf)<0 ) { fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } char *name; int len = strlen(argv[optind]); if ( strcmp(argv[optind]+len-3,".gz") ) { fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]); return 1; } fp = bgzf_open(argv[optind], "r"); if (fp == NULL) { fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]); return 1; } if (pstdout) { f_dst = fileno(stdout); } else { name = strdup(argv[optind]); name[strlen(name) - 3] = '\0'; f_dst = write_open(name, is_forced); free(name); } } else if (!pstdout && isatty(fileno((FILE *)stdin)) ) return bgzip_main_usage(); else { f_dst = fileno(stdout); fp = bgzf_fdopen(fileno(stdin), "r"); if (fp == NULL) { fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno)); return 1; } } buffer = malloc(WINDOW_SIZE); if ( start>0 ) { if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) error("Could not load index: %s.gzi\n", argv[optind]); if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start); } while (1) { if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE); else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); if (c == 0) break; if (c < 0) error("Could not read %d bytes: Error %d\n", (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start), fp->errcode); start += c; if ( write(f_dst, buffer, c) != c ) error("Could not write %d bytes\n", c); if (end >= 0 && start >= end) break; } free(buffer); if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode); if (!pstdout) unlink(argv[optind]); return 0; } return 0; }