char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len) { int l; char c; khiter_t iter; faidx1_t val; char *seq=NULL; // Adjust position iter = kh_get(s, fai->hash, c_name); if(iter == kh_end(fai->hash)) return 0; val = kh_value(fai->hash, iter); if(p_end_i < p_beg_i) p_beg_i = p_end_i; if(p_beg_i < 0) p_beg_i = 0; else if(val.len <= p_beg_i) p_beg_i = val.len - 1; if(p_end_i < 0) p_end_i = 0; else if(val.len <= p_end_i) p_end_i = val.len - 1; // Now retrieve the sequence l = 0; seq = (char*)malloc(p_end_i - p_beg_i + 2); razf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET); while (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1) if (isgraph(c)) seq[l++] = c; seq[l] = '\0'; *len = l; return seq; }
char *fai_fetch(const faidx_t *fai, const char *str, int *len) { char *s, *p, c; int i, l, k; khiter_t iter; faidx1_t val; khash_t(s) *h; int beg, end; beg = end = -1; h = fai->hash; l = strlen(str); p = s = (char*)malloc(l+1); /* squeeze out "," */ for (i = k = 0; i != l; ++i) if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; s[k] = 0; for (i = 0; i != k; ++i) if (s[i] == ':') break; s[i] = 0; iter = kh_get(s, h, s); /* get the ref_id */ if (iter == kh_end(h)) { *len = 0; fprintf(stderr,"faidx fetch %s failed",str); free(s); return 0; } val = kh_value(h, iter); if (i == k) { /* dump the whole sequence */ beg = 0; end = val.len; } else { for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; beg = atoi(p); if (i < k) { p = s + i + 1; end = atoi(p); } else end = val.len; } if (beg > 0) --beg; if (beg >= val.len) beg = val.len; if (end >= val.len) end = val.len; if (beg > end) beg = end; free(s); // now retrieve the sequence l = 0; s = (char*)malloc(end - beg + 2); razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET); while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err) if (isgraph(c)) s[l++] = c; s[l] = '\0'; *len = l; return s; }
char* faidx_fetch_seq_forced_lower( const faidx_t* fai, const char *c_name, int p_beg_i, int p_end_i ) { int l; char c; khiter_t iter; faidx1_t val; char* seq0; char* seq = NULL; iter = kh_get(s, fai->hash, c_name); if(iter == kh_end(fai->hash)) return 0; seq0 = seq = (char*)malloc( (p_end_i - p_beg_i + 2) * sizeof(char) ); if( seq0 == NULL ) fail( "Out of memory.\n" ); seq0[p_end_i-p_beg_i+1] = '\0'; val = kh_value(fai->hash, iter); /* entirely off the map: all Ns */ if( p_beg_i >= (int)val.len || p_end_i < 0 ) { while( p_beg_i <= p_end_i ) { *seq++ ='n'; } return seq0; } /* beginning is off the map */ while( p_beg_i < 0 && p_beg_i <= p_end_i ) { *seq++ = 'n'; p_beg_i++; } /* end is off the map */ while( p_end_i >= (int)val.len ) { seq[p_end_i-p_beg_i] = 'n'; p_end_i--; } /* retrieve the sequence */ l = 0; razf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET); while (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1) if (isgraph(c)) seq[l++] = tolower(c); while( p_beg_i+l <= p_end_i ) seq[l++] = 'n'; return seq0; }
char *fai_fetch(const faidx_t *fai, const char *str, int *len) { char *s, c; int i, l, k, name_end; khiter_t iter; faidx1_t val; khash_t(s) *h; int beg, end; beg = end = -1; h = fai->hash; name_end = l = strlen(str); s = (char*)malloc(l+1); // remove space for (i = k = 0; i < l; ++i) if (!isspace(str[i])) s[k++] = str[i]; s[k] = 0; l = k; // determine the sequence name for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end if (i >= 0) name_end = i; if (name_end < l) { // check if this is really the end int n_hyphen = 0; for (i = name_end + 1; i < l; ++i) { if (s[i] == '-') ++n_hyphen; else if (!isdigit(s[i]) && s[i] != ',') break; } if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name s[name_end] = 0; iter = kh_get(s, h, s); if (iter == kh_end(h)) { // cannot find the sequence name iter = kh_get(s, h, str); // try str as the name if (iter == kh_end(h)) { *len = 0; free(s); return 0; } else s[name_end] = ':', name_end = l; } } else iter = kh_get(s, h, str); if(iter == kh_end(h)) { fprintf(pysamerr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str); free(s); return 0; }; val = kh_value(h, iter); // parse the interval if (name_end < l) { for (i = k = name_end + 1; i < l; ++i) if (s[i] != ',') s[k++] = s[i]; s[k] = 0; beg = atoi(s + name_end + 1); for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; end = i < k? atoi(s + i + 1) : val.len; if (beg > 0) --beg; } else beg = 0, end = val.len; if (beg >= val.len) beg = val.len; if (end >= val.len) end = val.len; if (beg > end) beg = end; free(s); // now retrieve the sequence l = 0; s = (char*)malloc(end - beg + 2); razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET); while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err) if (isgraph(c)) s[l++] = c; s[l] = '\0'; *len = l; return s; }
int main(int argc, char **argv) { int c, compress, pstdout, is_forced; RAZF *rz; void *buffer; long start, end, size; compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; while((c = getopt(argc, argv, "cdlhfb:s:")) >= 0){ switch(c){ case 'h': return razf_main_usage(); case 'd': compress = 0; break; case 'c': pstdout = 1; break; case 'l': compress = 2; break; case 'b': start = atol(optarg); break; case 's': size = atol(optarg); break; case 'f': is_forced = 1; break; } } if (size >= 0) end = start + size; if(end >= 0 && end < start){ fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end); return 1; } if(compress == 1){ int f_src, f_dst = -1; if(argc > optind){ if((f_src = open(argv[optind], O_RDONLY)) < 0){ fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]); return 1; } if(pstdout){ f_dst = fileno(stdout); } else { char *name = malloc(sizeof(strlen(argv[optind]) + 5)); strcpy(name, argv[optind]); strcat(name, ".rz"); f_dst = write_open(name, is_forced); if (f_dst < 0) return 1; free(name); } } else if(pstdout){ f_src = fileno(stdin); f_dst = fileno(stdout); } else return razf_main_usage(); rz = razf_dopen(f_dst, "w"); buffer = malloc(WINDOW_SIZE); while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) razf_write(rz, buffer, c); razf_close(rz); // f_dst will be closed here if (argc > optind && !pstdout) unlink(argv[optind]); free(buffer); close(f_src); return 0; } else { if(argc <= optind) return razf_main_usage(); if(compress == 2){ rz = razf_open(argv[optind], "r"); if(rz->file_type == FILE_TYPE_RZ) { printf("%20s%20s%7s %s\n", "compressed", "uncompressed", "ratio", "name"); printf("%20lld%20lld%6.1f%% %s\n", (long long)rz->end, (long long)rz->src_end, rz->end * 100.0f / rz->src_end, argv[optind]); } else fprintf(stdout, "%s is not a regular rz file\n", argv[optind]); } else { int f_dst; if (argc > optind && !pstdout) { char *name; if (strstr(argv[optind], ".rz") - argv[optind] != strlen(argv[optind]) - 3) { printf("razip: %s: unknown suffix -- ignored\n", argv[optind]); return 1; } name = strdup(argv[optind]); name[strlen(name) - 3] = '\0'; f_dst = write_open(name, is_forced); free(name); } else f_dst = fileno(stdout); rz = razf_open(argv[optind], "r"); buffer = malloc(WINDOW_SIZE); razf_seek(rz, start, SEEK_SET); while(1){ if(end < 0) c = razf_read(rz, buffer, WINDOW_SIZE); else c = razf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); if(c <= 0) break; start += c; write(f_dst, buffer, c); if(end >= 0 && start >= end) break; } free(buffer); if (!pstdout) unlink(argv[optind]); } razf_close(rz); return 0; } }