Пример #1
0
char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len)
{
	int l;
	char c;
    khiter_t iter;
    faidx1_t val;
	char *seq=NULL;

    // Adjust position
    iter = kh_get(s, fai->hash, c_name);
    if(iter == kh_end(fai->hash)) return 0;
    val = kh_value(fai->hash, iter);
	if(p_end_i < p_beg_i) p_beg_i = p_end_i;
    if(p_beg_i < 0) p_beg_i = 0;
    else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
    if(p_end_i < 0) p_end_i = 0;
    else if(val.len <= p_end_i) p_end_i = val.len - 1;

    // Now retrieve the sequence 
	l = 0;
	seq = (char*)malloc(p_end_i - p_beg_i + 2);
	razf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);
	while (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1)
		if (isgraph(c)) seq[l++] = c;
	seq[l] = '\0';
	*len = l;
	return seq;
}
Пример #2
0
char *fai_fetch(const faidx_t *fai, const char *str, int *len)
{
	char *s, *p, c;
	int i, l, k;
	khiter_t iter;
	faidx1_t val;
	khash_t(s) *h;
	int beg, end;

	beg = end = -1;
	h = fai->hash;
	l = strlen(str);
	p = s = (char*)malloc(l+1);
	/* squeeze out "," */
	for (i = k = 0; i != l; ++i)
		if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
	s[k] = 0;
	for (i = 0; i != k; ++i) if (s[i] == ':') break;
	s[i] = 0;
	iter = kh_get(s, h, s); /* get the ref_id */
	if (iter == kh_end(h)) {
		*len = 0;
		fprintf(stderr,"faidx fetch %s failed",str);
		free(s); return 0;
	}
	val = kh_value(h, iter);
	if (i == k) { /* dump the whole sequence */
		beg = 0; end = val.len;
	} else {
		for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
		beg = atoi(p);
		if (i < k) {
			p = s + i + 1;
			end = atoi(p);
		} else end = val.len;
	}
	if (beg > 0) --beg;
	if (beg >= val.len) beg = val.len;
	if (end >= val.len) end = val.len;
	if (beg > end) beg = end;
	free(s);

	// now retrieve the sequence
	l = 0;
	s = (char*)malloc(end - beg + 2);
	razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
	while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err)
		if (isgraph(c)) s[l++] = c;
	s[l] = '\0';
	*len = l;
	return s;
}
Пример #3
0
char* faidx_fetch_seq_forced_lower( const faidx_t* fai, const char *c_name, int p_beg_i, int p_end_i )
{
	int l;
	char c;
    khiter_t iter;
    faidx1_t val;
    char* seq0;
    char* seq = NULL;

    iter = kh_get(s, fai->hash, c_name);
    if(iter == kh_end(fai->hash)) return 0;

    seq0 = seq = (char*)malloc( (p_end_i - p_beg_i + 2) * sizeof(char) );
    if( seq0 == NULL ) fail( "Out of memory.\n" );
    seq0[p_end_i-p_beg_i+1] = '\0';

    val = kh_value(fai->hash, iter);

    /* entirely off the map: all Ns */
    if( p_beg_i >= (int)val.len || p_end_i < 0 ) {
        while( p_beg_i <= p_end_i ) {
            *seq++ ='n';
        }
        return seq0;
    }

    /* beginning is off the map */
    while( p_beg_i < 0 && p_beg_i <= p_end_i ) {
        *seq++ = 'n';
        p_beg_i++;
    }

    /* end is off the map */
    while( p_end_i >= (int)val.len ) {
        seq[p_end_i-p_beg_i] = 'n';
        p_end_i--;
    }

    /* retrieve the sequence */
	l = 0;
	razf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);
	while (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1)
		if (isgraph(c)) seq[l++] = tolower(c);
    
    while( p_beg_i+l <= p_end_i ) seq[l++] = 'n';

    return seq0;
}
Пример #4
0
char *fai_fetch(const faidx_t *fai, const char *str, int *len)
{
	char *s, c;
	int i, l, k, name_end;
	khiter_t iter;
	faidx1_t val;
	khash_t(s) *h;
	int beg, end;

	beg = end = -1;
	h = fai->hash;
	name_end = l = strlen(str);
	s = (char*)malloc(l+1);
	// remove space
	for (i = k = 0; i < l; ++i)
		if (!isspace(str[i])) s[k++] = str[i];
	s[k] = 0; l = k;
	// determine the sequence name
	for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
	if (i >= 0) name_end = i;
	if (name_end < l) { // check if this is really the end
		int n_hyphen = 0;
		for (i = name_end + 1; i < l; ++i) {
			if (s[i] == '-') ++n_hyphen;
			else if (!isdigit(s[i]) && s[i] != ',') break;
		}
		if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
		s[name_end] = 0;
		iter = kh_get(s, h, s);
		if (iter == kh_end(h)) { // cannot find the sequence name
			iter = kh_get(s, h, str); // try str as the name
			if (iter == kh_end(h)) {
				*len = 0;
			free(s); return 0;
			} else s[name_end] = ':', name_end = l;
		}
	} else iter = kh_get(s, h, str);
	if(iter == kh_end(h)) {
		fprintf(pysamerr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str);
		free(s);
		return 0;
	};
	val = kh_value(h, iter);
	// parse the interval
	if (name_end < l) {
		for (i = k = name_end + 1; i < l; ++i)
			if (s[i] != ',') s[k++] = s[i];
		s[k] = 0;
		beg = atoi(s + name_end + 1);
		for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
		end = i < k? atoi(s + i + 1) : val.len;
		if (beg > 0) --beg;
	} else beg = 0, end = val.len;
	if (beg >= val.len) beg = val.len;
	if (end >= val.len) end = val.len;
	if (beg > end) beg = end;
	free(s);

	// now retrieve the sequence
	l = 0;
	s = (char*)malloc(end - beg + 2);
	razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
	while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err)
		if (isgraph(c)) s[l++] = c;
	s[l] = '\0';
	*len = l;
	return s;
}
Пример #5
0
int main(int argc, char **argv)
{
	int c, compress, pstdout, is_forced;
	RAZF *rz;
	void *buffer;
	long start, end, size;

	compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
	while((c  = getopt(argc, argv, "cdlhfb:s:")) >= 0){
		switch(c){
		case 'h': return razf_main_usage();
		case 'd': compress = 0; break;
		case 'c': pstdout = 1; break;
		case 'l': compress = 2; break;
		case 'b': start = atol(optarg); break;
		case 's': size = atol(optarg); break;
		case 'f': is_forced = 1; break;
		}
	}
	if (size >= 0) end = start + size;
	if(end >= 0 && end < start){
		fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end);
		return 1;
	}
	if(compress == 1){
		int f_src, f_dst = -1;
		if(argc > optind){
			if((f_src = open(argv[optind], O_RDONLY)) < 0){
				fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]);
				return 1;
			}
			if(pstdout){
				f_dst = fileno(stdout);
			} else {
				char *name = malloc(sizeof(strlen(argv[optind]) + 5));
				strcpy(name, argv[optind]);
				strcat(name, ".rz");
				f_dst = write_open(name, is_forced);
				if (f_dst < 0) return 1;
				free(name);
			}
		} else if(pstdout){ 
			f_src = fileno(stdin);
			f_dst = fileno(stdout);
		} else return razf_main_usage();
		rz = razf_dopen(f_dst, "w");
		buffer = malloc(WINDOW_SIZE);
		while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) razf_write(rz, buffer, c);
		razf_close(rz); // f_dst will be closed here
		if (argc > optind && !pstdout) unlink(argv[optind]);
		free(buffer);
		close(f_src);
		return 0;
	} else {
		if(argc <= optind) return razf_main_usage();
		if(compress == 2){
			rz = razf_open(argv[optind], "r");
			if(rz->file_type == FILE_TYPE_RZ) {
							printf("%20s%20s%7s %s\n", "compressed", "uncompressed", "ratio", "name");
				printf("%20lld%20lld%6.1f%% %s\n", (long long)rz->end, (long long)rz->src_end, rz->end * 100.0f / rz->src_end,
					   argv[optind]);
			} else fprintf(stdout, "%s is not a regular rz file\n", argv[optind]);
		} else {
			int f_dst;
			if (argc > optind && !pstdout) {
				char *name;
				if (strstr(argv[optind], ".rz") - argv[optind] != strlen(argv[optind]) - 3) {
					printf("razip: %s: unknown suffix -- ignored\n", argv[optind]);
					return 1;
				}
				name = strdup(argv[optind]);
				name[strlen(name) - 3] = '\0';
				f_dst = write_open(name, is_forced);
				free(name);
			} else f_dst = fileno(stdout);
			rz = razf_open(argv[optind], "r");
			buffer = malloc(WINDOW_SIZE);
			razf_seek(rz, start, SEEK_SET);
			while(1){
				if(end < 0) c = razf_read(rz, buffer, WINDOW_SIZE);
				else c = razf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
				if(c <= 0) break;
				start += c;
				write(f_dst, buffer, c);
				if(end >= 0 && start >= end) break;
			}
			free(buffer);
			if (!pstdout) unlink(argv[optind]);
		}
		razf_close(rz);
		return 0;
	}
}