예제 #1
0
void
populate_index(uint64_t *table, cmph_t *hash, BGZF *fastq_file)
{
    while( 1 )
    {
        /* Find @ */
        char c;
        while( ( c = bgzf_getc( fastq_file ) ) != '@' && c >= 0 )
        {
        }
        
        long pos = bgzf_tell( fastq_file );
        if( pos == -1 )
        {
            break;
        }

        char *accession = NULL;
        cmph_uint32 accession_length;
        if( read_one_line( &accession, &accession_length, fastq_file ) != 1 )
        {
            break;
        }

        /* Next char is sequence, save pos */
        unsigned int id = cmph_search( hash, accession, accession_length );
        table[ id ] = (uint64_t) pos;
    }
}
예제 #2
0
파일: faidx.c 프로젝트: pkrusche/vt
char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
{
	int l;
	char c;
    khiter_t iter;
    faidx1_t val;
	char *seq=NULL;

    // Adjust position
    iter = kh_get(s, fai->hash, c_name);
    if(iter == kh_end(fai->hash)) return 0;
    val = kh_value(fai->hash, iter);
	if(p_end_i < p_beg_i) p_beg_i = p_end_i;
    if(p_beg_i < 0) p_beg_i = 0;
    else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
    if(p_end_i < 0) p_end_i = 0;
    else if(val.len <= p_end_i) p_end_i = val.len - 1;

    // Now retrieve the sequence 
	int ret = bgzf_useek(fai->bgzf, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);
    if ( ret<0 )
    {
        *len = -1;
        fprintf(stderr, "[fai_fetch_seq] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
        return NULL;
    }
	l = 0;
	seq = (char*)malloc(p_end_i - p_beg_i + 2);
	while ( (c=bgzf_getc(fai->bgzf))>=0 && l < p_end_i - p_beg_i + 1)
		if (isgraph(c)) seq[l++] = c;
	seq[l] = '\0';
	*len = l;
	return seq;
}
예제 #3
0
int key_fastq_read(void *data, char **key, cmph_uint32 *keylen)
{
    BGZF *fp = (BGZF *) data;

    char c;
    *keylen = 0;
    /* Find header start */
    while( ( c = bgzf_getc( fp ) ) != '@' && c >= 0 )
    { }

    *key = NULL;
    if( read_one_line( key, keylen, fp ) == 1 )
    {
        return (int) *keylen;
    }
    else
    {
        return -1;
    }
}
예제 #4
0
char *
bgzf_fgets(char *buffer, size_t buffer_size, BGZF *fp)
{
    char c = 0;
    size_t i = 0;
    while( i < buffer_size - 1 && ((c = bgzf_getc( fp )) > 0) )
    {
        buffer[ i++ ] = c;
        if( c == '\n' )
        {
            break;
        }
    }
    buffer[ i ] = '\0';

    if( i > 0 && c != -1 )
    {
        return buffer;
    }
    else
    {
        return NULL;
    }
}
예제 #5
0
파일: faidx.c 프로젝트: mtmorgan/Rhtslib
faidx_t *fai_build_core(BGZF *bgzf)
{
    char *name;
    int c;
    int l_name, m_name;
    int line_len, line_blen, state;
    int l1, l2;
    faidx_t *idx;
    uint64_t offset;
    int64_t len;

    idx = (faidx_t*)calloc(1, sizeof(faidx_t));
    name = (char*)calloc(1, sizeof(char)); /* at least 1 byte, for '\0' */
    idx->hash = kh_init(s);
    l_name = m_name = 0;
    len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
    while ( (c=bgzf_getc(bgzf))>=0 ) {
        if (c == '\n') { // an empty line
            if (state == 1) {
                offset = bgzf_utell(bgzf);
                continue;
            } else if ((state == 0 && len < 0) || state == 2) continue;
        }
        if (c == '>') { // fasta header
            if (len >= 0)
                fai_insert_index(idx, name, len, line_len, line_blen, offset);
            l_name = 0;
            while ( (c=bgzf_getc(bgzf))>=0 && !isspace(c)) {
                if (m_name < l_name + 2) {
                    m_name = l_name + 2;
                    kroundup32(m_name);
                    name = (char*)realloc(name, m_name);
                }
                name[l_name++] = c;
            }
            name[l_name] = '\0';
            if ( c<0 ) {
                fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
                free(name); fai_destroy(idx);
                return 0;
            }
            if (c != '\n') while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
            state = 1; len = 0;
            offset = bgzf_utell(bgzf);
        } else {
            if (state == 3) {
                fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
                free(name); fai_destroy(idx);
                return 0;
            }
            if (state == 2) state = 3;
            l1 = l2 = 0;
            do {
                ++l1;
                if (isgraph(c)) ++l2;
            } while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
            if (state == 3 && l2) {
                fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name);
                free(name); fai_destroy(idx);
                return 0;
            }
            ++l1; len += l2;
            if (state == 1) line_len = l1, line_blen = l2, state = 0;
            else if (state == 0) {
                if (l1 != line_len || l2 != line_blen) state = 2;
            }
        }
    }
    if ( name )
        fai_insert_index(idx, name, len, line_len, line_blen, offset);
    else
    {
        free(idx);
        return NULL;
    }
    free(name);
    return idx;
}
예제 #6
0
파일: faidx.c 프로젝트: mtmorgan/Rhtslib
char *fai_fetch(const faidx_t *fai, const char *str, int *len)
{
    char *s;
    int c, i, l, k, name_end;
    khiter_t iter;
    faidx1_t val;
    khash_t(s) *h;
    int beg, end;

    beg = end = -1;
    h = fai->hash;
    name_end = l = strlen(str);
    s = (char*)malloc(l+1);
    // remove space
    for (i = k = 0; i < l; ++i)
        if (!isspace(str[i])) s[k++] = str[i];
    s[k] = 0; l = k;
    // determine the sequence name
    for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
    if (i >= 0) name_end = i;
    if (name_end < l) { // check if this is really the end
        int n_hyphen = 0;
        for (i = name_end + 1; i < l; ++i) {
            if (s[i] == '-') ++n_hyphen;
            else if (!isdigit(s[i]) && s[i] != ',') break;
        }
        if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
        s[name_end] = 0;
        iter = kh_get(s, h, s);
        if (iter == kh_end(h)) { // cannot find the sequence name
            iter = kh_get(s, h, str); // try str as the name
            if (iter == kh_end(h)) {
                *len = 0;
            free(s); return 0;
            } else s[name_end] = ':', name_end = l;
        }
    } else iter = kh_get(s, h, str);
    if(iter == kh_end(h)) {
        fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str);
        free(s);
        *len = -2;
        return 0;
    };
    val = kh_value(h, iter);
    // parse the interval
    if (name_end < l) {
        for (i = k = name_end + 1; i < l; ++i)
            if (s[i] != ',') s[k++] = s[i];
        s[k] = 0;
        beg = atoi(s + name_end + 1);
        for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
        end = i < k? atoi(s + i + 1) : val.len;
        if (beg > 0) --beg;
    } else beg = 0, end = val.len;
    if (beg >= val.len) beg = val.len;
    if (end >= val.len) end = val.len;
    if (beg > end) beg = end;
    free(s);

    // now retrieve the sequence
    int ret = bgzf_useek(fai->bgzf, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
    if ( ret<0 )
    {
        *len = -1;
        fprintf(stderr, "[fai_fetch] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
        return NULL;
    }
    l = 0;
    s = (char*)malloc(end - beg + 2);
    while ( (c=bgzf_getc(fai->bgzf))>=0 && l < end - beg )
        if (isgraph(c)) s[l++] = c;
    s[l] = '\0';
    *len = l;
    return s;
}
예제 #7
0
value caml_bgzf_getc(value bgzf) {
	CAMLparam1(bgzf);
	CAMLreturn(Val_int(bgzf_getc(BGZF_val(bgzf))));
}