Exemple #1
0
int beds_database_add(struct beds_options *opts, const char *fname, char *columns)
{
    if ( opts->n_files == opts->m_files ) {
	opts->m_files = opts->m_files == 0 ? 2 : opts->m_files +2;
	opts->files = (struct beds_anno_file*)realloc(opts->files, opts->m_files*sizeof(struct beds_anno_file));	
    }
    struct beds_anno_file *file = &opts->files[opts->n_files];
    memset(file, 0, sizeof(struct beds_anno_file));
    file->id = opts->n_files;
    file->fname = strdup(fname);
    file->fp = hts_open(fname, "r");
    if (file->fp == NULL)
	error("Failed to open %s : %s", fname, strerror(errno));
    // int n;
    file->idx = tbx_index_load(fname);
    if ( file->idx == NULL)
	error("Failed to load index of %s.", fname);
    opts->n_files++;
    
    file->last_id = -1;
    file->last_start = -1;
    file->last_end = -1;
    kstring_t string = KSTRING_INIT;
    int no_columns = 0;
    int i;
    if ( columns == NULL && file->no_such_chrom == 0) {
	warnings("No columns string specified for %s. Will annotate all tags in this data.", fname);
        file->no_such_chrom = 1;
	no_columns = 1;
    } else {
	int *splits = NULL;
	kputs(columns, &string);
	int nfields;
	splits = ksplit(&string, ',', &nfields);
	file->m_cols = nfields;
	file->cols = (struct anno_col*)malloc(sizeof(struct anno_col) * file->m_cols);

	for ( i = 0; i < nfields; ++i ) {
	    char *ss = string.s + splits[i];
	    struct anno_col *col = &file->cols[file->n_cols];
	    col->icol = i;
	    col->replace = REPLACE_MISSING;
	    if (*ss == '+') {
		col->replace = REPLACE_MISSING;
		ss++;
	    } else if ( *ss == '-' ) {
		col->replace = REPLACE_EXISTING;
		ss++;
	    }
	    if (ss[0] == '\0')
		continue;
	    if ( strncmp(ss, "INFO/", 5) == 0)
		ss += 5;
	    col->hdr_key = strdup(ss);	    
	    col->icol = -1;
	    // debug_print("%s, %d", col->hdr_key, file->n_cols);
	    file->n_cols++;	    
	}
	string.l = 0;	    
    }

    while (1) {
	string.l =0;
	if ( hts_getline(file->fp, KS_SEP_LINE, &string) < 0 )
	    break;
	// only accept header line in the beginning for file
	if ( string.s[0] != '#' )
	    break;
	if ( strncmp(string.s, "##INFO=", 7) == 0) {
	    char *ss = string.s + 11;
	    char *se = ss;
	    while (se && *se != ',') se++;
	    struct anno_col *col = NULL;
	    // if no column string specified, init all header lines
	    if ( no_columns ) {
		if ( file->n_cols == file->m_cols ) {
		    file->m_cols = file->m_cols == 0 ? 2 : file->m_cols + 2;
		    file->cols = (struct anno_col *) realloc(file->cols, file->m_cols*sizeof(struct anno_col));
		}
		col = &file->cols[file->n_cols++];
		col->icol = -1;
		col->hdr_key = strndup(ss, se-ss+1);
		col->hdr_key[se-ss] = '\0';
	    } else {
		for ( i = 0; i < file->n_cols; ++i ) {		    
		    if ( strncmp(file->cols[i].hdr_key, ss, se-ss) == 0)
			break;
		}
		// if header line is not set in the column string, skip
		if ( i == file->n_cols )
		    continue;
		col = &file->cols[i];
	    }

	    // specify setter functions here
	    col->setter.bed = beds_setter_info_string;
	    
	    bcf_hdr_append(opts->hdr_out, string.s);
	    bcf_hdr_sync(opts->hdr_out);
	    int hdr_id = bcf_hdr_id2int(opts->hdr_out, BCF_DT_ID,col->hdr_key);
	    assert ( bcf_hdr_idinfo_exists(opts->hdr_out, BCF_HL_INFO, hdr_id) );
	}
	string.l = 0;
	// set column number for each col
	if ( strncasecmp(string.s, "#chr", 4) == 0) {
	    int nfields;	    
	    int *splits = ksplit(&string, '\t', &nfields);

	    if (nfields < 4) {
		fprintf(stderr, "[error] Bad header of bed database : %s. n_fields : %d, %s", fname, nfields, string.s);
		fprintf(stderr, "[notice] this error usually happened because the header line is seperated by spaces but not tab!");
		exit(1);
	    }
	    int k;
	    for ( k = 3; k < nfields; ++k ) {
		char *ss = string.s + splits[k];
		for (i = 0; i < file->n_cols; ++i ) {
		    struct anno_col *col = &file->cols[i];
		    if ( strcmp(col->hdr_key, ss) == 0)
			break;
		}
		// if name line specify more names than column string or header, skip
		if ( i == file->n_cols )
		    continue;

		struct anno_col *col = &file->cols[i];
		col->icol = k;
	    }
	}
    }
    for ( i = 0; i < file->n_cols; ++i ) {
	struct anno_col *col = &file->cols[i];
	if ( col->hdr_key && col->icol == -1 )
	    error("No column %s found in bed database : %s", col->hdr_key, fname);

	int hdr_id = bcf_hdr_id2int(opts->hdr_out, BCF_DT_ID, col->hdr_key);
        assert(hdr_id>-1);
	col->number = bcf_hdr_id2length(opts->hdr_out, BCF_HL_INFO, hdr_id);
	if ( col->number == BCF_VL_A || col->number == BCF_VL_R || col->number == BCF_VL_G)
	    error("Only support fixed INFO number for bed database. %s", col->hdr_key);
	col->ifile = file->id;
    }
    if ( string.m )
	free(string.s);
    if ( opts->beds_is_inited == 0 )
	opts->beds_is_inited = 1;
    return 0;
}
Exemple #2
0
void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask)
{
    int *map = (int*) calloc(line->n_allele, sizeof(int));

    // create map of indexes from old to new ALT numbering and modify ALT
    kstring_t str = {0,0,0};
    kputs(line->d.allele[0], &str);

    int nrm = 0, i,j;  // i: ori alleles, j: new alleles
    for (i=1, j=1; i<line->n_allele; i++) 
    {
        if ( rm_mask & 1<<i )
        {
            // remove this allele
            line->d.allele[i] = NULL;
            nrm++;
            continue;
        }
        kputc(',', &str);
        kputs(line->d.allele[i], &str);
        map[i] = j;
        j++;
    }
    if ( !nrm ) { free(map); free(str.s); return; }

    int nR_ori = line->n_allele;
    int nR_new = line->n_allele-nrm;
    assert(nR_new > 0); // should not be able to remove reference allele
    int nA_ori = nR_ori-1;
    int nA_new = nR_new-1;

    int nG_ori = nR_ori*(nR_ori + 1)/2;
    int nG_new = nR_new*(nR_new + 1)/2;

    bcf_update_alleles_str(header, line, str.s);

    // remove from Number=G, Number=R and Number=A INFO fields.
    uint8_t *dat = NULL;
    int mdat = 0, ndat = 0, mdat_bytes = 0, nret;
    for (i=0; i<line->n_info; i++)
    {
        bcf_info_t *info = &line->d.info[i];
        int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key);
        
        if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change

        int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key);
        if ( type==BCF_HT_FLAG ) continue;
        int size = 1;
        if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;

        mdat = mdat_bytes / size;
        nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type);
        mdat_bytes = mdat * size;
        if ( nret<0 ) 
        { 
            fprintf(stderr,"[%s:%d %s] Could not access INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, 
                bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); 
            exit(1);
        }
        if ( type==BCF_HT_STR ) 
        { 
            str.l = 0;
            char *ss = (char*) dat, *se = (char*) dat;
            if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
            {
                int nexp, inc = 0;
                if ( vlen==BCF_VL_A )
                {
                    nexp = nA_ori;
                    inc  = 1;
                }
                else
                    nexp = nR_ori;
                for (j=0; j<nexp; j++)
                {
                    if ( !*se ) break;
                    while ( *se && *se!=',' ) se++;
                    if ( rm_mask & 1<<(j+inc) ) 
                    { 
                        if ( *se ) se++;
                        ss = se; 
                        continue; 
                    }
                    if ( str.l ) kputc(',',&str);
                    kputsn(ss,se-ss,&str);
                    if ( *se ) se++;
                    ss = se;
                }
                assert( j==nexp );
            }
            else    // Number=G, assuming diploid genotype
            {
                int k = 0, n = 0;
                for (j=0; j<nR_ori; j++)
                {
                    for (k=0; k<=j; k++)
                    {
                        if ( !*se ) break;
                        while ( *se && *se!=',' ) se++;
                        n++;
                        if ( rm_mask & 1<<j || rm_mask & 1<<k ) 
                        { 
                            if ( *se ) se++;
                            ss = se; 
                            continue; 
                        }
                        if ( str.l ) kputc(',',&str);
                        kputsn(ss,se-ss,&str);
                        if ( *se ) se++;
                        ss = se;
                    }
                    if ( !*se ) break;
                }
                assert( n=nG_ori );
            }

            nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type);
            if ( nret<0 )
            {
                fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
                        bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
                exit(1);
            }
            continue; 
        }
        
        if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
        {
            int inc = 0, ntop;
            if ( vlen==BCF_VL_A )
            {
                assert( nret==nA_ori );
                ntop = nA_ori;
                ndat = nA_new;
                inc  = 1;
            }
            else
            {
                assert( nret==nR_ori );
                ntop = nR_ori;
                ndat = nR_new;
            }
            int k = 0;

            #define BRANCH(type_t,is_vector_end) \
            { \
                type_t *ptr = (type_t*) dat; \
                int size = sizeof(type_t); \
                for (j=0; j<ntop; j++) /* j:ori, k:new */ \
                { \
                    if ( is_vector_end ) { memcpy(dat+k*size, dat+j*size, size); break; } \
                    if ( rm_mask & 1<<(j+inc) ) continue; \
                    if ( j!=k ) memcpy(dat+k*size, dat+j*size, size); \
                    k++; \
                } \
            }
            switch (type) 
            {
                case BCF_HT_INT:  BRANCH(int32_t,ptr[j]==bcf_int32_vector_end); break;
                case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[j])); break;
            }
            #undef BRANCH
        }
        else    // Number=G
        {
            assert( nret==nG_ori );
            int k, l_ori = -1, l_new = 0;
            ndat = nG_new;

            #define BRANCH(type_t,is_vector_end) \
            { \
                type_t *ptr = (type_t*) dat; \
                int size = sizeof(type_t); \
                for (j=0; j<nR_ori; j++) \
                { \
                    for (k=0; k<=j; k++) \
                    { \
                        l_ori++; \
                        if ( is_vector_end ) { memcpy(dat+l_new*size, dat+l_ori*size, size); break; } \
                        if ( rm_mask & 1<<j || rm_mask & 1<<k ) continue; \
                        if ( l_ori!=l_new ) memcpy(dat+l_new*size, dat+l_ori*size, size); \
                        l_new++; \
                    } \
                } \
            }
            switch (type) 
            {
                case BCF_HT_INT:  BRANCH(int32_t,ptr[l_ori]==bcf_int32_vector_end); break;
                case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[l_ori])); break;
            }
            #undef BRANCH
        }

        nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type);
        if ( nret<0 )
        {
            fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
                    bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
            exit(1);
        }
    }

    // Update GT fields, the allele indexes might have changed
    for (i=1; i<line->n_allele; i++) if ( map[i]!=i ) break;
    if ( i<line->n_allele )
    {
        mdat = mdat_bytes / 4;  // sizeof(int32_t)
        nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat);
        mdat_bytes = mdat * 4;
        if ( nret>0 )
        {
            nret /= line->n_sample;
            int32_t *ptr = (int32_t*) dat;
            for (i=0; i<line->n_sample; i++)
            {
                for (j=0; j<nret; j++)
                {
                    if ( ptr[j]==bcf_gt_missing ) continue;
                    if ( ptr[j]==bcf_int32_vector_end ) break;
                    int al = bcf_gt_allele(ptr[j]);
                    assert( al<nR_ori && map[al]>=0 );
                    ptr[j] = (map[al]+1)<<1 | (ptr[j]&1);
                }
                ptr += nret;
            }
            bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample);
        }
    }

    // Remove from Number=G, Number=R and Number=A FORMAT fields. 
    // Assuming haploid or diploid GTs
    for (i=0; i<line->n_fmt; i++)
    {
        bcf_fmt_t *fmt = &line->d.fmt[i];
        int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id);

        if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change

        int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id);
        if ( type==BCF_HT_FLAG ) continue;

        int size = 1;
        if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;

        mdat = mdat_bytes / size;
        nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type);
        mdat_bytes = mdat * size;
        if ( nret<0 ) 
        { 
            fprintf(stderr,"[%s:%d %s] Could not access FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, 
                    bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); 
            exit(1);
        }

        if ( type==BCF_HT_STR ) 
        {
            int size = nret/line->n_sample;     // number of bytes per sample
            str.l = 0;
            if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
            {
                int nexp, inc = 0;
                if ( vlen==BCF_VL_A )
                {
                    nexp = nA_ori;
                    inc  = 1;
                }
                else
                    nexp = nR_ori;
                for (j=0; j<line->n_sample; j++)
                {
                    char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss;
                    int k_src = 0, k_dst = 0, l = str.l;
                    for (k_src=0; k_src<nexp; k_src++)
                    {
                        if ( ptr>=se || !*ptr) break;
                        while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
                        if ( rm_mask & 1<<(k_src+inc) )
                        {
                            ss = ++ptr;
                            continue;
                        }
                        if ( k_dst ) kputc(',',&str);
                        kputsn(ss,ptr-ss,&str);
                        ss = ++ptr;
                        k_dst++;
                    }
                    assert( k_src==nexp );
                    l = str.l - l;
                    for (; l<size; l++) kputc(0, &str);
                }
            }
            else    // Number=G, diploid or haploid
            {
                for (j=0; j<line->n_sample; j++)
                {
                    char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss;
                    int k_src = 0, k_dst = 0, l = str.l;
                    int nexp = 0; // diploid or haploid?
                    while ( ptr<se )
                    {
                        if ( !*ptr ) break;
                        if ( *ptr==',' ) nexp++;
                        ptr++;
                    }
                    if ( ptr!=ss ) nexp++;
                    assert( nexp==nG_ori || nexp==nR_ori );
                    ptr = ss;
                    if ( nexp==nG_ori ) // diploid
                    {
                        int ia, ib;
                        for (ia=0; ia<nR_ori; ia++)
                        {
                            for (ib=0; ib<=ia; ib++)
                            {
                                if ( ptr>=se || !*ptr ) break;
                                while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
                                if ( rm_mask & 1<<ia || rm_mask & 1<<ib )
                                {
                                    ss = ++ptr;
                                    continue;
                                }
                                if ( k_dst ) kputc(',',&str);
                                kputsn(ss,ptr-ss,&str);
                                ss = ++ptr;
                                k_dst++;
                            }
                            if ( ptr>=se || !*ptr ) break;
                        }
                    }
                    else    // haploid
                    {
                        for (k_src=0; k_src<nR_ori; k_src++)
                        {
                            if ( ptr>=se || !*ptr ) break;
                            while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
                            if ( rm_mask & 1<<k_src )
                            {
                                ss = ++ptr;
                                continue;
                            }
                            if ( k_dst ) kputc(',',&str);
                            kputsn(ss,ptr-ss,&str);
                            ss = ++ptr;
                            k_dst++;
                        }
                        assert( k_src==nR_ori );
                        l = str.l - l;
                        for (; l<size; l++) kputc(0, &str);
                    }
                }
            }
            nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type);
            if ( nret<0 )
            {
                fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
                        bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
                exit(1);
            }
            continue;
        }

        int nori = nret / line->n_sample;
        if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G
        {
            int ntop, inc = 0;
            if ( vlen==BCF_VL_A )
            {
                assert( nori==nA_ori );     // todo: will fail if all values are missing
                ntop = nA_ori;
                ndat = nA_new*line->n_sample;
                inc  = 1;
            }
            else
            {
                assert( nori==nR_ori );     // todo: will fail if all values are missing
                ntop = nR_ori;
                ndat = nR_new*line->n_sample;
            }

            #define BRANCH(type_t,is_vector_end) \
            { \
                for (j=0; j<line->n_sample; j++) \
                { \
                    type_t *ptr_src = ((type_t*)dat) + j*nori; \
                    type_t *ptr_dst = ((type_t*)dat) + j*nA_new; \
                    int size = sizeof(type_t); \
                    int k_src, k_dst = 0; \
                    for (k_src=0; k_src<ntop; k_src++) \
                    { \
                        if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); break; } \
                        if ( rm_mask & 1<<(k_src+inc) ) continue; \
                        if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
                        k_dst++; \
                    } \
                } \
            }
            switch (type) 
            {
                case BCF_HT_INT:  BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
                case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
            }
            #undef BRANCH
        }
        else    // Number=G, diploid or mixture of haploid+diploid
        {
            assert( nori==nG_ori );
            ndat = nG_new*line->n_sample;

            #define BRANCH(type_t,is_vector_end) \
            { \
                for (j=0; j<line->n_sample; j++) \
                { \
                    type_t *ptr_src = ((type_t*)dat) + j*nori; \
                    type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \
                    int size = sizeof(type_t); \
                    int ia, ib, k_dst = 0, k_src; \
                    int nset = 0;   /* haploid or diploid? */ \
                    for (k_src=0; k_src<nG_ori; k_src++) { if ( is_vector_end ) break; nset++; } \
                    if ( nset==nR_ori ) /* haploid */ \
                    { \
                        for (k_src=0; k_src<nR_ori; k_src++) \
                        { \
                            if ( rm_mask & 1<<k_src ) continue; \
                            if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
                            k_dst++; \
                        } \
                        memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
                    } \
                    else /* diploid */ \
                    { \
                        k_src = -1; \
                        for (ia=0; ia<nR_ori; ia++) \
                        { \
                            for (ib=0; ib<=ia; ib++) \
                            { \
                                k_src++; \
                                if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; } \
                                if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) continue; \
                                if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
                                k_dst++; \
                            } \
                        } \
                    } \
                } \
            }
            switch (type) 
            {
                case BCF_HT_INT:  BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
                case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
            }
            #undef BRANCH
        }
        nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type);
        if ( nret<0 )
        {
            fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
                    bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
            exit(1);
        }
    }
    free(dat);
    free(str.s);
    free(map);
}
Exemple #3
0
// only if annotation database is VCF/BCF file, header_in has values or else header_in == NULL
anno_col_t *init_columns(const char *rules, bcf_hdr_t *header_in, bcf_hdr_t *header_out, int *ncols, enum anno_type type)
{
    assert(rules != NULL);
    if (type == anno_is_vcf && header_in == NULL) {
	error("Inconsistent file type!");
    }
    char *ss = (char*)rules, *se = ss;
    int nc = 0;
    anno_col_t *cols = NULL;
    kstring_t tmp = KSTRING_INIT;
    kstring_t str = KSTRING_INIT;
    int i = -1;

    while (*ss) {
	if ( *se && *se!=',' ) {
	    se++;
	    continue;
	}
	int replace = REPLACE_ALL;
	if ( *ss=='+') {
	    replace = REPLACE_MISSING;
	    ss++;
	} else if (*ss=='-') {
	    replace = REPLACE_EXISTING;
	    ss++;
	}
	i++;
	str.l = 0;
	kputsn(ss, se-ss, &str); 
	if ( !str.s[0] ) {
	    warnings("Empty tag in %s", rules);
	} else if ( !strcasecmp("CHROM", str.s) || !strcasecmp("POS", str.s) || !strcasecmp("FROM", str.s) || !strcasecmp("TO", str.s) || !strcasecmp("REF", str.s) || !strcasecmp("ALT", str.s) || !strcasecmp("FILTER", str.s) || !strcasecmp("QUAL", str.s)) {
	    warnings("Skip tag %s", str.s);
	} else if ( !strcasecmp("ID", str.s) ) {
	    nc++;
            cols = (struct anno_col*) realloc(cols, sizeof(struct anno_col)* (nc));
            struct anno_col *col = &cols[nc-1];
            col->icol = i;
            col->replace = replace;
            col->setter = type == anno_is_vcf ? vcf_setter_id : setter_id;
            col->hdr_key = strdup(str.s);
        } else if (!strcasecmp("INFO", str.s) || !strcasecmp("FORMAT", str.s) ) {
	    error("do not support annotate all INFO,FORMAT fields. todo INFO/TAG instead\n");
	} else if (!strncasecmp("FORMAT/", str.s, 7) || !strncasecmp("FMT/", str.s, 4)) {
            char *key = str.s + (!strncasecmp("FMT", str.s, 4) ? 4 : 7);
            if (!strcasecmp("GT", key)) 
		error("It is not allowed to change GT tag.");

	    int hdr_id = bcf_hdr_id2int(header_out, BCF_DT_ID, str.s);
	    
	    if ( !bcf_hdr_idinfo_exists(header_out, BCF_HL_FMT, hdr_id) ) {
		
		if ( type == anno_is_vcf ) {
		    bcf_hrec_t *hrec = bcf_hdr_get_hrec(header_in, BCF_HL_FMT, "ID", str.s, NULL);
		    if ( !hrec )
			error("The tag \"%s\" is not defined in header: %s\n", str.s, rules);
		    tmp.l = 0;
		    bcf_hrec_format(hrec, &tmp);
		    bcf_hdr_append(header_out, tmp.s);
		    bcf_hdr_sync(header_out);
		    hdr_id = bcf_hdr_id2int(header_out, BCF_DT_ID, str.s);
		    assert( bcf_hdr_idinfo_exists(header_out, BCF_HL_FMT, hdr_id) );
		} else {
		    error("The tag \"%s\" is not defined in header: %s\n", str.s, rules);
		}
	    }

            //int hdr_id = bcf_hdr_id2int(header_out, BCF_DT_ID, key);
            nc++;
	    cols = (struct anno_col*) realloc(cols, sizeof(struct anno_col)*(nc));
            struct anno_col *col = &cols[nc-1];
            col->icol = -1;
            col->replace = replace;
            col->hdr_key = strdup(key);

            switch ( bcf_hdr_id2type(header_out, BCF_HL_FMT, hdr_id) ) {

		case BCF_HT_INT:
		    col->setter = type == anno_is_vcf ? vcf_setter_format_int : setter_format_int;
		    break;

		case BCF_HT_REAL:
		    col->setter = type == anno_is_vcf ? vcf_setter_format_real : setter_format_real;
		    break;

		case BCF_HT_STR:
		    col->setter = type == anno_is_vcf ? vcf_setter_format_str : setter_format_str;
		    break;

		default :
		    error("The type of %s not recognised (%d)\n", str.s, bcf_hdr_id2type(header_out, BCF_HL_FMT, hdr_id));
            }

	} else if ( !strncasecmp("INFO/", str.s, 5) ) {
	    memmove(str.s, str.s+5, str.l-4);
	    str.l -= 4;

	    int hdr_id = bcf_hdr_id2int(header_out, BCF_DT_ID, str.s);

	    if ( !bcf_hdr_idinfo_exists(header_out, BCF_HL_INFO, hdr_id) ) {
		if ( type == anno_is_vcf ) {
		    bcf_hrec_t *hrec = bcf_hdr_get_hrec(header_in, BCF_HL_INFO, "ID", str.s, NULL);

		    if ( !hrec )
			error("The tag \"%s\" is not defined in header: %s\n", str.s, rules);
		    tmp.l = 0;

		    bcf_hrec_format(hrec, &tmp);
		    bcf_hdr_append(header_out, tmp.s);
		    bcf_hdr_sync(header_out);
		    hdr_id = bcf_hdr_id2int(header_out, BCF_DT_ID, str.s);
		    assert( bcf_hdr_idinfo_exists(header_out, BCF_HL_INFO, hdr_id) );
		} else {
		    error("The tag \"%s\" is not defined in header: %s\n", str.s, rules);
		}
	    }
	    nc++;
	    cols = (struct anno_col*) realloc(cols, sizeof(struct anno_col)*(nc));
	    struct anno_col *col = &cols[nc-1];
	    col->icol = i;
	    col->replace = replace;
	    col->hdr_key = strdup(str.s);

	    col->number = bcf_hdr_id2length(header_out, BCF_HL_INFO, hdr_id);

	    switch ( bcf_hdr_id2type(header_out, BCF_HL_INFO, hdr_id) ) {

		case BCF_HT_FLAG:
		    col->setter = type == anno_is_vcf ? vcf_setter_info_flag : setter_info_flag;
		    break;

		case BCF_HT_INT:
		    col->setter = type == anno_is_vcf ? vcf_setter_info_int : setter_info_int;
		    break;

		case BCF_HT_REAL:
		    col->setter = type == anno_is_vcf ? vcf_setter_info_real : setter_info_real;
		    break;

		case BCF_HT_STR:
		    col->setter = type == anno_is_vcf ? vcf_setter_info_str : setter_info_str;
		    break;

		default:
		    error("The type of %s not recognised (%d)\n", str.s, bcf_hdr_id2type(header_out, BCF_HL_INFO, hdr_id));
	    }
	}
	if ( !*se ) break;
        ss = ++se;
    }
    *ncols = nc;
    if (str.m) free(str.s);
    if (tmp.m) free(tmp.s);
    return cols;
}