Пример #1
0
static void list_columns(args_t *args)
{
    void *has_sample = NULL;
    if ( args->sample_list )
    {
        has_sample = khash_str2int_init();
        int i, nsmpl;
        char **smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl);
        for (i=0; i<nsmpl; i++) khash_str2int_inc(has_sample, smpl[i]);
        free(smpl);
    }

    int i;
    bcf_sr_t *reader = &args->files->readers[0];
    for (i=0; i<bcf_hdr_nsamples(reader->header); i++)
    {
        if ( has_sample && !khash_str2int_has_key(has_sample, reader->header->samples[i]) ) continue;
        printf("%s\n", reader->header->samples[i]);
    }

    if ( has_sample )
        khash_str2int_destroy_free(has_sample);
}
Пример #2
0
static void bsmpl_add_readgroup(bam_smpl_t *bsmpl, file_t *file, const char *rg_id, const char *smpl_name)
{
    int ismpl = -1;
    if ( smpl_name )
    {
        if ( khash_str2int_get(bsmpl->name2idx,smpl_name,&ismpl) < 0 )
        {
            // new sample
            bsmpl->nsmpl++;
            bsmpl->smpl = (char**) realloc(bsmpl->smpl,sizeof(char*)*bsmpl->nsmpl);
            bsmpl->smpl[bsmpl->nsmpl-1] = strdup(smpl_name);
            ismpl = khash_str2int_inc(bsmpl->name2idx,bsmpl->smpl[bsmpl->nsmpl-1]);
        }
    }
    if ( !strcmp("*",rg_id) )
    {
        // all read groups in the bam treated as the same sample
        file->default_idx = ismpl;
        return;
    }
    if ( !file->rg2idx ) file->rg2idx = khash_str2int_init();
    if ( khash_str2int_has_key(file->rg2idx,rg_id) ) return;    // duplicate @RG:ID
    khash_str2int_set(file->rg2idx, strdup(rg_id), ismpl);
}
Пример #3
0
int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file)
{
    int i, j, nsmpl, free_smpl = 0;
    char **smpl = NULL;

    void *exclude = (fname[0]=='^') ? khash_str2int_init() : NULL;
    if ( exclude || strcmp("-",fname) ) // "-" stands for all samples
    {
        smpl = hts_readlist(fname, is_file, &nsmpl);
        if ( !smpl )
        {
            fprintf(stderr,"Could not read the file: \"%s\"\n", fname);
            return 0;
        }
        if ( exclude )
        {
            for (i=0; i<nsmpl; i++)
                khash_str2int_inc(exclude, smpl[i]);
        }
        free_smpl = 1;
    }
    if ( !smpl )
    {
        smpl  = files->readers[0].header->samples;   // intersection of all samples
        nsmpl = bcf_hdr_nsamples(files->readers[0].header);
    }

    files->samples = NULL;
    files->n_smpl  = 0;
    for (i=0; i<nsmpl; i++)
    {
        if ( exclude && khash_str2int_has_key(exclude,smpl[i])  ) continue;

        int n_isec = 0;
        for (j=0; j<files->nreaders; j++)
        {
            if ( bcf_hdr_id2int(files->readers[j].header, BCF_DT_SAMPLE, smpl[i])<0 ) break;
            n_isec++;
        }
        if ( n_isec!=files->nreaders )
        {
            fprintf(stderr,"Warning: The sample \"%s\" was not found in %s, skipping\n", smpl[i], files->readers[n_isec].fname);
            continue;
        }

        files->samples = (char**) realloc(files->samples, (files->n_smpl+1)*sizeof(const char*));
        files->samples[files->n_smpl++] = strdup(smpl[i]);
    }

    if ( exclude ) khash_str2int_destroy(exclude);
    if ( free_smpl )
    {
        for (i=0; i<nsmpl; i++) free(smpl[i]);
        free(smpl);
    }

    if ( !files->n_smpl )
    {
        if ( files->nreaders>1 )
            fprintf(stderr,"No samples in common.\n");
        return 0;
    }
    for (i=0; i<files->nreaders; i++)
    {
        bcf_sr_t *reader = &files->readers[i];
        reader->samples  = (int*) malloc(sizeof(int)*files->n_smpl);
        reader->n_smpl   = files->n_smpl;
        for (j=0; j<files->n_smpl; j++)
            reader->samples[j] = bcf_hdr_id2int(reader->header, BCF_DT_SAMPLE, files->samples[j]);
    }
    return 1;
}
Пример #4
0
static void init_data(args_t *args)
{
    int i;
    args->hdr = args->files->readers[0].header;

    if (args->calc_ac && args->update_info)
    {
        bcf_hdr_append(args->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">");
        bcf_hdr_append(args->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");
    }
    bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view");

    // setup sample data
    if (args->sample_names)
    {
        void *hdr_samples = khash_str2int_init();
        for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
            khash_str2int_inc(hdr_samples, bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i));

        void *exclude = (args->sample_names[0]=='^') ? khash_str2int_init() : NULL;
        int nsmpl;
        char **smpl = NULL;
        args->samples = NULL; args->n_samples = 0;
        smpl = hts_readlist(exclude ? &args->sample_names[1] : args->sample_names, args->sample_is_file, &nsmpl);
        if ( !smpl )
        {
            error("Could not read the list: \"%s\"\n", exclude ? &args->sample_names[1] : args->sample_names);
        }

        if ( exclude )
        {
            for (i=0; i<nsmpl; i++) {
                if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
                    if (args->force_samples) {
                        fprintf(stderr, "Warn: exclude called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
                    } else {
                        error("Error: exclude called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
                    }
                }
                khash_str2int_inc(exclude, smpl[i]);
            }

            for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
            {
                if ( exclude && khash_str2int_has_key(exclude,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i))  ) continue;
                args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*));
                args->samples[args->n_samples++] = strdup(bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i));
            }
            khash_str2int_destroy(exclude);
        }
        else
        {
            for (i=0; i<nsmpl; i++) {
                if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
                    if (args->force_samples) {
                        fprintf(stderr, "Warn: subset called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
                        continue;
                    } else {
                        error("Error: subset called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
                    }
                }
                args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*));
                args->samples[args->n_samples++] = strdup(smpl[i]);
            }
        }
        for (i=0; i<nsmpl; i++) free(smpl[i]);
        free(smpl);
        khash_str2int_destroy(hdr_samples);
        if (args->n_samples == 0) {
            fprintf(stderr, "Warn: subsetting has removed all samples\n");
            args->sites_only = 1;
        }
    }

    if (args->n_samples)
        args->imap = (int*)malloc(args->n_samples * sizeof(int));

    // determine variant types to include/exclude
    if (args->include_types || args->exclude_types) {
        if (args->include_types && args->exclude_types) {
            fprintf(stderr, "Error: only supply one of --include-types, --exclude-types options\n");
            exit(1);
        }
        char **type_list = 0;
        int m = 0, n = 0;
        const char *q, *p;
        for (q = p = args->include_types ? args->include_types : args->exclude_types;; ++p) {
            if (*p == ',' || *p == 0) {
                if (m == n) {
                    m = m? m<<1 : 16;
                    type_list = (char**)realloc(type_list, m * sizeof(char*));
                }
                type_list[n] = (char*)calloc(p - q + 1, 1);
                strncpy(type_list[n++], q, p - q);
                q = p + 1;
                if (*p == 0) break;
            }
        }
        type_list = (char**)realloc(type_list, n * sizeof(char*));

        if (args->include_types) {
            args->include = 0;
            for (i = 0; i < n; ++i) {
                if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP;
                else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL;
                else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP;
                else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER;
                else {
                    fprintf(stderr, "[E::%s] unknown type\n", type_list[i]);
                    fprintf(stderr, "Accepted types are snps, indels, mnps, other\n");
                    exit(1);
                }
            }
        }
        if (args->exclude_types) {
            args->exclude = 0;
            for (i = 0; i < n; ++i) {
                if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP;
                else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL;
                else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP;
                else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER;
                else {
                    fprintf(stderr, "[E::%s] unknown type\n", type_list[i]);
                    fprintf(stderr, "Accepted types are snps, indels, mnps, other\n");
                    exit(1);
                }
            }
        }
        for (i = 0; i < n; ++i)
            free(type_list[i]);
        free(type_list);
    }

    // setup output
    char modew[8];
    strcpy(modew, "w");
    if (args->clevel >= 0 && args->clevel <= 9) sprintf(modew + 1, "%d", args->clevel);
    if (args->output_type==FT_BCF) strcat(modew, "bu");         // uncompressed BCF
    else if (args->output_type & FT_BCF) strcat(modew, "b");    // compressed BCF
    else if (args->output_type & FT_GZ) strcat(modew,"z");      // compressed VCF
    args->out = hts_open(args->fn_out ? args->fn_out : "-", modew);
    if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));

    // headers: hdr=full header, hsub=subset header, hnull=sites only header
    if (args->sites_only){
        args->hnull = bcf_hdr_subset(args->hdr, 0, 0, 0);
        bcf_hdr_remove(args->hnull, BCF_HL_FMT, NULL);
    }
    if (args->n_samples > 0)
    {
        args->hsub = bcf_hdr_subset(args->hdr, args->n_samples, args->samples, args->imap);
        if ( !args->hsub ) error("Error occurred while subsetting samples\n");
        if ( args->n_samples != bcf_hdr_nsamples(args->hsub) )
        {
            int i;
            for (i=0; i<args->n_samples; i++)
                if ( args->imap[i]<0 ) error("Error: No such sample: \"%s\"\n", args->samples[i]);
        }
    }

    if ( args->filter_str )
        args->filter = filter_init(args->hdr, args->filter_str);
}
Пример #5
0
/*
    The logic of this function is a bit complicated because we want to work
    also with broken bams containing read groups that are not listed in the
    header. The desired behavior is as follows:
        - when -G is given, read groups which are not listed in the header must
          be given explicitly using the "?" symbol in -G.
          Otherwise:
        - if the bam has no header, all reads in the file are assigned to a
          single sample named after the file
        - if there is at least one sample defined in the header, reads with no
          read group id or with a read group id not listed in the header are
          assigned to the first sample encountered in the header
*/
int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname)
{
    bsmpl->nfiles++;
    bsmpl->files = (file_t*) realloc(bsmpl->files,bsmpl->nfiles*sizeof(file_t));
    file_t *file = &bsmpl->files[bsmpl->nfiles-1];
    memset(file,0,sizeof(file_t));
    file->fname  = strdup(fname);
    file->default_idx = -1;

    if ( bsmpl->ignore_rg || !bam_hdr )
    {
        // The option --ignore-RG is set or there is no BAM header: use the file name as the sample name
        bsmpl_add_readgroup(bsmpl,file,"*",file->fname);
        return bsmpl->nfiles-1;
    }

    void *bam_smpls = khash_str2int_init();
    int first_smpl = -1, nskipped = 0;
    const char *p = bam_hdr, *q, *r;
    while (p != NULL && (q = strstr(p, "@RG")) != 0)
    {
        char *eol = strchr(q + 3, '\n');
        if (q > bam_hdr && *(q - 1) != '\n') { // @RG must be at start of line
            p = eol;
            continue;
        }
        p = q + 3;
        if ((q = strstr(p, "\tID:")) != 0) q += 4;
        if ((r = strstr(p, "\tSM:")) != 0) r += 4;
        if (r && q)
        {
            char *u, *v;
            int ioq, ior;
            for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u);
            for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v);
            ioq = *u; ior = *v; *u = *v = '\0';

            // q now points to a null terminated read group id
            // r points to a null terminated sample name
            if ( !strcmp("*",q) || !strcmp("?",q) )
                error("Error: the read group IDs \"*\" and \"?\" have a special meaning in the mpileup code. Please fix the code or the bam: %s\n", fname);

            int accept_rg = 1;
            if ( bsmpl->sample_list )
            {
                // restrict samples based on the -s/-S options
                char *name = khash_str2str_get(bsmpl->sample_list,r);
                if ( bsmpl->sample_logic==0 )
                    accept_rg = name ? 0 : 1;
                else if ( !name )
                    accept_rg = 0;
                else
                    r = name;
            }
            if ( accept_rg && bsmpl->rg_list )
            {
                // restrict readgroups based on the -G option, possibly renaming the sample
                accept_rg = bsmpl_keep_readgroup(bsmpl,file,q,&r);
            }
            if ( accept_rg )
                bsmpl_add_readgroup(bsmpl,file,q,r);
            else
            {
                bsmpl_add_readgroup(bsmpl,file,q,NULL); // ignore this RG but note that it was seen in the header
                nskipped++;
            }

            if ( first_smpl<0 )
                khash_str2int_get(bsmpl->name2idx,r,&first_smpl);
            if ( !khash_str2int_has_key(bam_smpls,r) )
                khash_str2int_inc(bam_smpls,strdup(r));

            *u = ioq; *v = ior;
        }
        else
            break;
        p = eol;
    }
    int nsmpls = khash_str2int_size(bam_smpls);
    khash_str2int_destroy_free(bam_smpls);

    const char *smpl_name = NULL;
    int accept_null_rg = 1;
    if ( bsmpl->rg_list && !bsmpl_keep_readgroup(bsmpl,file,"?",&smpl_name) ) accept_null_rg = 0;
    if ( bsmpl->sample_list && first_smpl==-1 ) accept_null_rg = 0;

    if ( !accept_null_rg && first_smpl==-1 )
    {
        // no suitable read group is available in this bam: ignore the whole file.
        free(file->fname);
        if ( file->rg2idx ) khash_str2int_destroy_free(file->rg2idx);
        bsmpl->nfiles--;
        return -1;
    }
    if ( !accept_null_rg ) return bsmpl->nfiles-1;
    if ( nsmpls==1 && !nskipped )
    {
        file->default_idx = first_smpl;
        return bsmpl->nfiles-1;
    }
    if ( !smpl_name ) smpl_name = first_smpl==-1 ? file->fname : bsmpl->smpl[first_smpl];

    bsmpl_add_readgroup(bsmpl,file,"?",smpl_name);
    return bsmpl->nfiles-1;
}