Ejemplo n.º 1
0
static int set_sample_pairs(char **samples, int nsamples, kstring_t *hdr, int idx)
{
    int i, j, n;

    // Are these samples "old-name new-name" pairs?
    void *hash = khash_str2str_init();
    for (i=0; i<nsamples; i++)
    {
        char *key, *value;
        key = value = samples[i];
        while ( *value && !isspace(*value) ) value++;
        if ( !*value ) break;
        *value = 0; value++;
        while ( isspace(*value) ) value++;
        khash_str2str_set(hash,key,value);
    }
    if ( i!=nsamples )  // not "old-name new-name" pairs
    {
        khash_str2str_destroy(hash);
        return 0;
    }

    while ( hdr->l>0 && isspace(hdr->s[hdr->l-1]) ) hdr->l--;  // remove trailing newlines
    hdr->s[hdr->l] = 0;

    kstring_t tmp = {0,0,0};
    i = j = n = 0;
    while ( hdr->s[idx+i] && hdr->s[idx+i])
    {
        if ( hdr->s[idx+i]=='\t' )
        {
            hdr->s[idx+i] = 0;

            if ( ++n>9 )
            {
                char *ori = khash_str2str_get(hash,hdr->s+idx+j);
                kputs(ori ? ori : hdr->s+idx+j, &tmp);
            }
            else
                kputs(hdr->s+idx+j, &tmp);

            kputc('\t',&tmp);

            j = ++i;
            continue;
        }
        i++;
    }
    char *ori = khash_str2str_get(hash,hdr->s+idx+j);
    kputs(ori ? ori : hdr->s+idx+j, &tmp);

    if ( hash ) khash_str2str_destroy(hash);

    hdr->l = idx;
    kputs(tmp.s, hdr);
    kputc('\n', hdr);
    free(tmp.s);

    return 1;
}
Ejemplo n.º 2
0
static int bsmpl_keep_readgroup(bam_smpl_t *bsmpl, file_t *file, const char *rg_id, const char **smpl_name)
{
    char *rg_smpl = khash_str2str_get(bsmpl->rg_list,rg_id);    // unique read group present in one bam only
    if ( !rg_smpl )
    {
        // read group specific to this bam
        bsmpl->tmp.l = 0;
        ksprintf(&bsmpl->tmp,"%s\t%s",rg_id,file->fname);
        rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s);
    }
    if ( !rg_smpl )
    {
        // any read group in this file?
        bsmpl->tmp.l = 0;
        ksprintf(&bsmpl->tmp,"*\t%s",file->fname);
        rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s);
    }
    if ( !rg_smpl && bsmpl->rg_logic ) return 0;
    if ( rg_smpl && !bsmpl->rg_logic ) return 0;

    if ( rg_smpl && rg_smpl[0]!='\t' ) *smpl_name = rg_smpl;    // rename the sample
    return 1;
}
Ejemplo n.º 3
0
int bam_smpl_add_readgroups(bam_smpl_t *bsmpl, char *list, int is_file)
{
    if ( list[0]!='^' ) bsmpl->rg_logic = 1;
    else list++;

    int i, nrows  = 0;
    char **rows = hts_readlist(list, is_file, &nrows);
    if ( !nrows ) return 0;

    kstring_t fld1 = {0,0,0};
    kstring_t fld2 = {0,0,0};
    kstring_t fld3 = {0,0,0};

    bsmpl->rg_list = khash_str2str_init();
    for (i=0; i<nrows; i++)
    {
        char *ptr = rows[i];
        fld1.l = fld2.l = fld3.l = 0;
        int escaped = 0;
        while ( *ptr )
        {
            if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
            if ( isspace(*ptr) && !escaped ) break;
            kputc(*ptr, &fld1);
            escaped = 0;
            ptr++;
        }
        if ( *ptr )
        {
            while ( *ptr && isspace(*ptr) ) ptr++;
            while ( *ptr )
            {
                if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
                if ( isspace(*ptr) && !escaped ) break;
                kputc(*ptr, &fld2);
                escaped = 0;
                ptr++;
            }
        }
        if ( *ptr )
        {
            while ( *ptr && isspace(*ptr) ) ptr++;
            while ( *ptr )
            {
                if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
                if ( isspace(*ptr) && !escaped ) break;
                kputc(*ptr, &fld3);
                escaped = 0;
                ptr++;
            }
        }
        if ( fld3.l )
        {
            // ID FILE SAMPLE
            kputc('\t',&fld1);
            kputs(fld2.s,&fld1);
            fld2.l = 0;
            kputs(fld3.s,&fld2);
        }
        // fld2.s now contains a new sample name. If NULL, use \t to keep the bam header name
        char *value = khash_str2str_get(bsmpl->rg_list,fld1.s);
        if ( !value )
            khash_str2str_set(bsmpl->rg_list,strdup(fld1.s),strdup(fld2.l?fld2.s:"\t"));
        else if ( strcmp(value,fld2.l?fld2.s:"\t") )
            error("Error: The read group \"%s\" was assigned to two different samples: \"%s\" and \"%s\"\n", fld1.s,value,fld2.l?fld2.s:"\t");
        free(rows[i]);
    }
    free(rows);
    free(fld1.s);
    free(fld2.s);
    free(fld3.s);
    return nrows;
}
Ejemplo n.º 4
0
/*
    The logic of this function is a bit complicated because we want to work
    also with broken bams containing read groups that are not listed in the
    header. The desired behavior is as follows:
        - when -G is given, read groups which are not listed in the header must
          be given explicitly using the "?" symbol in -G.
          Otherwise:
        - if the bam has no header, all reads in the file are assigned to a
          single sample named after the file
        - if there is at least one sample defined in the header, reads with no
          read group id or with a read group id not listed in the header are
          assigned to the first sample encountered in the header
*/
int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname)
{
    bsmpl->nfiles++;
    bsmpl->files = (file_t*) realloc(bsmpl->files,bsmpl->nfiles*sizeof(file_t));
    file_t *file = &bsmpl->files[bsmpl->nfiles-1];
    memset(file,0,sizeof(file_t));
    file->fname  = strdup(fname);
    file->default_idx = -1;

    if ( bsmpl->ignore_rg || !bam_hdr )
    {
        // The option --ignore-RG is set or there is no BAM header: use the file name as the sample name
        bsmpl_add_readgroup(bsmpl,file,"*",file->fname);
        return bsmpl->nfiles-1;
    }

    void *bam_smpls = khash_str2int_init();
    int first_smpl = -1, nskipped = 0;
    const char *p = bam_hdr, *q, *r;
    while (p != NULL && (q = strstr(p, "@RG")) != 0)
    {
        char *eol = strchr(q + 3, '\n');
        if (q > bam_hdr && *(q - 1) != '\n') { // @RG must be at start of line
            p = eol;
            continue;
        }
        p = q + 3;
        if ((q = strstr(p, "\tID:")) != 0) q += 4;
        if ((r = strstr(p, "\tSM:")) != 0) r += 4;
        if (r && q)
        {
            char *u, *v;
            int ioq, ior;
            for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u);
            for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v);
            ioq = *u; ior = *v; *u = *v = '\0';

            // q now points to a null terminated read group id
            // r points to a null terminated sample name
            if ( !strcmp("*",q) || !strcmp("?",q) )
                error("Error: the read group IDs \"*\" and \"?\" have a special meaning in the mpileup code. Please fix the code or the bam: %s\n", fname);

            int accept_rg = 1;
            if ( bsmpl->sample_list )
            {
                // restrict samples based on the -s/-S options
                char *name = khash_str2str_get(bsmpl->sample_list,r);
                if ( bsmpl->sample_logic==0 )
                    accept_rg = name ? 0 : 1;
                else if ( !name )
                    accept_rg = 0;
                else
                    r = name;
            }
            if ( accept_rg && bsmpl->rg_list )
            {
                // restrict readgroups based on the -G option, possibly renaming the sample
                accept_rg = bsmpl_keep_readgroup(bsmpl,file,q,&r);
            }
            if ( accept_rg )
                bsmpl_add_readgroup(bsmpl,file,q,r);
            else
            {
                bsmpl_add_readgroup(bsmpl,file,q,NULL); // ignore this RG but note that it was seen in the header
                nskipped++;
            }

            if ( first_smpl<0 )
                khash_str2int_get(bsmpl->name2idx,r,&first_smpl);
            if ( !khash_str2int_has_key(bam_smpls,r) )
                khash_str2int_inc(bam_smpls,strdup(r));

            *u = ioq; *v = ior;
        }
        else
            break;
        p = eol;
    }
    int nsmpls = khash_str2int_size(bam_smpls);
    khash_str2int_destroy_free(bam_smpls);

    const char *smpl_name = NULL;
    int accept_null_rg = 1;
    if ( bsmpl->rg_list && !bsmpl_keep_readgroup(bsmpl,file,"?",&smpl_name) ) accept_null_rg = 0;
    if ( bsmpl->sample_list && first_smpl==-1 ) accept_null_rg = 0;

    if ( !accept_null_rg && first_smpl==-1 )
    {
        // no suitable read group is available in this bam: ignore the whole file.
        free(file->fname);
        if ( file->rg2idx ) khash_str2int_destroy_free(file->rg2idx);
        bsmpl->nfiles--;
        return -1;
    }
    if ( !accept_null_rg ) return bsmpl->nfiles-1;
    if ( nsmpls==1 && !nskipped )
    {
        file->default_idx = first_smpl;
        return bsmpl->nfiles-1;
    }
    if ( !smpl_name ) smpl_name = first_smpl==-1 ? file->fname : bsmpl->smpl[first_smpl];

    bsmpl_add_readgroup(bsmpl,file,"?",smpl_name);
    return bsmpl->nfiles-1;
}