static int set_sample_pairs(char **samples, int nsamples, kstring_t *hdr, int idx) { int i, j, n; // Are these samples "old-name new-name" pairs? void *hash = khash_str2str_init(); for (i=0; i<nsamples; i++) { char *key, *value; key = value = samples[i]; while ( *value && !isspace(*value) ) value++; if ( !*value ) break; *value = 0; value++; while ( isspace(*value) ) value++; khash_str2str_set(hash,key,value); } if ( i!=nsamples ) // not "old-name new-name" pairs { khash_str2str_destroy(hash); return 0; } while ( hdr->l>0 && isspace(hdr->s[hdr->l-1]) ) hdr->l--; // remove trailing newlines hdr->s[hdr->l] = 0; kstring_t tmp = {0,0,0}; i = j = n = 0; while ( hdr->s[idx+i] && hdr->s[idx+i]) { if ( hdr->s[idx+i]=='\t' ) { hdr->s[idx+i] = 0; if ( ++n>9 ) { char *ori = khash_str2str_get(hash,hdr->s+idx+j); kputs(ori ? ori : hdr->s+idx+j, &tmp); } else kputs(hdr->s+idx+j, &tmp); kputc('\t',&tmp); j = ++i; continue; } i++; } char *ori = khash_str2str_get(hash,hdr->s+idx+j); kputs(ori ? ori : hdr->s+idx+j, &tmp); if ( hash ) khash_str2str_destroy(hash); hdr->l = idx; kputs(tmp.s, hdr); kputc('\n', hdr); free(tmp.s); return 1; }
static int bsmpl_keep_readgroup(bam_smpl_t *bsmpl, file_t *file, const char *rg_id, const char **smpl_name) { char *rg_smpl = khash_str2str_get(bsmpl->rg_list,rg_id); // unique read group present in one bam only if ( !rg_smpl ) { // read group specific to this bam bsmpl->tmp.l = 0; ksprintf(&bsmpl->tmp,"%s\t%s",rg_id,file->fname); rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s); } if ( !rg_smpl ) { // any read group in this file? bsmpl->tmp.l = 0; ksprintf(&bsmpl->tmp,"*\t%s",file->fname); rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s); } if ( !rg_smpl && bsmpl->rg_logic ) return 0; if ( rg_smpl && !bsmpl->rg_logic ) return 0; if ( rg_smpl && rg_smpl[0]!='\t' ) *smpl_name = rg_smpl; // rename the sample return 1; }
int bam_smpl_add_readgroups(bam_smpl_t *bsmpl, char *list, int is_file) { if ( list[0]!='^' ) bsmpl->rg_logic = 1; else list++; int i, nrows = 0; char **rows = hts_readlist(list, is_file, &nrows); if ( !nrows ) return 0; kstring_t fld1 = {0,0,0}; kstring_t fld2 = {0,0,0}; kstring_t fld3 = {0,0,0}; bsmpl->rg_list = khash_str2str_init(); for (i=0; i<nrows; i++) { char *ptr = rows[i]; fld1.l = fld2.l = fld3.l = 0; int escaped = 0; while ( *ptr ) { if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; } if ( isspace(*ptr) && !escaped ) break; kputc(*ptr, &fld1); escaped = 0; ptr++; } if ( *ptr ) { while ( *ptr && isspace(*ptr) ) ptr++; while ( *ptr ) { if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; } if ( isspace(*ptr) && !escaped ) break; kputc(*ptr, &fld2); escaped = 0; ptr++; } } if ( *ptr ) { while ( *ptr && isspace(*ptr) ) ptr++; while ( *ptr ) { if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; } if ( isspace(*ptr) && !escaped ) break; kputc(*ptr, &fld3); escaped = 0; ptr++; } } if ( fld3.l ) { // ID FILE SAMPLE kputc('\t',&fld1); kputs(fld2.s,&fld1); fld2.l = 0; kputs(fld3.s,&fld2); } // fld2.s now contains a new sample name. If NULL, use \t to keep the bam header name char *value = khash_str2str_get(bsmpl->rg_list,fld1.s); if ( !value ) khash_str2str_set(bsmpl->rg_list,strdup(fld1.s),strdup(fld2.l?fld2.s:"\t")); else if ( strcmp(value,fld2.l?fld2.s:"\t") ) error("Error: The read group \"%s\" was assigned to two different samples: \"%s\" and \"%s\"\n", fld1.s,value,fld2.l?fld2.s:"\t"); free(rows[i]); } free(rows); free(fld1.s); free(fld2.s); free(fld3.s); return nrows; }
/* The logic of this function is a bit complicated because we want to work also with broken bams containing read groups that are not listed in the header. The desired behavior is as follows: - when -G is given, read groups which are not listed in the header must be given explicitly using the "?" symbol in -G. Otherwise: - if the bam has no header, all reads in the file are assigned to a single sample named after the file - if there is at least one sample defined in the header, reads with no read group id or with a read group id not listed in the header are assigned to the first sample encountered in the header */ int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname) { bsmpl->nfiles++; bsmpl->files = (file_t*) realloc(bsmpl->files,bsmpl->nfiles*sizeof(file_t)); file_t *file = &bsmpl->files[bsmpl->nfiles-1]; memset(file,0,sizeof(file_t)); file->fname = strdup(fname); file->default_idx = -1; if ( bsmpl->ignore_rg || !bam_hdr ) { // The option --ignore-RG is set or there is no BAM header: use the file name as the sample name bsmpl_add_readgroup(bsmpl,file,"*",file->fname); return bsmpl->nfiles-1; } void *bam_smpls = khash_str2int_init(); int first_smpl = -1, nskipped = 0; const char *p = bam_hdr, *q, *r; while (p != NULL && (q = strstr(p, "@RG")) != 0) { char *eol = strchr(q + 3, '\n'); if (q > bam_hdr && *(q - 1) != '\n') { // @RG must be at start of line p = eol; continue; } p = q + 3; if ((q = strstr(p, "\tID:")) != 0) q += 4; if ((r = strstr(p, "\tSM:")) != 0) r += 4; if (r && q) { char *u, *v; int ioq, ior; for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u); for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v); ioq = *u; ior = *v; *u = *v = '\0'; // q now points to a null terminated read group id // r points to a null terminated sample name if ( !strcmp("*",q) || !strcmp("?",q) ) error("Error: the read group IDs \"*\" and \"?\" have a special meaning in the mpileup code. Please fix the code or the bam: %s\n", fname); int accept_rg = 1; if ( bsmpl->sample_list ) { // restrict samples based on the -s/-S options char *name = khash_str2str_get(bsmpl->sample_list,r); if ( bsmpl->sample_logic==0 ) accept_rg = name ? 0 : 1; else if ( !name ) accept_rg = 0; else r = name; } if ( accept_rg && bsmpl->rg_list ) { // restrict readgroups based on the -G option, possibly renaming the sample accept_rg = bsmpl_keep_readgroup(bsmpl,file,q,&r); } if ( accept_rg ) bsmpl_add_readgroup(bsmpl,file,q,r); else { bsmpl_add_readgroup(bsmpl,file,q,NULL); // ignore this RG but note that it was seen in the header nskipped++; } if ( first_smpl<0 ) khash_str2int_get(bsmpl->name2idx,r,&first_smpl); if ( !khash_str2int_has_key(bam_smpls,r) ) khash_str2int_inc(bam_smpls,strdup(r)); *u = ioq; *v = ior; } else break; p = eol; } int nsmpls = khash_str2int_size(bam_smpls); khash_str2int_destroy_free(bam_smpls); const char *smpl_name = NULL; int accept_null_rg = 1; if ( bsmpl->rg_list && !bsmpl_keep_readgroup(bsmpl,file,"?",&smpl_name) ) accept_null_rg = 0; if ( bsmpl->sample_list && first_smpl==-1 ) accept_null_rg = 0; if ( !accept_null_rg && first_smpl==-1 ) { // no suitable read group is available in this bam: ignore the whole file. free(file->fname); if ( file->rg2idx ) khash_str2int_destroy_free(file->rg2idx); bsmpl->nfiles--; return -1; } if ( !accept_null_rg ) return bsmpl->nfiles-1; if ( nsmpls==1 && !nskipped ) { file->default_idx = first_smpl; return bsmpl->nfiles-1; } if ( !smpl_name ) smpl_name = first_smpl==-1 ? file->fname : bsmpl->smpl[first_smpl]; bsmpl_add_readgroup(bsmpl,file,"?",smpl_name); return bsmpl->nfiles-1; }