コード例 #1
0
ファイル: vcf.c プロジェクト: 9beckert/TIR
int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn)
{
	vcf_t *v;
	gzFile fp;
	kstream_t *ks;
	kstring_t s, rn;
	int dret;
	if (bp == 0) return -1;
	if (!bp->is_vcf) return 0;
	s.l = s.m = 0; s.s = 0;
	rn.m = rn.l = h->l_nm; rn.s = h->name;
	v = (vcf_t*)bp->v;
	fp = gzopen(fn, "r");
	ks = ks_init(fp);
	while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
		bcf_str2id_add(v->refhash, strdup(s.s));
		kputs(s.s, &rn); kputc('\0', &rn);
		if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
	}
	ks_destroy(ks);
	gzclose(fp);
	h->l_nm = rn.l; h->name = rn.s;
	bcf_hdr_sync(h);
	free(s.s);
	return 0;
}
コード例 #2
0
static bcf_hdr_t *strip_header(bcf_hdr_t *src, bcf_hdr_t *dst)
{
    bcf_hrec_t *src_hrec, *dst_hrec, *tmp;
    bcf_hdr_t *out = bcf_hdr_init("r");
    int i;
    for (i=0; i<dst->nhrec; i++)
    {
        // first insert lines which do not code BCF ids, their order does not matter
        dst_hrec = dst->hrec[i];
        if ( dst_hrec->type==BCF_HL_FLT || dst_hrec->type==BCF_HL_INFO || dst_hrec->type==BCF_HL_FMT || dst_hrec->type== BCF_HL_CTG ) continue;
        bcf_hdr_add_hrec(out, bcf_hrec_dup(dst_hrec));
    }
    for (i=0; i<src->nhrec; i++)
    {
        // now transfer header lines which define BCF ids
        src_hrec = src->hrec[i];

        if ( src_hrec->type==BCF_HL_FLT || src_hrec->type==BCF_HL_INFO || src_hrec->type==BCF_HL_FMT || src_hrec->type== BCF_HL_CTG )
        {
            int j = bcf_hrec_find_key(src_hrec, "ID");
            dst_hrec = bcf_hdr_get_hrec(dst, src_hrec->type, "ID", src_hrec->vals[j], NULL);
            if ( !dst_hrec ) continue;

            tmp = bcf_hrec_dup(dst_hrec);

            j = bcf_hrec_find_key(src_hrec, "IDX");
            if ( j>=0 )
            {
                j = atoi(src_hrec->vals[j]);
                hrec_add_idx(tmp, j);
            }
            bcf_hdr_add_hrec(out, tmp);
        }
    }
    bcf_hdr_sync(out);
    for (i=0; i<dst->nhrec; i++)
    {
        // finally add new structured fields
        dst_hrec = dst->hrec[i];
        if ( dst_hrec->type==BCF_HL_FLT || dst_hrec->type==BCF_HL_INFO || dst_hrec->type==BCF_HL_FMT || dst_hrec->type== BCF_HL_CTG )
        {
            int j = bcf_hrec_find_key(dst_hrec, "ID");
            tmp = bcf_hdr_get_hrec(out, dst_hrec->type, "ID", dst_hrec->vals[j], NULL);
            if ( !tmp )
                bcf_hdr_add_hrec(out, bcf_hrec_dup(dst_hrec));
        }
    }
    for (i=0; i<dst->n[BCF_DT_SAMPLE]; i++) bcf_hdr_add_sample(out, dst->samples[i]);
    bcf_hdr_add_sample(out, NULL);
    bcf_hdr_destroy(dst);
    return out;
}
コード例 #3
0
ファイル: vcf.c プロジェクト: goshng/cocoa
int bcf_hdr_parse(bcf_hdr_t *hdr)
{
    int len, needs_sync = 0;
    bcf_hrec_t *hrec;
    char *p = hdr->text;
    while ( (hrec=bcf_hdr_parse_line(hdr,p,&len)) )
    {
        // bcf_hrec_debug(hrec);
        needs_sync += bcf_hdr_add_hrec(hdr, hrec);
        p += len;
    }
    hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
    needs_sync += bcf_hdr_add_hrec(hdr, hrec);
    bcf_hdr_parse_sample_line(hdr,p);
    if ( needs_sync ) bcf_hdr_sync(hdr);
	return 0;
}
コード例 #4
0
ファイル: vcf.c プロジェクト: goshng/cocoa
int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
{
    int i, n;
    char **lines = hts_readlines(fname, &n);
    if ( !lines ) return 1;
    for (i=0; i<n-1; i++)
    {
        int k;
        bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
        bcf_hdr_add_hrec(hdr, hrec);
        free(lines[i]);
    }
    bcf_hdr_parse_sample_line(hdr,lines[n-1]);
    free(lines[n-1]);
    free(lines);
    bcf_hdr_sync(hdr);
    bcf_hdr_fmt_text(hdr);
    return 0;
}
コード例 #5
0
static void remove_hdr_lines(bcf_hdr_t *hdr, int type)
{
    int i = 0, nrm = 0;
    while ( i<hdr->nhrec )
    {
        if ( hdr->hrec[i]->type!=type ) { i++; continue; }
        bcf_hrec_t *hrec = hdr->hrec[i];
        if ( type==BCF_HL_FMT )
        {
            // everything except FORMAT/GT
            int id = bcf_hrec_find_key(hrec, "ID");
            if ( id>=0 && !strcmp(hrec->vals[id],"GT") ) { i++; continue; }
        }
        nrm++;
        hdr->nhrec--;
        if ( i < hdr->nhrec )
            memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
        bcf_hrec_destroy(hrec);
    }
    if ( nrm ) bcf_hdr_sync(hdr);
}
コード例 #6
0
ファイル: vcf.c プロジェクト: 9beckert/TIR
bcf_hdr_t *vcf_hdr_read(bcf_t *bp)
{
	kstring_t meta, smpl;
	int dret;
	vcf_t *v;
	bcf_hdr_t *h;
	if (!bp->is_vcf) return bcf_hdr_read(bp);
	h = calloc(1, sizeof(bcf_hdr_t));
	v = (vcf_t*)bp->v;
	v->line.l = 0;
	memset(&meta, 0, sizeof(kstring_t));
	memset(&smpl, 0, sizeof(kstring_t));
	while (ks_getuntil(v->ks, '\n', &v->line, &dret) >= 0) {
		if (v->line.l < 2) continue;
		if (v->line.s[0] != '#') return 0; // no sample line
		if (v->line.s[0] == '#' && v->line.s[1] == '#') {
			kputsn(v->line.s, v->line.l, &meta); kputc('\n', &meta);
		} else if (v->line.s[0] == '#') {
			int k;
			ks_tokaux_t aux;
			char *p;
			for (p = kstrtok(v->line.s, "\t\n", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
				if (k >= 9) {
					kputsn(p, aux.p - p, &smpl);
					kputc('\0', &smpl);
				}
			}
			break;
		}
	}
	kputc('\0', &meta);
	h->name = 0;
	h->sname = smpl.s; h->l_smpl = smpl.l;
	h->txt = meta.s; h->l_txt = meta.l;
	bcf_hdr_sync(h);
	return h;
}
コード例 #7
0
ファイル: bam_plcmd.c プロジェクト: 9beckert/TIR
static int mpileup(mplp_conf_t *conf, int n, char **fn)
{
	extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
	extern void bcf_call_del_rghash(void *rghash);
	mplp_aux_t **data;
	int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth;
	const bam_pileup1_t **plp;
	bam_mplp_t iter;
	bam_header_t *h = 0;
	char *ref;
	void *rghash = 0;

	bcf_callaux_t *bca = 0;
	bcf_callret1_t *bcr = 0;
	bcf_call_t bc;
	bcf_t *bp = 0;
	bcf_hdr_t *bh = 0;

	bam_sample_t *sm = 0;
	kstring_t buf;
	mplp_pileup_t gplp;

	memset(&gplp, 0, sizeof(mplp_pileup_t));
	memset(&buf, 0, sizeof(kstring_t));
	memset(&bc, 0, sizeof(bcf_call_t));
	data = calloc(n, sizeof(void*));
	plp = calloc(n, sizeof(void*));
	n_plp = calloc(n, sizeof(int*));
	sm = bam_smpl_init();

	// read the header and initialize data
	for (i = 0; i < n; ++i) {
		bam_header_t *h_tmp;
		data[i] = calloc(1, sizeof(mplp_aux_t));
		data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r");
		data[i]->conf = conf;
		h_tmp = bam_header_read(data[i]->fp);
		data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet
		bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
		rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
		if (conf->reg) {
			int beg, end;
			bam_index_t *idx;
			idx = bam_index_load(fn[i]);
			if (idx == 0) {
				fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1);
				exit(1);
			}
			if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) {
				fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1);
				exit(1);
			}
			if (i == 0) tid0 = tid, beg0 = beg, end0 = end;
			data[i]->iter = bam_iter_query(idx, tid, beg, end);
			bam_index_destroy(idx);
		}
		if (i == 0) h = h_tmp;
		else {
			// FIXME: to check consistency
			bam_header_destroy(h_tmp);
		}
	}
	gplp.n = sm->n;
	gplp.n_plp = calloc(sm->n, sizeof(int));
	gplp.m_plp = calloc(sm->n, sizeof(int));
	gplp.plp = calloc(sm->n, sizeof(void*));

	fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
	// write the VCF header
	if (conf->flag & MPLP_GLF) {
		kstring_t s;
		bh = calloc(1, sizeof(bcf_hdr_t));
		s.l = s.m = 0; s.s = 0;
		bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w");
		for (i = 0; i < h->n_targets; ++i) {
			kputs(h->target_name[i], &s);
			kputc('\0', &s);
		}
		bh->l_nm = s.l;
		bh->name = malloc(s.l);
		memcpy(bh->name, s.s, s.l);
		s.l = 0;
		for (i = 0; i < sm->n; ++i) {
			kputs(sm->smpl[i], &s); kputc('\0', &s);
		}
		bh->l_smpl = s.l;
		bh->sname = malloc(s.l);
		memcpy(bh->sname, s.s, s.l);
		bh->txt = malloc(strlen(BAM_VERSION) + 64);
		bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION);
		free(s.s);
		bcf_hdr_sync(bh);
		bcf_hdr_write(bp, bh);
		bca = bcf_call_init(-1., conf->min_baseQ);
		bcr = calloc(sm->n, sizeof(bcf_callret1_t));
		bca->rghash = rghash;
		bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ;
		bca->min_frac = conf->min_frac;
		bca->min_support = conf->min_support;
	}
	if (tid0 >= 0 && conf->fai) { // region is set
		ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len);
		ref_tid = tid0;
		for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0;
	} else ref_tid = -1, ref = 0;
	iter = bam_mplp_init(n, mplp_func, (void**)data);
	max_depth = conf->max_depth;
	if (max_depth * sm->n > 1<<20)
		fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
	if (max_depth * sm->n < 8000) {
		max_depth = 8000 / sm->n;
		fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth);
	}
	max_indel_depth = conf->max_indel_depth * sm->n;
	bam_mplp_set_maxcnt(iter, max_depth);
	while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) {
		if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
		if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
		if (tid != ref_tid) {
			free(ref); ref = 0;
			if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len);
			for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid;
			ref_tid = tid;
		}
		if (conf->flag & MPLP_GLF) {
			int total_depth, _ref0, ref16;
			bcf1_t *b = calloc(1, sizeof(bcf1_t));
			for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
			group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG);
			_ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
			ref16 = bam_nt16_table[_ref0];
			for (i = 0; i < gplp.n; ++i)
				bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i);
			bcf_call_combine(gplp.n, bcr, ref16, &bc);
			bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
						 (conf->flag&MPLP_FMT_SP), 0, 0);
			bcf_write(bp, bh, b);
			bcf_destroy(b);
			// call indels
			if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
				for (i = 0; i < gplp.n; ++i)
					bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
				if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) {
					b = calloc(1, sizeof(bcf1_t));
					bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
								 (conf->flag&MPLP_FMT_SP), bca, ref);
					bcf_write(bp, bh, b);
					bcf_destroy(b);
				}
			}
		} else {
			printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
			for (i = 0; i < n; ++i) {
				int j;
				printf("\t%d\t", n_plp[i]);
				if (n_plp[i] == 0) {
					printf("*\t*"); // FIXME: printf() is very slow...
					if (conf->flag & MPLP_PRINT_POS) printf("\t*");
				} else {
					for (j = 0; j < n_plp[i]; ++j)
						pileup_seq(plp[i] + j, pos, ref_len, ref);
					putchar('\t');
					for (j = 0; j < n_plp[i]; ++j) {
						const bam_pileup1_t *p = plp[i] + j;
						int c = bam1_qual(p->b)[p->qpos] + 33;
						if (c > 126) c = 126;
						putchar(c);
					}
					if (conf->flag & MPLP_PRINT_MAPQ) {
						putchar('\t');
						for (j = 0; j < n_plp[i]; ++j) {
							int c = plp[i][j].b->core.qual + 33;
							if (c > 126) c = 126;
							putchar(c);
						}
					}
					if (conf->flag & MPLP_PRINT_POS) {
						putchar('\t');
						for (j = 0; j < n_plp[i]; ++j) {
							if (j > 0) putchar(',');
							printf("%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow...
						}
					}
				}
			}
			putchar('\n');
		}
	}

	bcf_close(bp);
	bam_smpl_destroy(sm); free(buf.s);
	for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]);
	free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp);
	bcf_call_del_rghash(rghash);
	bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr);
	bam_mplp_destroy(iter);
	bam_header_destroy(h);
	for (i = 0; i < n; ++i) {
		bam_close(data[i]->fp);
		if (data[i]->iter) bam_iter_destroy(data[i]->iter);
		free(data[i]);
	}
	free(data); free(plp); free(ref); free(n_plp);
	return 0;
}
コード例 #8
0
int ingest1(const char *input,const char *output,char *ref,bool exit_on_mismatch=true) {
  cerr << "Input: " << input << "\tOutput: "<<output<<endl;

  kstream_t *ks;
  kstring_t str = {0,0,0};    
  gzFile fp = gzopen(input, "r");
  VarBuffer vbuf(1000);
  int prev_rid = -1;
  if(fp==NULL) {
    fprintf(stderr,"problem opening %s\n",input);
    exit(1);
  }

  char *out_fname = (char *)malloc(strlen(output)+5);
  strcpy(out_fname,output);
  strcat(out_fname,".tmp");
  if(fileexists(out_fname)) {
    fprintf(stderr,"%s file already exists. will not overwrite\n",out_fname);
    exit(1);
  }
  printf("depth: %s\n",out_fname);
  gzFile depth_fp = gzopen(out_fname, "wb1");
  strcpy(out_fname,output);
  strcat(out_fname,".bcf");
  if(fileexists(out_fname)) {
    fprintf(stderr,"%s file already exists. will not overwrite\n",out_fname);
    exit(1);
  }
  printf("variants: %s\n",out_fname);
  htsFile *variant_fp=hts_open(out_fname,"wb1");
  if(variant_fp==NULL) {
    fprintf(stderr,"problem opening %s\n",input);
    exit(1);    
  }

  ks = ks_init(fp);
  htsFile *hfp=hts_open(input, "r");
  bcf_hdr_t *hdr_in =  bcf_hdr_read(hfp);
  hts_close(hfp);
  //this is a hack to fix gvcfs where AD is incorrectly defined in the header. (vcf4.2 does not technically allow Number=R)
  bcf_hdr_remove(hdr_in,BCF_HL_FMT,"AD");
  assert(  bcf_hdr_append(hdr_in,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed. For indels this value only includes reads which confidently support each allele (posterior prob 0.999 or higher that read contains indicated allele vs all other intersecting indel alleles)\">") == 0);

  //this is a hack to fix broken gvcfs where GQ is incorrectly labelled as float (v4.3 spec says it should be integer)
  bcf_hdr_remove(hdr_in,BCF_HL_FMT,"GQ");
  assert(  bcf_hdr_append(hdr_in,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">") == 0);


  //  bcf_hdr_t  *hdr_out=hdr_in;
  bcf_hdr_t *hdr_out =  bcf_hdr_dup(hdr_in);
  remove_hdr_lines(hdr_out,BCF_HL_INFO);
  remove_hdr_lines(hdr_out,BCF_HL_FLT);
  bcf_hdr_sync(hdr_out);

  //here we add FORMAT/PF. which is the pass filter flag for alts.
  assert(  bcf_hdr_append(hdr_out,"##FORMAT=<ID=PF,Number=A,Type=Integer,Description=\"variant was PASS filter in original sample gvcf\">") == 0);

  args_t *norm_args = init_vcfnorm(hdr_out,ref);
  norm_args->check_ref |= CHECK_REF_WARN;
  bcf1_t *bcf_rec = bcf_init();
  bcf_hdr_write(variant_fp, hdr_out);
  kstring_t work1 = {0,0,0};            
  int buf[5];
  ks_tokaux_t aux;
  int ndec=0;
  int ref_len,alt_len;
  while(    ks_getuntil(ks, '\n', &str, 0) >=0) {
    //    fprintf(stderr,"%s\n",str.s);
    if(str.s[0]!='#')  {
      char *ptr = kstrtok(str.s,"\t",&aux);//chrom
      ptr = kstrtok(NULL,NULL,&aux);//pos
      work1.l=0;
      kputsn(str.s,ptr-str.s-1, &work1);   
      buf[0] =  bcf_hdr_name2id(hdr_in, work1.s);
      assert(      buf[0]>=0);
      buf[1]=atoi(ptr)-1;
      ptr = kstrtok(NULL,NULL,&aux);//ID
      ptr = kstrtok(NULL,NULL,&aux);//REF

      ref_len=0;
      while(ptr[ref_len]!='\t') ref_len++;

      ptr = kstrtok(NULL,NULL,&aux);//ALT

      bool is_variant=false;
      alt_len=0;
      while(ptr[alt_len]!='\t') alt_len++;
      if(ptr[0]!='.') 
	is_variant=true;

      char * QUAL_ptr = kstrtok(NULL, NULL, &aux);
      assert (QUAL_ptr != NULL);
      
      for(int i=0;i<2;i++)  ptr = kstrtok(NULL,NULL,&aux);// gets us to INFO

      //find END if it is there
      char *end_ptr=strstr(ptr,"END=") ;
      if(end_ptr!=NULL) 
	buf[2]=atoi(end_ptr+4)-1;
      else
	buf[2]=buf[1]+alt_len-1;

      ptr  = kstrtok(NULL,NULL,&aux);//FORMAT
      //find index of DP (if present)
      //if not present, dont output anything (indels ignored)

      char *DP_ptr = find_format(ptr,"DP");
      int GQX = 0;
      int QUAL = 0;

      // AH: change code to use the minimum of GQ and QUAL fields if
      // GQX is not defined. See here:
      // https://support.basespace.illumina.com/knowledgebase/articles/144844-vcf-file
      // "GQXGenotype quality. GQX is the minimum of the GQ value
      // and the QUAL column. In general, these are similar values;
      // taking the minimum makes GQX the more conservative measure of
      // genotype quality."
      if(DP_ptr!=NULL) {
	buf[3]=atoi(DP_ptr);
	char *GQX_ptr = find_format(ptr,"GQX");
	if (GQX_ptr == NULL) 
	  {
	    GQX_ptr = find_format(ptr,"GQ");
	    GQX = atoi(GQX_ptr);
	    if (QUAL_ptr[0] != '.') 
	      {
		QUAL = atoi(QUAL_ptr);
		if (QUAL < GQX)
		  GQX = QUAL;
	      }
	  }
	else
	  {
	    GQX = atoi(GQX_ptr);
	  }
	
	//trying to reduce entropy on GQ to get better compression performance.
	//1. rounds down to nearest 10. 
	//2. sets gq to min(gq,100). 
	buf[4]=GQX/10;
	buf[4]*=10;
	if(buf[4]>100) buf[4]=100;

	//	printf("%d\t%d\t%d\t%d\t%d\n",buf[0],buf[1],buf[2],buf[3],buf[4]);
	if(gzwrite(depth_fp,buf,5*sizeof(int))!=(5*sizeof(int)))
	  die("ERROR: problem writing "+(string)out_fname+".tmp");
      }
      if(is_variant) {//wass this a variant? if so write it out to the bcf
	norm_args->ntotal++;
	vcf_parse(&str,hdr_in,bcf_rec);
	//	cerr<<bcf_rec->rid<<":"<<bcf_rec->pos<<endl;
	if(prev_rid!=bcf_rec->rid) 
	  vbuf.flush(variant_fp,hdr_out);
	else
	  vbuf.flush(bcf_rec->pos,variant_fp,hdr_out);
	prev_rid=bcf_rec->rid;
	int32_t pass = bcf_has_filter(hdr_in, bcf_rec, ".");
	bcf_update_format_int32(hdr_out,bcf_rec,"PF",&pass,1);
	bcf_update_filter(hdr_out,bcf_rec,NULL,0);
	if(bcf_rec->n_allele>2) {//split multi-allelics (using vcfnorm.c from bcftools1.3
	  norm_args->nsplit++;
	  split_multiallelic_to_biallelics(norm_args,bcf_rec );
	  for(int i=0;i<norm_args->ntmp_lines;i++){
	    remove_info(norm_args->tmp_lines[i]);
	    if(realign(norm_args,norm_args->tmp_lines[i]) != ERR_REF_MISMATCH)
	      ndec+=decompose(norm_args->tmp_lines[i],hdr_out,vbuf);
	    else
	      if(exit_on_mismatch)
		die("vcf did not match the reference");
	      else
		norm_args->nskipped++;
	  }
	}
	else {
	  remove_info(bcf_rec);
	  if( realign(norm_args,bcf_rec) !=  ERR_REF_MISMATCH)
	    ndec+=decompose(bcf_rec,hdr_out,vbuf);
	  else
	    if(exit_on_mismatch)
	      die("vcf did not match the reference");
	    else
	      norm_args->nskipped++;
	}
	vbuf.flush(bcf_rec->pos,variant_fp,hdr_out);
      }
    }
  }
  vbuf.flush(variant_fp,hdr_out);
  bcf_hdr_destroy(hdr_in);
  bcf_hdr_destroy(hdr_out);
  bcf_destroy1(bcf_rec);
  ks_destroy(ks);
  gzclose(fp);
  gzclose(depth_fp);  
  free(str.s);
  free(work1.s);
  hts_close(variant_fp);
  destroy_data(norm_args);
  fprintf(stderr,"Variant lines   total/split/realigned/skipped:\t%d/%d/%d/%d\n", norm_args->ntotal,norm_args->nsplit,norm_args->nchanged,norm_args->nskipped);
  fprintf(stderr,"Decomposed %d MNPs\n", ndec);


  fprintf(stderr,"Indexing %s\n",out_fname);
  bcf_index_build(out_fname, BCF_LIDX_SHIFT);
  free(out_fname);
  return 0;
}
コード例 #9
0
ファイル: vcf.c プロジェクト: goshng/cocoa
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
{
	if (!fp->is_bin) {
		kstring_t txt, *s = &fp->line;
		bcf_hdr_t *h;
		h = bcf_hdr_init();
		txt.l = txt.m = 0; txt.s = 0;
		while (hts_getline(fp, KS_SEP_LINE, s) >= 0) {
			if (s->l == 0) continue;
			if (s->s[0] != '#') {
				if (hts_verbose >= 2)
					fprintf(stderr, "[E::%s] no sample line\n", __func__);
				free(txt.s);
				bcf_hdr_destroy(h);
				return 0;
			}
			if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
				int dret;
				gzFile f;
				kstream_t *ks;
				kstring_t tmp;
				tmp.l = tmp.m = 0; tmp.s = 0;
				f = gzopen(fp->fn_aux, "r");
				ks = ks_init(f);
				while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) {
					int c;
					kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt);
					ks_getuntil(ks, 0, &tmp, &dret);
					kputs(",length=", &txt); kputw(atol(tmp.s), &txt);
					kputsn(">\n", 2, &txt);
					if (dret != '\n')
						while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line
				}
				free(tmp.s);
				ks_destroy(ks);
				gzclose(f);
			}
			kputsn(s->s, s->l, &txt);
			if (s->s[1] != '#') break;
			kputc('\n', &txt);
		}
		h->l_text = txt.l + 1; // including NULL
		h->text = txt.s;
		bcf_hdr_parse(h);
        // check tabix index, are all contigs listed in the header? add the missing ones
        tbx_t *idx = tbx_index_load(fp->fn);
        if ( idx )
        {
			int i, n, need_sync = 0;
			const char **names = tbx_seqnames(idx, &n);
			for (i=0; i<n; i++)
			{
                bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_DT_CTG, (char*) names[i]);
                if ( hrec ) continue;
                hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
                hrec->key = strdup("contig");
                bcf_hrec_add_key(hrec, "ID", strlen("ID"));
                bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0);
                bcf_hrec_add_key(hrec, "length", strlen("length"));
                bcf_hrec_set_val(hrec, hrec->nkeys-1, "-1", strlen("-1"), 0);   // what is a good default value?
                bcf_hdr_add_hrec(h, hrec);
                need_sync = 1;
			}
			free(names);
			tbx_destroy(idx);
            if ( need_sync )
            {
                bcf_hdr_sync(h);
                bcf_hdr_fmt_text(h);
            }
		}
		return h;
	} else return bcf_hdr_read((BGZF*)fp->fp);
}
コード例 #10
0
ファイル: vcf.c プロジェクト: 9beckert/TIR
int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
{
	int dret, k, i, sync = 0;
	vcf_t *v = (vcf_t*)bp->v;
	char *p, *q;
	kstring_t str, rn;
	ks_tokaux_t aux, a2;
	if (!bp->is_vcf) return bcf_read(bp, h, b);
	v->line.l = 0;
	str.l = 0; str.m = b->m_str; str.s = b->str;
	rn.l = rn.m = h->l_nm; rn.s = h->name;
	if (ks_getuntil(v->ks, '\n', &v->line, &dret) < 0) return -1;
	b->n_smpl = h->n_smpl;
	for (p = kstrtok(v->line.s, "\t", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
		*(char*)aux.p = 0;
		if (k == 0) { // ref
			int tid = bcf_str2id(v->refhash, p);
			if (tid < 0) {
				tid = bcf_str2id_add(v->refhash, strdup(p));
				kputs(p, &rn); kputc('\0', &rn);
				sync = 1;
			}
			b->tid = tid;
		} else if (k == 1) { // pos
			b->pos = atoi(p) - 1;
		} else if (k == 5) { // qual
			b->qual = (p[0] >= '0' && p[0] <= '9')? atof(p) : 0;
		} else if (k <= 8) { // variable length strings
			kputs(p, &str); kputc('\0', &str);
			b->l_str = str.l; b->m_str = str.m; b->str = str.s;
			if (k == 8) bcf_sync(b);
		} else { // k > 9
			if (strncmp(p, "./.", 3) == 0) {
				for (i = 0; i < b->n_gi; ++i) {
					if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
						((uint8_t*)b->gi[i].data)[k-9] = 1<<7;
					} else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
						((uint8_t*)b->gi[i].data)[k-9] = 0;
					} else if (b->gi[i].fmt == bcf_str2int("SP", 2)) {
						((int32_t*)b->gi[i].data)[k-9] = 0;
					} else if (b->gi[i].fmt == bcf_str2int("DP", 2)) {
						((uint16_t*)b->gi[i].data)[k-9] = 0;
					} else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
						int y = b->n_alleles * (b->n_alleles + 1) / 2;
						memset((uint8_t*)b->gi[i].data + (k - 9) * y, 0, y);
					} else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
						int y = b->n_alleles * (b->n_alleles + 1) / 2;
						memset((float*)b->gi[i].data + (k - 9) * y, 0, y * 4);
					}
				}
				goto endblock;
			}
			for (q = kstrtok(p, ":", &a2), i = 0; q && i < b->n_gi; q = kstrtok(0, 0, &a2), ++i) {
				if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
					((uint8_t*)b->gi[i].data)[k-9] = (q[0] - '0')<<3 | (q[2] - '0') | (q[1] == '/'? 0 : 1) << 6;
				} else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
					double _x = strtod(q, &q);
					int x = (int)(_x + .499);
					if (x > 255) x = 255;
					((uint8_t*)b->gi[i].data)[k-9] = x;
				} else if (b->gi[i].fmt == bcf_str2int("SP", 2)) {
					int x = strtol(q, &q, 10);
					if (x > 0xffff) x = 0xffff;
					((uint32_t*)b->gi[i].data)[k-9] = x;
				} else if (b->gi[i].fmt == bcf_str2int("DP", 2)) {
					int x = strtol(q, &q, 10);
					if (x > 0xffff) x = 0xffff;
					((uint16_t*)b->gi[i].data)[k-9] = x;
				} else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
					int x, y, j;
					uint8_t *data = (uint8_t*)b->gi[i].data;
					y = b->n_alleles * (b->n_alleles + 1) / 2;
					for (j = 0; j < y; ++j) {
						x = strtol(q, &q, 10);
						if (x > 255) x = 255;
						data[(k-9) * y + j] = x;
						++q;
					}
				} else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
					int j, y;
					float x, *data = (float*)b->gi[i].data;
					y = b->n_alleles * (b->n_alleles + 1) / 2;
					for (j = 0; j < y; ++j) {
						x = strtod(q, &q);
						data[(k-9) * y + j] = x > 0? -x/10. : x;
						++q;
					}
				}
			}
		endblock: i = i;
		}
	}
	h->l_nm = rn.l; h->name = rn.s;
	if (sync) bcf_hdr_sync(h);
	return v->line.l + 1;
}
コード例 #11
0
ファイル: bam_plcmd.c プロジェクト: mwilkers/unceqr
static int mpileup(mplp_conf_t *conf, int n, char **fn)
{
	extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
	extern void bcf_call_del_rghash(void *rghash);
	mplp_aux_t **data;
	int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth;
	const bam_pileup1_t **plp;
	bam_mplp_t iter;
	bam_header_t *h = 0;
	char *ref;
	void *rghash = 0;

	bcf_callaux_t *bca = 0;
	bcf_callret1_t *bcr = 0;
	bcf_call_t bc;
	bcf_t *bp = 0;
	bcf_hdr_t *bh = 0;

	bam_sample_t *sm = 0;
	kstring_t buf;
	mplp_pileup_t gplp;

	memset(&gplp, 0, sizeof(mplp_pileup_t));
	memset(&buf, 0, sizeof(kstring_t));
	memset(&bc, 0, sizeof(bcf_call_t));
	data = calloc(n, sizeof(void*));
	plp = calloc(n, sizeof(void*));
	n_plp = calloc(n, sizeof(int*));
	sm = bam_smpl_init();

	// read the header and initialize data
	for (i = 0; i < n; ++i) {
		bam_header_t *h_tmp;
		data[i] = calloc(1, sizeof(mplp_aux_t));
		data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r");
		data[i]->conf = conf;
		h_tmp = bam_header_read(data[i]->fp);
		data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet
		bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
		rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
		if (conf->reg) {
			int beg, end;
			bam_index_t *idx;
			idx = bam_index_load(fn[i]);
			if (idx == 0) {
				fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1);
				exit(1);
			}
			if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) {
				fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1);
				exit(1);
			}
			if (i == 0) tid0 = tid, beg0 = beg, end0 = end;
			data[i]->iter = bam_iter_query(idx, tid, beg, end);
			bam_index_destroy(idx);
		}
		if (i == 0) h = h_tmp;
		else {
			// FIXME: to check consistency
			bam_header_destroy(h_tmp);
		}
	}
	gplp.n = sm->n;
	gplp.n_plp = calloc(sm->n, sizeof(int));
	gplp.m_plp = calloc(sm->n, sizeof(int));
	gplp.plp = calloc(sm->n, sizeof(void*));

	fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
	// write the VCF header
	if (conf->flag & MPLP_GLF) {
		kstring_t s;
		bh = calloc(1, sizeof(bcf_hdr_t));
		s.l = s.m = 0; s.s = 0;
		bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w");
		for (i = 0; i < h->n_targets; ++i) {
			kputs(h->target_name[i], &s);
			kputc('\0', &s);
		}
		bh->l_nm = s.l;
		bh->name = malloc(s.l);
		memcpy(bh->name, s.s, s.l);
		s.l = 0;
		for (i = 0; i < sm->n; ++i) {
			kputs(sm->smpl[i], &s); kputc('\0', &s);
		}
		bh->l_smpl = s.l;
		bh->sname = malloc(s.l);
		memcpy(bh->sname, s.s, s.l);
		bh->txt = malloc(strlen(BAM_VERSION) + 64);
		bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION);
		free(s.s);
		bcf_hdr_sync(bh);
		bcf_hdr_write(bp, bh);
		bca = bcf_call_init(-1., conf->min_baseQ);
		bcr = calloc(sm->n, sizeof(bcf_callret1_t));
		bca->rghash = rghash;
		bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ;
		bca->min_frac = conf->min_frac;
		bca->min_support = conf->min_support;
	}
	if (tid0 >= 0 && conf->fai) { // region is set
		ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len);
		ref_tid = tid0;
		for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0;
	} else ref_tid = -1, ref = 0;
	iter = bam_mplp_init(n, mplp_func, (void**)data);
	max_depth = conf->max_depth;
	if (max_depth * sm->n > 1<<20)
		fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
	if (max_depth * sm->n < 8000) {
		max_depth = 8000 / sm->n;
		fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth);
	}
	max_indel_depth = conf->max_indel_depth * sm->n;
	bam_mplp_set_maxcnt(iter, max_depth);


	int storeSize = 100;

	int delStore[2][100] = {{0},{0}};

	typedef char * mstring;

	while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) {
		if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
		if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
		if (tid != ref_tid) {
			free(ref); ref = 0;
			if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len);
			for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid;
			ref_tid = tid;
		}
		if (conf->flag & MPLP_GLF) {
			int total_depth, _ref0, ref16;
			bcf1_t *b = calloc(1, sizeof(bcf1_t));
			for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
			group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG);
			_ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
			ref16 = bam_nt16_table[_ref0];
			for (i = 0; i < gplp.n; ++i)
				bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i);
			bcf_call_combine(gplp.n, bcr, ref16, &bc);
			bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
						 (conf->flag&MPLP_FMT_SP), 0, 0);
			bcf_write(bp, bh, b);
			bcf_destroy(b);
			// call indels
			if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
				for (i = 0; i < gplp.n; ++i)
					bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
				if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) {
					b = calloc(1, sizeof(bcf1_t));
					bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
								 (conf->flag&MPLP_FMT_SP), bca, ref);
					bcf_write(bp, bh, b);
					bcf_destroy(b);
				}
			}
		} else {
			printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
			for (i = 0; i < n; ++i) {
				int j;
				printf("\t%d\t", n_plp[i]);
				if (n_plp[i] == 0) {
					printf("*\t*"); // FIXME: printf() is very slow...
					if (conf->flag & MPLP_PRINT_POS) printf("\t*");
				} else {
					//MDW start					
					//for each position in the pileup column
					int charLen = 16;
					int countChars[ charLen ][2];
					int countiChars[ charLen ][2];

					int countGap[2]={0,0};

					//double qvTotal=0;
					int numStruck=0;
					int numGood=0;
					int tti;
					int ttj;
					mstring insAllele[100];
					int insAlleleCnt[100];
					int sf=0;
					int flag=0;

					//typedef char * string;
					char insStr0[10000];
					int iCnt0=0;

					char insStr1[10000];
					int iCnt1=0;

					char delStr0[10000];
					int dCnt0=0;

					char delStr1[10000];
					int dCnt1=0;


					float qposP[10000];
					int qposCnt=0;



					//initialize with zeros
						for(tti=0;tti<charLen;tti++){
						  countChars[tti][0]=0;
						  countChars[tti][1]=0;
						}

					// define repeat length here; look back up to 10 prior positions
					// start one position away.
					int replC=0; //
					for(tti=1;tti<=15;tti++){
						// check for greater than zero
						if(toupper(ref[pos-1])==toupper(ref[pos-tti])){
							replC++;
						}else{ // breaks the chain at first non identical to current position not strict homopolymer
							break;
						}
					}					
					int reprC=0; // 
					for(tti=1;tti<=15;tti++){
						// check for greater than zero
						if(toupper(ref[pos+1])==toupper(ref[pos+tti])){
							reprC++;
						}else{ // breaks the chain at first non identical to current position not strict homopolymer
							break;
						}
					}		
					int repT = replC;
					if(replC < reprC){
						repT=reprC;
					}



					for (j = 0; j < n_plp[i]; ++j){
						const bam_pileup1_t *p = plp[i] + j;
									
						/*
						SAME LOGIC AS pileup_seq()
						*/

						if(p->is_refskip){ // never count intron gaps in numStruck
							continue;
						}

						if(p->is_del){ // skip deletion gap, after first position which is the first aligned char
							continue;
						}

						if( 	p->b->core.qual < conf->min_mqToCount  || // mapping quality
							conf->maxrepC < (repT) || // max homopolymer run, this will not 
							(!p->is_del && bam1_qual(p->b)[p->qpos] < conf->min_baseQ) || // base quality for matches
							p->alignedQPosBeg <= (conf->trimEnd ) || p->alignedQPosEnd <= (conf->trimEnd ) ||  // trimEnd is 1-based
							p->zf == 1 || // fusion tag
							p->ih > conf->maxIH  || // max hit index
							(p->nmd > conf->maxNM) || // max mismatch
							(conf->flagFilter == 1 && !(p->b->core.flag&BAM_FPROPER_PAIR)) || // optionally keep only proper pairs
							(conf->flagFilter == 2 && p->b->core.flag&BAM_FSECONDARY) || // optionally strike secondary
							(conf->flagFilter == 3 && p->b->core.flag&BAM_FDUP) || // optionally strike dup
							(conf->flagFilter == 4 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY))  || // optionally strike secondary or dup
							(conf->flagFilter == 5 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY || p->b->core.flag&BAM_FQCFAIL || !(p->b->core.flag&BAM_FPROPER_PAIR) ))   // optionally strike secondary, dup and QCfail


						){
							numStruck++;
							continue;
						}

						
						//printf("repT=%d: %d %c %c %c %c \n",repT,p->indel,ref[pos],ref[pos-1],ref[pos-2],ref[pos-3]);


						if(!p->is_del && p->indel==0){
  						  countChars[ bam1_seqi(bam1_seq(p->b), p->qpos) ][ bam1_strand(p->b) ] ++;
						  numGood++;			

						}else if(p->is_refskip){
						  countGap[ bam1_strand(p->b) ]++;
						}
						
						if(p->indel<0){
    						  numGood++;			
						  if(bam1_strand(p->b) ==0){
							  for(tti=1;tti<= -p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   delStr0[dCnt0] =  ref[pos+tti];
							   dCnt0++;
							  }	
							  delStr0[dCnt0] = ',';
							  dCnt0++;
						  }else{
							  for(tti=1;tti<= -p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   delStr1[dCnt1] = ref[pos+tti];
							   dCnt1++;
							  }	
							  delStr1[dCnt1] = ',';
							  dCnt1++;
						  }



						}else if(p->indel>0){
						  numGood++;			

						  if(bam1_strand(p->b) ==0){
							  for(tti=1;tti<= p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   insStr0[iCnt0] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)];
							   iCnt0++;
							  }	
							  insStr0[iCnt0] = ',';
							  iCnt0++;
						  }else{
							  for(tti=1;tti<= p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   insStr1[iCnt1] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)];
							   iCnt1++;
							  }	
							  insStr1[iCnt1] = ',';
							  iCnt1++;
						  }


						}
						//calculate position of variant within aligned read - no soft clips
						if( toupper(ref[pos]) != toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]) || p->indel>0 || p->indel<0  ){

						//distance to end; calculate distance to end of aligned read.  removes soft clips.
						int distToEnd = (p->alignedQPosBeg < p->alignedQPosEnd) ? p->alignedQPosBeg : p->alignedQPosEnd;
						qposP[qposCnt] = distToEnd;						  
						qposCnt++;	
						// printf("id=%s, pos=%d",bam1_qname(p->b),distToEnd);
						}	
					}

					//

					//print A,C,G,T, by +/-
				        printf("\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d", 	countChars[1][0],countChars[1][1],
											countChars[2][0],countChars[2][1],
											countChars[4][0],countChars[4][1],
											countChars[8][0],countChars[8][1],
											countChars[7][0],countChars[7][1]);
					
					putchar('\t');
					for(tti=0;tti<dCnt0;tti++){
					  putchar(delStr0[tti]);
					}

					putchar('\t');
					for(tti=0;tti<dCnt1;tti++){
					  putchar(delStr1[tti]);
					}

					putchar('\t');
					for(tti=0;tti<iCnt0;tti++){
					  putchar(insStr0[tti]);
					}

					putchar('\t');
					for(tti=0;tti<iCnt1;tti++){
					  putchar(insStr1[tti]);
					}

					printf("\t%d\t%d",numGood,numStruck);					

					// get non-ref qpos variation

					float medqpos = -1;
					float medAbsDev = -1;
					if(qposCnt>0){
					  medqpos = median(qposCnt,qposP);
					  float absDev[qposCnt];
					  for(tti=0;tti<qposCnt;tti++){
						absDev[tti] = abs(medqpos - qposP[tti]);
					  }
					  medAbsDev = median(qposCnt-1,absDev);
					}
					printf("\t%f",medAbsDev);

					///END MDW
				}



			}
			putchar('\n');
		}
	}

	bcf_close(bp);
	bam_smpl_destroy(sm); free(buf.s);
	for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]);
	free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp);
	bcf_call_del_rghash(rghash);
	bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr);
	bam_mplp_destroy(iter);
	bam_header_destroy(h);
	for (i = 0; i < n; ++i) {
		bam_close(data[i]->fp);
		if (data[i]->iter) bam_iter_destroy(data[i]->iter);
		free(data[i]);
	}
	free(data); free(plp); free(ref); free(n_plp);
	return 0;
}
コード例 #12
0
ファイル: test-bcf-translate.c プロジェクト: atks/vt
int main(int argc, char **argv)
{
    char *fname = argc>1 ? argv[1] : "/dev/null";
    htsFile *fp = hts_open(fname, "w");
    bcf_hdr_t *hdr1, *hdr2;

    hdr1 = bcf_hdr_init("w");
    hdr2 = bcf_hdr_init("w");

    // Add two shared and two private annotations
    bcf_hdr_append(hdr1, "##contig=<ID=1>");
    bcf_hdr_append(hdr1, "##contig=<ID=2>");
    bcf_hdr_append(hdr2, "##contig=<ID=2>");
    bcf_hdr_append(hdr2, "##contig=<ID=1>");
    bcf_hdr_append(hdr1, "##FILTER=<ID=FLT1,Description=\"Filter 1\">");
    bcf_hdr_append(hdr1, "##FILTER=<ID=FLT2,Description=\"Filter 2\">");
    bcf_hdr_append(hdr1, "##FILTER=<ID=FLT3,Description=\"Filter 3\">");
    bcf_hdr_append(hdr2, "##FILTER=<ID=FLT4,Description=\"Filter 4\">");
    bcf_hdr_append(hdr2, "##FILTER=<ID=FLT3,Description=\"Filter 3\">");
    bcf_hdr_append(hdr2, "##FILTER=<ID=FLT2,Description=\"Filter 2\">");
    bcf_hdr_append(hdr1, "##INFO=<ID=INF1,Number=.,Type=Integer,Description=\"Info 1\">");
    bcf_hdr_append(hdr1, "##INFO=<ID=INF2,Number=.,Type=Integer,Description=\"Info 2\">");
    bcf_hdr_append(hdr1, "##INFO=<ID=INF3,Number=.,Type=Integer,Description=\"Info 3\">");
    bcf_hdr_append(hdr2, "##INFO=<ID=INF4,Number=.,Type=Integer,Description=\"Info 4\">");
    bcf_hdr_append(hdr2, "##INFO=<ID=INF3,Number=.,Type=Integer,Description=\"Info 3\">");
    bcf_hdr_append(hdr2, "##INFO=<ID=INF2,Number=.,Type=Integer,Description=\"Info 2\">");
    bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT1,Number=.,Type=Integer,Description=\"FMT 1\">");
    bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT2,Number=.,Type=Integer,Description=\"FMT 2\">");
    bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT3,Number=.,Type=Integer,Description=\"FMT 3\">");
    bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT4,Number=.,Type=Integer,Description=\"FMT 4\">");
    bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT3,Number=.,Type=Integer,Description=\"FMT 3\">");
    bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT2,Number=.,Type=Integer,Description=\"FMT 2\">");
    bcf_hdr_add_sample(hdr1,"SMPL1");
    bcf_hdr_add_sample(hdr1,"SMPL2");
    bcf_hdr_add_sample(hdr2,"SMPL1");
    bcf_hdr_add_sample(hdr2,"SMPL2");
    bcf_hdr_sync(hdr1);
    bcf_hdr_sync(hdr2);

    hdr2 = bcf_hdr_merge(hdr2,hdr1);
    bcf_hdr_sync(hdr2);
    if ( bcf_hdr_write(fp, hdr2)!=0 ) error("Failed to write to %s\n", fname);

    bcf1_t *rec = bcf_init1();
    rec->rid = bcf_hdr_name2id(hdr1, "1");
    rec->pos = 0;
    bcf_update_alleles_str(hdr1, rec, "G,A");
    int32_t tmpi[3];
    tmpi[0] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT1");
    tmpi[1] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT2");
    tmpi[2] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT3");
    bcf_update_filter(hdr1, rec, tmpi, 3);
    tmpi[0] = 1; bcf_update_info_int32(hdr1, rec, "INF1", tmpi, 1);
    tmpi[0] = 2; bcf_update_info_int32(hdr1, rec, "INF2", tmpi, 1);
    tmpi[0] = 3; bcf_update_info_int32(hdr1, rec, "INF3", tmpi, 1);
    tmpi[0] = tmpi[1] = 1; bcf_update_format_int32(hdr1, rec, "FMT1", tmpi, 2);
    tmpi[0] = tmpi[1] = 2; bcf_update_format_int32(hdr1, rec, "FMT2", tmpi, 2);
    tmpi[0] = tmpi[1] = 3; bcf_update_format_int32(hdr1, rec, "FMT3", tmpi, 2);

    bcf_remove_filter(hdr1, rec, bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT2"), 0);
    bcf_update_info_int32(hdr1, rec, "INF2", NULL, 0);
    bcf_update_format_int32(hdr1, rec, "FMT2", NULL, 0);

    bcf_translate(hdr2, hdr1, rec);
    if ( bcf_write(fp, hdr2, rec)!=0 ) error("Faild to write to %s\n", fname);

    // Clean
    bcf_destroy1(rec);
    bcf_hdr_destroy(hdr1);
    bcf_hdr_destroy(hdr2);
    int ret;
    if ( (ret=hts_close(fp)) )
    {
        fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret);
        exit(ret);
    }
    return 0;
}
コード例 #13
0
ファイル: anno_bed.c プロジェクト: shiquan/vcfanno
int beds_database_add(struct beds_options *opts, const char *fname, char *columns)
{
    if ( opts->n_files == opts->m_files ) {
	opts->m_files = opts->m_files == 0 ? 2 : opts->m_files +2;
	opts->files = (struct beds_anno_file*)realloc(opts->files, opts->m_files*sizeof(struct beds_anno_file));	
    }
    struct beds_anno_file *file = &opts->files[opts->n_files];
    memset(file, 0, sizeof(struct beds_anno_file));
    file->id = opts->n_files;
    file->fname = strdup(fname);
    file->fp = hts_open(fname, "r");
    if (file->fp == NULL)
	error("Failed to open %s : %s", fname, strerror(errno));
    // int n;
    file->idx = tbx_index_load(fname);
    if ( file->idx == NULL)
	error("Failed to load index of %s.", fname);
    opts->n_files++;
    
    file->last_id = -1;
    file->last_start = -1;
    file->last_end = -1;
    kstring_t string = KSTRING_INIT;
    int no_columns = 0;
    int i;
    if ( columns == NULL && file->no_such_chrom == 0) {
	warnings("No columns string specified for %s. Will annotate all tags in this data.", fname);
        file->no_such_chrom = 1;
	no_columns = 1;
    } else {
	int *splits = NULL;
	kputs(columns, &string);
	int nfields;
	splits = ksplit(&string, ',', &nfields);
	file->m_cols = nfields;
	file->cols = (struct anno_col*)malloc(sizeof(struct anno_col) * file->m_cols);

	for ( i = 0; i < nfields; ++i ) {
	    char *ss = string.s + splits[i];
	    struct anno_col *col = &file->cols[file->n_cols];
	    col->icol = i;
	    col->replace = REPLACE_MISSING;
	    if (*ss == '+') {
		col->replace = REPLACE_MISSING;
		ss++;
	    } else if ( *ss == '-' ) {
		col->replace = REPLACE_EXISTING;
		ss++;
	    }
	    if (ss[0] == '\0')
		continue;
	    if ( strncmp(ss, "INFO/", 5) == 0)
		ss += 5;
	    col->hdr_key = strdup(ss);	    
	    col->icol = -1;
	    // debug_print("%s, %d", col->hdr_key, file->n_cols);
	    file->n_cols++;	    
	}
	string.l = 0;	    
    }

    while (1) {
	string.l =0;
	if ( hts_getline(file->fp, KS_SEP_LINE, &string) < 0 )
	    break;
	// only accept header line in the beginning for file
	if ( string.s[0] != '#' )
	    break;
	if ( strncmp(string.s, "##INFO=", 7) == 0) {
	    char *ss = string.s + 11;
	    char *se = ss;
	    while (se && *se != ',') se++;
	    struct anno_col *col = NULL;
	    // if no column string specified, init all header lines
	    if ( no_columns ) {
		if ( file->n_cols == file->m_cols ) {
		    file->m_cols = file->m_cols == 0 ? 2 : file->m_cols + 2;
		    file->cols = (struct anno_col *) realloc(file->cols, file->m_cols*sizeof(struct anno_col));
		}
		col = &file->cols[file->n_cols++];
		col->icol = -1;
		col->hdr_key = strndup(ss, se-ss+1);
		col->hdr_key[se-ss] = '\0';
	    } else {
		for ( i = 0; i < file->n_cols; ++i ) {		    
		    if ( strncmp(file->cols[i].hdr_key, ss, se-ss) == 0)
			break;
		}
		// if header line is not set in the column string, skip
		if ( i == file->n_cols )
		    continue;
		col = &file->cols[i];
	    }

	    // specify setter functions here
	    col->setter.bed = beds_setter_info_string;
	    
	    bcf_hdr_append(opts->hdr_out, string.s);
	    bcf_hdr_sync(opts->hdr_out);
	    int hdr_id = bcf_hdr_id2int(opts->hdr_out, BCF_DT_ID,col->hdr_key);
	    assert ( bcf_hdr_idinfo_exists(opts->hdr_out, BCF_HL_INFO, hdr_id) );
	}
	string.l = 0;
	// set column number for each col
	if ( strncasecmp(string.s, "#chr", 4) == 0) {
	    int nfields;	    
	    int *splits = ksplit(&string, '\t', &nfields);

	    if (nfields < 4) {
		fprintf(stderr, "[error] Bad header of bed database : %s. n_fields : %d, %s", fname, nfields, string.s);
		fprintf(stderr, "[notice] this error usually happened because the header line is seperated by spaces but not tab!");
		exit(1);
	    }
	    int k;
	    for ( k = 3; k < nfields; ++k ) {
		char *ss = string.s + splits[k];
		for (i = 0; i < file->n_cols; ++i ) {
		    struct anno_col *col = &file->cols[i];
		    if ( strcmp(col->hdr_key, ss) == 0)
			break;
		}
		// if name line specify more names than column string or header, skip
		if ( i == file->n_cols )
		    continue;

		struct anno_col *col = &file->cols[i];
		col->icol = k;
	    }
	}
    }
    for ( i = 0; i < file->n_cols; ++i ) {
	struct anno_col *col = &file->cols[i];
	if ( col->hdr_key && col->icol == -1 )
	    error("No column %s found in bed database : %s", col->hdr_key, fname);

	int hdr_id = bcf_hdr_id2int(opts->hdr_out, BCF_DT_ID, col->hdr_key);
        assert(hdr_id>-1);
	col->number = bcf_hdr_id2length(opts->hdr_out, BCF_HL_INFO, hdr_id);
	if ( col->number == BCF_VL_A || col->number == BCF_VL_R || col->number == BCF_VL_G)
	    error("Only support fixed INFO number for bed database. %s", col->hdr_key);
	col->ifile = file->id;
    }
    if ( string.m )
	free(string.s);
    if ( opts->beds_is_inited == 0 )
	opts->beds_is_inited = 1;
    return 0;
}
コード例 #14
0
ファイル: anno_setter.c プロジェクト: shiquan/vcfanno
// only if annotation database is VCF/BCF file, header_in has values or else header_in == NULL
anno_col_t *init_columns(const char *rules, bcf_hdr_t *header_in, bcf_hdr_t *header_out, int *ncols, enum anno_type type)
{
    assert(rules != NULL);
    if (type == anno_is_vcf && header_in == NULL) {
	error("Inconsistent file type!");
    }
    char *ss = (char*)rules, *se = ss;
    int nc = 0;
    anno_col_t *cols = NULL;
    kstring_t tmp = KSTRING_INIT;
    kstring_t str = KSTRING_INIT;
    int i = -1;

    while (*ss) {
	if ( *se && *se!=',' ) {
	    se++;
	    continue;
	}
	int replace = REPLACE_ALL;
	if ( *ss=='+') {
	    replace = REPLACE_MISSING;
	    ss++;
	} else if (*ss=='-') {
	    replace = REPLACE_EXISTING;
	    ss++;
	}
	i++;
	str.l = 0;
	kputsn(ss, se-ss, &str); 
	if ( !str.s[0] ) {
	    warnings("Empty tag in %s", rules);
	} else if ( !strcasecmp("CHROM", str.s) || !strcasecmp("POS", str.s) || !strcasecmp("FROM", str.s) || !strcasecmp("TO", str.s) || !strcasecmp("REF", str.s) || !strcasecmp("ALT", str.s) || !strcasecmp("FILTER", str.s) || !strcasecmp("QUAL", str.s)) {
	    warnings("Skip tag %s", str.s);
	} else if ( !strcasecmp("ID", str.s) ) {
	    nc++;
            cols = (struct anno_col*) realloc(cols, sizeof(struct anno_col)* (nc));
            struct anno_col *col = &cols[nc-1];
            col->icol = i;
            col->replace = replace;
            col->setter = type == anno_is_vcf ? vcf_setter_id : setter_id;
            col->hdr_key = strdup(str.s);
        } else if (!strcasecmp("INFO", str.s) || !strcasecmp("FORMAT", str.s) ) {
	    error("do not support annotate all INFO,FORMAT fields. todo INFO/TAG instead\n");
	} else if (!strncasecmp("FORMAT/", str.s, 7) || !strncasecmp("FMT/", str.s, 4)) {
            char *key = str.s + (!strncasecmp("FMT", str.s, 4) ? 4 : 7);
            if (!strcasecmp("GT", key)) 
		error("It is not allowed to change GT tag.");

	    int hdr_id = bcf_hdr_id2int(header_out, BCF_DT_ID, str.s);
	    
	    if ( !bcf_hdr_idinfo_exists(header_out, BCF_HL_FMT, hdr_id) ) {
		
		if ( type == anno_is_vcf ) {
		    bcf_hrec_t *hrec = bcf_hdr_get_hrec(header_in, BCF_HL_FMT, "ID", str.s, NULL);
		    if ( !hrec )
			error("The tag \"%s\" is not defined in header: %s\n", str.s, rules);
		    tmp.l = 0;
		    bcf_hrec_format(hrec, &tmp);
		    bcf_hdr_append(header_out, tmp.s);
		    bcf_hdr_sync(header_out);
		    hdr_id = bcf_hdr_id2int(header_out, BCF_DT_ID, str.s);
		    assert( bcf_hdr_idinfo_exists(header_out, BCF_HL_FMT, hdr_id) );
		} else {
		    error("The tag \"%s\" is not defined in header: %s\n", str.s, rules);
		}
	    }

            //int hdr_id = bcf_hdr_id2int(header_out, BCF_DT_ID, key);
            nc++;
	    cols = (struct anno_col*) realloc(cols, sizeof(struct anno_col)*(nc));
            struct anno_col *col = &cols[nc-1];
            col->icol = -1;
            col->replace = replace;
            col->hdr_key = strdup(key);

            switch ( bcf_hdr_id2type(header_out, BCF_HL_FMT, hdr_id) ) {

		case BCF_HT_INT:
		    col->setter = type == anno_is_vcf ? vcf_setter_format_int : setter_format_int;
		    break;

		case BCF_HT_REAL:
		    col->setter = type == anno_is_vcf ? vcf_setter_format_real : setter_format_real;
		    break;

		case BCF_HT_STR:
		    col->setter = type == anno_is_vcf ? vcf_setter_format_str : setter_format_str;
		    break;

		default :
		    error("The type of %s not recognised (%d)\n", str.s, bcf_hdr_id2type(header_out, BCF_HL_FMT, hdr_id));
            }

	} else if ( !strncasecmp("INFO/", str.s, 5) ) {
	    memmove(str.s, str.s+5, str.l-4);
	    str.l -= 4;

	    int hdr_id = bcf_hdr_id2int(header_out, BCF_DT_ID, str.s);

	    if ( !bcf_hdr_idinfo_exists(header_out, BCF_HL_INFO, hdr_id) ) {
		if ( type == anno_is_vcf ) {
		    bcf_hrec_t *hrec = bcf_hdr_get_hrec(header_in, BCF_HL_INFO, "ID", str.s, NULL);

		    if ( !hrec )
			error("The tag \"%s\" is not defined in header: %s\n", str.s, rules);
		    tmp.l = 0;

		    bcf_hrec_format(hrec, &tmp);
		    bcf_hdr_append(header_out, tmp.s);
		    bcf_hdr_sync(header_out);
		    hdr_id = bcf_hdr_id2int(header_out, BCF_DT_ID, str.s);
		    assert( bcf_hdr_idinfo_exists(header_out, BCF_HL_INFO, hdr_id) );
		} else {
		    error("The tag \"%s\" is not defined in header: %s\n", str.s, rules);
		}
	    }
	    nc++;
	    cols = (struct anno_col*) realloc(cols, sizeof(struct anno_col)*(nc));
	    struct anno_col *col = &cols[nc-1];
	    col->icol = i;
	    col->replace = replace;
	    col->hdr_key = strdup(str.s);

	    col->number = bcf_hdr_id2length(header_out, BCF_HL_INFO, hdr_id);

	    switch ( bcf_hdr_id2type(header_out, BCF_HL_INFO, hdr_id) ) {

		case BCF_HT_FLAG:
		    col->setter = type == anno_is_vcf ? vcf_setter_info_flag : setter_info_flag;
		    break;

		case BCF_HT_INT:
		    col->setter = type == anno_is_vcf ? vcf_setter_info_int : setter_info_int;
		    break;

		case BCF_HT_REAL:
		    col->setter = type == anno_is_vcf ? vcf_setter_info_real : setter_info_real;
		    break;

		case BCF_HT_STR:
		    col->setter = type == anno_is_vcf ? vcf_setter_info_str : setter_info_str;
		    break;

		default:
		    error("The type of %s not recognised (%d)\n", str.s, bcf_hdr_id2type(header_out, BCF_HL_INFO, hdr_id));
	    }
	}
	if ( !*se ) break;
        ss = ++se;
    }
    *ncols = nc;
    if (str.m) free(str.s);
    if (tmp.m) free(tmp.s);
    return cols;
}