Exemple #1
0
int main(int argc, char *argv[])
{
	int min_span = 1000, min_match = 100, width = 600, height, diagonal = 1;
	int color[2] = { 0xFF0000, 0x0080FF }, font_size = 11, no_label = 0;
	float min_iden = .1;
	paf_file_t *f;
	sdict_t *d[2];
	paf_rec_t r;
	int32_t c, i, j;
	uint64_t *acclen[2], totlen[2];
	srtaux_t *a[2];
	kvec_t(dt_hit_t) h = {0,0,0};
	double sx, sy;

	while ((c = getopt(argc, argv, "m:i:s:w:f:Ld")) >= 0) {
		if (c == 'm') min_match = atoi(optarg);
		else if (c == 'i') min_iden = atof(optarg);
		else if (c == 's') min_span = atoi(optarg);
		else if (c == 'w') width = atoi(optarg);
		else if (c == 'f') font_size = atoi(optarg);
		else if (c == 'L') no_label = 1;
		else if (c == 'd') diagonal = 0;
	}
	if (argc == optind) {
		fprintf(stderr, "Usage: minidot [options] <in.paf>\n");
		fprintf(stderr, "Options:\n");
		fprintf(stderr, "  -m INT      min match length [%d]\n", min_match);
		fprintf(stderr, "  -i FLOAT    min identity [%.2f]\n", min_iden);
		fprintf(stderr, "  -s INT      min span [%d]\n", min_span);
		fprintf(stderr, "  -w INT      image width [%d]\n", width);
		fprintf(stderr, "  -f INT      font size [%d]\n", font_size);
		fprintf(stderr, "  -L          don't print labels\n");
		fprintf(stderr, "  -D          don't try to put hits onto the diagonal\n");
		return 1;
	}

	d[0] = sd_init();
	d[1] = sd_init();

	f = paf_open(argv[optind]);
	while (paf_read(f, &r) >= 0) {
		dt_hit_t *s;
		if (r.qe - r.qs < min_span || r.te - r.ts < min_span || r.ml < min_match) continue;
		if (r.ml < r.bl * min_iden) continue;
		kv_pushp(dt_hit_t, h, &s);
		s->qn = sd_put(d[0], r.qn, r.ql), s->qs = r.qs, s->qe = r.qe;
		s->tn = sd_put(d[1], r.tn, r.tl);
		s->ts = r.rev? r.te : r.ts, s->te = r.rev? r.ts : r.te;
		s->ml = r.ml;
	}
	paf_close(f);

	for (i = 0; i < 2; ++i) {
		uint32_t n = d[i]->n_seq;
		uint64_t l = 0;
		a[i] = (srtaux_t*)calloc(n + 1, sizeof(srtaux_t));
		if (i == 0 || !diagonal) {
			for (j = 0; j < n; ++j)
				a[i][j].name = d[i]->seq[j].name, a[i][j].i = j;
			ks_introsort_dtx(n, a[i]);
		} else {
			srtaux_t *b = a[i];
			uint32_t *inv;
			inv = (uint32_t*)calloc(d[0]->n_seq, 4);
			for (j = 0; j < d[0]->n_seq; ++j)
				inv[a[0][j].i] = j;
			for (j = 0; j < n; ++j)
				b[j].name = d[i]->seq[j].name, b[j].tot = b[j].w = 0, b[j].i = j;
			for (j = 0; j < h.n; ++j) {
				uint64_t w, coor;
				dt_hit_t *p = &h.a[j];
				srtaux_t *q = &b[p->tn];
				coor = acclen[0][inv[p->qn]] + (p->qs + p->qe) / 2;
				w = (uint64_t)(.01 * p->ml * p->ml + .499);
				q->tot += (double)coor * w;
				q->w += w;
			}
			free(inv);
			for (j = 0; j < n; ++j) b[j].tot /= b[j].w;
			ks_introsort_dty(n, b);
		}
		acclen[i] = (uint64_t*)calloc(n, 8);
		for (j = 0; j < n; ++j)
			acclen[i][a[i][j].i] = l, l += d[i]->seq[a[i][j].i].len;
		totlen[i] = l;
	}
	height = (int)((double)width / totlen[0] * totlen[1] + .499);
	sx = (double)width / totlen[0];
	sy = (double)height / totlen[1];

	eps_header(stdout, width, height, .2);
	eps_font(stdout, "Helvetica-Narrow", font_size);
	eps_gray(stdout, .8);

	if (!no_label) {
		// write x labels
		for (i = 0; i < d[0]->n_seq; ++i)
			eps_Mstr(stdout, (acclen[0][a[0][i].i] + .5 * d[0]->seq[a[0][i].i].len) * sx, font_size*.5, a[0][i].name);
		eps_stroke(stdout);
		fprintf(stdout, "gsave %g 0 translate 90 rotate\n", font_size*1.25);
		// write y labels
		for (i = 0; i < d[1]->n_seq; ++i)
			eps_Mstr(stdout, (acclen[1][a[1][i].i] + .5 * d[1]->seq[a[1][i].i].len) * sx, 0, a[1][i].name);
		fprintf(stdout, "grestore\n");
		eps_stroke(stdout);
	}

	// write grid lines
	eps_linewidth(stdout, .1);
	for (i = 0; i < d[1]->n_seq; ++i)
		eps_linex(stdout, 1, width, i == 0? 1 : acclen[1][a[1][i].i] * sy);
	eps_linex(stdout, 1, width, totlen[1] * sy);
	for (i = 0; i < d[0]->n_seq; ++i)
		eps_liney(stdout, 1, height, i == 0? 1 : acclen[0][a[0][i].i] * sx);
	eps_liney(stdout, 1, height, totlen[0] * sx);
	eps_stroke(stdout);

	// write hits
	eps_linewidth(stdout, .1);
	for (j = 0; j < 2; ++j) {
		eps_color(stdout, color[j]);
		for (i = 0; i < h.n; ++i) {
			dt_hit_t *p = &h.a[i];
			double x0, y0, x1, y1;
			uint64_t xo = acclen[0][p->qn], yo = acclen[1][p->tn];
			if (j == 0 && p->ts > p->te) continue;
			if (j == 1 && p->ts < p->te) continue;
			x0 = (p->qs + xo) * sx, y0 = (p->ts + yo) * sy;
			x1 = (p->qe + xo) * sx, y1 = (p->te + yo) * sy;
			eps_line(stdout, x0, y0, x1, y1);
		}
		eps_stroke(stdout);
	}
	eps_bottom(stdout);

	for (i = 0; i < 2; ++i) {
		free(acclen[i]);
		free(a[i]);
		sd_destroy(d[i]);
	}

	free(h.a);
	return 0;
}
int do_grep() {
#ifdef DEBUGa
	printf("[!]do_grep\n");
#endif
	BamInfo_t *pbam;
	kh_cstr_t BamID;
	khiter_t ki, bami;
	kstring_t ks1 = { 0, 0, NULL };
	kstring_t ks2 = { 0, 0, NULL };
	kstring_t ks3 = { 0, 0, NULL };

	samFile *in;
	bam_hdr_t *h;
	hts_idx_t *idx;
	bam1_t *b, *d, *d2, *bR1, *bR2, *bR3;
	bR1 = bam_init1(); bR2 = bam_init1(); bR3 = bam_init1();
	//htsFile *out;
	//hts_opt *in_opts = NULL, *out_opts = NULL;
	int r = 0, exit_code = 0;

	kvec_t(bam1_t) R1, R2, RV;
	pierCluster_t *pierCluster;
	//samdat_t tmp_samdat;
	FILE *fs = fopen("./test.txt","w");

	for (bami = kh_begin(bamNFOp); bami != kh_end(bamNFOp); ++bami) {
		//printf(">[%d]:\n",bami);
		if (kh_exist(bamNFOp, bami)) {
			kv_init(R1); kv_init(R2); kv_init(RV);
			//tmp_samdat = (const samdat_t){ 0 };
			//memset(&tmp_samdat,0,sizeof(samdat_t));
			//printf("-[%d]:\n",bami);
			BamID = kh_key(bamNFOp, bami);
			pbam = &kh_value(bamNFOp, bami);
			fprintf(stderr, "%u [%s]=%s\t%u %u\n",bami,BamID,pbam->fileName,pbam->insertSize,pbam->SD);

			in = sam_open(pbam->fileName, "r");
			if (in == NULL) {
				fprintf(stderr, "[x]Error opening \"%s\"\n", pbam->fileName);
				return EXIT_FAILURE;
			}
			h = sam_hdr_read(in);
/*			out = hts_open("-", "w");
			if (out == NULL) {
				fprintf(stderr, "[x]Error opening standard output\n");
				return EXIT_FAILURE;
			}
			if (sam_hdr_write(out, h) < 0) {
				fprintf(stderr, "[!]Error writing output header.\n");
				exit_code = 1;
			}
*/
			int8_t *ChrIsHum;
			if (h == NULL) {
				fprintf(stderr, "[x]Couldn't read header for \"%s\"\n", pbam->fileName);
				return EXIT_FAILURE;
			} else {
				ChrIsHum = malloc(h->n_targets * sizeof(int8_t));
				for (int32_t i=0; i < h->n_targets; ++i) {
					//ChrIsHum[i] = -1;
					ki = kh_get(chrNFO, chrNFOp, h->target_name[i]);
					if (ki == kh_end(chrNFOp)) {
						errx(4,"[x]Cannot find ChrID for [%s] !",h->target_name[i]);
					} else {
						ChrInfo_t * tmp = &kh_value(chrNFOp, ki);
						ChrIsHum[i] = tmp->isHum;
						//printf(">>> %d Chr:%s %d\n",i,h->target_name[i],ChrIsHum[i]);
					}
				}
			}
			h->ignore_sam_err = 0;
			b = bam_init1();
			d = bam_init1();
			d2 = bam_init1();
			if ((idx = sam_index_load(in, pbam->fileName)) == 0) {
				fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__);
				return 1;
			}
			pierCluster = sam_plp_init();
			while ((r = sam_read1(in, h, b)) >= 0) {
				int8_t flag = false;
				const bam1_core_t *c = &b->core;
				if (c->flag & BAM_FSECONDARY) continue;
				if (c->n_cigar) {
					uint32_t *cigar = bam_get_cigar(b);
					for (int i = 0; i < c->n_cigar; ++i) {
						if (bam_cigar_opchr(cigar[i])=='S') {	// soft clipping
							if ( bam_cigar_oplen(cigar[i]) >= myConfig.minGrepSlen ) {
								flag = true;
							}
						}
					}
				}
				if (flag && ChrIsHum[c->tid]) {	// Now, skip Virus items.
					//bam_copy1(bR1, b);
					flag = 0;	// recycle
					//int enoughMapQ = 0;
					//kstring_t ks = { 0, 0, NULL };
					/*if (sam_format1(h, b, &ks1) < 0) {
						fprintf(stderr, "Error writing output.\n");
						exit_code = 1;
						break;
					} else*/ if ((c->mtid == c->tid && ChrIsHum[c->tid]) || (ChrIsHum[c->tid] ^ ChrIsHum[c->mtid])) {	// Only grep those mapped on same Human ChrID, or diff species/一方在病毒的情况.
						//printf(">[%s]\n",ks_str(&ks1));
						flag |= 1;
						//tmp_samdat.b = bam_dup1(b);
						//kv_push(samdat_t,R1,tmp_samdat);
						/*if (checkMapQ(ChrIsHum, b, true)) {
							++enoughMapQ;
						}*/
					}
					if (getPairedSam(in, idx, b, d) != 0) {
						flag &= ~1;
						continue;
					} else {
						flag |= 2;
						/*if (checkMapQ(ChrIsHum, d, false)) {
							++enoughMapQ;
						}*/
						/*if (c->flag & BAM_FSECONDARY) {
							if (getPairedSam(in, idx, d, d2) == 0) {
								//sam_format1(h, d2, &ks3);
								flag |= 4;
								if (checkMapQ(ChrIsHum, d2, false)) {
									++enoughMapQ;
								}
							}
						}*/
					}
/*
对于 BAM_FSECONDARY(256) 的 Read,跳两次 与 读 SA 项,效果一样。
>[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90	353	chr2	13996555	0	50S40M	chr18	48245109	0ACACAACAATGTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAA	CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	NM:i:0	MD:Z:40	AS:i:40	XS:i:40	RG:Z:Fsimout_mB	SA:Z:rgi|59585|emb|X04615.1|,2000,-,40S46M4S,60,0;	YC:Z:CT	YD:Z:f]
-[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90	177	chr18	48245109	9	40S50M	gi|59585|emb|X04615.1|2000	0	GTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAAAGGAATTCAA	CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	NM:i:0	MD:Z:50	AS:i:50	XS:i:46	RG:Z:Fsimout_mB	SA:Z:rgi|59585|emb|X04615.1|,2000,+,50S40M,9,0;	YC:Z:GA	YD:Z:f]
+[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90	113	gi|59585|emb|X04615.1|	2000	60	40S46M4S	chr18	48245109	0	TTTTTTGGCTGAATAGTATTCCATGGTGTGTGTGTGTGTGGCCTCTGCTCTGTATCGGGAGGCCTTAGAGTCTCCGGAACATTGTTGTGT	CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	NM:i:0	MD:Z:46	AS:i:46	XS:i:27	RG:Z:Fsimout_mB	SA:Z:fchr2,13996555,+,50S40M,0,0;	YC:Z:CT	YD:Z:r]
*/
					/*if (sam_format1(h, d, &ks2) < 0) {
						fprintf(stderr, "Error writing output.\n");
						exit_code = 1;
						break;
					}*/
					if (((flag & 3) == 3) /*&& enoughMapQ >= myConfig.samples*/) {
						/*printf(">%d[%s]\n",checkMapQ(ChrIsHum, b, true),ks_str(&ks1));
						printf("-%d[%s]\n",checkMapQ(ChrIsHum, d, false),ks_str(&ks2));
						if (flag & 4) {
							printf("+%d[%s]\n",checkMapQ(ChrIsHum, d2, false),ks_str(&ks3));
						}
						printf("<--%d\n",enoughMapQ);*/
						if (sam_plp_push(ChrIsHum, pierCluster, b) == 0) {
							//printf("--HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos);
							if ((!ChrIsHum[(d->core).tid]) && (flag & 2)) sam_plp_push(ChrIsHum, pierCluster, d);
							//if ((!ChrIsHum[(d2->core).tid]) && (flag & 4)) sam_plp_push(ChrIsHum, pierCluster, d2);
						} else {
							//print
							fprintf(fs,"[%s]\nHumRange=%s:%d-%d\n", BamID, h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos);
							fprintf(fs,"VirRange=%s:%d-%d\n", h->target_name[(pierCluster->VirusRange).tid], (pierCluster->VirusRange).pos, (pierCluster->VirusRange).endpos);
							for (size_t i=0; i<kv_size(pierCluster->Reads);++i) {
								bam1_t *bi = kv_A(pierCluster->Reads, i);
								if (sam_format1(h, bi, &ks1) < 0) {
									fprintf(stderr, "Error writing output.\n");
									exit_code = 1;
									break;
								} else {
									fprintf(fs,"%s\n",ks1.s);
								}
							}
							fprintf(fs,"\n");
							//printf("HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos);
							//fflush(fs);
							sam_plp_dectroy(pierCluster);
							pierCluster = sam_plp_init();
						}
					}
				}
				/*char *qname = bam_get_qname(b);
				if (sam_write1(out, h, b) < 0) {
					fprintf(stderr, "[x]Error writing output.\n");
					exit_code = 1;
					break;
				}*/
			}
/*			r = sam_close(out);   // stdout can only be closed once
			if (r < 0) {
				fprintf(stderr, "Error closing output.\n");
				exit_code = 1;
			}
*/
			hts_idx_destroy(idx);
			bam_destroy1(b);
			bam_destroy1(d);
			bam_destroy1(d2);
			bam_hdr_destroy(h);
			r = sam_close(in);
			free(ChrIsHum);
#ifdef DEBUGa
			fflush(NULL);
			//pressAnyKey();
#endif
			sam_plp_dectroy(pierCluster);
			//printf("<[%d]:\n",bami);
		}
	}
	fclose(fs);
	getPairedSam(NULL, NULL, NULL, NULL);	// sam_close(fp2);
	//printf("---[%d]---\n",exit_code);
	bam_destroy1(bR1); bam_destroy1(bR2); bam_destroy1(bR3);
	ks_release(&ks1);
	ks_release(&ks2);
	ks_release(&ks3);
	return exit_code;
}
Exemple #3
0
mem_alnreg_v mem_fmeas_fliter_se(mem_alnreg_v a , int n , int l_seq , int mode)
{
	mem_alnreg_v  aa  ;
	int i , j ;
	kvec_t(FF_t)  k_ff_t ;
	kv_init(k_ff_t);
	kv_init(aa);
	//   caculate FMEAS value 
	if(n == 0) return aa ;
	for( i = 0 ;  i <  a.n ; i++){
		mem_alnreg_t  *p_ar =  a.a + i ;
		for( j = i + 1 ; j < a.n ; j++){
			FF_t  tmp ;
			mem_alnreg_t  *q_ar =  a.a + j ;
			double  sens  ,  spec ;
			int FN =  0 , TP = 0 ,TN = 0 , FP = 0 ;
			int A,B,C,D;
			if( p_ar->qb < q_ar->qb || (p_ar->qb  ==  q_ar->qb &&  p_ar->qe >=  q_ar->qe)){ //   p  q
				A =  p_ar->qb ;
				B =  p_ar->qe - 1 ;
				C =  q_ar->qb ;
				D =  q_ar->qe - 1 ;
			}else {  //   p   q  
				A =  q_ar->qb ;
				B =  q_ar->qe - 1;
				C =  p_ar->qb ;
				D =  p_ar->qe - 1;
			}
			if(B < C){
				TP = B - A + D - C + 2 ;
				FN = l_seq - D - 1  + A  + C - B - 1 ; 
				TN = l_seq ;
				FP = 0 ;
			}else if( D <= B){ // contain
				continue ;
			}else{
				TP = D - A + 1 ;
				FN = l_seq - D - 1 + A  ;
				FP = B - C + 1 ;
				TN = l_seq - FP;

			}
			sens = (double)TP/(double)(TP+FN);
			spec = (double)TN/(double)(TN+FP);
			tmp.FMEAS =  (2*spec*sens)/(spec+sens);
			tmp.score =  p_ar->score + q_ar->score;
			tmp.x =  i  , tmp.y = j ;
			if(tmp.FMEAS > 0.95) kv_push(FF_t,k_ff_t,tmp);
		}
	}
	ks_introsort(ff_mem_flt, k_ff_t.n, k_ff_t.a);	
	kv_push(mem_alnreg_t,aa,a.a[0]);
	double max_feas ;
//	int   score ;
	if( k_ff_t.n == 0 ) return aa;
	
	max_feas = k_ff_t.a[0].FMEAS ;
//	score =  k_ff_t.a[0].score ;
	if(mode){
		int cnt = 0 ;
		for( i = 0 ;  i <  kv_size(k_ff_t) ; i++){
			FF_t  p  = kv_A(k_ff_t,i);
			if(p.x == 0 && cnt == 0){
				kv_push(mem_alnreg_t,aa,a.a[p.y]);
				cnt = 1 ;
			}else if(p.x == 0){
				kv_push(mem_alnreg_t,aa,a.a[0]);
				kv_push(mem_alnreg_t,aa,a.a[p.y]);

			}


		}

		for( i = 0 ;  i  < kv_size(k_ff_t); i++){
			FF_t  p  = kv_A(k_ff_t,i);
			if(max_feas != p.FMEAS )  break;
			if(p.x == 0) continue ;
			kv_push(mem_alnreg_t,aa,a.a[p.x]);
			kv_push(mem_alnreg_t,aa,a.a[p.y]);
		}
	}else{
		int cnt = 0 ;
		for( i = 0 ;  i  < kv_size(k_ff_t); i++){
			FF_t  p  = kv_A(k_ff_t,i);
			if(max_feas != p.FMEAS )  break;
			if(p.x == 0 && cnt == 0){
				kv_push(mem_alnreg_t,aa,a.a[p.y]);
				continue ;
			}else if( p.x == 0 ){
				kv_push(mem_alnreg_t,aa,a.a[0]);
				kv_push(mem_alnreg_t,aa,a.a[p.y]);
				continue ;
			}
			kv_push(mem_alnreg_t,aa,a.a[p.x]);
			kv_push(mem_alnreg_t,aa,a.a[p.y]);
		}

	}
	kv_destroy(k_ff_t);

#if 0
	for( i = 0 ;  i < kv_size(aa); i++){
		mem_alnreg_t  *q = aa.a + i;
		printf("%db: %d  %de:%d \t" , i, q->qb , i, q->qe);
		if( i == kv_size(aa) -1 )  printf("\n");
	}
#endif


	return  aa ; 
}