int main(int argc, char *argv[]) { int min_span = 1000, min_match = 100, width = 600, height, diagonal = 1; int color[2] = { 0xFF0000, 0x0080FF }, font_size = 11, no_label = 0; float min_iden = .1; paf_file_t *f; sdict_t *d[2]; paf_rec_t r; int32_t c, i, j; uint64_t *acclen[2], totlen[2]; srtaux_t *a[2]; kvec_t(dt_hit_t) h = {0,0,0}; double sx, sy; while ((c = getopt(argc, argv, "m:i:s:w:f:Ld")) >= 0) { if (c == 'm') min_match = atoi(optarg); else if (c == 'i') min_iden = atof(optarg); else if (c == 's') min_span = atoi(optarg); else if (c == 'w') width = atoi(optarg); else if (c == 'f') font_size = atoi(optarg); else if (c == 'L') no_label = 1; else if (c == 'd') diagonal = 0; } if (argc == optind) { fprintf(stderr, "Usage: minidot [options] <in.paf>\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -m INT min match length [%d]\n", min_match); fprintf(stderr, " -i FLOAT min identity [%.2f]\n", min_iden); fprintf(stderr, " -s INT min span [%d]\n", min_span); fprintf(stderr, " -w INT image width [%d]\n", width); fprintf(stderr, " -f INT font size [%d]\n", font_size); fprintf(stderr, " -L don't print labels\n"); fprintf(stderr, " -D don't try to put hits onto the diagonal\n"); return 1; } d[0] = sd_init(); d[1] = sd_init(); f = paf_open(argv[optind]); while (paf_read(f, &r) >= 0) { dt_hit_t *s; if (r.qe - r.qs < min_span || r.te - r.ts < min_span || r.ml < min_match) continue; if (r.ml < r.bl * min_iden) continue; kv_pushp(dt_hit_t, h, &s); s->qn = sd_put(d[0], r.qn, r.ql), s->qs = r.qs, s->qe = r.qe; s->tn = sd_put(d[1], r.tn, r.tl); s->ts = r.rev? r.te : r.ts, s->te = r.rev? r.ts : r.te; s->ml = r.ml; } paf_close(f); for (i = 0; i < 2; ++i) { uint32_t n = d[i]->n_seq; uint64_t l = 0; a[i] = (srtaux_t*)calloc(n + 1, sizeof(srtaux_t)); if (i == 0 || !diagonal) { for (j = 0; j < n; ++j) a[i][j].name = d[i]->seq[j].name, a[i][j].i = j; ks_introsort_dtx(n, a[i]); } else { srtaux_t *b = a[i]; uint32_t *inv; inv = (uint32_t*)calloc(d[0]->n_seq, 4); for (j = 0; j < d[0]->n_seq; ++j) inv[a[0][j].i] = j; for (j = 0; j < n; ++j) b[j].name = d[i]->seq[j].name, b[j].tot = b[j].w = 0, b[j].i = j; for (j = 0; j < h.n; ++j) { uint64_t w, coor; dt_hit_t *p = &h.a[j]; srtaux_t *q = &b[p->tn]; coor = acclen[0][inv[p->qn]] + (p->qs + p->qe) / 2; w = (uint64_t)(.01 * p->ml * p->ml + .499); q->tot += (double)coor * w; q->w += w; } free(inv); for (j = 0; j < n; ++j) b[j].tot /= b[j].w; ks_introsort_dty(n, b); } acclen[i] = (uint64_t*)calloc(n, 8); for (j = 0; j < n; ++j) acclen[i][a[i][j].i] = l, l += d[i]->seq[a[i][j].i].len; totlen[i] = l; } height = (int)((double)width / totlen[0] * totlen[1] + .499); sx = (double)width / totlen[0]; sy = (double)height / totlen[1]; eps_header(stdout, width, height, .2); eps_font(stdout, "Helvetica-Narrow", font_size); eps_gray(stdout, .8); if (!no_label) { // write x labels for (i = 0; i < d[0]->n_seq; ++i) eps_Mstr(stdout, (acclen[0][a[0][i].i] + .5 * d[0]->seq[a[0][i].i].len) * sx, font_size*.5, a[0][i].name); eps_stroke(stdout); fprintf(stdout, "gsave %g 0 translate 90 rotate\n", font_size*1.25); // write y labels for (i = 0; i < d[1]->n_seq; ++i) eps_Mstr(stdout, (acclen[1][a[1][i].i] + .5 * d[1]->seq[a[1][i].i].len) * sx, 0, a[1][i].name); fprintf(stdout, "grestore\n"); eps_stroke(stdout); } // write grid lines eps_linewidth(stdout, .1); for (i = 0; i < d[1]->n_seq; ++i) eps_linex(stdout, 1, width, i == 0? 1 : acclen[1][a[1][i].i] * sy); eps_linex(stdout, 1, width, totlen[1] * sy); for (i = 0; i < d[0]->n_seq; ++i) eps_liney(stdout, 1, height, i == 0? 1 : acclen[0][a[0][i].i] * sx); eps_liney(stdout, 1, height, totlen[0] * sx); eps_stroke(stdout); // write hits eps_linewidth(stdout, .1); for (j = 0; j < 2; ++j) { eps_color(stdout, color[j]); for (i = 0; i < h.n; ++i) { dt_hit_t *p = &h.a[i]; double x0, y0, x1, y1; uint64_t xo = acclen[0][p->qn], yo = acclen[1][p->tn]; if (j == 0 && p->ts > p->te) continue; if (j == 1 && p->ts < p->te) continue; x0 = (p->qs + xo) * sx, y0 = (p->ts + yo) * sy; x1 = (p->qe + xo) * sx, y1 = (p->te + yo) * sy; eps_line(stdout, x0, y0, x1, y1); } eps_stroke(stdout); } eps_bottom(stdout); for (i = 0; i < 2; ++i) { free(acclen[i]); free(a[i]); sd_destroy(d[i]); } free(h.a); return 0; }
int do_grep() { #ifdef DEBUGa printf("[!]do_grep\n"); #endif BamInfo_t *pbam; kh_cstr_t BamID; khiter_t ki, bami; kstring_t ks1 = { 0, 0, NULL }; kstring_t ks2 = { 0, 0, NULL }; kstring_t ks3 = { 0, 0, NULL }; samFile *in; bam_hdr_t *h; hts_idx_t *idx; bam1_t *b, *d, *d2, *bR1, *bR2, *bR3; bR1 = bam_init1(); bR2 = bam_init1(); bR3 = bam_init1(); //htsFile *out; //hts_opt *in_opts = NULL, *out_opts = NULL; int r = 0, exit_code = 0; kvec_t(bam1_t) R1, R2, RV; pierCluster_t *pierCluster; //samdat_t tmp_samdat; FILE *fs = fopen("./test.txt","w"); for (bami = kh_begin(bamNFOp); bami != kh_end(bamNFOp); ++bami) { //printf(">[%d]:\n",bami); if (kh_exist(bamNFOp, bami)) { kv_init(R1); kv_init(R2); kv_init(RV); //tmp_samdat = (const samdat_t){ 0 }; //memset(&tmp_samdat,0,sizeof(samdat_t)); //printf("-[%d]:\n",bami); BamID = kh_key(bamNFOp, bami); pbam = &kh_value(bamNFOp, bami); fprintf(stderr, "%u [%s]=%s\t%u %u\n",bami,BamID,pbam->fileName,pbam->insertSize,pbam->SD); in = sam_open(pbam->fileName, "r"); if (in == NULL) { fprintf(stderr, "[x]Error opening \"%s\"\n", pbam->fileName); return EXIT_FAILURE; } h = sam_hdr_read(in); /* out = hts_open("-", "w"); if (out == NULL) { fprintf(stderr, "[x]Error opening standard output\n"); return EXIT_FAILURE; } if (sam_hdr_write(out, h) < 0) { fprintf(stderr, "[!]Error writing output header.\n"); exit_code = 1; } */ int8_t *ChrIsHum; if (h == NULL) { fprintf(stderr, "[x]Couldn't read header for \"%s\"\n", pbam->fileName); return EXIT_FAILURE; } else { ChrIsHum = malloc(h->n_targets * sizeof(int8_t)); for (int32_t i=0; i < h->n_targets; ++i) { //ChrIsHum[i] = -1; ki = kh_get(chrNFO, chrNFOp, h->target_name[i]); if (ki == kh_end(chrNFOp)) { errx(4,"[x]Cannot find ChrID for [%s] !",h->target_name[i]); } else { ChrInfo_t * tmp = &kh_value(chrNFOp, ki); ChrIsHum[i] = tmp->isHum; //printf(">>> %d Chr:%s %d\n",i,h->target_name[i],ChrIsHum[i]); } } } h->ignore_sam_err = 0; b = bam_init1(); d = bam_init1(); d2 = bam_init1(); if ((idx = sam_index_load(in, pbam->fileName)) == 0) { fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__); return 1; } pierCluster = sam_plp_init(); while ((r = sam_read1(in, h, b)) >= 0) { int8_t flag = false; const bam1_core_t *c = &b->core; if (c->flag & BAM_FSECONDARY) continue; if (c->n_cigar) { uint32_t *cigar = bam_get_cigar(b); for (int i = 0; i < c->n_cigar; ++i) { if (bam_cigar_opchr(cigar[i])=='S') { // soft clipping if ( bam_cigar_oplen(cigar[i]) >= myConfig.minGrepSlen ) { flag = true; } } } } if (flag && ChrIsHum[c->tid]) { // Now, skip Virus items. //bam_copy1(bR1, b); flag = 0; // recycle //int enoughMapQ = 0; //kstring_t ks = { 0, 0, NULL }; /*if (sam_format1(h, b, &ks1) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } else*/ if ((c->mtid == c->tid && ChrIsHum[c->tid]) || (ChrIsHum[c->tid] ^ ChrIsHum[c->mtid])) { // Only grep those mapped on same Human ChrID, or diff species/一方在病毒的情况. //printf(">[%s]\n",ks_str(&ks1)); flag |= 1; //tmp_samdat.b = bam_dup1(b); //kv_push(samdat_t,R1,tmp_samdat); /*if (checkMapQ(ChrIsHum, b, true)) { ++enoughMapQ; }*/ } if (getPairedSam(in, idx, b, d) != 0) { flag &= ~1; continue; } else { flag |= 2; /*if (checkMapQ(ChrIsHum, d, false)) { ++enoughMapQ; }*/ /*if (c->flag & BAM_FSECONDARY) { if (getPairedSam(in, idx, d, d2) == 0) { //sam_format1(h, d2, &ks3); flag |= 4; if (checkMapQ(ChrIsHum, d2, false)) { ++enoughMapQ; } } }*/ } /* 对于 BAM_FSECONDARY(256) 的 Read,跳两次 与 读 SA 项,效果一样。 >[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90 353 chr2 13996555 0 50S40M chr18 48245109 0ACACAACAATGTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:0 MD:Z:40 AS:i:40 XS:i:40 RG:Z:Fsimout_mB SA:Z:rgi|59585|emb|X04615.1|,2000,-,40S46M4S,60,0; YC:Z:CT YD:Z:f] -[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90 177 chr18 48245109 9 40S50M gi|59585|emb|X04615.1|2000 0 GTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAAAGGAATTCAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:0 MD:Z:50 AS:i:50 XS:i:46 RG:Z:Fsimout_mB SA:Z:rgi|59585|emb|X04615.1|,2000,+,50S40M,9,0; YC:Z:GA YD:Z:f] +[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90 113 gi|59585|emb|X04615.1| 2000 60 40S46M4S chr18 48245109 0 TTTTTTGGCTGAATAGTATTCCATGGTGTGTGTGTGTGTGGCCTCTGCTCTGTATCGGGAGGCCTTAGAGTCTCCGGAACATTGTTGTGT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:0 MD:Z:46 AS:i:46 XS:i:27 RG:Z:Fsimout_mB SA:Z:fchr2,13996555,+,50S40M,0,0; YC:Z:CT YD:Z:r] */ /*if (sam_format1(h, d, &ks2) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; }*/ if (((flag & 3) == 3) /*&& enoughMapQ >= myConfig.samples*/) { /*printf(">%d[%s]\n",checkMapQ(ChrIsHum, b, true),ks_str(&ks1)); printf("-%d[%s]\n",checkMapQ(ChrIsHum, d, false),ks_str(&ks2)); if (flag & 4) { printf("+%d[%s]\n",checkMapQ(ChrIsHum, d2, false),ks_str(&ks3)); } printf("<--%d\n",enoughMapQ);*/ if (sam_plp_push(ChrIsHum, pierCluster, b) == 0) { //printf("--HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); if ((!ChrIsHum[(d->core).tid]) && (flag & 2)) sam_plp_push(ChrIsHum, pierCluster, d); //if ((!ChrIsHum[(d2->core).tid]) && (flag & 4)) sam_plp_push(ChrIsHum, pierCluster, d2); } else { //print fprintf(fs,"[%s]\nHumRange=%s:%d-%d\n", BamID, h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); fprintf(fs,"VirRange=%s:%d-%d\n", h->target_name[(pierCluster->VirusRange).tid], (pierCluster->VirusRange).pos, (pierCluster->VirusRange).endpos); for (size_t i=0; i<kv_size(pierCluster->Reads);++i) { bam1_t *bi = kv_A(pierCluster->Reads, i); if (sam_format1(h, bi, &ks1) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } else { fprintf(fs,"%s\n",ks1.s); } } fprintf(fs,"\n"); //printf("HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); //fflush(fs); sam_plp_dectroy(pierCluster); pierCluster = sam_plp_init(); } } } /*char *qname = bam_get_qname(b); if (sam_write1(out, h, b) < 0) { fprintf(stderr, "[x]Error writing output.\n"); exit_code = 1; break; }*/ } /* r = sam_close(out); // stdout can only be closed once if (r < 0) { fprintf(stderr, "Error closing output.\n"); exit_code = 1; } */ hts_idx_destroy(idx); bam_destroy1(b); bam_destroy1(d); bam_destroy1(d2); bam_hdr_destroy(h); r = sam_close(in); free(ChrIsHum); #ifdef DEBUGa fflush(NULL); //pressAnyKey(); #endif sam_plp_dectroy(pierCluster); //printf("<[%d]:\n",bami); } } fclose(fs); getPairedSam(NULL, NULL, NULL, NULL); // sam_close(fp2); //printf("---[%d]---\n",exit_code); bam_destroy1(bR1); bam_destroy1(bR2); bam_destroy1(bR3); ks_release(&ks1); ks_release(&ks2); ks_release(&ks3); return exit_code; }
mem_alnreg_v mem_fmeas_fliter_se(mem_alnreg_v a , int n , int l_seq , int mode) { mem_alnreg_v aa ; int i , j ; kvec_t(FF_t) k_ff_t ; kv_init(k_ff_t); kv_init(aa); // caculate FMEAS value if(n == 0) return aa ; for( i = 0 ; i < a.n ; i++){ mem_alnreg_t *p_ar = a.a + i ; for( j = i + 1 ; j < a.n ; j++){ FF_t tmp ; mem_alnreg_t *q_ar = a.a + j ; double sens , spec ; int FN = 0 , TP = 0 ,TN = 0 , FP = 0 ; int A,B,C,D; if( p_ar->qb < q_ar->qb || (p_ar->qb == q_ar->qb && p_ar->qe >= q_ar->qe)){ // p q A = p_ar->qb ; B = p_ar->qe - 1 ; C = q_ar->qb ; D = q_ar->qe - 1 ; }else { // p q A = q_ar->qb ; B = q_ar->qe - 1; C = p_ar->qb ; D = p_ar->qe - 1; } if(B < C){ TP = B - A + D - C + 2 ; FN = l_seq - D - 1 + A + C - B - 1 ; TN = l_seq ; FP = 0 ; }else if( D <= B){ // contain continue ; }else{ TP = D - A + 1 ; FN = l_seq - D - 1 + A ; FP = B - C + 1 ; TN = l_seq - FP; } sens = (double)TP/(double)(TP+FN); spec = (double)TN/(double)(TN+FP); tmp.FMEAS = (2*spec*sens)/(spec+sens); tmp.score = p_ar->score + q_ar->score; tmp.x = i , tmp.y = j ; if(tmp.FMEAS > 0.95) kv_push(FF_t,k_ff_t,tmp); } } ks_introsort(ff_mem_flt, k_ff_t.n, k_ff_t.a); kv_push(mem_alnreg_t,aa,a.a[0]); double max_feas ; // int score ; if( k_ff_t.n == 0 ) return aa; max_feas = k_ff_t.a[0].FMEAS ; // score = k_ff_t.a[0].score ; if(mode){ int cnt = 0 ; for( i = 0 ; i < kv_size(k_ff_t) ; i++){ FF_t p = kv_A(k_ff_t,i); if(p.x == 0 && cnt == 0){ kv_push(mem_alnreg_t,aa,a.a[p.y]); cnt = 1 ; }else if(p.x == 0){ kv_push(mem_alnreg_t,aa,a.a[0]); kv_push(mem_alnreg_t,aa,a.a[p.y]); } } for( i = 0 ; i < kv_size(k_ff_t); i++){ FF_t p = kv_A(k_ff_t,i); if(max_feas != p.FMEAS ) break; if(p.x == 0) continue ; kv_push(mem_alnreg_t,aa,a.a[p.x]); kv_push(mem_alnreg_t,aa,a.a[p.y]); } }else{ int cnt = 0 ; for( i = 0 ; i < kv_size(k_ff_t); i++){ FF_t p = kv_A(k_ff_t,i); if(max_feas != p.FMEAS ) break; if(p.x == 0 && cnt == 0){ kv_push(mem_alnreg_t,aa,a.a[p.y]); continue ; }else if( p.x == 0 ){ kv_push(mem_alnreg_t,aa,a.a[0]); kv_push(mem_alnreg_t,aa,a.a[p.y]); continue ; } kv_push(mem_alnreg_t,aa,a.a[p.x]); kv_push(mem_alnreg_t,aa,a.a[p.y]); } } kv_destroy(k_ff_t); #if 0 for( i = 0 ; i < kv_size(aa); i++){ mem_alnreg_t *q = aa.a + i; printf("%db: %d %de:%d \t" , i, q->qb , i, q->qe); if( i == kv_size(aa) -1 ) printf("\n"); } #endif return aa ; }