static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t *chain) { int j, k, m = 0; ks_introsort(hsaip, n, z); for (j = 0; j < n; ++j) { hsaip_t *p = z + j; for (k = m - 1; k >= 0; --k) { hsaip_t *q = chain + k; int x = p->qbeg - q->qbeg; // always positive int y = p->tbeg - q->tbeg; if (y > 0 && x - y <= opt->bw && y - x <= opt->bw) { if (p->qend > q->qend) q->qend = p->qend; if (p->tend > q->tend) q->tend = p->tend; ++q->chain; p->chain = shift + k; break; } } if (k < 0) { chain[m] = *p; chain[m].chain = 1; chain[m].idx = p->chain = shift + m; ++m; } } return m; }
// qual:6, strand:1, base:4 int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q) { call_aux_t aux; int i, j, k, w[32]; if (m > m) return -1; memset(q, 0, m * m * sizeof(float)); if (n == 0) return 0; // calculate aux.esum and aux.fsum if (n > 255) { // then sample 255 bases ks_shuffle(uint16_t, n, bases); n = 255; } ks_introsort(uint16_t, n, bases); memset(w, 0, 32 * sizeof(int)); memset(&aux, 0, sizeof(call_aux_t)); for (j = n - 1; j >= 0; --j) { // calculate esum and fsum uint16_t b = bases[j]; int q = b>>5 < 4? 4 : b>>5; if (q > 63) q = 63; k = b&0x1f; aux.fsum[k&0xf] += em->coef->fk[w[k]]; aux.bsum[k&0xf] += em->coef->fk[w[k]] * em->coef->beta[q<<16|n<<8|aux.c[k&0xf]]; ++aux.c[k&0xf]; ++w[k]; } // generate likelihood for (j = 0; j != m; ++j) { float tmp1, tmp3; int tmp2, bar_e; // homozygous for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != m; ++k) { if (k == j) continue; tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k]; } if (tmp2) { bar_e = (int)(tmp1 / tmp3 + 0.499); if (bar_e > 63) bar_e = 63; q[j*m+j] = tmp1; } // heterozygous for (k = j + 1; k < m; ++k) { int cjk = aux.c[j] + aux.c[k]; for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) { if (i == j || i == k) continue; tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i]; } if (tmp2) { bar_e = (int)(tmp1 / tmp3 + 0.499); if (bar_e > 63) bar_e = 63; q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1; } else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k } for (k = 0; k != m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0; } return 0; }
void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, uint32_t l_pac, int is_rev, uint8_t *_mem) { int i, matrix[25]; bwtint_t k; uint8_t *target = 0, *query; AlnParam par; par.matrix = matrix; __gen_ap(par, opt); query = calloc(lq, 1); // sort according to the descending order of query end ks_introsort(hit, b->n, b->hits); target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); // reverse _query for (i = 0; i < lq; ++i) query[lq - i - 1] = _query[i]; // core loop for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; int score, j; path_t path; p->n_seeds = 1; if (p->l || p->k == 0) continue; for (j = score = 0; j < i; ++j) { bsw2hit_t *q = b->hits + j; if (q->beg <= p->beg && q->k <= p->k && q->k + q->len >= p->k + p->len) { if (q->n_seeds < (1<<14) - 2) ++q->n_seeds; ++score; } } if (score) continue; if (lt > p->k) lt = p->k; if (is_rev) { for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! target[j++] = __rpac(pac, l_pac, k); } else { for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; } lt = j; score = aln_extend_core(target, lt, query + lq - p->beg, p->beg, &par, &path, 0, p->G, _mem); if (score > p->G) { // extensible p->G = score; p->len += path.i; p->beg -= path.j; p->k -= path.i; } } free(query); free(target); }
/* Returns total size after dropping low scores */ void drop_low_scores(aln_v *vec, int offset, int max_drop) { const int size = kv_size(*vec); ks_introsort(cdec_score, size - offset, vec->a + offset); const int min_score = kv_A(*vec, offset).loc.score - max_drop; for (int i = offset; i < size; i++) { if (kv_A(*vec, i).loc.score < min_score) { vec->n = i; /* Free remaining */ for (int j = i; j < size; j++) free(kv_A(*vec, j).cigar); return; } } }
void sparse_matrix_sort_indices(sparse_matrix_t *self) { uint32_t row, row_start, row_len, i; column_value_array *col_vals = column_value_array_new(); sparse_matrix_foreach_row(self, row, row_start, row_len, { for (i = row_start; i < row_start + row_len; i++) { column_value_array_push(col_vals, (column_value_t){self->indices->a[i], self->data->a[i]}); } ks_introsort(column_value_array, col_vals->n, col_vals->a); for (i = 0; i < col_vals->n; i++) { column_value_t col_val = col_vals->a[i]; self->indices->a[row_start + i] = col_val.col; self->data->a[row_start + i] = col_val.val; } }) }
void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) { int i; bwtint_t k; uint8_t *target = 0, *query; int8_t mat[25]; bwa_fill_scmat(opt->a, opt->b, mat); query = calloc(lq, 1); // sort according to the descending order of query end ks_introsort(hit, b->n, b->hits); target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); // reverse _query for (i = 0; i < lq; ++i) query[lq - i - 1] = _query[i]; // core loop for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; int score, j, qle, tle; p->n_seeds = 1; if (p->l || p->k == 0) continue; for (j = score = 0; j < i; ++j) { bsw2hit_t *q = b->hits + j; if (q->beg <= p->beg && q->k <= p->k && q->k + q->len >= p->k + p->len) { if (q->n_seeds < (1<<13) - 2) ++q->n_seeds; ++score; } } if (score) continue; if (lt > p->k) lt = p->k; for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, p->G, &qle, &tle, 0, 0, 0); if (score > p->G) { // extensible p->G = score; p->k -= tle; p->len += tle; p->beg -= qle; } } free(query); free(target); }
static void sort_aux_core(int k, bam1_p *buf, int sort_type) { switch(sort_type) { case 0: ks_mergesort(sort, k, buf, 0); break; case 1: ks_introsort(sort, k, buf); break; case 2: ks_combsort(sort, k, buf); break; case 3: default: ks_heapmake(sort, k, buf); ks_heapsort(sort, k, buf); break; } }
static aln_v align_read(const kseq_t *read, const kseq_v targets, const align_config_t *conf) { kseq_t *r; const int32_t read_len = read->seq.l; aln_v result; kv_init(result); kv_resize(aln_t, result, kv_size(targets)); uint8_t *read_num = calloc(read_len, sizeof(uint8_t)); for(size_t k = 0; k < read_len; ++k) read_num[k] = conf->table[(int)read->seq.s[k]]; // Align to each target kswq_t *qry = NULL; for(size_t j = 0; j < kv_size(targets); j++) { // Encode target r = &kv_A(targets, j); uint8_t *ref_num = calloc(r->seq.l, sizeof(uint8_t)); for(size_t k = 0; k < r->seq.l; ++k) ref_num[k] = conf->table[(int)r->seq.s[k]]; aln_t aln; aln.target_idx = j; aln.loc = ksw_align(read_len, read_num, r->seq.l, ref_num, conf->m, conf->mat, conf->gap_o, conf->gap_e, KSW_XSTART, &qry); ksw_global(aln.loc.qe - aln.loc.qb + 1, &read_num[aln.loc.qb], aln.loc.te - aln.loc.tb + 1, &ref_num[aln.loc.tb], conf->m, conf->mat, conf->gap_o, conf->gap_e, 50, /* TODO: Magic number - band width */ &aln.n_cigar, &aln.cigar); aln.nm = 0; size_t qi = aln.loc.qb, ri = aln.loc.tb; for(size_t k = 0; k < aln.n_cigar; k++) { const int32_t oplen = bam_cigar_oplen(aln.cigar[k]), optype = bam_cigar_type(aln.cigar[k]); if(optype & 3) { // consumes both - check for mismatches for(size_t j = 0; j < oplen; j++) { if(UNLIKELY(read_num[qi + j] != ref_num[ri + j])) aln.nm++; } } else { aln.nm += oplen; } if(optype & 1) qi += oplen; if(optype & 2) ri += oplen; } kv_push(aln_t, result, aln); free(ref_num); } free(qry); free(read_num); ks_introsort(dec_score, kv_size(result), result.a); return result; }
// // em: error model to fit to data // m: number of alleles across all samples // n: number of bases observed in sample // bases[i]: bases observed in pileup [6 bit quality|1 bit strand|4 bit base] // q[i*m+j]: (Output) phred-scaled likelihood of each genotype (i,j) int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q) { // Aux // aux.c is total count of each base observed (ignoring strand) call_aux_t aux; // Loop variables int i, j, k; // The total count of each base observed per strand int w[32]; /* zero out q */ memset(q, 0, m * m * sizeof(float)); if (n == 0) return 0; // calculate aux.esum and aux.fsum if (n > 255) { // then sample 255 bases ks_shuffle(uint16_t, n, bases); n = 255; } ks_introsort(uint16_t, n, bases); /* zero out w and aux */ memset(w, 0, 32 * sizeof(int)); memset(&aux, 0, sizeof(call_aux_t)); for (j = n - 1; j >= 0; --j) { // calculate esum and fsum uint16_t b = bases[j]; /* extract quality and cap at 63 */ int qual = b>>5 < 4? 4 : b>>5; if (qual > 63) qual = 63; /* extract base ORed with strand */ int basestrand = b&0x1f; /* extract base */ int base = b&0xf; aux.fsum[base] += em->coef->fk[w[basestrand]]; aux.bsum[base] += em->coef->fk[w[basestrand]] * em->coef->beta[qual<<16|n<<8|aux.c[base]]; ++aux.c[base]; ++w[basestrand]; } // generate likelihood for (j = 0; j < m; ++j) { float tmp1, tmp3; int tmp2; // homozygous for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k < m; ++k) { if (k == j) continue; tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k]; } if (tmp2) { q[j*m+j] = tmp1; } // heterozygous for (k = j + 1; k < m; ++k) { int cjk = aux.c[j] + aux.c[k]; for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) { if (i == j || i == k) continue; tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i]; } if (tmp2) { q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1; } else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k } /* clamp to greater than 0 */ for (k = 0; k < m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0; } return 0; }
static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) { bam_lplbuf_t *tv = (bam_lplbuf_t*)data; freenode_t *p; int i, l, max_level; // allocate memory if necessary if (tv->max < n) { // enlarge tv->max = n; kroundup32(tv->max); tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max); tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max); } tv->n_cur = n; // update cnt for (p = tv->head; p->next; p = p->next) if (p->cnt > 0) --p->cnt; // calculate cur_level[] max_level = 0; for (i = l = 0; i < n; ++i) { const bam_pileup1_t *p = pl + i; if (p->is_head) { if (tv->head->next && tv->head->cnt == 0) { // then take a free slot freenode_t *p = tv->head->next; tv->cur_level[i] = tv->head->level; mp_free(tv->mp, tv->head); tv->head = p; --tv->n_nodes; } else tv->cur_level[i] = ++tv->max_level; } else { tv->cur_level[i] = tv->pre_level[l++]; if (p->is_tail) { // then return a free slot tv->tail->level = tv->cur_level[i]; tv->tail->next = mp_alloc(tv->mp); tv->tail = tv->tail->next; ++tv->n_nodes; } } if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i]; ((bam_pileup1_t*)p)->level = tv->cur_level[i]; } assert(l == tv->n_pre); tv->func(tid, pos, n, pl, tv->user_data); // sort the linked list if (tv->n_nodes) { freenode_t *q; if (tv->n_nodes + 1 > tv->m_aux) { // enlarge tv->m_aux = tv->n_nodes + 1; kroundup32(tv->m_aux); tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux); } for (p = tv->head, i = l = 0; p->next;) { if (p->level > max_level) { // then discard this entry q = p->next; mp_free(tv->mp, p); p = q; } else { tv->aux[i++] = p; p = p->next; } } tv->aux[i] = tv->tail; // add a proper tail for the loop below tv->n_nodes = i; if (tv->n_nodes) { ks_introsort(node, tv->n_nodes, tv->aux); for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1]; tv->head = tv->aux[0]; } else tv->head = tv->tail; } // clean up tv->max_level = max_level; memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4); // squeeze out terminated levels for (i = l = 0; i < n; ++i) { const bam_pileup1_t *p = pl + i; if (!p->is_tail) tv->pre_level[l++] = tv->pre_level[i]; } tv->n_pre = l; /* fprintf(stderr, "%d\t", pos+1); for (i = 0; i < n; ++i) { const bam_pileup1_t *p = pl + i; if (p->is_head) fprintf(stderr, "^"); if (p->is_tail) fprintf(stderr, "$"); fprintf(stderr, "%d,", p->level); } fprintf(stderr, "\n"); */ return 0; }
// qual:6, strand:1, base:4 int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q) { // fprintf(stderr,"n=%d\n",n); if (n == 0) return 0; //no data //reset results array; memset(q, 0, m * m * sizeof(float)); //set "counts" array int w[32]; memset(w, 0, 32 * sizeof(int)); for(int i=0;0&&i<32;i++) fprintf(stderr,"pre w[%d]=%d\n",i,w[i]); call_aux_t aux; memset(&aux, 0, sizeof(call_aux_t)); //sample 255 if depth >255 if ((n > 255)) { // then sample 255 bases //THIS MAKES ks_shuffle(uint16_t, n, bases); n = 255; } ks_introsort(uint16_t, n, bases); for (int j = n - 1; j >= 0; --j) { uint16_t b = bases[j]; // fprintf(stderr,"j=%d q=%d strand=%d base=%d\n",j,b>>5,0x1&(b>>4),b&0xf); // fprintf(stderr,"j=%d q=%d\tc=%c\t",j,(b>>5) + 33,(b>>5) + 33); //cap quality at [4,63] int q = b>>5 < 4? 4 : b>>5; if (q > 63) q = 63; int k = b&0x1f; aux.bsum[k&0xf] += em->coef->fk[w[k]] * em->coef->beta[q<<16|n<<8|aux.c[k&0xf]]; ++aux.c[k&0xf]; ++w[k]; for(int i=0;0&&i<32;i++) fprintf(stderr,"w[%d]=%d\n",i,w[i]); } if(0){ //floating point inprecision compared with samtools binary output. But is correct //the genotype like p(data|A1=g1,A2=g2) = p(data|A1=g2,A2=g1) for (int g1 = 0; g1 <5; ++g1) {//allele1=0,1,2,3,4 for (int g2 = g1; g2<5; ++g2) {//allele2=0,1,2,3,4 if(g1!=g2){ // A1!=A2 - heterozygoues int cjk = aux.c[g1] + aux.c[g2];//total depth for allele g1+allele g2 //binomial when ignoring non A1/A2 alleles: Bin (n,k,p);n=cjk=#g1+#g2 , k= aux.c[g2] = #g2 ; p=0.5. returns log q[g1*5+g2] = -4.343 * em->coef->lhet[cjk<<8|aux.c[g2]]; } for (int k = 0; k <5; ++k){ if(k!=g1 && k!=g2) //if a read has a non A1/A2 alleles it is an error. add the log of the prob of these reads q[g1*5+g2] += aux.bsum[k]; } //mirror if(g1!=g2) q[g2*5+g1] = q[g1*5+g2]; if (q[g1*5+g2] < 0.0) q[g1*5+g2] = 0.0; } } return 0; } // generate likelihood THIS WORKS PERFECTLY june 4 ande for (int g1 = 0; g1 <5; ++g1) {//j=0,1,2,3,4 for (int g2 = g1; g2<5; ++g2) {//j=0,1,2,3,4 if(g1==g2){ for (int k = 0; k <5; ++k){ if(k!=g1) q[g1*5+g2] += aux.bsum[k]; } } else{ int other=0; float tmp1=0; for (int k = 0; k < 5; ++k) if (k != g1 && k != g2) { tmp1 += aux.bsum[k]; other = 1; } int cjk = aux.c[g1] + aux.c[g2]; if (other) q[g1*5+g2] = q[g2*5+g1] = -4.343 * em->coef->lhet[cjk<<8|aux.c[g2]] + tmp1; else q[g1*5+g2] = q[g2*5+g1] = -4.343 * em->coef->lhet[cjk<<8|aux.c[g2]]; // all the bases are either j or k } if (q[g1*5+g2] < 0.0) q[g1*5+g2] = 0.0; } } return 0; //old original almost for (int j = 0; j != m; ++j) {//j=0,1,2,3,4 // homozygous for (int k = 0; k != m; ++k){//only updates if k!=j and aux.c[k]!=0 fprintf(stderr,"\t-> j=%d aux.c[%d]=%d aux.bsum[%d]=%f res=%d ",j,k,aux.c[k],k,aux.bsum[k],j*m+j); if (k != j && aux.c[k]) { fprintf(stderr,"USING \n"); q[j*m+j] += aux.bsum[k]; }else fprintf(stderr,"skipping\n"); } // heterozygous for (int k = j + 1; k < m; ++k) {//k=1,...,4 float tmp1=0.0; int isHe=0; for (int i = 0; i < m; ++i) if (i != j && i != k) { tmp1 += aux.bsum[i]; isHe += aux.c[i]; } int cjk = aux.c[j] + aux.c[k]; fprintf(stderr,"j=%d k=%d RES=%d\n",j,k,j*m+k); if(1) { if (isHe) q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1; else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k }else{ q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; if(isHe) q[j*m+k] = q[k*m+j] = q[k*m+j]+tmp1; } } //set to zero if negative, shoulnd't happen for (int k = 0; k != m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0; } // exit(0); return 0; }
void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]) { hsaip_t *z[2], *chain[2]; int i, j, k, n[2], m[2]; char *flag; // initialization n[0] = b[0]->n; n[1] = b[1]->n; z[0] = (hsaip_t*)calloc(n[0] + n[1], sizeof(hsaip_t)); z[1] = z[0] + n[0]; chain[0] = (hsaip_t*)calloc(n[0] + n[1], sizeof(hsaip_t)); for (k = j = 0; k < 2; ++k) { for (i = 0; i < b[k]->n; ++i) { bsw2hit_t *p = b[k]->hits + i; hsaip_t *q = z[k] + i; q->flag = k; q->idx = i; q->tbeg = p->k; q->tend = p->k + p->len; q->chain = -1; q->qbeg = p->beg; q->qend = p->end; } } // chaining m[0] = chaining(opt, 0, n[0], z[0], chain[0]); chain[1] = chain[0] + m[0]; m[1] = chaining(opt, m[0], n[1], z[1], chain[1]); // change query coordinate on the reverse strand for (k = 0; k < m[1]; ++k) { hsaip_t *p = chain[1] + k; int tmp = p->qbeg; p->qbeg = len - p->qend; p->qend = len - tmp; } // filtering flag = (char*)calloc(m[0] + m[1], 1); ks_introsort(hsaip, m[0] + m[1], chain[0]); for (k = 1; k < m[0] + m[1]; ++k) { hsaip_t *p = chain[0] + k; for (j = 0; j < k; ++j) { hsaip_t *q = chain[0] + j; if (flag[q->idx]) continue; if (q->qend >= p->qend && q->chain > p->chain * opt->t_seeds * 2) { flag[p->idx] = 1; break; } } } for (k = 0; k < n[0] + n[1]; ++k) { hsaip_t *p = z[0] + k; if (flag[p->chain]) b[p->flag]->hits[p->idx].G = 0; } free(flag); // squeeze out filtered elements in b[2] for (k = 0; k < 2; ++k) { for (j = i = 0; j < n[k]; ++j) { bsw2hit_t *p = b[k]->hits + j; if (p->G) { if (i != j) b[k]->hits[i++] = *p; else ++i; } } b[k]->n = i; } // free free(z[0]); free(chain[0]); }
string_tree_t *add_string_alternatives(char *str, normalize_options_t options) { char_array *key = NULL; log_debug("input=%s\n", str); token_array *tokens = tokenize_keep_whitespace(str); if (tokens == NULL) { return NULL; } size_t len = strlen(str); log_debug("tokenized, num tokens=%zu\n", tokens->n); phrase_language_array *phrases = NULL; phrase_array *lang_phrases = NULL; for (int i = 0; i < options.num_languages; i++) { char *lang = options.languages[i]; log_debug("lang=%s\n", lang); lang_phrases = search_address_dictionaries_tokens(str, tokens, lang); if (lang_phrases == NULL) { log_debug("lang_phrases NULL\n"); continue; } log_debug("lang_phrases->n = %zu\n", lang_phrases->n); phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); for (int j = 0; j < lang_phrases->n; j++) { phrase_t p = lang_phrases->a[j]; phrase_language_array_push(phrases, (phrase_language_t){lang, p}); } phrase_array_destroy(lang_phrases); } lang_phrases = search_address_dictionaries_tokens(str, tokens, ALL_LANGUAGES); if (lang_phrases != NULL) { phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); for (int j = 0; j < lang_phrases->n; j++) { phrase_t p = lang_phrases->a[j]; phrase_language_array_push(phrases, (phrase_language_t){ALL_LANGUAGES, p}); } phrase_array_destroy(lang_phrases); } string_tree_t *tree = string_tree_new_size(len); if (phrases != NULL) { log_debug("phrases not NULL, n=%zu\n", phrases->n); ks_introsort(phrase_language_array, phrases->n, phrases->a); phrase_language_t phrase_lang; int start = 0; int end = 0; key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN); for (int i = 0; i < phrases->n; i++) { phrase_lang = phrases->a[i]; char_array_clear(key); char_array_cat(key, phrase_lang.language); char_array_cat(key, NAMESPACE_SEPARATOR_CHAR); size_t namespace_len = key->n; phrase_t phrase = phrase_lang.phrase; end = phrase.start; for (int j = start; j < end; j++) { token_t token = tokens->a[j]; if (token.type != WHITESPACE) { log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); string_tree_add_string_len(tree, str + token.offset, token.len); } else { log_debug("Adding space\n"); string_tree_add_string(tree, " "); } string_tree_finalize_token(tree); } expansion_value_t value; value.value = phrase.data; token_t token; if (value.components & options.address_components) { key->n = namespace_len; for (int j = phrase.start; j < phrase.start + phrase.len; j++) { token = tokens->a[j]; if (token.type != WHITESPACE) { char_array_cat_len(key, str + token.offset, token.len); } else { char_array_cat(key, " "); } } char *key_str = char_array_get_string(key); log_debug("key_str=%s\n", key_str); address_expansion_array *expansions = address_dictionary_get_expansions(key_str); if (expansions != NULL) { for (int j = 0; j < expansions->n; j++) { address_expansion_t expansion = expansions->a[j]; if (expansion.canonical_index != NULL_CANONICAL_INDEX) { char *canonical = address_dictionary_get_canonical(expansion.canonical_index); if (phrase.start + phrase.len < tokens->n - 1) { token_t next_token = tokens->a[phrase.start + phrase.len]; if (!is_numeric_token(next_token.type)) { string_tree_add_string(tree, canonical); } else { uint32_t start_index = cstring_array_start_token(tree->strings); cstring_array_append_string(tree->strings, canonical); cstring_array_append_string(tree->strings, " "); cstring_array_terminate(tree->strings); } } else { string_tree_add_string(tree, canonical); } } else { for (int k = phrase.start; k < phrase.start + phrase.len; k++) { token = tokens->a[k]; if (token.type != WHITESPACE) { string_tree_add_string_len(tree, str + token.offset, token.len); } else { string_tree_add_string(tree, " "); } } } } string_tree_finalize_token(tree); } } else { for (int j = phrase.start; j < phrase.start + phrase.len; j++) { token = tokens->a[j]; if (token.type != WHITESPACE) { log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); string_tree_add_string_len(tree, str + token.offset, token.len); } else { string_tree_add_string(tree, " "); } string_tree_finalize_token(tree); } } start = phrase.start + phrase.len; } char_array_destroy(key); end = (int)tokens->n; for (int j = start; j < end; j++) { token_t token = tokens->a[j]; if (token.type != WHITESPACE) { log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); string_tree_add_string_len(tree, str + token.offset, token.len); } else { log_debug("Adding space\n"); string_tree_add_string(tree, " "); } string_tree_finalize_token(tree); } } else { string_tree_add_string(tree, str); string_tree_finalize_token(tree); } if (phrases != NULL) { phrase_language_array_destroy(phrases); } token_array_destroy(tokens); return tree; }
int main(int argc, char *argv[]) { int c, dret, lineno = 0, n_rows = 0, m_rows = 0, n_cols = 0, max_hap = 0; int64_t n_missing = 0, n_tot = 0; gzFile fp; kstream_t *ks; kstring_t str = {0,0,0}; int8_t **C = 0; double **M, *X, min_maf = 0.0; char **names = 0; // _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~(_MM_MASK_INVALID | _MM_MASK_OVERFLOW | _MM_MASK_UNDERFLOW | _MM_MASK_DIV_ZERO)); _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~(_MM_MASK_INVALID | _MM_MASK_DIV_ZERO)); while ((c = getopt(argc, argv, "m:")) >= 0) { if (c == 'm') min_maf = atof(optarg); } if (argc - optind == 0) { fprintf(stderr, "Usage: naivepca [-m min_maf] <in.txt>\n"); return 1; } fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); if (fp == 0) { fprintf(stderr, "[E::%s] failed to open file '%s'. Abort.\n", __func__, argv[optind]); return 2; } ks = ks_init(fp); // read the matrix into C while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { int8_t *q; char *p, *name = str.s; int i; ++lineno; for (p = str.s; *p && *p != '\t' && *p != ' '; ++p); if (*p) { *p++ = 0; for (; *p && (*p == '\t' || *p == ' '); ++p); } if (*p == 0) { fprintf(stderr, "[W::%s] line %d has one field; skipped.\n", __func__, lineno); continue; } if (n_cols != 0) { if (n_cols != str.s + str.l - p) { fprintf(stderr, "[W::%s] line %d has a different number of columns; skipped.\n", __func__, lineno); continue; } } else n_cols = str.s + str.l - p; if (n_rows == m_rows) { m_rows = m_rows? m_rows<<1 : 16; C = (int8_t**)realloc(C, m_rows * sizeof(int8_t*)); names = (char**)realloc(names, m_rows * sizeof(char*)); } names[n_rows] = strdup(name); q = C[n_rows++] = (int8_t*)calloc(n_cols, sizeof(double)); for (i = 0; i < n_cols; ++i) { if (p[i] >= '0' && p[i] <= '9') q[i] = p[i] - '0'; else q[i] = -1, ++n_missing; max_hap = max_hap > q[i]? max_hap : q[i]; } n_tot += n_cols; } free(str.s); fprintf(stderr, "[M::%s] read %d samples and %d sites; ploidy is %d\n", __func__, n_rows, n_cols, max_hap); fprintf(stderr, "[M::%s] %.3f%% of genotypes are missing\n", __func__, (double)n_missing / n_tot); { // normalize the matrix into M int i, j, *sum, *cnt, n_dropped = 0; double *mu, *pp; sum = (int*)calloc(n_cols, sizeof(int)); cnt = (int*)calloc(n_cols, sizeof(int)); mu = (double*)calloc(n_cols, sizeof(double)); pp = (double*)calloc(n_cols, sizeof(double)); for (i = 0; i < n_rows; ++i) { int8_t *q = C[i]; for (j = 0; j < n_cols; ++j) if (q[j] >= 0) sum[j] += q[j], ++cnt[j]; } for (j = 0; j < n_cols; ++j) { if (cnt[j] > 0) { mu[j] = (double)sum[j] / cnt[j]; pp[j] = mu[j] / max_hap; if (pp[j] < min_maf || 1. - pp[j] < min_maf) ++n_dropped; } else ++n_dropped; } fprintf(stderr, "[M::%s] %d rare sites are dropped\n", __func__, n_dropped); M = (double**)calloc(n_rows, sizeof(double*)); for (i = 0; i < n_rows; ++i) { int8_t *q = C[i]; double *r; r = M[i] = (double*)calloc(n_cols, sizeof(double)); for (j = 0; j < n_cols; ++j) r[j] = q[j] < 0 || pp[j] < min_maf || 1. - pp[j] < min_maf || pp[j] == 0. || 1 - pp[j] == 0. ? 0. : (q[j] - mu[j]) / sqrt(pp[j] * (1. - pp[j])); } free(sum); free(cnt); free(mu); free(pp); for (i = 0; i < n_rows; ++i) free(C[i]); free(C); } { // multiplication int i, j, k; X = (double*)calloc(n_rows * n_rows, sizeof(double)); for (i = 0; i < n_rows; ++i) { double *zi = M[i]; for (j = 0; j <= i; ++j) { double t = 0., *zj = M[j]; for (k = 0; k < n_cols; ++k) t += zi[k] * zj[k]; X[i*n_rows + j] = X[j*n_rows + i] = t / n_cols; } } for (i = 0; i < n_rows; ++i) free(M[i]); free(M); } { // print eigan vectors double *ev; int i, j; evsrt_t *evsrt; ev = (double*)calloc(n_rows, sizeof(double)); evsrt = (evsrt_t*)calloc(n_rows, sizeof(evsrt_t)); n_eigen_symm(X, n_rows, ev); for (i = 0; i < n_rows; ++i) evsrt[i].ev = ev[i], evsrt[i].i = i; ks_introsort(ev, n_rows, evsrt); for (i = 0; i < n_rows; ++i) { printf("%s", names[i]); for (j = 0; j < n_rows; ++j) printf("\t%.6f", X[i*n_rows + evsrt[j].i] * evsrt[j].ev); putchar('\n'); free(names[i]); } free(ev); free(evsrt); free(X); free(names); } ks_destroy(ks); gzclose(fp); return 0; }
mem_alnreg_v mem_fmeas_fliter_se(mem_alnreg_v a , int n , int l_seq , int mode) { mem_alnreg_v aa ; int i , j ; kvec_t(FF_t) k_ff_t ; kv_init(k_ff_t); kv_init(aa); // caculate FMEAS value if(n == 0) return aa ; for( i = 0 ; i < a.n ; i++){ mem_alnreg_t *p_ar = a.a + i ; for( j = i + 1 ; j < a.n ; j++){ FF_t tmp ; mem_alnreg_t *q_ar = a.a + j ; double sens , spec ; int FN = 0 , TP = 0 ,TN = 0 , FP = 0 ; int A,B,C,D; if( p_ar->qb < q_ar->qb || (p_ar->qb == q_ar->qb && p_ar->qe >= q_ar->qe)){ // p q A = p_ar->qb ; B = p_ar->qe - 1 ; C = q_ar->qb ; D = q_ar->qe - 1 ; }else { // p q A = q_ar->qb ; B = q_ar->qe - 1; C = p_ar->qb ; D = p_ar->qe - 1; } if(B < C){ TP = B - A + D - C + 2 ; FN = l_seq - D - 1 + A + C - B - 1 ; TN = l_seq ; FP = 0 ; }else if( D <= B){ // contain continue ; }else{ TP = D - A + 1 ; FN = l_seq - D - 1 + A ; FP = B - C + 1 ; TN = l_seq - FP; } sens = (double)TP/(double)(TP+FN); spec = (double)TN/(double)(TN+FP); tmp.FMEAS = (2*spec*sens)/(spec+sens); tmp.score = p_ar->score + q_ar->score; tmp.x = i , tmp.y = j ; if(tmp.FMEAS > 0.95) kv_push(FF_t,k_ff_t,tmp); } } ks_introsort(ff_mem_flt, k_ff_t.n, k_ff_t.a); kv_push(mem_alnreg_t,aa,a.a[0]); double max_feas ; // int score ; if( k_ff_t.n == 0 ) return aa; max_feas = k_ff_t.a[0].FMEAS ; // score = k_ff_t.a[0].score ; if(mode){ int cnt = 0 ; for( i = 0 ; i < kv_size(k_ff_t) ; i++){ FF_t p = kv_A(k_ff_t,i); if(p.x == 0 && cnt == 0){ kv_push(mem_alnreg_t,aa,a.a[p.y]); cnt = 1 ; }else if(p.x == 0){ kv_push(mem_alnreg_t,aa,a.a[0]); kv_push(mem_alnreg_t,aa,a.a[p.y]); } } for( i = 0 ; i < kv_size(k_ff_t); i++){ FF_t p = kv_A(k_ff_t,i); if(max_feas != p.FMEAS ) break; if(p.x == 0) continue ; kv_push(mem_alnreg_t,aa,a.a[p.x]); kv_push(mem_alnreg_t,aa,a.a[p.y]); } }else{ int cnt = 0 ; for( i = 0 ; i < kv_size(k_ff_t); i++){ FF_t p = kv_A(k_ff_t,i); if(max_feas != p.FMEAS ) break; if(p.x == 0 && cnt == 0){ kv_push(mem_alnreg_t,aa,a.a[p.y]); continue ; }else if( p.x == 0 ){ kv_push(mem_alnreg_t,aa,a.a[0]); kv_push(mem_alnreg_t,aa,a.a[p.y]); continue ; } kv_push(mem_alnreg_t,aa,a.a[p.x]); kv_push(mem_alnreg_t,aa,a.a[p.y]); } } kv_destroy(k_ff_t); #if 0 for( i = 0 ; i < kv_size(aa); i++){ mem_alnreg_t *q = aa.a + i; printf("%db: %d %de:%d \t" , i, q->qb , i, q->qe); if( i == kv_size(aa) -1 ) printf("\n"); } #endif return aa ; }