Beispiel #1
0
static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t *chain)
{
	int j, k, m = 0;
	ks_introsort(hsaip, n, z);
	for (j = 0; j < n; ++j) {
		hsaip_t *p = z + j;
		for (k = m - 1; k >= 0; --k) {
			hsaip_t *q = chain + k;
			int x = p->qbeg - q->qbeg; // always positive
			int y = p->tbeg - q->tbeg;
			if (y > 0 && x - y <= opt->bw && y - x <= opt->bw) {
				if (p->qend > q->qend) q->qend = p->qend;
				if (p->tend > q->tend) q->tend = p->tend;
				++q->chain;
				p->chain = shift + k;
				break;
			}
		}
		if (k < 0) {
			chain[m] = *p;
			chain[m].chain = 1;
			chain[m].idx = p->chain = shift + m;
			++m;
		}
	}
	return m;
}
Beispiel #2
0
// qual:6, strand:1, base:4
int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q)
{
	call_aux_t aux;
	int i, j, k, w[32];

	if (m > m) return -1;
	memset(q, 0, m * m * sizeof(float));
	if (n == 0) return 0;
	// calculate aux.esum and aux.fsum
	if (n > 255) { // then sample 255 bases
		ks_shuffle(uint16_t, n, bases);
		n = 255;
	}
	ks_introsort(uint16_t, n, bases);
	memset(w, 0, 32 * sizeof(int));
	memset(&aux, 0, sizeof(call_aux_t));
	for (j = n - 1; j >= 0; --j) { // calculate esum and fsum
		uint16_t b = bases[j];
		int q = b>>5 < 4? 4 : b>>5;
		if (q > 63) q = 63;
		k = b&0x1f;
		aux.fsum[k&0xf] += em->coef->fk[w[k]];
		aux.bsum[k&0xf] += em->coef->fk[w[k]] * em->coef->beta[q<<16|n<<8|aux.c[k&0xf]];
		++aux.c[k&0xf];
		++w[k];
	}
	// generate likelihood
	for (j = 0; j != m; ++j) {
		float tmp1, tmp3;
		int tmp2, bar_e;
		// homozygous
		for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != m; ++k) {
			if (k == j) continue;
			tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k];
		}
		if (tmp2) {
			bar_e = (int)(tmp1 / tmp3 + 0.499);
			if (bar_e > 63) bar_e = 63;
			q[j*m+j] = tmp1;
		}
		// heterozygous
		for (k = j + 1; k < m; ++k) {
			int cjk = aux.c[j] + aux.c[k];
			for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) {
				if (i == j || i == k) continue;
				tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i];
			}
			if (tmp2) {
				bar_e = (int)(tmp1 / tmp3 + 0.499);
				if (bar_e > 63) bar_e = 63;
				q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1;
			} else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k
		}
		for (k = 0; k != m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0;
	}
	return 0;
}
Beispiel #3
0
void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, uint32_t l_pac, int is_rev, uint8_t *_mem)
{
	int i, matrix[25];
	bwtint_t k;
	uint8_t *target = 0, *query;
	AlnParam par;

	par.matrix = matrix;
	__gen_ap(par, opt);
	query = calloc(lq, 1);
	// sort according to the descending order of query end
	ks_introsort(hit, b->n, b->hits);
	target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1);
	// reverse _query
	for (i = 0; i < lq; ++i) query[lq - i - 1] = _query[i];
	// core loop
	for (i = 0; i < b->n; ++i) {
		bsw2hit_t *p = b->hits + i;
		int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq;
		int score, j;
		path_t path;
		p->n_seeds = 1;
		if (p->l || p->k == 0) continue;
		for (j = score = 0; j < i; ++j) {
			bsw2hit_t *q = b->hits + j;
			if (q->beg <= p->beg && q->k <= p->k && q->k + q->len >= p->k + p->len) {
				if (q->n_seeds < (1<<14) - 2) ++q->n_seeds;
				++score;
			}
		}
		if (score) continue;
		if (lt > p->k) lt = p->k;
		if (is_rev) {
			for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered!
				target[j++] = __rpac(pac, l_pac, k);
		} else {
			for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered!
				target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
		}
		lt = j;
		score = aln_extend_core(target, lt, query + lq - p->beg, p->beg, &par, &path, 0, p->G, _mem);
		if (score > p->G) { // extensible
			p->G = score;
			p->len += path.i;
			p->beg -= path.j;
			p->k -= path.i;
		}
	}
	free(query); free(target);
}
Beispiel #4
0
/* Returns total size after dropping low scores */
void drop_low_scores(aln_v *vec, int offset, int max_drop) {
  const int size = kv_size(*vec);
  ks_introsort(cdec_score, size - offset, vec->a + offset);
  const int min_score = kv_A(*vec, offset).loc.score - max_drop;
  for (int i = offset; i < size; i++) {
    if (kv_A(*vec, i).loc.score < min_score) {
      vec->n = i;

      /* Free remaining */
      for (int j = i; j < size; j++)
        free(kv_A(*vec, j).cigar);
      return;
    }
  }
}
Beispiel #5
0
void sparse_matrix_sort_indices(sparse_matrix_t *self) {
    uint32_t row, row_start, row_len, i;

    column_value_array *col_vals = column_value_array_new();

    sparse_matrix_foreach_row(self, row, row_start, row_len, {
        for (i = row_start; i < row_start + row_len; i++) {
            column_value_array_push(col_vals, (column_value_t){self->indices->a[i], self->data->a[i]});
        }
        ks_introsort(column_value_array, col_vals->n, col_vals->a);

        for (i = 0; i < col_vals->n; i++) {
            column_value_t col_val = col_vals->a[i];
            self->indices->a[row_start + i] = col_val.col;
            self->data->a[row_start + i] = col_val.val;
        }
    })

}
Beispiel #6
0
void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem)
{
	int i;
	bwtint_t k;
	uint8_t *target = 0, *query;
	int8_t mat[25];

	bwa_fill_scmat(opt->a, opt->b, mat);
	query = calloc(lq, 1);
	// sort according to the descending order of query end
	ks_introsort(hit, b->n, b->hits);
	target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1);
	// reverse _query
	for (i = 0; i < lq; ++i) query[lq - i - 1] = _query[i];
	// core loop
	for (i = 0; i < b->n; ++i) {
		bsw2hit_t *p = b->hits + i;
		int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq;
		int score, j, qle, tle;
		p->n_seeds = 1;
		if (p->l || p->k == 0) continue;
		for (j = score = 0; j < i; ++j) {
			bsw2hit_t *q = b->hits + j;
			if (q->beg <= p->beg && q->k <= p->k && q->k + q->len >= p->k + p->len) {
				if (q->n_seeds < (1<<13) - 2) ++q->n_seeds;
				++score;
			}
		}
		if (score) continue;
		if (lt > p->k) lt = p->k;
		for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered!
			target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
		lt = j;
		score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, p->G, &qle, &tle, 0, 0, 0);
		if (score > p->G) { // extensible
			p->G = score;
			p->k -= tle;
			p->len += tle;
			p->beg -= qle;
		}
	}
	free(query); free(target);
}
Beispiel #7
0
static void
sort_aux_core(int k, bam1_p *buf, int sort_type)
{
  switch(sort_type) {
    case 0:
        ks_mergesort(sort, k, buf, 0);
        break;
    case 1:
        ks_introsort(sort, k, buf);
        break;
    case 2:
        ks_combsort(sort, k, buf);
        break;
    case 3:
    default:
        ks_heapmake(sort, k, buf);
        ks_heapsort(sort, k, buf);
        break;
  }
}
Beispiel #8
0
static aln_v align_read(const kseq_t *read,
                        const kseq_v targets,
                        const align_config_t *conf)
{
    kseq_t *r;
    const int32_t read_len = read->seq.l;

    aln_v result;
    kv_init(result);
    kv_resize(aln_t, result, kv_size(targets));

    uint8_t *read_num = calloc(read_len, sizeof(uint8_t));

    for(size_t k = 0; k < read_len; ++k)
        read_num[k] = conf->table[(int)read->seq.s[k]];

    // Align to each target
    kswq_t *qry = NULL;
    for(size_t j = 0; j < kv_size(targets); j++) {
        // Encode target
        r = &kv_A(targets, j);
        uint8_t *ref_num = calloc(r->seq.l, sizeof(uint8_t));
        for(size_t k = 0; k < r->seq.l; ++k)
            ref_num[k] = conf->table[(int)r->seq.s[k]];

        aln_t aln;
        aln.target_idx = j;
        aln.loc = ksw_align(read_len, read_num,
                            r->seq.l, ref_num,
                            conf->m,
                            conf->mat,
                            conf->gap_o,
                            conf->gap_e,
                            KSW_XSTART,
                            &qry);
        ksw_global(aln.loc.qe - aln.loc.qb + 1,
                   &read_num[aln.loc.qb],
                   aln.loc.te - aln.loc.tb + 1,
                   &ref_num[aln.loc.tb],
                   conf->m,
                   conf->mat,
                   conf->gap_o,
                   conf->gap_e,
                   50, /* TODO: Magic number - band width */
                   &aln.n_cigar,
                   &aln.cigar);

        aln.nm = 0;
        size_t qi = aln.loc.qb, ri = aln.loc.tb;
        for(size_t k = 0; k < aln.n_cigar; k++) {
            const int32_t oplen = bam_cigar_oplen(aln.cigar[k]),
                          optype = bam_cigar_type(aln.cigar[k]);

            if(optype & 3) { // consumes both - check for mismatches
                for(size_t j = 0; j < oplen; j++) {
                    if(UNLIKELY(read_num[qi + j] != ref_num[ri + j]))
                        aln.nm++;
                }
            } else {
                aln.nm += oplen;
            }
            if(optype & 1) qi += oplen;
            if(optype & 2) ri += oplen;
        }

        kv_push(aln_t, result, aln);
        free(ref_num);
    }
    free(qry);
    free(read_num);
    ks_introsort(dec_score, kv_size(result), result.a);

    return result;
}
Beispiel #9
0
//
// em: error model to fit to data
// m: number of alleles across all samples
// n: number of bases observed in sample
// bases[i]: bases observed in pileup [6 bit quality|1 bit strand|4 bit base]
// q[i*m+j]: (Output) phred-scaled likelihood of each genotype (i,j)
int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q)
{
    // Aux
    // aux.c is total count of each base observed (ignoring strand)
    call_aux_t aux;
    // Loop variables
    int i, j, k;
    // The total count of each base observed per strand
    int w[32];

        /* zero out q */
    memset(q, 0, m * m * sizeof(float));
    if (n == 0) return 0;
    // calculate aux.esum and aux.fsum
    if (n > 255) { // then sample 255 bases
        ks_shuffle(uint16_t, n, bases);
        n = 255;
    }
    ks_introsort(uint16_t, n, bases);
    /* zero out w and aux */
    memset(w, 0, 32 * sizeof(int));
    memset(&aux, 0, sizeof(call_aux_t));

    for (j = n - 1; j >= 0; --j) { // calculate esum and fsum
        uint16_t b = bases[j];
        /* extract quality and cap at 63 */
        int qual = b>>5 < 4? 4 : b>>5;
        if (qual > 63) qual = 63;
        /* extract base ORed with strand */
        int basestrand = b&0x1f;
        /* extract base */
        int base = b&0xf;
        aux.fsum[base] += em->coef->fk[w[basestrand]];
        aux.bsum[base] += em->coef->fk[w[basestrand]] * em->coef->beta[qual<<16|n<<8|aux.c[base]];
        ++aux.c[base];
        ++w[basestrand];
    }

    // generate likelihood
    for (j = 0; j < m; ++j) {
        float tmp1, tmp3;
        int tmp2;
        // homozygous
        for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k < m; ++k) {
            if (k == j) continue;
            tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k];
        }
        if (tmp2) {
            q[j*m+j] = tmp1;
        }
        // heterozygous
        for (k = j + 1; k < m; ++k) {
            int cjk = aux.c[j] + aux.c[k];
            for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) {
                if (i == j || i == k) continue;
                tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i];
            }
            if (tmp2) {
                q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1;
            } else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k
        }
        /* clamp to greater than 0 */
        for (k = 0; k < m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0;
    }

    return 0;
}
Beispiel #10
0
static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
{
	bam_lplbuf_t *tv = (bam_lplbuf_t*)data;
	freenode_t *p;
	int i, l, max_level;
	// allocate memory if necessary
	if (tv->max < n) { // enlarge
		tv->max = n;
		kroundup32(tv->max);
		tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max);
		tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max);
	}
	tv->n_cur = n;
	// update cnt
	for (p = tv->head; p->next; p = p->next)
		if (p->cnt > 0) --p->cnt;
	// calculate cur_level[]
	max_level = 0;
	for (i = l = 0; i < n; ++i) {
		const bam_pileup1_t *p = pl + i;
		if (p->is_head) {
			if (tv->head->next && tv->head->cnt == 0) { // then take a free slot
				freenode_t *p = tv->head->next;
				tv->cur_level[i] = tv->head->level;
				mp_free(tv->mp, tv->head);
				tv->head = p;
				--tv->n_nodes;
			} else tv->cur_level[i] = ++tv->max_level;
		} else {
			tv->cur_level[i] = tv->pre_level[l++];
			if (p->is_tail) { // then return a free slot
				tv->tail->level = tv->cur_level[i];
				tv->tail->next = mp_alloc(tv->mp);
				tv->tail = tv->tail->next;
				++tv->n_nodes;
			}
		}
		if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i];
		((bam_pileup1_t*)p)->level = tv->cur_level[i];
	}
	assert(l == tv->n_pre);
	tv->func(tid, pos, n, pl, tv->user_data);
	// sort the linked list
	if (tv->n_nodes) {
		freenode_t *q;
		if (tv->n_nodes + 1 > tv->m_aux) { // enlarge
			tv->m_aux = tv->n_nodes + 1;
			kroundup32(tv->m_aux);
			tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux);
		}
		for (p = tv->head, i = l = 0; p->next;) {
			if (p->level > max_level) { // then discard this entry
				q = p->next;
				mp_free(tv->mp, p);
				p = q;
			} else {
				tv->aux[i++] = p;
				p = p->next;
			}
		}
		tv->aux[i] = tv->tail; // add a proper tail for the loop below
		tv->n_nodes = i;
		if (tv->n_nodes) {
			ks_introsort(node, tv->n_nodes, tv->aux);
			for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1];
			tv->head = tv->aux[0];
		} else tv->head = tv->tail;
	}
	// clean up
	tv->max_level = max_level;
	memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4);
	// squeeze out terminated levels
	for (i = l = 0; i < n; ++i) {
		const bam_pileup1_t *p = pl + i;
		if (!p->is_tail)
			tv->pre_level[l++] = tv->pre_level[i];
	}
	tv->n_pre = l;
/*
	fprintf(stderr, "%d\t", pos+1);
	for (i = 0; i < n; ++i) {
		const bam_pileup1_t *p = pl + i;
		if (p->is_head) fprintf(stderr, "^");
		if (p->is_tail) fprintf(stderr, "$");
		fprintf(stderr, "%d,", p->level);
	}
	fprintf(stderr, "\n");
*/
	return 0;
}
Beispiel #11
0
// qual:6, strand:1, base:4
int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q)
{
  //  fprintf(stderr,"n=%d\n",n);
  if (n == 0) 
    return 0; //no data

  //reset results array;
  memset(q, 0, m * m * sizeof(float));

  //set "counts" array
  int w[32];
  memset(w, 0, 32 * sizeof(int));
  for(int i=0;0&&i<32;i++)
    fprintf(stderr,"pre w[%d]=%d\n",i,w[i]);
      
  call_aux_t aux;
  memset(&aux, 0, sizeof(call_aux_t));

  //sample 255 if depth >255
  if ((n > 255)) { // then sample 255 bases //THIS MAKES
    ks_shuffle(uint16_t, n, bases);
    n = 255;
  }
  ks_introsort(uint16_t, n, bases);
  
  for (int j = n - 1; j >= 0; --j) {
    uint16_t b = bases[j];
    //    fprintf(stderr,"j=%d q=%d strand=%d base=%d\n",j,b>>5,0x1&(b>>4),b&0xf);
    //    fprintf(stderr,"j=%d q=%d\tc=%c\t",j,(b>>5) + 33,(b>>5) + 33);
    //cap quality at [4,63]
    int q = b>>5 < 4? 4 : b>>5;
    if (q > 63) 
      q = 63;

    int k = b&0x1f;
    
    
    aux.bsum[k&0xf] += em->coef->fk[w[k]] * em->coef->beta[q<<16|n<<8|aux.c[k&0xf]];
    ++aux.c[k&0xf];
    ++w[k];
    for(int i=0;0&&i<32;i++)
      fprintf(stderr,"w[%d]=%d\n",i,w[i]);
  }
  if(0){
    //floating point inprecision compared with samtools binary output. But is correct
    
    //the genotype like p(data|A1=g1,A2=g2) = p(data|A1=g2,A2=g1)
    for (int g1 = 0; g1 <5; ++g1) {//allele1=0,1,2,3,4
      for (int g2 = g1; g2<5; ++g2) {//allele2=0,1,2,3,4
	if(g1!=g2){ // A1!=A2 - heterozygoues
	  int cjk = aux.c[g1] + aux.c[g2];//total depth for allele g1+allele g2
	  //binomial when ignoring non A1/A2 alleles: Bin (n,k,p);n=cjk=#g1+#g2 , k= aux.c[g2] = #g2 ; p=0.5. returns log
	  q[g1*5+g2] = -4.343 * em->coef->lhet[cjk<<8|aux.c[g2]]; 
	}
	for (int k = 0; k <5; ++k){
	  if(k!=g1 && k!=g2) //if a read has a non A1/A2 alleles it is an error. add the log of the prob of these reads
	    q[g1*5+g2] += aux.bsum[k];
	  
	}

	//mirror
	if(g1!=g2)
	  q[g2*5+g1] =  q[g1*5+g2];
	
	if (q[g1*5+g2] < 0.0) 
	  q[g1*5+g2] = 0.0;
	
      }
    }
    return 0;
  }
  // generate likelihood THIS WORKS PERFECTLY june 4 ande
  for (int g1 = 0; g1 <5; ++g1) {//j=0,1,2,3,4
    for (int g2 = g1; g2<5; ++g2) {//j=0,1,2,3,4
      if(g1==g2){ 
	for (int k = 0; k <5; ++k){
	  if(k!=g1)
	    q[g1*5+g2] += aux.bsum[k];
	}
      }
      else{
	int other=0;
	float tmp1=0;
	for (int k = 0; k < 5; ++k) 
	  if (k != g1 && k != g2) {
	    tmp1 += aux.bsum[k]; 
	    other = 1; 
	}
	int cjk = aux.c[g1] + aux.c[g2];
	if (other) 
	  q[g1*5+g2] = q[g2*5+g1] = -4.343 * em->coef->lhet[cjk<<8|aux.c[g2]] + tmp1;
	else 
	  q[g1*5+g2] = q[g2*5+g1] = -4.343 * em->coef->lhet[cjk<<8|aux.c[g2]]; // all the bases are either j or k

      }
      if (q[g1*5+g2] < 0.0) 
	q[g1*5+g2] = 0.0;

    }
  }
 
  return 0;
  

  //old original almost
  for (int j = 0; j != m; ++j) {//j=0,1,2,3,4
    // homozygous
    for (int k = 0; k != m; ++k){//only updates if k!=j and aux.c[k]!=0
      fprintf(stderr,"\t-> j=%d aux.c[%d]=%d aux.bsum[%d]=%f res=%d ",j,k,aux.c[k],k,aux.bsum[k],j*m+j);
      if (k != j && aux.c[k]) {
	fprintf(stderr,"USING \n");
	q[j*m+j] += aux.bsum[k];
      }else
	fprintf(stderr,"skipping\n");
    }
    
    // heterozygous
    for (int k = j + 1; k < m; ++k) {//k=1,...,4
      float tmp1=0.0;
      int isHe=0;
      for (int i = 0; i < m; ++i) 
	if (i != j && i != k) {
	  tmp1 += aux.bsum[i]; 
	  isHe += aux.c[i]; 
	}
      

      int cjk = aux.c[j] + aux.c[k];
      fprintf(stderr,"j=%d k=%d RES=%d\n",j,k,j*m+k);
      if(1) {

	if (isHe) 
	  q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1;
	else 
	  q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k
      }else{

	q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]];
	if(isHe)
	  q[j*m+k] = q[k*m+j] =  q[k*m+j]+tmp1;

      }

    }

    //set to zero if negative, shoulnd't happen
    for (int k = 0; k != m; ++k) 
      if (q[j*m+k] < 0.0) 
	q[j*m+k] = 0.0;
  }
  //  exit(0);
  return 0;
}
Beispiel #12
0
void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2])
{
	hsaip_t *z[2], *chain[2];
	int i, j, k, n[2], m[2];
	char *flag;
	// initialization
	n[0] = b[0]->n; n[1] = b[1]->n;
	z[0] = (hsaip_t*)calloc(n[0] + n[1], sizeof(hsaip_t));
	z[1] = z[0] + n[0];
	chain[0] = (hsaip_t*)calloc(n[0] + n[1], sizeof(hsaip_t));
	for (k = j = 0; k < 2; ++k) {
		for (i = 0; i < b[k]->n; ++i) {
			bsw2hit_t *p = b[k]->hits + i;
			hsaip_t *q = z[k] + i;
			q->flag = k; q->idx = i;
			q->tbeg = p->k; q->tend = p->k + p->len;
			q->chain = -1;
			q->qbeg = p->beg; q->qend = p->end;
		}
	}
	// chaining
	m[0] = chaining(opt, 0,    n[0], z[0], chain[0]);
	chain[1] = chain[0] + m[0];
	m[1] = chaining(opt, m[0], n[1], z[1], chain[1]);	
	// change query coordinate on the reverse strand
	for (k = 0; k < m[1]; ++k) {
		hsaip_t *p = chain[1] + k;
		int tmp = p->qbeg;
		p->qbeg = len - p->qend; p->qend = len - tmp;
	}
	// filtering
	flag = (char*)calloc(m[0] + m[1], 1);
	ks_introsort(hsaip, m[0] + m[1], chain[0]);
	for (k = 1; k < m[0] + m[1]; ++k) {
		hsaip_t *p = chain[0] + k;
		for (j = 0; j < k; ++j) {
			hsaip_t *q = chain[0] + j;
			if (flag[q->idx]) continue;
			if (q->qend >= p->qend && q->chain > p->chain * opt->t_seeds * 2) {
				flag[p->idx] = 1;
				break;
			}
		}
	}
	for (k = 0; k < n[0] + n[1]; ++k) {
		hsaip_t *p = z[0] + k;
		if (flag[p->chain])
			b[p->flag]->hits[p->idx].G = 0;
	}
	free(flag);
	// squeeze out filtered elements in b[2]
	for (k = 0; k < 2; ++k) {
		for (j = i = 0; j < n[k]; ++j) {
			bsw2hit_t *p = b[k]->hits + j;
			if (p->G) {
				if (i != j) b[k]->hits[i++] = *p;
				else ++i;
			}
		}
		b[k]->n = i;
	}
	// free
	free(z[0]); free(chain[0]);
}
Beispiel #13
0
string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
    char_array *key = NULL;

    log_debug("input=%s\n", str);
    token_array *tokens = tokenize_keep_whitespace(str);

    if (tokens == NULL) {
        return NULL;
    }

    size_t len = strlen(str);

    log_debug("tokenized, num tokens=%zu\n", tokens->n);

    phrase_language_array *phrases = NULL;
    phrase_array *lang_phrases = NULL;

    for (int i = 0; i < options.num_languages; i++)  {
        char *lang = options.languages[i];
        log_debug("lang=%s\n", lang);
        lang_phrases = search_address_dictionaries_tokens(str, tokens, lang);
        
        if (lang_phrases == NULL) { 
            log_debug("lang_phrases NULL\n");
            continue;
        }

        log_debug("lang_phrases->n = %zu\n", lang_phrases->n);

        phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n);

        for (int j = 0; j < lang_phrases->n; j++) {
            phrase_t p = lang_phrases->a[j];
            phrase_language_array_push(phrases, (phrase_language_t){lang, p});
        }

        phrase_array_destroy(lang_phrases);
    }


    lang_phrases = search_address_dictionaries_tokens(str, tokens, ALL_LANGUAGES);
    if (lang_phrases != NULL) {
        phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n);

        for (int j = 0; j < lang_phrases->n; j++) {
            phrase_t p = lang_phrases->a[j];
            phrase_language_array_push(phrases, (phrase_language_t){ALL_LANGUAGES, p});
        }
        phrase_array_destroy(lang_phrases);
    }

    string_tree_t *tree = string_tree_new_size(len);

    if (phrases != NULL) {
        log_debug("phrases not NULL, n=%zu\n", phrases->n);
        ks_introsort(phrase_language_array, phrases->n, phrases->a);

        phrase_language_t phrase_lang;

        int start = 0;
        int end = 0;

        key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN);

        for (int i = 0; i < phrases->n; i++) {
            phrase_lang = phrases->a[i];
            char_array_clear(key);

            char_array_cat(key, phrase_lang.language);
            char_array_cat(key, NAMESPACE_SEPARATOR_CHAR);

            size_t namespace_len = key->n;

            phrase_t phrase = phrase_lang.phrase;

            end = phrase.start;

            for (int j = start; j < end; j++) {
                token_t token = tokens->a[j]; 
                if (token.type != WHITESPACE) {
                    log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);

                    string_tree_add_string_len(tree, str + token.offset, token.len);
                } else {
                    log_debug("Adding space\n");
                    string_tree_add_string(tree, " ");
                }
                string_tree_finalize_token(tree);       
            }

            expansion_value_t value;
            value.value = phrase.data;

            token_t token;

            if (value.components & options.address_components) {
                key->n = namespace_len;
                for (int j = phrase.start; j < phrase.start + phrase.len; j++) {
                    token = tokens->a[j];
                    if (token.type != WHITESPACE) {
                        char_array_cat_len(key, str + token.offset, token.len);
                    } else {
                        char_array_cat(key, " ");
                    }
                }

                char *key_str = char_array_get_string(key);
                log_debug("key_str=%s\n", key_str);
                address_expansion_array *expansions = address_dictionary_get_expansions(key_str);

                if (expansions != NULL) {
                    for (int j = 0; j < expansions->n; j++) {
                        address_expansion_t expansion = expansions->a[j];
                        if (expansion.canonical_index != NULL_CANONICAL_INDEX) {
                            char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
                            if (phrase.start + phrase.len < tokens->n - 1) {
                                token_t next_token = tokens->a[phrase.start + phrase.len];
                                if (!is_numeric_token(next_token.type)) {
                                    string_tree_add_string(tree, canonical);
                                } else {
                                    uint32_t start_index = cstring_array_start_token(tree->strings);
                                    cstring_array_append_string(tree->strings, canonical);
                                    cstring_array_append_string(tree->strings, " ");
                                    cstring_array_terminate(tree->strings);
                                }
                            } else {
                                string_tree_add_string(tree, canonical);

                            }
                        } else {
                            for (int k = phrase.start; k < phrase.start + phrase.len; k++) {
                                token = tokens->a[k];
                                if (token.type != WHITESPACE) {
                                    string_tree_add_string_len(tree, str + token.offset, token.len);
                                } else {
                                    string_tree_add_string(tree, " ");
                                }
                            }

                        }
                    }

                    string_tree_finalize_token(tree);

                }
            } else {
                for (int j = phrase.start; j < phrase.start + phrase.len; j++) {
                    token = tokens->a[j];
                    if (token.type != WHITESPACE) {
                        log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);
                        string_tree_add_string_len(tree, str + token.offset, token.len);
                    } else {
                        string_tree_add_string(tree, " ");
                    }
                    string_tree_finalize_token(tree);

                }
            }

            start = phrase.start + phrase.len;

        }

        char_array_destroy(key);

        end = (int)tokens->n;

        for (int j = start; j < end; j++) {
            token_t token = tokens->a[j]; 
            if (token.type != WHITESPACE) {
                log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);

                string_tree_add_string_len(tree, str + token.offset, token.len);
            } else {
                log_debug("Adding space\n");
                string_tree_add_string(tree, " ");
            }
            string_tree_finalize_token(tree);       
        }


    } else {
        string_tree_add_string(tree, str);
        string_tree_finalize_token(tree);
    }

    if (phrases != NULL) {
        phrase_language_array_destroy(phrases);
    }

    token_array_destroy(tokens);

    return tree;
}
Beispiel #14
0
int main(int argc, char *argv[])
{
	int c, dret, lineno = 0, n_rows = 0, m_rows = 0, n_cols = 0, max_hap = 0;
	int64_t n_missing = 0, n_tot = 0;
	gzFile fp;
	kstream_t *ks;
	kstring_t str = {0,0,0};
	int8_t **C = 0;
	double **M, *X, min_maf = 0.0;
	char **names = 0;

//	_MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~(_MM_MASK_INVALID | _MM_MASK_OVERFLOW | _MM_MASK_UNDERFLOW | _MM_MASK_DIV_ZERO));
	_MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~(_MM_MASK_INVALID | _MM_MASK_DIV_ZERO));
	while ((c = getopt(argc, argv, "m:")) >= 0) {
		if (c == 'm') min_maf = atof(optarg);
	}
	if (argc - optind == 0) {
		fprintf(stderr, "Usage: naivepca [-m min_maf] <in.txt>\n");
		return 1;
	}

	fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
	if (fp == 0) {
		fprintf(stderr, "[E::%s] failed to open file '%s'. Abort.\n", __func__, argv[optind]);
		return 2;
	}
	ks = ks_init(fp);

	// read the matrix into C
	while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
		int8_t *q;
		char *p, *name = str.s;
		int i;
		++lineno;
		for (p = str.s; *p && *p != '\t' && *p != ' '; ++p);
		if (*p) {
			*p++ = 0;
			for (; *p && (*p == '\t' || *p == ' '); ++p);
		}
		if (*p == 0) {
			fprintf(stderr, "[W::%s] line %d has one field; skipped.\n", __func__, lineno);
			continue;
		}
		if (n_cols != 0) {
			if (n_cols != str.s + str.l - p) {
				fprintf(stderr, "[W::%s] line %d has a different number of columns; skipped.\n", __func__, lineno);
				continue;
			}
		} else n_cols = str.s + str.l - p;
		if (n_rows == m_rows) {
			m_rows = m_rows? m_rows<<1 : 16;
			C = (int8_t**)realloc(C, m_rows * sizeof(int8_t*));
			names = (char**)realloc(names, m_rows * sizeof(char*));
		}
		names[n_rows] = strdup(name);
		q = C[n_rows++] = (int8_t*)calloc(n_cols, sizeof(double));
		for (i = 0; i < n_cols; ++i) {
			if (p[i] >= '0' && p[i] <= '9') q[i] = p[i] - '0';
			else q[i] = -1, ++n_missing;
			max_hap = max_hap > q[i]? max_hap : q[i];
		}
		n_tot += n_cols;
	}
	free(str.s);
	fprintf(stderr, "[M::%s] read %d samples and %d sites; ploidy is %d\n", __func__, n_rows, n_cols, max_hap);
	fprintf(stderr, "[M::%s] %.3f%% of genotypes are missing\n", __func__, (double)n_missing / n_tot);

	{ // normalize the matrix into M
		int i, j, *sum, *cnt, n_dropped = 0;
		double *mu, *pp;
		sum = (int*)calloc(n_cols, sizeof(int));
		cnt = (int*)calloc(n_cols, sizeof(int));
		mu = (double*)calloc(n_cols, sizeof(double));
		pp = (double*)calloc(n_cols, sizeof(double));
		for (i = 0; i < n_rows; ++i) {
			int8_t *q = C[i];
			for (j = 0; j < n_cols; ++j)
				if (q[j] >= 0) sum[j] += q[j], ++cnt[j];
		}
		for (j = 0; j < n_cols; ++j) {
			if (cnt[j] > 0) {
				mu[j] = (double)sum[j] / cnt[j];
				pp[j] = mu[j] / max_hap;
				if (pp[j] < min_maf || 1. - pp[j] < min_maf) ++n_dropped;
			} else ++n_dropped;
		}
		fprintf(stderr, "[M::%s] %d rare sites are dropped\n", __func__, n_dropped);
		M = (double**)calloc(n_rows, sizeof(double*));
		for (i = 0; i < n_rows; ++i) {
			int8_t *q = C[i];
			double *r;
			r = M[i] = (double*)calloc(n_cols, sizeof(double));
			for (j = 0; j < n_cols; ++j)
				r[j] = q[j] < 0 || pp[j] < min_maf || 1. - pp[j] < min_maf || pp[j] == 0. || 1 - pp[j] == 0. ? 0. : (q[j] - mu[j]) / sqrt(pp[j] * (1. - pp[j]));
		}
		free(sum); free(cnt); free(mu); free(pp);
		for (i = 0; i < n_rows; ++i) free(C[i]);
		free(C);
	}

	{ // multiplication
		int i, j, k;
		X = (double*)calloc(n_rows * n_rows, sizeof(double));
		for (i = 0; i < n_rows; ++i) {
			double *zi = M[i];
			for (j = 0; j <= i; ++j) {
				double t = 0., *zj = M[j];
				for (k = 0; k < n_cols; ++k)
					t += zi[k] * zj[k];
				X[i*n_rows + j] = X[j*n_rows + i] = t / n_cols;
			}
		}
		for (i = 0; i < n_rows; ++i) free(M[i]);
		free(M);
	}

	{ // print eigan vectors
		double *ev;
		int i, j;
		evsrt_t *evsrt;
		ev = (double*)calloc(n_rows, sizeof(double));
		evsrt = (evsrt_t*)calloc(n_rows, sizeof(evsrt_t));
		n_eigen_symm(X, n_rows, ev);
		for (i = 0; i < n_rows; ++i)
			evsrt[i].ev = ev[i], evsrt[i].i = i;
		ks_introsort(ev, n_rows, evsrt);
		for (i = 0; i < n_rows; ++i) {
			printf("%s", names[i]);
			for (j = 0; j < n_rows; ++j)
				printf("\t%.6f", X[i*n_rows + evsrt[j].i] * evsrt[j].ev);
			putchar('\n');
			free(names[i]);
		}
		free(ev); free(evsrt);
		free(X); free(names);
	}
	
	ks_destroy(ks);
	gzclose(fp);
	return 0;
}
Beispiel #15
0
mem_alnreg_v mem_fmeas_fliter_se(mem_alnreg_v a , int n , int l_seq , int mode)
{
	mem_alnreg_v  aa  ;
	int i , j ;
	kvec_t(FF_t)  k_ff_t ;
	kv_init(k_ff_t);
	kv_init(aa);
	//   caculate FMEAS value 
	if(n == 0) return aa ;
	for( i = 0 ;  i <  a.n ; i++){
		mem_alnreg_t  *p_ar =  a.a + i ;
		for( j = i + 1 ; j < a.n ; j++){
			FF_t  tmp ;
			mem_alnreg_t  *q_ar =  a.a + j ;
			double  sens  ,  spec ;
			int FN =  0 , TP = 0 ,TN = 0 , FP = 0 ;
			int A,B,C,D;
			if( p_ar->qb < q_ar->qb || (p_ar->qb  ==  q_ar->qb &&  p_ar->qe >=  q_ar->qe)){ //   p  q
				A =  p_ar->qb ;
				B =  p_ar->qe - 1 ;
				C =  q_ar->qb ;
				D =  q_ar->qe - 1 ;
			}else {  //   p   q  
				A =  q_ar->qb ;
				B =  q_ar->qe - 1;
				C =  p_ar->qb ;
				D =  p_ar->qe - 1;
			}
			if(B < C){
				TP = B - A + D - C + 2 ;
				FN = l_seq - D - 1  + A  + C - B - 1 ; 
				TN = l_seq ;
				FP = 0 ;
			}else if( D <= B){ // contain
				continue ;
			}else{
				TP = D - A + 1 ;
				FN = l_seq - D - 1 + A  ;
				FP = B - C + 1 ;
				TN = l_seq - FP;

			}
			sens = (double)TP/(double)(TP+FN);
			spec = (double)TN/(double)(TN+FP);
			tmp.FMEAS =  (2*spec*sens)/(spec+sens);
			tmp.score =  p_ar->score + q_ar->score;
			tmp.x =  i  , tmp.y = j ;
			if(tmp.FMEAS > 0.95) kv_push(FF_t,k_ff_t,tmp);
		}
	}
	ks_introsort(ff_mem_flt, k_ff_t.n, k_ff_t.a);	
	kv_push(mem_alnreg_t,aa,a.a[0]);
	double max_feas ;
//	int   score ;
	if( k_ff_t.n == 0 ) return aa;
	
	max_feas = k_ff_t.a[0].FMEAS ;
//	score =  k_ff_t.a[0].score ;
	if(mode){
		int cnt = 0 ;
		for( i = 0 ;  i <  kv_size(k_ff_t) ; i++){
			FF_t  p  = kv_A(k_ff_t,i);
			if(p.x == 0 && cnt == 0){
				kv_push(mem_alnreg_t,aa,a.a[p.y]);
				cnt = 1 ;
			}else if(p.x == 0){
				kv_push(mem_alnreg_t,aa,a.a[0]);
				kv_push(mem_alnreg_t,aa,a.a[p.y]);

			}


		}

		for( i = 0 ;  i  < kv_size(k_ff_t); i++){
			FF_t  p  = kv_A(k_ff_t,i);
			if(max_feas != p.FMEAS )  break;
			if(p.x == 0) continue ;
			kv_push(mem_alnreg_t,aa,a.a[p.x]);
			kv_push(mem_alnreg_t,aa,a.a[p.y]);
		}
	}else{
		int cnt = 0 ;
		for( i = 0 ;  i  < kv_size(k_ff_t); i++){
			FF_t  p  = kv_A(k_ff_t,i);
			if(max_feas != p.FMEAS )  break;
			if(p.x == 0 && cnt == 0){
				kv_push(mem_alnreg_t,aa,a.a[p.y]);
				continue ;
			}else if( p.x == 0 ){
				kv_push(mem_alnreg_t,aa,a.a[0]);
				kv_push(mem_alnreg_t,aa,a.a[p.y]);
				continue ;
			}
			kv_push(mem_alnreg_t,aa,a.a[p.x]);
			kv_push(mem_alnreg_t,aa,a.a[p.y]);
		}

	}
	kv_destroy(k_ff_t);

#if 0
	for( i = 0 ;  i < kv_size(aa); i++){
		mem_alnreg_t  *q = aa.a + i;
		printf("%db: %d  %de:%d \t" , i, q->qb , i, q->qe);
		if( i == kv_size(aa) -1 )  printf("\n");
	}
#endif


	return  aa ; 
}