pool *pet_cvg(const char *pet_fn, const ass_opt *opt) { bwa_seq_t *pets, *query, *p, *p2; int i = 0, j = 0, k = 0; index64 mate_i = 0; pool *good_pets = new_pool(), *repeat_pets = new_pool(); alignarray *align, *align_2; alg *a; hash_table *ht; ht = pe_load_hash(pet_fn); pets = ht->seqs; fprintf(stderr, "[pe_cvg] Converging RNA-PETs... \n"); // for (i = n_pets - 1; i >= 0; i -= 2) { for (i = 0; i < ht->n_seqs; i += 2) { p = &pets[i]; p2 = &pets[i + 1]; if (binary_exists(repeat_pets->reads, p) || binary_exists(good_pets->reads, p)) continue; for (k = p->len - opt->ol; k >= 0; k--) { query = new_seq(p, opt->ol, k); // p_query(query); pe_aln_query(query, query->seq, ht, opt->nm + 2, opt->ol, 0, align); pool_sort_ins(good_pets, p); // p_align(align); query = new_seq(p2, opt->ol, k); // p_query(query); pe_aln_query(query, query->seq, ht, opt->nm + 2, opt->ol, 0, align_2); pool_sort_ins(good_pets, p2); // p_align(align_2); for (j = 0; j < align->len; j++) { a = g_ptr_array_index(align, j); // The aligned seq is the query itself if (a->r_id == atoll(p->name)) continue; mate_i = get_mate_index(a->r_id); // If the right mate is also aligned if (!aligned(align_2, mate_i)) continue; pool_sort_ins(repeat_pets, &pets[a->r_id]); pool_sort_ins(repeat_pets, &pets[mate_i]); } } // p_pool("Good Pets: ", good_pets); // p_pool("Repeat Pets: ", repeat_pets); } fprintf(stderr, "[pet_cvg] Converged to %zd RNA-PETs... \n", (good_pets->n)); fprintf(stderr, "[pet_cvg] ------------------------------ \n"); // p_pool("Good Pets: ", good_pets); return good_pets; }
static void *correct_thread(void *data) { correct_aux_t *d = (correct_aux_t*) data; int i = 0; bwa_seq_t *s = NULL, *query = NULL, *seqs = d->ht->seqs; readarray *low_kmer_reads = d->low_kmer_reads; alignarray *aligns = NULL; aligns = g_ptr_array_sized_new(N_DEFAULT_ALIGNS); for (i = d->start; i < d->end; i++) { if (i % 10000 == 0) show_msg(__func__, "Thread %d correction progress: [%d,%d,%d]... \n", d->tid, d->start, i, d->end); s = g_ptr_array_index(low_kmer_reads, i); if (is_repetitive_q(s)) { s->status = USED; continue; } // Only the fresh reads, or the reads tried once would be corrected. if (s->status != FRESH) continue; query = new_seq(s, s->len - 8, 0); pe_aln_query(s, s->seq, d->ht, MISMATCHES, s->len, 0, aligns); pe_aln_query(s, s->rseq, d->ht, MISMATCHES, s->len, 1, aligns); if (aligns->len >= 4) correct_bases(seqs, s, aligns, d->tid); s->status = TRIED; reset_alg(aligns); bwa_free_read_seq(1, query); //if (i > 10000) // break; } free_alg(aligns); show_msg(__func__, "Thread %d finished. \n", d->tid); }
OBJ FileRead_P(OBJ filename, generated::ENV &) { char *fname = obj_to_str(filename); int size; char *data = file_read(fname, size); delete_byte_array(fname, strlen(fname)+1); if (size == -1) return make_symb(symb_idx_nothing); OBJ seq_obj = make_empty_seq(); if (size > 0) { SEQ_OBJ *seq = new_seq(size); for (uint32 i=0 ; i < size ; i++) seq->buffer[i] = make_int((uint8) data[i]); delete_byte_array(data, size); seq_obj = make_seq(seq, size); } return make_tag_obj(symb_idx_just, seq_obj); }
void Caller::create_node_calls(const NodePileup& np) { int n = _node->sequence().length(); const string& seq = _node->sequence(); int cur = 0; int cat = call_cat(_node_calls[cur]); NodePair prev_nodes(-1, -1); // scan contiguous chunks of a node with same call // (note: snps will always be 1-base -- never merged) for (int next = 1; next <= n; ++next) { int next_cat = next == n ? -1 : call_cat(_node_calls[next]); if (cat == 2 || cat != next_cat) { NodePair new_nodes(-1, -1); bool secondary_snp = false; // process first genotype if it's not missing if (_node_calls[cur].first == '.') { // add single node for stretch of reference node string new_seq = seq.substr(cur, next - cur); new_nodes.first = ++_max_id; _call_graph.create_node(new_seq, new_nodes.first); } else if (_node_calls[cur].first != '-') { // add snp node assert(next - cur == 1); string new_seq(1, _node_calls[cur].first); new_nodes.first = ++_max_id; _call_graph.create_node(new_seq, new_nodes.first); create_snp_path(new_nodes.first, secondary_snp); secondary_snp = true; } // process second genotype if difference from first if (_node_calls[cur].second != _node_calls[cur].first) { if (_node_calls[cur].second == '.') { // add single node for stretch of reference node string new_seq = seq.substr(cur, next - cur); new_nodes.second = ++_max_id; _call_graph.create_node(new_seq, new_nodes.second); } else if (_node_calls[cur].second != '-') { // add snp node assert(next - cur == 1); string new_seq(1, _node_calls[cur].second); new_nodes.second = ++_max_id; _call_graph.create_node(new_seq, new_nodes.second); create_snp_path(new_nodes.second, secondary_snp); } } // update maps if new node abuts end of original node // so that edges can be updated later on: if (new_nodes.first != -1 || new_nodes.second != -1) { if (cur == 0) { _start_node_map[_node->id()] = new_nodes; } if (next == n) { _end_node_map[_node->id()] = new_nodes; } } // add edges if (prev_nodes.first != -1 && new_nodes.first != -1) { _call_graph.create_edge(prev_nodes.first, new_nodes.first); } if (prev_nodes.first != -1 && new_nodes.second != -1) { _call_graph.create_edge(prev_nodes.first, new_nodes.second); } if (prev_nodes.second != -1 && new_nodes.first != -1) { _call_graph.create_edge(prev_nodes.second, new_nodes.first); } if (prev_nodes.second != -1 && new_nodes.second != -1) { _call_graph.create_edge(prev_nodes.second, new_nodes.second); } // shift right cur = next; cat = next_cat; prev_nodes = new_nodes; } } }