// Returns 1 if a read is a substring of ANY read in the list or a complete // match with a read before it in the list. Returns <= 0 otherwise. // 1 => is substr // 0 => not substr // -1 => not enough bases of ACGT static int _is_substr(const ReadBuffer *rbuf, size_t idx, const KOGraph *kograph, const dBGraph *db_graph) { const size_t kmer_size = db_graph->kmer_size; const read_t *r = &rbuf->b[idx], *r2; size_t contig_start; contig_start = seq_contig_start(r, 0, kmer_size, 0, 0); if(contig_start >= r->seq.end) return -1; // No kmers in this sequence dBNode node = db_graph_find_str(db_graph, r->seq.b+contig_start); ctx_assert(node.key != HASH_NOT_FOUND); // expect at least one hit (for this read!) ctx_assert(kograph_occurs(kograph, node.key)); KOccur *hit; for(hit = kograph_get(kograph, node.key); 1; hit++) { if(hit->chrom != idx) { r2 = &rbuf->b[hit->chrom]; // A read is a duplicate (i.e. return 1) if it is a substring of ANY // read in the list or a complete match with a read before it in the list. // That is why we have: (hit->chrom < idx || r->seq.end < r2->seq.end) // since identical strings have equal length if(hit->chrom < idx || r->seq.end < r2->seq.end) { if(hit->orient == node.orient) { // potential FORWARD match if(hit->offset >= contig_start && hit->offset + r->seq.end <= r2->seq.end && strncasecmp(r->seq.b, r2->seq.b+hit->offset-contig_start, r->seq.end) == 0) { return 1; } } else { // potential REVERSE match // if read is '<NNNN>[kmer]<rem>' rX_rem is the number of chars after // the first valid kmer size_t r1_rem = r->seq.end - (contig_start + kmer_size); size_t r2_rem = r2->seq.end - (hit->offset + kmer_size); if(r1_rem <= hit->offset && r2_rem >= contig_start && dna_revncasecmp(r->seq.b, r2->seq.b+hit->offset-r1_rem, r->seq.end) == 0) { return 1; } } } } if(!hit->next) break; } return 0; }
static bool read_touches_graph(const read_t *r, const dBGraph *db_graph, LoadingStats *stats) { bool found = false; BinaryKmer bkmer; Nucleotide nuc; dBNode node; const size_t kmer_size = db_graph->kmer_size; size_t i, num_contigs = 0, num_kmers_loaded = 0; size_t search_pos = 0, start, end = 0, contig_len; if(r->seq.end >= kmer_size) { while((start = seq_contig_start(r, search_pos, kmer_size, 0,0)) < r->seq.end && !found) { end = seq_contig_end(r, start, kmer_size, 0, 0, &search_pos); contig_len = end - start; __sync_fetch_and_add((volatile size_t*)&stats->total_bases_loaded, contig_len); num_contigs++; bkmer = binary_kmer_from_str(r->seq.b + start, kmer_size); num_kmers_loaded++; node = db_graph_find(db_graph, bkmer); if(node.key != HASH_NOT_FOUND) { found = true; break; } for(i = start+kmer_size; i < end; i++) { nuc = dna_char_to_nuc(r->seq.b[i]); bkmer = binary_kmer_left_shift_add(bkmer, kmer_size, nuc); num_kmers_loaded++; node = db_graph_find(db_graph, bkmer); if(node.key != HASH_NOT_FOUND) { found = true; break; } } } } // Update stats __sync_fetch_and_add((volatile size_t*)&stats->total_bases_read, r->seq.end); __sync_fetch_and_add((volatile size_t*)&stats->num_kmers_loaded, num_kmers_loaded); __sync_fetch_and_add((volatile size_t*)&stats->num_kmers_novel, num_kmers_loaded - found); __sync_fetch_and_add((volatile size_t*)&stats->num_good_reads, num_contigs > 0); __sync_fetch_and_add((volatile size_t*)&stats->num_bad_reads, num_contigs == 0); return found; }
// if colour is -1 aligns to all colours, otherwise aligns to given colour only // Returns number of kmers lost from the end static size_t db_alignment_from_read(dBAlignment *aln, const read_t *r, uint8_t qcutoff, uint8_t hp_cutoff, const dBGraph *db_graph, int colour) { size_t contig_start, contig_end = 0, search_start = 0; const size_t kmer_size = db_graph->kmer_size; BinaryKmer bkmer, tmp_key; Nucleotide nuc; hkey_t node; size_t i, offset, nxtbse; dBNodeBuffer *nodes = &aln->nodes; Int32Buffer *rpos = &aln->rpos; ctx_assert(nodes->len == rpos->len); size_t n = nodes->len, init_len = n; db_node_buf_capacity(nodes, n + r->seq.end); int32_buf_capacity(rpos, n + r->seq.end); while((contig_start = seq_contig_start(r, search_start, kmer_size, qcutoff, hp_cutoff)) < r->seq.end) { contig_end = seq_contig_end(r, contig_start, kmer_size, qcutoff, hp_cutoff, &search_start); const char *contig = r->seq.b + contig_start; size_t contig_len = contig_end - contig_start; bkmer = binary_kmer_from_str(contig, kmer_size); bkmer = binary_kmer_right_shift_one_base(bkmer); for(offset=contig_start, nxtbse=kmer_size-1; nxtbse < contig_len; nxtbse++,offset++) { nuc = dna_char_to_nuc(contig[nxtbse]); bkmer = binary_kmer_left_shift_add(bkmer, kmer_size, nuc); tmp_key = binary_kmer_get_key(bkmer, kmer_size); node = hash_table_find(&db_graph->ht, tmp_key); if(node != HASH_NOT_FOUND && (colour == -1 || db_node_has_col(db_graph, node, colour))) { nodes->b[n].key = node; nodes->b[n].orient = bkmer_get_orientation(bkmer, tmp_key); rpos->b[n] = offset; n++; } } } // Return number of bases from the last kmer found until read end size_t ret = (n == init_len ? r->seq.end /* No kmers found */ : r->seq.end - (rpos->b[n-1] + kmer_size)); nodes->len = rpos->len = n; // Check for sequence gaps for(i = init_len; i+1 < nodes->len; i++) { if(rpos->b[i]+1 < rpos->b[i+1]) { aln->seq_gaps = true; break; } } return ret; }