Alignment* BWA::generate_single_alignment(const char* bases, const unsigned read_length) { bwa_seq_t* sequence = create_sequence(bases,read_length); // Calculate paths. bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options); // Check for no alignments found and return null. if(sequence->n_aln == 0) { bwa_free_read_seq(1,sequence); return NULL; } // bwa_cal_sa_reg_gap destroys the bases / read length. Copy them back in. copy_bases_into_sequence(sequence,bases,read_length); // Pick best alignment and propagate its information into the sequence. bwa_aln2seq(sequence->n_aln,sequence->aln,sequence); // Generate the best alignment from the sequence. Alignment* alignment = new Alignment; *alignment = generate_final_alignment_from_sequence(sequence); bwa_free_read_seq(1,sequence); return alignment; }
void pe_clean_core(char *fa_fn, clean_opt *opt) { bwa_seq_t *seqs, *s = NULL; int n_seqs = 0, i = 0; char *item = (char*) malloc(BUFSIZE), *solid = malloc(BUFSIZE); FILE *solid_file; clock_t t = clock(); GPtrArray *solid_reads = NULL; show_debug_msg(__func__, "Loading library %s...\n", fa_fn); seqs = load_reads(fa_fn, &n_seqs); show_debug_msg(__func__, "Saving k-mer frequencies: %.2f sec...\n", (float) (clock() - t) / CLOCKS_PER_SEC); sprintf(solid, "%s.solid", opt->lib_name); solid_file = xopen(solid, "w"); solid_reads = calc_solid_reads(seqs, n_seqs, opt, n_seqs * opt->stop_thre, 0, 1); for (i = 0; i < solid_reads->len; i++) { s = g_ptr_array_index(solid_reads, i); sprintf(item, "%s\n", s->name); fputs(item, solid_file); } free(item); free(solid); g_ptr_array_free(solid_reads, TRUE); bwa_free_read_seq(n_seqs, seqs); fclose(solid_file); }
static void *correct_thread(void *data) { correct_aux_t *d = (correct_aux_t*) data; int i = 0; bwa_seq_t *s = NULL, *query = NULL, *seqs = d->ht->seqs; readarray *low_kmer_reads = d->low_kmer_reads; alignarray *aligns = NULL; aligns = g_ptr_array_sized_new(N_DEFAULT_ALIGNS); for (i = d->start; i < d->end; i++) { if (i % 10000 == 0) show_msg(__func__, "Thread %d correction progress: [%d,%d,%d]... \n", d->tid, d->start, i, d->end); s = g_ptr_array_index(low_kmer_reads, i); if (is_repetitive_q(s)) { s->status = USED; continue; } // Only the fresh reads, or the reads tried once would be corrected. if (s->status != FRESH) continue; query = new_seq(s, s->len - 8, 0); pe_aln_query(s, s->seq, d->ht, MISMATCHES, s->len, 0, aligns); pe_aln_query(s, s->rseq, d->ht, MISMATCHES, s->len, 1, aligns); if (aligns->len >= 4) correct_bases(seqs, s, aligns, d->tid); s->status = TRIED; reset_alg(aligns); bwa_free_read_seq(1, query); //if (i > 10000) // break; } free_alg(aligns); show_msg(__func__, "Thread %d finished. \n", d->tid); }
bwa_alg *pe_aln_core(bwa_seq_t *seqs, const gap_opt_t *opt, const bwt_t *bwt_0, const bwt_t *bwt_1) { clock_t t; bwt_t *bwt[2]; bwa_alg *align = (bwa_alg*) malloc(sizeof(bwa_alg)); bwt[0] = bwt_0; bwt[1] = bwt_1; t = clock(); int n_seqs = 1; #ifdef HAVE_PTHREAD if (opt->n_threads <= 1) { // no multi-threading at all bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); } else { pthread_t *tid; pthread_attr_t attr; thread_aux_t *data; int j; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); data = (thread_aux_t*) calloc(opt->n_threads, sizeof(thread_aux_t)); tid = (pthread_t*) calloc(opt->n_threads, sizeof(pthread_t)); for (j = 0; j < opt->n_threads; ++j) { data[j].tid = j; data[j].bwt[0] = bwt[0]; data[j].bwt[1] = bwt[1]; data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt; pthread_create(&tid[j], &attr, worker, data + j); } for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); free(data); free(tid); } #else bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); #endif t = clock(); bwa_seq_t *p = seqs; align->n_algs = p->n_aln; if (p->n_aln) { align->algs = (bwt_aln1_t*) calloc(p->n_aln, sizeof(bwt_aln1_t)); memcpy(align->algs, p->aln, sizeof(bwt_aln1_t) * p->n_aln); } bwa_free_read_seq(n_seqs, seqs); return align; }
void BWA::find_paths(const char* bases, const unsigned read_length, bwt_aln1_t*& paths, unsigned& num_paths, unsigned& best_path_count, unsigned& second_best_path_count) { bwa_seq_t* sequence = create_sequence(bases, read_length); // Calculate the suffix array interval for each sequence, storing the result in sequence->aln (and sequence->n_aln). // This method will destroy the contents of seq and rseq. bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options); paths = new bwt_aln1_t[sequence->n_aln]; memcpy(paths,sequence->aln,sequence->n_aln*sizeof(bwt_aln1_t)); num_paths = sequence->n_aln; // Call aln2seq to initialize the type of match present. bwa_aln2seq(sequence->n_aln,sequence->aln,sequence); best_path_count = sequence->c1; second_best_path_count = sequence->c2; bwa_free_read_seq(1,sequence); }
void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) { int i, n_seqs, tot_seqs = 0; bwa_seq_t *seqs; bwa_seqio_t *ks; clock_t t; bwt_t *bwt; // initialization ks = bwa_open_reads(opt->mode, fn_fa); { // load BWT char *str = (char*)calloc(strlen(prefix) + 10, 1); strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); free(str); } // core loop err_fwrite(opt, sizeof(gap_opt_t), 1, stdout); while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) { tot_seqs += n_seqs; t = clock(); fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... "); #ifdef HAVE_PTHREAD if (opt->n_threads <= 1) { // no multi-threading at all bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); } else { pthread_t *tid; pthread_attr_t attr; thread_aux_t *data; int j; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (j = 0; j < opt->n_threads; ++j) { data[j].tid = j; data[j].bwt = bwt; data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt; pthread_create(&tid[j], &attr, worker, data + j); } for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); free(data); free(tid); } #else bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); #endif fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); t = clock(); fprintf(stderr, "[bwa_aln_core] write to the disk... "); for (i = 0; i < n_seqs; ++i) { bwa_seq_t *p = seqs + i; err_fwrite(&p->n_aln, 4, 1, stdout); if (p->n_aln) err_fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout); } fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); bwa_free_read_seq(n_seqs, seqs); fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs); } // destroy bwt_destroy(bwt); bwa_seq_close(ks); }
void BWA::generate_alignments_from_paths(const char* bases, const unsigned read_length, bwt_aln1_t* paths, const unsigned num_paths, const unsigned best_count, const unsigned second_best_count, Alignment*& alignments, unsigned& num_alignments) { bwa_seq_t* sequence = create_sequence(bases,read_length); sequence->aln = paths; sequence->n_aln = num_paths; // (Ab)use bwa_aln2seq to propagate values stored in the path out into the sequence itself. bwa_aln2seq(sequence->n_aln,sequence->aln,sequence); // But overwrite key parts of the sequence in case the user passed back only a smaller subset // of the paths. sequence->c1 = best_count; sequence->c2 = second_best_count; sequence->type = sequence->c1 > 1 ? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE; num_alignments = 0; for(unsigned i = 0; i < (unsigned)sequence->n_aln; i++) num_alignments += (sequence->aln + i)->l - (sequence->aln + i)->k + 1; alignments = new Alignment[num_alignments]; unsigned alignment_idx = 0; for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) { // Stub in a 'working' path, so that only the desired alignment is local-aligned. const bwt_aln1_t* path = paths + path_idx; bwt_aln1_t working_path = *path; // Loop through all alignments, aligning each one individually. for(unsigned sa_idx = path->k; sa_idx <= path->l; sa_idx++) { working_path.k = working_path.l = sa_idx; sequence->aln = &working_path; sequence->n_aln = 1; sequence->sa = sa_idx; sequence->strand = path->a; sequence->score = path->score; // Each time through bwa_refine_gapped, seq gets reversed. Revert the reverse. // TODO: Fix the interface to bwa_refine_gapped so its easier to work with. if(alignment_idx > 0) seq_reverse(sequence->len, sequence->seq, 0); // Copy the local alignment data into the alignment object. *(alignments + alignment_idx) = generate_final_alignment_from_sequence(sequence); alignment_idx++; } } sequence->aln = NULL; sequence->n_aln = 0; bwa_free_read_seq(1,sequence); }
void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) { int i, n_seqs, tot_seqs = 0; bwa_seq_t *seqs; bwa_seqio_t *ks; clock_t t; bwt_t *bwt; // initialization ks = bwa_open_reads(opt->mode, fn_fa); { // load BWT char *str = (char*)calloc(strlen(prefix) + 10, 1); strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); free(str); } // core loop err_fwrite(opt, sizeof(gap_opt_t), 1, stdout); while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) { tot_seqs += n_seqs; t = clock(); fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... "); #ifdef THREAD if (opt->n_threads <= 1) { // no multi-threading at all bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); } else { DWORD ThreadID; HANDLE *tid; thread_aux_t *data; int j; data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); tid = (HANDLE*)calloc(opt->n_threads, sizeof(HANDLE)); for (j = 0; j < opt->n_threads; ++j) { data[j].tid = j; data[j].bwt = bwt; data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt; //create threads tid[j] = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) worker, data + j, 0, &ThreadID); if (tid[j] == NULL) { printf("CreateThread error: %d\n", GetLastError()); return; } } // Wait for all threads to terminate WaitForMultipleObjects(opt->n_threads, &tid[0], TRUE, INFINITE); free(data); free(tid); } #else bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); #endif fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); t = clock(); fprintf(stderr, "[bwa_aln_core] write to the disk... "); for (i = 0; i < n_seqs; ++i) { bwa_seq_t *p = seqs + i; err_fwrite(&p->n_aln, 4, 1, stdout); if (p->n_aln) err_fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout); } fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); bwa_free_read_seq(n_seqs, seqs); fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs); } // destroy bwt_destroy(bwt); bwa_seq_close(ks); }
int bwa_read_seq1(bwa_seqio_t *bs, int iter, int tid, int thrds, bwa_seq_t **_seqs, int *n_avail, int mode, int trim_qual) { bwa_seq_t *p; bwa_seq_t *seqs = *_seqs; kseq_t *seq = bs->ks; int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24; long n_trimmed = 0, n_tot = 0; bool first; if (l_bc > BWA_MAX_BCLEN) { fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN); return 0; } if (bs->is_bam) { fprintf (stderr, "IS BAM! --- Port bwa_read_bam function\n"); bwa_free_read_seq(*n_avail, seqs); // return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input exit(0); } if (*n_avail == 0) { if (*n_avail) bwa_free_read_seq(*n_avail, seqs); seqs = (bwa_seq_t*)calloc(READ_SEQ_SIZE, sizeof(bwa_seq_t)); *_seqs = seqs; *n_avail = READ_SEQ_SIZE; } n_seqs = 0; first = true; //err_fwrite("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", strlen("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"), 1, stdout); //long cksm = 0; while ((l = kseq_read1(seq, iter, tid, thrds, &first)) >= 0) { if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) { // skip reads that are marked to be filtered by Casava char *s = index(seq->comment.s, ':'); if (s && *(++s) == 'Y') { continue; } } if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length p = &seqs[n_seqs++]; if(n_seqs > READ_SEQ_SIZE) { fprintf (stderr, "READ_SEQ_SIZE not big enough\n"); abort(); } init_bwa_seq_t(p); if (l_bc) { // then trim barcode for (i = 0; i < l_bc; ++i) p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]); p->bc[i] = 0; for (; i < seq->seq.l; ++i) seq->seq.s[i - l_bc] = seq->seq.s[i]; seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0; if (seq->qual.l) { for (i = l_bc; i < seq->qual.l; ++i) seq->qual.s[i - l_bc] = seq->qual.s[i]; seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0; } l = seq->seq.l; } else p->bc[0] = 0; //ComputeChecksum(seq->seq.s,seq->seq.l,&cksm); p->tid = -1; // no assigned to a thread p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; if (p->llen < p->len) { if(p->seq) free(p->seq); p->llen = p->len; p->seq = (ubyte_t*)calloc(p->len, 1); p->rseq = (ubyte_t*)calloc(p->full_len, 1); } for (i = 0; i != p->full_len; ++i) p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]]; if (seq->qual.l) { // copy quality if(p->qual) free(p->qual); p->qual = (ubyte_t*)strdup((char*)seq->qual.s); if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); } memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); if(p->name) free(p->name); p->name = strdup((const char*)seq->name.s); { // trim /[12]$ int t = strlen(p->name); if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0'; } //if (n_seqs == n_needed) break; if(kseq_end(seq)) break; } if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq1] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); //fprintf(stderr, "%d tid: %d sequences: %d cksum: %lx\n", iter, tid, n_seqs, cksm); return n_seqs; }
extern "C" void bwa_seed2genome_cleanup_seq(bwa_seq_t *seq) { bwa_free_read_seq(1, seq) ; }