void nw_test_no_mismatches() { nw_aligner_t *nw = needleman_wunsch_new(); alignment_t *result = alignment_create(256); int match = 1; int mismatch = -2; int gap_open = -4; int gap_extend = -1; bool no_start_gap_penalty = false, no_end_gap_penalty = false; bool no_gaps_in_a = false, no_gaps_in_b = false; bool no_mismatches = true, case_sensitive = true; scoring_t scoring; scoring_init(&scoring, match, mismatch, gap_open, gap_extend, no_start_gap_penalty, no_end_gap_penalty, no_gaps_in_a, no_gaps_in_b, no_mismatches, case_sensitive); needleman_wunsch_align("atc", "ac", &scoring, nw, result); ASSERT(strcmp(result->result_a, "atc") == 0 && strcmp(result->result_b, "a-c") == 0); needleman_wunsch_align("cgatcga", "catcctcga", &scoring, nw, result); ASSERT(strcmp(result->result_a, "cgatc---ga") == 0 && strcmp(result->result_b, "c-atcctcga") == 0); alignment_free(result); needleman_wunsch_free(nw); }
void write_unmapped_read(fastq_read_t *fq_read, bam_file_t *bam_file) { static char aux[1024] = ""; alignment_t *alig; size_t header_len; char *id; bam1_t *bam1; // calculating cigar //sprintf(aux, "%luX", fq_read->length); alig = alignment_new(); //header_len = strlen(fq_read->id); //id = (char *) malloc(sizeof(char) * (header_len + 1)); //get_to_first_blank(fq_read->id, header_len, id); //free(fq_read->id); alignment_init_single_end(strdup(fq_read->id), fq_read->sequence, fq_read->quality, 0, -1, -1, /*strdup(aux)*/"", 0, 0, 0, 0, 0, NULL, alig); bam1 = convert_to_bam(alig, 33); bam_fwrite(bam1, bam_file); bam_destroy1(bam1); alig->sequence = NULL; alig->quality = NULL; alig->cigar = NULL; alignment_free(alig); //printf("\tWRITE : read %i (%d items): unmapped...done !!\n", i, num_items); }
void write_mapped_read(array_list_t *array_list, bam_file_t *bam_file) { size_t num_items = array_list_size(array_list); alignment_t *alig; bam1_t *bam1; for (size_t j = 0; j < num_items; j++) { alig = (alignment_t *) array_list_get(j, array_list); //printf("\t******** %i(%i)\n", j, num_items); //printf("is null alig->name %i\n", (alig->query_name == NULL)); //printf("name = %s\n", alig->query_name); //printf("read = %s\n", alig->sequence); //printf("\t-----> %s\n", alig->cigar); LOG_DEBUG("writting bam..\n"); //alignment_print(alig); //exit(-1); if (alig != NULL) { bam1 = convert_to_bam(alig, 33); bam_fwrite(bam1, bam_file); bam_destroy1(bam1); alignment_free(alig); } else { LOG_FATAL_F("alig is NULL, num_items = %lu\n", num_items); } //printf("\t**************** %i(%i)\n", j, num_items); } if (array_list) { array_list_free(array_list, NULL); } }
/* No gap is expected in the longer sequence */ void nw_test_no_gaps_in_longer() { nw_aligner_t *nw = needleman_wunsch_new(); alignment_t *aln = alignment_create(256); const char* seq_a = "aaaaacg"; const char* seq_b = "acgt"; int match = 1; int mismatch = -2; int gap_open = -4; int gap_extend = -1; bool no_start_gap_penalty = false, no_end_gap_penalty = false; bool no_gaps_in_a = true, no_gaps_in_b = false; bool no_mismatches = false, case_sensitive = true; scoring_t scoring; scoring_init(&scoring, match, mismatch, gap_open, gap_extend, no_start_gap_penalty, no_end_gap_penalty, no_gaps_in_a, no_gaps_in_b, no_mismatches, case_sensitive); needleman_wunsch_align(seq_a, seq_b, &scoring, nw, aln); // ASSERT(strcmp(aln->result_a, "aaaaacg") == 0 && // strcmp(aln->result_b, "acgt---") == 0); ASSERT(strcmp(aln->result_a, "aaaaacg-") == 0 && strcmp(aln->result_b, "a----cgt") == 0); alignment_free(aln); needleman_wunsch_free(nw); }
/* First sequence is aligned to the corresponding (equal) substring of the second * sequence because both gaps at start and at end are free */ void nw_test_free_gaps_at_ends() { nw_aligner_t *nw = needleman_wunsch_new(); alignment_t *result = alignment_create(256); const char* seq_a = "acg"; const char* seq_b = "tttacgttt"; int match = 1; int mismatch = -1; int gap_open = -4; int gap_extend = -1; bool no_start_gap_penalty = true, no_end_gap_penalty = true; bool no_gaps_in_a = false, no_gaps_in_b = false; bool no_mismatches = false, case_sensitive = true; scoring_t scoring; scoring_init(&scoring, match, mismatch, gap_open, gap_extend, no_start_gap_penalty, no_end_gap_penalty, no_gaps_in_a, no_gaps_in_b, no_mismatches, case_sensitive); needleman_wunsch_align(seq_a, seq_b, &scoring, nw, result); ASSERT(strcmp(result->result_a, "---acg---") == 0 && strcmp(result->result_b, "tttacgttt") == 0); alignment_free(result); needleman_wunsch_free(nw); }
void call_decomp_destroy(CallDecomp *dc) { alignment_free(dc->aln); needleman_wunsch_free(dc->nw_aligner); ctx_free(dc->scoring); bcf_destroy(dc->v); strbuf_dealloc(&dc->sbuf); ctx_free(dc); }
void align(char* seq_a, char* seq_b) { // Variables to store alignment result sw_aligner_t *sw = smith_waterman_new(); alignment_t *result = alignment_create(256); // Decide on scoring int match = 1; int mismatch = -2; int gap_open = -4; int gap_extend = -1; // Don't penalise gaps at the start // ACGATTT // ----TTT would score +3 (when match=+1) char no_start_gap_penalty = 1; // ..or gaps at the end e.g. // ACGATTT // ACGA--- would score +4 (when match=+1) char no_end_gap_penalty = 1; char no_gaps_in_a = 0, no_gaps_in_b = 0; char no_mismatches = 0; // Compare character case-sensitively (usually set to 0 for DNA etc) char case_sensitive = 0; scoring_t scoring; scoring_init(&scoring, match, mismatch, gap_open, gap_extend, no_start_gap_penalty, no_end_gap_penalty, no_gaps_in_a, no_gaps_in_b, no_mismatches, case_sensitive); // Add some special cases // x -> y means x in seq1 changing to y in seq2 scoring_add_mutation(&scoring, 'a', 'c', -2); // a -> c give substitution score -2 scoring_add_mutation(&scoring, 'c', 'a', -1); // c -> a give substitution score -1 // We could also prohibit the aligning of characters not given as special cases // scoring.use_match_mismatch = 0; smith_waterman_align(seq_a, seq_b, &scoring, sw); while(smith_waterman_fetch(sw, result)) { printf("seqA: %s [start:%zu]\n", result->result_a, result->pos_a); printf("seqB: %s [start:%zu]\n", result->result_b, result->pos_b); printf("alignment score: %i\n\n", result->score); } // Free memory for storing alignment results smith_waterman_free(sw); alignment_free(result); }
void nw_test_no_mismatches_rand() { nw_aligner_t *nw = needleman_wunsch_new(); alignment_t *aln = alignment_create(256); int match = 1; int mismatch = -2; int gap_open = -4; int gap_extend = -1; bool no_start_gap_penalty = false, no_end_gap_penalty = false; bool no_gaps_in_a = false, no_gaps_in_b = false; bool no_mismatches = true, case_sensitive = true; scoring_t scoring; scoring_init(&scoring, match, mismatch, gap_open, gap_extend, no_start_gap_penalty, no_end_gap_penalty, no_gaps_in_a, no_gaps_in_b, no_mismatches, case_sensitive); char seqa[100], seqb[100]; size_t i; // Run 50 random alignments for(i = 0; i < 50; i++) { make_rand_seq(seqa, sizeof(seqa)); make_rand_seq(seqb, sizeof(seqb)); needleman_wunsch_align(seqa, seqb, &scoring, nw, aln); // Check no mismatches char *a = aln->result_a, *b = aln->result_b; while(1) { ASSERT(*a == '-' || *b == '-' || *a == *b); // printf("Seq: '%s' '%s'\n", aln->result_a, aln->result_b); // exit(EXIT_FAILURE); if(!*a && !*b) break; a++; b++; } } alignment_free(aln); needleman_wunsch_free(nw); }
int main(int argc, char **argv) { if(argc != 2) print_usage(argv); char *seq = argv[1]; size_t seqlen = strlen(seq); // Go int match = 1, mismatch = -1, gap_open = -4, gap_extend = -1; bool no_start_gap_penalty = false, no_end_gap_penalty = false; bool no_gaps_in_a = true, no_gaps_in_b = true; bool no_mismatches = true, case_sensitive = true; scoring_t scoring; scoring_init(&scoring, match, mismatch, gap_open, gap_extend, no_start_gap_penalty, no_end_gap_penalty, no_gaps_in_a, no_gaps_in_b, no_mismatches, case_sensitive); // Alignment results stored here sw_aligner_t *sw = smith_waterman_new(); alignment_t *aln = alignment_create(seqlen+1); smith_waterman_align(seq, seq, &scoring, sw); // Loop over results while(smith_waterman_fetch(sw, aln)) { if(aln->pos_a < aln->pos_b) { fputs(aln->result_a, stdout); printf(" [%zu,%zu]\n", aln->pos_a, aln->pos_b); } } smith_waterman_free(sw); alignment_free(aln); return EXIT_SUCCESS; }
int main(int argc, char* argv[]) { #ifdef SEQ_ALIGN_VERBOSE printf("VERBOSE: on\n"); #endif sw_set_default_scoring(); cmd = cmdline_new(argc, argv, &scoring, SEQ_ALIGN_SW_CMD); // Align! sw = smith_waterman_new(); result = alignment_create(256); if(cmd->seq1 != NULL) { // Align seq1 and seq2 align(cmd->seq1, cmd->seq2, NULL, NULL); } // Align from files size_t i, num_of_file_pairs = cmdline_get_num_of_file_pairs(cmd); for(i = 0; i < num_of_file_pairs; i++) { const char *file1 = cmdline_get_file1(cmd, i); const char *file2 = cmdline_get_file2(cmd, i); if(file1 != NULL && *file1 == '\0' && file2 == NULL) { wait_on_keystroke = 1; file1 = "-"; } align_from_file(file1, file2, &align_pair_from_file, !cmd->interactive); } // Free memory for storing alignment results smith_waterman_free(sw); alignment_free(result); cmdline_free(cmd); return EXIT_SUCCESS; }
void sw_test_no_gaps_smith_waterman() { sw_aligner_t *sw = smith_waterman_new(); alignment_t *result = alignment_create(256); const char* seq_a = "gacag"; const char* seq_b = "tgaagt"; int match = 1; int mismatch = -2; int gap_open = -4; int gap_extend = -1; bool no_start_gap_penalty = false, no_end_gap_penalty = false; bool no_gaps_in_a = true, no_gaps_in_b = true; bool no_mismatches = false, case_sensitive = true; scoring_t scoring; scoring_init(&scoring, match, mismatch, gap_open, gap_extend, no_start_gap_penalty, no_end_gap_penalty, no_gaps_in_a, no_gaps_in_b, no_mismatches, case_sensitive); smith_waterman_align(seq_a, seq_b, &scoring, sw); smith_waterman_fetch(sw, result); ASSERT(strcmp(result->result_a, "ga") == 0 && strcmp(result->result_b, "ga") == 0); smith_waterman_fetch(sw, result); ASSERT(strcmp(result->result_a, "ag") == 0 && strcmp(result->result_b, "ag") == 0); alignment_free(result); smith_waterman_free(sw); }
int sa_sam_writer(void *data) { sa_wf_batch_t *wf_batch = (sa_wf_batch_t *) data; sa_mapping_batch_t *mapping_batch = (sa_mapping_batch_t *) wf_batch->mapping_batch; if (mapping_batch == NULL) { printf("bam_writer1: error, NULL mapping batch\n"); return 0; } /* for (int i = 0; i < NUM_COUNTERS; i++) { counters[i] += mapping_batch->counters[i]; } */ #ifdef _TIMING for (int i = 0; i < NUM_TIMING; i++) { func_times[i] += mapping_batch->func_times[i]; } #endif int num_mismatches, num_cigar_ops; size_t flag, pnext = 0, tlen = 0; char *cigar_string, *cigar_M_string, *rnext = "*"; fastq_read_t *read; array_list_t *read_list = mapping_batch->fq_reads; array_list_t *mapping_list, *mate_list; FILE *out_file = (FILE *) wf_batch->writer_input->bam_file; sa_genome3_t *genome = wf_batch->sa_index->genome; size_t num_reads, num_mappings, num_mate_mappings; num_reads = mapping_batch->num_reads; if (mapping_batch->options->pair_mode != SINGLE_END_MODE) { // PAIR MODE int len; char *sequence, *quality; char *seq, *opt_fields; alignment_t *alig; for (size_t i = 0; i < num_reads; i++) { read = (fastq_read_t *) array_list_get(i, read_list); // seq = read->sequence; /* if (i % 2 == 0) { mate_list = mapping_batch->mapping_lists[i+1]; num_mate_mappings = array_list_size(mate_list); } else { mate_list = mapping_list; num_mate_mappings = num_mappings; } */ mapping_list = mapping_batch->mapping_lists[i]; num_mappings = array_list_size(mapping_list); num_total_mappings += num_mappings; #ifdef _VERBOSE if (num_mappings > 1) { num_dup_reads++; num_total_dup_reads += num_mappings; } #endif if (num_mappings > 0) { num_mapped_reads++; if (num_mappings > 1) { num_multihit_reads++; } for (size_t j = 0; j < num_mappings; j++) { alig = (alignment_t *) array_list_get(j, mapping_list); /* // update alignment alig->secondary_alignment = 0; if (num_mate_mappings != 1) { alig->is_mate_mapped = 0; alig->is_paired_end_mapped = 0; alig->mate_strand = 0; } */ if (alig->optional_fields) { opt_fields = (char *) calloc(strlen(alig->optional_fields) + 100, sizeof(char)); sprintf(opt_fields, "NH:i:%i\t%s", num_mappings, alig->optional_fields); // sprintf(opt_fields, "NH:i:%i\t%s\tXU:i:%i", num_mappings, alig->optional_fields, mapping_batch->status[i]); } else { opt_fields = (char *) calloc(100, sizeof(char)); sprintf(opt_fields, "NH:i:%i", num_mappings); // sprintf(opt_fields, "NH:i:%i\tXU:i:%i", num_mappings, mapping_batch->status[i]); } /* // update alignment alig->secondary_alignment = 0; if (num_mate_mappings != 1) { alig->is_mate_mapped = 0; alig->is_paired_end_mapped = 0; alig->mate_strand = 0; } */ flag = 0; if (alig->is_paired_end) flag += BAM_FPAIRED; if (alig->is_paired_end_mapped) flag += BAM_FPROPER_PAIR; if (!alig->is_seq_mapped) flag += BAM_FUNMAP; if ((!alig->is_mate_mapped) && (alig->is_paired_end)) flag += BAM_FMUNMAP; if (alig->mate_strand) flag += BAM_FMREVERSE; if (alig->pair_num == 1) flag += BAM_FREAD1; if (alig->pair_num == 2) flag += BAM_FREAD2; if (alig->secondary_alignment) flag += BAM_FSECONDARY; if (alig->fails_quality_check) flag += BAM_FQCFAIL; if (alig->pc_optical_duplicate) flag += BAM_FDUP; if (alig->seq_strand) flag += BAM_FREVERSE; fprintf(out_file, "%s\t%lu\t%s\t%i\t%i\t%s\t%s\t%i\t%i\t%s\t%s\t%s\n", read->id, flag, genome->chrom_names[alig->chromosome], alig->position + 1, (num_mappings > 1 ? 0 : alig->mapq), //60, //(alig->map_quality > 3 ? 0 : alig->map_quality), alig->cigar, (alig->chromosome == alig->mate_chromosome ? "=" : genome->chrom_names[alig->mate_chromosome]), alig->mate_position + 1, alig->template_length, alig->sequence, alig->quality, opt_fields ); // free memory free(opt_fields); alignment_free(alig); } // end for num_mappings } else { num_unmapped_reads++; opt_fields = (char *) calloc(100, sizeof(char)); sprintf(opt_fields, "XM:i:%i XU:i:%i", num_mappings, mapping_batch->status[i]); if (read->adapter) { len = read->length + abs(read->adapter_length); sequence = (char *) malloc(len + 1); quality = (char *) malloc(len + 1); if (read->adapter_length < 0) { strcpy(quality, read->adapter_quality); strcat(quality, read->quality); } else { strcpy(quality, read->quality); strcat(quality, read->adapter_quality); } if ((read->adapter_strand == 0 && read->adapter_length < 0) || (read->adapter_strand == 1 && read->adapter_length > 0)) { strcpy(sequence, read->adapter); strcat(sequence, read->sequence); } else { strcpy(sequence, read->sequence); strcat(sequence, read->adapter); } sequence[len] = 0; quality[len] = 0; } else { sequence = read->sequence; quality = read->quality; } fprintf(out_file, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\t%s\n", read->id, sequence, quality, opt_fields ); free(opt_fields); if (read->adapter) { free(sequence); free(quality); } } array_list_free(mapping_list, (void *) NULL); } } else { // SINGLE MODE int len, mapq; char *seq; seed_cal_t *cal; cigar_t *cigar; char *sequence, *revcomp, *quality; for (size_t i = 0; i < num_reads; i++) { read = (fastq_read_t *) array_list_get(i, read_list); mapping_list = mapping_batch->mapping_lists[i]; num_mappings = array_list_size(mapping_list); num_total_mappings += num_mappings; #ifdef _VERBOSE if (num_mappings > 1) { num_dup_reads++; num_total_dup_reads += num_mappings; } #endif if (num_mappings > 0) { num_mapped_reads++; if (num_mappings > 1) { num_multihit_reads++; } for (size_t j = 0; j < num_mappings; j++) { cal = (seed_cal_t *) array_list_get(j, mapping_list); if (read->adapter) { // sequences and cigar len = read->length + abs(read->adapter_length); sequence = (char *) malloc(len + 1); revcomp = (char *) malloc(len + 1); quality = (char *) malloc(len + 1); cigar = cigar_new_empty(); if (read->adapter_length < 0) { strcpy(quality, read->adapter_quality); strcat(quality, read->quality); } else { strcpy(quality, read->quality); strcat(quality, read->adapter_quality); } if ( (cal->strand == 1 && ((read->adapter_strand == 0 && read->adapter_length > 0) || (read->adapter_strand == 1 && read->adapter_length < 0))) || (cal->strand == 0 && ((read->adapter_strand == 0 && read->adapter_length < 0) || (read->adapter_strand == 1 && read->adapter_length > 0))) ) { strcpy(sequence, read->adapter); strcat(sequence, read->sequence); strcpy(revcomp, read->adapter_revcomp); strcat(revcomp, read->revcomp); cigar_append_op(abs(read->adapter_length), 'S', cigar); cigar_concat(&cal->cigar, cigar); } else { strcpy(sequence, read->sequence); strcat(sequence, read->adapter); strcpy(revcomp, read->revcomp); strcat(revcomp, read->adapter_revcomp); cigar_concat(&cal->cigar, cigar); cigar_append_op(read->adapter_length, 'S', cigar); } sequence[len] = 0; revcomp[len] = 0; quality[len] = 0; } else { // sequences and cigar sequence = read->sequence; revcomp = read->revcomp; quality = read->quality; cigar = &cal->cigar; } if (cal->strand) { flag = 16; seq = revcomp; } else { flag = 0; seq = sequence; } /* if (i == 0) { flag += BAM_FSECONDARY; } */ cigar_string = cigar_to_string(cigar); cigar_M_string = cigar_to_M_string(&num_mismatches, &num_cigar_ops, cigar); if (num_mappings > 1) { cal->mapq = 0; } fprintf(out_file, "%s\t%i\t%s\t%i\t%i\t%s\t%s\t%lu\t%i\t%s\t%s\tNH:i:%i\tNM:i:%i\n", read->id, flag, genome->chrom_names[cal->chromosome_id], cal->start + 1, (num_mappings == 1 ? cal->mapq : 0), cigar_M_string, rnext, pnext, tlen, seq, quality, num_mappings, num_mismatches ); // free memory free(cigar_M_string); free(cigar_string); seed_cal_free(cal); if (read->adapter) { free(sequence); free(revcomp); free(quality); cigar_free(cigar); } } } else { num_unmapped_reads++; if (read->adapter) { // sequences and cigar len = read->length + abs(read->adapter_length); sequence = (char *) malloc(len + 1); quality = (char *) malloc(len + 1); if (read->adapter_length < 0) { strcpy(quality, read->adapter_quality); strcat(quality, read->quality); } else { strcpy(quality, read->quality); strcat(quality, read->adapter_quality); } if ((read->adapter_strand == 0 && read->adapter_length < 0) || (read->adapter_strand == 1 && read->adapter_length > 0)) { strcpy(sequence, read->adapter); strcat(sequence, read->sequence); } else { strcpy(sequence, read->sequence); strcat(sequence, read->adapter); } sequence[len] = 0; quality[len] = 0; } else { // sequences sequence = read->sequence; quality = read->quality; } fprintf(out_file, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\n", read->id, sequence, quality ); if (read->adapter) { free(sequence); free(quality); } } array_list_free(mapping_list, (void *) NULL); } // end for num_reads } // free memory sa_mapping_batch_free(mapping_batch); if (wf_batch) sa_wf_batch_free(wf_batch); return 0; }
int sam_writer(void *data) { batch_t *batch = (batch_t *) data; batch_writer_input_t *writer_input = batch->writer_input; //bam_file_t *bam_file = writer_input->bam_file; FILE *out_file = (FILE *) writer_input->bam_file; genome_t *genome = writer_input->genome; fastq_read_t *read; mapping_batch_t *mapping_batch = (mapping_batch_t *) batch->mapping_batch; size_t num_reads = array_list_size(mapping_batch->fq_batch); //linked_list_t *linked_list = writer_input->list_p; array_list_t *read_list = mapping_batch->fq_batch; array_list_t *mapping_list; size_t num_mappings; size_t num_mapped_reads = 0; size_t total_mappings = 0; alignment_t *alig; int flag, pnext = 0, tlen = 0; char rnext[4] = "*\0"; extern st_bwt_t st_bwt; st_bwt.total_reads += num_reads; for (size_t i = 0; i < num_reads; i++) { read = (fastq_read_t *) array_list_get(i, read_list); mapping_list = mapping_batch->mapping_lists[i]; num_mappings = array_list_size(mapping_list); total_mappings += num_mappings; //printf("%i.Read %s (num_mappings %i)\n", i, read->id, num_mappings); if (num_mappings > 0) { if (num_mappings == 1) { st_bwt.single_alig++; } else { st_bwt.multi_alig++; } num_mapped_reads++; for (size_t j = 0; j < num_mappings; j++) { alig = (alignment_t *) array_list_get(j, mapping_list); flag = (alig->seq_strand ? 16 : 0); fprintf(out_file, "%s\t%i\t%s\t%i\t%i\t%s\t%s\t%i\t%i\t%s\t%s\n", alig->query_name, flag, genome->chr_name[alig->chromosome], alig->position + 1, alig->map_quality, alig->cigar, rnext, pnext, tlen, alig->sequence, alig->quality); //alig->optional_fields); alignment_free(alig); } array_list_free(mapping_list, NULL); } else { total_mappings++; if (mapping_list) { array_list_free(mapping_list, NULL); } fprintf(out_file, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\n", read->id, read->sequence, read->quality); } } if (mapping_batch) { mapping_batch_free(mapping_batch); } if (batch) batch_free(batch); basic_statistics_add(num_reads, num_mapped_reads, total_mappings, 0, basic_st); }
static PyObject * nw_align_wrapper(PyObject *self, PyObject *args, PyObject *kw) { const char *seq1, *seq2; // Decide on scoring int match = 1; int mismatch = -2; int gap_open = -4; int gap_extend = -1; // Don't penalise gaps at the start // ACGATTT // ----TTT would score +3 (when match=+1) int no_start_gap_penalty = 0; // ..or gaps at the end e.g. // ACGATTT // ACGA--- would score +4 (when match=+1) int no_end_gap_penalty = 0; int no_gaps_in_a = 0, no_gaps_in_b = 0; int no_mismatches = 0; // Compare character case-sensitively (usually set to 0 for DNA etc) int case_sensitive = 0; PyObject * matrix = NULL; static char *kwlist[] = {"seq1","seq2", "matrix", "match", "mismatch", "gap_open","gap_extend", "no_start_gap_penalty", "no_end_gap_penalty", "no_gaps_in_a", "no_gaps_in_b", "no_mismatches", "case_sensitive", NULL}; PyObject *res = NULL; if(!PyArg_ParseTupleAndKeywords(args, kw, "ss|Oiiiiiiiiii", kwlist, &seq1, &seq2, &matrix, &match, &mismatch, &gap_open, &gap_extend, &no_start_gap_penalty, &no_end_gap_penalty, &no_gaps_in_a, &no_gaps_in_b, &no_mismatches, &case_sensitive)) return NULL; alignment_t *result = alignment_create(256); // Variables to store alignment result nw_aligner_t *nw = needleman_wunsch_new(); scoring_t scoring; scoring_init(&scoring, match, mismatch, gap_open, gap_extend, no_start_gap_penalty, no_end_gap_penalty, no_gaps_in_a, no_gaps_in_b, no_mismatches, case_sensitive); // Add some special cases // x -> y means x in seq1 changing to y in seq2 if(matrix != NULL) { PyObject * mapping = PyMapping_Items(matrix); if(mapping == NULL) goto error; int n = PySequence_Size(mapping); PyObject *item; int value; PyObject *key; char * char_a; char * char_b; int i; for(i = 0; i < n; i++) { item = PySequence_GetItem(mapping, i); if(item == NULL || !PyTuple_Check(item)) { Py_XDECREF(item); Py_DECREF(mapping); goto error; } if(!PyArg_ParseTuple(item, "Oi", &key, &value)) { PyErr_SetString(PyExc_RuntimeError, "Values of matrix dict should be integers"); Py_XDECREF(item); Py_DECREF(mapping); goto error; } if(!PyTuple_Check(key)) { PyErr_SetString(PyExc_RuntimeError, "Keys of matrix dict should be tuples"); Py_XDECREF(item); Py_DECREF(mapping); goto error; } if(!PyArg_ParseTuple(key, "ss", &char_a, &char_b)) { PyErr_SetString(PyExc_RuntimeError, "Keys of matrix dict should be tuples with 2 characters as elements."); Py_XDECREF(item); Py_DECREF(mapping); goto error; } if(strlen(char_a) != 1 || strlen(char_b) != 1) { PyErr_SetString(PyExc_RuntimeError, "Character length should be 1"); Py_XDECREF(item); Py_DECREF(mapping); goto error; } scoring_add_mutation(&scoring, case_sensitive ? *char_a : tolower(*char_a), case_sensitive ? *char_a : tolower(*char_b), value); // a -> c give substitution score -2 Py_DECREF(item); } } // We could also prohibit the aligning of characters not given as special cases // scoring.use_match_mismatch = 0; needleman_wunsch_align(seq1, seq2, &scoring, nw, result); res = Py_BuildValue("ssi", result->result_a, result->result_b, result->score); error: // Free memory for storing alignment results needleman_wunsch_free(nw); alignment_free(result); return res; }
void batch_writer(batch_writer_input_t* input_p) { struct timespec ts; ts.tv_sec = 1; ts.tv_nsec = 0; alignment_t **buffer_p; bam1_t* bam1_p; bam_header_t* bam_header_p; bam_file_t* bam_file_p; char* match_filename = input_p->match_filename; //char* mismatch_filename = input_p->mismatch_filename; char* splice_exact_filename = input_p->splice_exact_filename; char* splice_extend_filename = input_p->splice_extend_filename; list_t* list_p = input_p->list_p; printf("batch_writer (%i): START\n", omp_get_thread_num()); list_item_t *item_p = NULL; write_batch_t* batch_p; FILE* fd; FILE* splice_exact_fd = fopen(splice_exact_filename, "w"); FILE* splice_extend_fd = fopen(splice_extend_filename, "w"); //printf("HEADER FROM WRITE: %s\n", input_p->header_filename); bam_header_p = bam_header_new(HUMAN, NCBI37, input_p->header_filename); //bam_file_p = bam_fopen(match_filename); bam_file_p = bam_fopen_mode(match_filename, bam_header_p, "w"); bam_fwrite_header(bam_header_p, bam_file_p); // main loop while ( (item_p = list_remove_item(list_p)) != NULL ) { if (time_on) { timing_start(BATCH_WRITER, 0, timing_p); } batch_p = (write_batch_t*) item_p->data_p; //printf("*********************************Extract one item*********************************\n"); if (batch_p->flag == MATCH_FLAG || batch_p->flag == MISMATCH_FLAG) { //fd = match_fd; //printf("start write alignment. Total %d\n", batch_p->size); buffer_p = (alignment_t **)batch_p->buffer_p; for(int i = 0; i < batch_p->size; i++) { //alignment_print(buffer_p[i]); bam1_p = convert_to_bam(buffer_p[i], 33); bam_fwrite(bam1_p, bam_file_p); bam_destroy1(bam1_p); alignment_free(buffer_p[i]); } } else { if (batch_p->flag == SPLICE_EXACT_FLAG) { fd = splice_exact_fd; } else if (batch_p->flag == SPLICE_EXTEND_FLAG) { fd = splice_extend_fd; } else { fd = NULL; } if (fd != NULL) { //printf("start write batch, %i bytes...\n", batch_p->size); fwrite((char *)batch_p->buffer_p, batch_p->size, 1, fd); //printf("write done !!\n"); //if (time_on) { stop_timer(t1_write, t2_write, write_time); } } } //printf("Free batch\n"); write_batch_free(batch_p); list_item_free(item_p); if (time_on) { timing_stop(BATCH_WRITER, 0, timing_p); } } // end of batch loop //fclose(match_fd); //fclose(mismatch_fd); fclose(splice_exact_fd); fclose(splice_extend_fd); bam_fclose(bam_file_p); //bam_header_free(bam_header_p); printf("batch_writer: END\n"); }
// Clean up pairwise aligner static void nw_aligner_destroy() { alignment_free(aln); needleman_wunsch_free(nw_aligner); }
int batch_decoder_run(batch_decoder_t *bd) { int32 ctloffset, ctlcount, ctlincr; lineiter_t *li, *ali = NULL; search_run(bd->fwdtree); search_run(bd->fwdflat); ctloffset = cmd_ln_int32_r(bd->config, "-ctloffset"); ctlcount = cmd_ln_int32_r(bd->config, "-ctlcount"); ctlincr = cmd_ln_int32_r(bd->config, "-ctlincr"); if (bd->alignfh) ali = lineiter_start(bd->alignfh); for (li = lineiter_start(bd->ctlfh); li; li = lineiter_next(li)) { alignment_t *al = NULL; char *wptr[4]; int32 nf, sf, ef; if (li->lineno < ctloffset) { if (ali) ali = lineiter_next(ali); continue; } if ((li->lineno - ctloffset) % ctlincr != 0) { if (ali) ali = lineiter_next(ali); continue; } if (ctlcount != -1 && li->lineno >= ctloffset + ctlcount) break; if (ali) al = parse_alignment(ali->buf, search_factory_d2p(bd->sf)); sf = 0; ef = -1; nf = str2words(li->buf, wptr, 4); if (nf == 0) { /* Do nothing. */ } else if (nf < 0) { E_ERROR("Unexpected extra data in control file at line %d\n", li->lineno); } else { char *file, *uttid; file = wptr[0]; uttid = NULL; if (nf > 1) sf = atoi(wptr[1]); if (nf > 2) ef = atoi(wptr[2]); if (nf > 3) uttid = wptr[3]; /* Do actual decoding. */ batch_decoder_decode(bd, file, uttid, sf, ef, al); } alignment_free(al); if (ali) ali = lineiter_next(ali); } featbuf_producer_shutdown(search_factory_featbuf(bd->sf)); return 0; }
int sa_bam_writer(void *data) { sa_wf_batch_t *wf_batch = (sa_wf_batch_t *) data; sa_mapping_batch_t *mapping_batch = (sa_mapping_batch_t *) wf_batch->mapping_batch; if (mapping_batch == NULL) { printf("bam_writer1: error, NULL mapping batch\n"); return 0; } // for (int i = 0; i < NUM_COUNTERS; i++) { // counters[i] += mapping_batch->counters[i]; // } #ifdef _TIMING for (int i = 0; i < NUM_TIMING; i++) { func_times[i] += mapping_batch->func_times[i]; } #endif int flag, len; char *sequence, *quality; fastq_read_t *read; array_list_t *read_list = mapping_batch->fq_reads; bam1_t *bam1; alignment_t *alig; array_list_t *mapping_list; bam_file_t *out_file = wf_batch->writer_input->bam_file; sa_genome3_t *genome = wf_batch->sa_index->genome; size_t num_reads, num_mappings, num_mate_mappings; num_reads = mapping_batch->num_reads; for (size_t i = 0; i < num_reads; i++) { read = (fastq_read_t *) array_list_get(i, read_list); mapping_list = mapping_batch->mapping_lists[i]; num_mappings = array_list_size(mapping_list); num_total_mappings += num_mappings; #ifdef _VERBOSE if (num_mappings > 1) { num_dup_reads++; num_total_dup_reads += num_mappings; } #endif if (num_mappings > 0) { num_mapped_reads++; if (num_mappings > 1) { num_multihit_reads++; } for (size_t j = 0; j < num_mappings; j++) { alig = (alignment_t *) array_list_get(j, mapping_list); // update alignment if (num_mappings > 1) { alig->map_quality = 0; } else { alig->map_quality = alig->mapq; } bam1 = convert_to_bam(alig, 33); bam_fwrite(bam1, out_file); bam_destroy1(bam1); alignment_free(alig); } } else { num_unmapped_reads++; if (read->adapter) { // sequences and cigar len = read->length + abs(read->adapter_length); sequence = (char *) malloc(len + 1); quality = (char *) malloc(len + 1); if (read->adapter_length < 0) { strcpy(quality, read->adapter_quality); strcat(quality, read->quality); } else { strcpy(quality, read->quality); strcat(quality, read->adapter_quality); } if ((read->adapter_strand == 0 && read->adapter_length < 0) || (read->adapter_strand == 1 && read->adapter_length > 0)) { strcpy(sequence, read->adapter); strcat(sequence, read->sequence); } else { strcpy(sequence, read->sequence); strcat(sequence, read->adapter); } sequence[len] = 0; quality[len] = 0; } else { // sequences sequence = read->sequence; quality = read->quality; } alig = alignment_new(); alignment_init_single_end(strdup(read->id), sequence, quality, 0, -1, -1, /*strdup(aux)*/"", 0, 0, 0, 0, 0, NULL, alig); bam1 = convert_to_bam(alig, 33); bam_fwrite(bam1, out_file); // free memory bam_destroy1(bam1); alig->sequence = NULL; alig->quality = NULL; alig->cigar = NULL; alignment_free(alig); if (read->adapter) { free(sequence); free(quality); } } array_list_free(mapping_list, (void *) NULL); } // free memory sa_mapping_batch_free(mapping_batch); if (wf_batch) sa_wf_batch_free(wf_batch); return 0; }
void batch_writer2(batch_writer_input_t* input) { printf("START: batch_writer (%i): START, for file %s\n", omp_get_thread_num(), input->match_filename); bam1_t *bam1; bam_header_t *bam_header; bam_file_t *bam_file; alignment_t *alig; char* match_filename = input->match_filename; // char* splice_filename = input->splice_filename; list_t *write_list = input->list_p; array_list_t *array_list; list_item_t *item = NULL; aligner_batch_t *batch = NULL; fastq_batch_t *fq_batch = NULL; FILE* fd; static char aux[10]; size_t read_len; bam_header = bam_header_new(HUMAN, NCBI37, input->header_filename); bam_file = bam_fopen_mode(match_filename, bam_header, "w"); bam_fwrite_header(bam_header, bam_file); size_t num_reads = 0, num_items = 0, total_mappings = 0; // main loop while ( (item = list_remove_item(write_list)) != NULL ) { // if (array_list == NULL) printf("batch_writer.c...\n"); batch = (aligner_batch_t *) item->data_p; fq_batch = batch->fq_batch; num_reads = batch->num_mapping_lists; for (size_t i = 0; i < num_reads; i++) { array_list = batch->mapping_lists[i]; // if (array_list == NULL) printf("READ %d, writer, list is NULL\n", i); // printf("----> list == NULL ? %d\n", (array_list == NULL)); num_items = (array_list == NULL ? 0 : array_list_size(array_list)); // printf("----> number of items = %d, num_items <= 0 ? %d\n", num_items, num_items <= 0); read_len = fq_batch->data_indices[i + 1] - fq_batch->data_indices[i] - 1; // mapped or not mapped ? if (num_items == 0) { //printf("\tWRITE : read %i (%d items): unmapped...\n", i, num_items); // calculating cigar sprintf(aux, "%luX", read_len); alig = alignment_new(); alignment_init_single_end(&(fq_batch->header[fq_batch->header_indices[i]])+1, &(fq_batch->seq[fq_batch->data_indices[i]]), &(fq_batch->quality[fq_batch->data_indices[i]]), 0, 0, 0, aux, 1, 255, 0, 0, alig); bam1 = convert_to_bam(alig, 33); bam_fwrite(bam1, bam_file); bam_destroy1(bam1); // some cosmetic stuff before freeing the alignment, // (in order to not free twice some fields) alig->query_name = NULL; alig->sequence = NULL; alig->quality = NULL; alig->cigar = NULL; alignment_free(alig); // printf("\tWRITE : read %i (%d items): unmapped...done !!\n", i, num_items); } else { // printf("\tWRITE : read %d (%d items): mapped...\n", i, num_items); for (size_t j = 0; j < num_items; j++) { alig = (alignment_t *) array_list_get(j, array_list); if (alig != NULL) { bam1 = convert_to_bam(alig, 33); bam_fwrite(bam1, bam_file); bam_destroy1(bam1); alignment_free(alig); } } // printf("\tWRITE : read %d (%d items): mapped...done !!\n", i, num_items); } if (array_list != NULL) array_list_free(array_list, NULL); } if (batch != NULL) aligner_batch_free(batch); if (item != NULL) list_item_free(item); if (time_on) { timing_stop(BATCH_WRITER, 0, timing_p); } } // end of batch loop bam_fclose(bam_file); printf("END: batch_writer (total mappings %lu)\n", total_mappings); }