static void tmap_seqs_io_init2_fs_and_add(tmap_seqs_io_t *io_in, sam_header_t *header, sam_header_record_t *record) { char tag[2]; // add @RG.KS and @RG.FO if(io_in->type == TMAP_SEQ_TYPE_SFF) { sam_header_records_t *records = sam_header_get_records(header, record->tag); // get the header line if(io_in->n <= records->n) tmap_error("Too many read groups specified", Exit, OutOfRange); // @RG.KS tag[0]='K';tag[1]='S'; if(0 == sam_header_record_add(record, tag, tmap_sff_io_get_rg_ks(io_in->seqios[records->n]->io.sffio))) { tmap_error("Could not add the KS tag; most likely it is already present", Exit, OutOfRange); } // @RG.FO tag[0]='F';tag[1]='O'; if(0 == sam_header_record_add(record, tag, tmap_sff_io_get_rg_fo(io_in->seqios[records->n]->io.sffio))) { tmap_error("Could not add the FO tag; most likely it is already present", Exit, OutOfRange); } } // check for the @RG.ID and @RG.SM tags if(NULL == sam_header_record_get(record, "ID")) tmap_bug(); // should not happen if(NULL == sam_header_record_get(record, "SM")) { if(0 == sam_header_record_add(record, "SM", "NOSM")) tmap_bug(); // dummy SM, for Picard validation } if(NULL == sam_header_record_get(record, "PG")) { if(0 == sam_header_record_add(record, "PG", PACKAGE_NAME)) tmap_bug(); // dummy PG } // add the read group if(0 == sam_header_add_record(header, record)) tmap_bug(); }
tmap_bwt_int_t tmap_sa_pac_pos(const tmap_sa_t *sa, const tmap_bwt_t *bwt, tmap_bwt_int_t k) { #if TMAP_SA_RUN_TYPE != 1 // Not just the optimized tmap_bwt_int_t orig; orig = tmap_sa_pac_pos_orig(sa, bwt, k); #endif #if TMAP_SA_RUN_TYPE != 0 // Not just the original tmap_bwt_int_t opt; opt = tmap_sa_pac_pos_aux(sa, bwt, k); #endif #if TMAP_SA_RUN_TYPE == 2 // Not just the original if(orig != opt) { tmap_bug(); } #endif #if TMAP_SA_RUN_TYPE == 1 // The optimized return opt; #else return orig; #endif }
int32_t tmap_sam_get_za(tmap_sam_t *sam) { uint8_t *tag = NULL; // ZA if(NULL == sam->b) tmap_bug(); tag = bam_aux_get(sam->b, "ZA"); if(NULL != tag) return bam_aux2i(tag); else return -1; }
// NB: would require sorting to get the commands sorted void tmap_help_unknown_cmd(const char *cmd) { int32_t i, n, best=INT32_MAX, best_n=0; uint64_t *distances = NULL; tmap_command_t *c = NULL; // get # of commands n = 0; c = commands; while(0 <= c->type) { n++; c++; } distances = tmap_malloc(sizeof(uint64_t)*n, "distances"); for(i=0;0 <= commands[i].type;i++) { if(0 == strcmp(cmd, commands[i].name)) tmap_bug(); if(!tmap_prefixcmp(commands[i].name, cmd)) { distances[i] = ((uint64_t)i << 32); // zero score } else { distances[i] = ((uint64_t)i << 32) | (tmap_levenshtein(cmd, commands[i].name, 0, 2, 1, 4) + 1); // pack } if(__get_distance(distances[i]) < best) { best = __get_distance(distances[i]); best_n = 0; } else if(__get_distance(distances[i]) == best) { best_n++; } //fprintf(stderr, "%s -> %u\n", commands[distances[i]>>32].name, (__get_distance(distances[i]))); } if(0 == best && n == best_n) { // matches everything best = TMAP_HELP_SIMILARITY_FLOOR + 1; } n = i; // output similar matches fprintf(stderr, "%s: '%s' is not a tmap command. See 'tmap --help'.\n", PACKAGE, cmd); if(TMAP_HELP_SIMILAR_ENOUGH(best)) { fprintf(stderr, "\nDid you mean %s?\n", i < 2 ? "this": "one of these"); for (i = 0; i < n; i++) if(best == __get_distance(distances[i])) { fprintf(stderr, "\t%s\n", commands[__get_name_idx(distances[i])].name); } } free(distances); }
static inline tmap_map1_aux_stack_entry_t * tmap_map1_aux_stack_pop(tmap_map1_aux_stack_t *stack) { int32_t i; tmap_map1_aux_bin_t *bin; tmap_map1_aux_stack_entry_t *best = NULL; if(0 == stack->n_entries) { return NULL; } // remove from the appropriate bin bin = &stack->bins[stack->best_score]; if(0 == bin->n_entries) { tmap_bug(); } best = bin->entries[bin->n_entries-1]; bin->entries[bin->n_entries-1] = NULL; bin->n_entries--; stack->n_entries--; if(0 == stack->n_entries) { stack->best_score = INT32_MAX; } else if(0 == bin->n_entries) { // find the next best for(i=stack->best_score;i<stack->n_bins;i++) { if(0 < stack->bins[i].n_entries) { stack->best_score = i; break; } } if(i == stack->n_bins) { tmap_bug(); } } return best; }
static int tmap_usage(int argc, char *argv[]) { tmap_command_t *c = NULL; int i, t; tmap_version(argc, argv); c = commands; t = -1; while(0 <= c->type) { if(c->type != t) { if(0 <= t && TMAP_COMMAND_NONE != c->type) fprintf(stderr, "\n"); switch(c->type) { case TMAP_COMMAND_PREPROCESSING: fprintf(stderr, "%sPre-processing:%s\n", KRED, KNRM); break; case TMAP_COMMAND_SERVER: fprintf(stderr, "%sServer:%s\n", KRED, KNRM); break; case TMAP_COMMAND_MAPPING: fprintf(stderr, "%sMapping:%s\n", KRED, KNRM); break; case TMAP_COMMAND_UTILITIES: fprintf(stderr, "%sUtilities:%s\n", KRED, KNRM); break; #ifdef ENABLE_TMAP_DEBUG_FUNCTIONS case TMAP_COMMAND_DEBUG: fprintf(stderr, "%sDebugging:%s\n", KRED, KNRM); break; #endif case TMAP_COMMAND_NONE: break; default: fprintf(stderr, "c->type=%d\n", c->type); tmap_bug(); } t = c->type; } if(c->type != TMAP_COMMAND_NONE) { fprintf(stderr, " %s%s%s", KCYN, c->name, KNRM); for(i=strlen(c->name);i<16;i++) fputc(' ', stderr); fprintf(stderr, "%s%s%s\n", KWHT, c->help, KNRM); } c++; } return 1; }
static int32_t tmap_map1_mapq(tmap_map_sams_t *sams, int32_t seq_len, tmap_map_opt_t *opt) { int32_t i; int32_t num_best_sa, num_best; if(0 == sams->n) { return 0; } // sort by decreasing score tmap_sort_introsort(tmap_map1_sam_sort_score, sams->n, sams->sams); //Note: assumes that the alignments are sorted by decreasing score num_best = num_best_sa = 0; for(i=0;i<sams->n;i++) { if(0 < i && sams->sams[i-1].score < sams->sams[i].score) { // check assumption tmap_bug(); } if(sams->sams[i].score < sams->sams[0].score) { break; } num_best++; num_best_sa++; } for(i=0;i<num_best;i++) { sams->sams[i].mapq = tmap_map1_sam_mapq(num_best_sa, sams->sams[i].aux.map1_aux->num_all_sa, opt->max_mm, sams->sams[i].aux.map1_aux->n_mm); } for(i=num_best;i<sams->n;i++) { sams->sams[i].mapq = 0; } return 0; }
void tmap_bwt_compare_core2(tmap_bwt_t *bwt[2], int32_t length, int32_t print_msg, int32_t warn) { uint8_t *seqs[2] = {NULL,NULL}; char *str = NULL; int32_t i, asymmetric, k, m; uint64_t hash_j; int64_t sum, j; tmap_bwt_match_occ_t sa[2]; tmap_bwt_int_t n[2][2]; for(i=1;i<=length;i++) { seqs[0] = tmap_calloc(i, sizeof(uint8_t), "seqs[0]"); seqs[1] = tmap_calloc(i, sizeof(uint8_t), "seqs[1]"); str = tmap_calloc(i+1, sizeof(char), "str"); for(j=0;j<i;j++) { seqs[1][j] = 3; } asymmetric = 0; j = 0; hash_j = sum = 0; while(1) { if(i == j) { for(k=0;k<i;k++) { seqs[1][k] = 3 - seqs[0][i-k-1]; } for(k=0;k<2;k++) { for(m=0;m<2;m++) { n[m][k] = tmap_bwt_match_exact_reverse(bwt[m], i, seqs[k], &sa[m]); } if(n[0][k] != n[1][k] || sa[0].k != sa[1].k || sa[0].l != sa[1].l || sa[0].hi != sa[1].hi || sa[0].offset != sa[1].offset) { tmap_progress_print2("BWTs did not match"); tmap_progress_print2("n=[%llu,%llu]", n[0][k], n[1][k]); tmap_progress_print2("k=[%llu,%llu]", sa[0].k, sa[1].k); tmap_progress_print2("l=[%llu,%llu]", sa[0].l, sa[1].l); tmap_progress_print2("hi=[%llu,%llu]", sa[0].hi, sa[1].hi); tmap_progress_print2("offset=[%llu,%llu]", sa[0].offset, sa[1].offset); tmap_bug(); } } for(k=0;k<2;k++) { // use m == 0 && k = 0 if(0 == k) { if(0 < n[0][k] && TMAP_BWT_INT_MAX != sa[0].k && sa[0].k <= sa[0].l) { sum += n[0][k]; } } } if(0 == asymmetric && n[0][0] != n[0][1]) { asymmetric = 1; //fprintf(stderr, "n[0][0]=%u n[0][1]=%u\n", n[0][0], n[0][1]); tmap_error("Asymmetry found", Warn, OutOfRange); } j--; while(0 <= j && 3 == seqs[0][j]) { seqs[0][j] = 0; hash_j >>= 2; j--; } if(j < 0) break; seqs[0][j]++; hash_j++; j++; } else { hash_j <<= 2; j++; } } free(seqs[0]); free(seqs[1]); free(str); j = (sum == (bwt[0]->seq_len - i + 1)) ? 0 : 1; // j==1 on fail if(1 == print_msg) { if(0 == j) tmap_progress_print2("%d-mer validation passed", i); else tmap_progress_print2("%d-mer validation failed: observed (%llu) != expected (%llu)\n", i, sum, bwt[0]->seq_len - i + 1); } if(0 == warn && 1 == j) { tmap_error("inconsistency found in the BWT", Exit, OutOfRange); } }
int tmap_seqs_io_sff2sam_main(int argc, char *argv[]) { int c, help = 0; tmap_seqs_io_t *io_in = NULL; tmap_seqs_t *seqs = NULL; char **sam_rg = NULL; int32_t sam_rg_num = 0; int bidirectional = 0, sam_flowspace_tags = 0; int out_type = 0; tmap_sam_io_t *io_out = NULL; bam_header_t *header = NULL; // BAM Header int32_t i; /* uint8_t *key_seq = NULL; int key_seq_len = 0; */ while((c = getopt(argc, argv, "DGR:Yvh")) >= 0) { switch(c) { case 'D': bidirectional = 1; break; case 'G': break; case 'R': sam_rg = tmap_realloc(sam_rg, (1+sam_rg_num) * sizeof(char*), "sam_rg"); sam_rg[sam_rg_num] = tmap_strdup(optarg); sam_rg_num++; break; case 'Y': sam_flowspace_tags = 1; break; case 'v': tmap_progress_set_verbosity(1); break; case 'h': help = 1; break; default: return 1; } } if(1 != argc - optind || 1 == help) { tmap_file_fprintf(tmap_file_stderr, "Usage: %s %s [-R -Y -v -h] <in.sff>\n", PACKAGE, argv[0]); return 1; } // input io_in = tmap_seqs_io_init(&argv[optind], 1, TMAP_SEQ_TYPE_SFF, TMAP_FILE_NO_COMPRESSION, 0l, 0l); // BAM Header header = tmap_seqs_io_to_bam_header(NULL, io_in, sam_rg, sam_rg_num, argc, argv); // open the output file switch(out_type) { case 0: // SAM io_out = tmap_sam_io_init2("-", "wh", header); break; case 1: io_out = tmap_sam_io_init2("-", "wb", header); break; case 2: io_out = tmap_sam_io_init2("-", "wbu", header); break; default: tmap_bug(); } // destroy the BAM Header bam_header_destroy(header); header = NULL; seqs = tmap_seqs_init(TMAP_SEQ_TYPE_SFF); while(0 < tmap_seqs_io_read(io_in, seqs, io_out->fp->header->header)) { bam1_t *b = NULL; tmap_seq_t *seq = seqs->seqs[0]; b = tmap_sam_convert_unmapped(seq, sam_flowspace_tags, bidirectional, NULL, 0, 0, 0, 0, 0, 0, "\tlq:i:%d\trq:i:%d\tla:i:%d\trq:i:%d", seq->data.sff->rheader->clip_qual_left, seq->data.sff->rheader->clip_qual_right, seq->data.sff->rheader->clip_adapter_left, seq->data.sff->rheader->clip_adapter_right); if(samwrite(io_out->fp, b) <= 0) { tmap_error("Error writing the SAM file", Exit, WriteFileError); } bam_destroy1(b); tmap_seqs_destroy(seqs); seqs = tmap_seqs_init(TMAP_SEQ_TYPE_SFF); } tmap_seqs_destroy(seqs); // free memory tmap_seqs_io_destroy(io_in); tmap_sam_io_destroy(io_out); for(i=0;i<sam_rg_num;i++) { free(sam_rg[i]); } free(sam_rg); return 0; }
bam_header_t * tmap_seqs_io_to_bam_header(tmap_refseq_t *refseq, tmap_seqs_io_t *io_in, char **rg_sam, int32_t rg_sam_num, int32_t argc, char *argv[]) { bam_header_t *bam_header = NULL; sam_header_t *header = NULL; // the output header sam_header_record_t *record = NULL; sam_header_record_t **record_list = NULL; char tag[2]; char *command_line= NULL; char *id = NULL; char *id_pp = NULL; int32_t i, j; // @HD if(io_in->type == TMAP_SEQ_TYPE_SAM || io_in->type == TMAP_SEQ_TYPE_BAM) { // should be only one input file if(1 != io_in->n) { tmap_bug(); } // get the current header if(NULL == io_in->seqios[0]) tmap_bug(); if(NULL == io_in->seqios[0]->io.samio) tmap_bug(); if(NULL == io_in->seqios[0]->io.samio->fp->header) tmap_bug(); if(NULL == io_in->seqios[0]->io.samio->fp->header->header) { header = sam_header_parse2(io_in->seqios[0]->io.samio->fp->header->text); } else { header = io_in->seqios[0]->io.samio->fp->header->header; // wow, that's a lot of pointers if(NULL == header) tmap_bug(); header = sam_header_clone(header); // clone the header } if(NULL == header) tmap_bug(); } else { // empty header header = sam_header_init(); // @HD - header line record = sam_header_record_init("HD"); // new header line if(0 == sam_header_record_add(record, "VN", "1.4")) tmap_bug(); // version number if(0 == sam_header_add_record(header, record)) tmap_bug(); // add the header line // nullify record = NULL; } // Get the TMAP program ID id = tmap_malloc(sizeof(char) * (1 + strlen(PACKAGE_NAME)), "id"); strcpy(id, PACKAGE_NAME); // default for(i=j=0;NULL != (record_list = sam_header_get_record(header, "PG", "ID", id, &i)) && 0 < i;i=0) { // while the id is found char *ptr = NULL; // swap id and id_pp ptr = id_pp; id_pp = id; id = ptr; // create the new ID j++; id = tmap_realloc(id, sizeof(char) * (1 + (int)log10(j) + 1 + strlen(PACKAGE_NAME) + 1), "id"); if(sprintf(id, "%s.%d", PACKAGE_NAME, j) < 0) tmap_bug(); free(record_list); record_list = NULL; } // @SQ if(NULL != refseq) { sam_header_records_t *records = NULL; // NB: check to see if any SQ/SN records exist, if not, then ignore checking... // ZZ: We will not checking, but instead just remove all the old header. The old way of checking is not working records = sam_header_get_records(header, "SQ"); if (NULL != records) { // ZZ: remove the headers if exists. sam_header_remove_records(header, "SQ"); records = NULL; } // ZZ: Now we will just add all new tags for(i=0;i<refseq->num_annos;i++) { // for each reference sequence char num[32]; record = sam_header_record_init("SQ"); // new reference sequence record if(0 == sam_header_record_add(record, "SN", refseq->annos[i].name->s)) tmap_bug(); // reference sequence name if(sprintf(num, "%u", (uint32_t)refseq->annos[i].len) < 0) tmap_bug(); // integer to string if(0 == sam_header_record_add(record, "LN", num)) tmap_bug(); // reference sequence length if(0 == sam_header_add_record(header, record)) tmap_bug(); // add the reference sequence record } } // @RG - read group if(0 < rg_sam_num) { // @RG specified on the command line // Check for SAM/BAM // TODO: this should be possible... if(io_in->type == TMAP_SEQ_TYPE_SAM || io_in->type == TMAP_SEQ_TYPE_BAM) { tmap_error("Cannot specify the read groups on the command line when using SAM/BAM as input." " Please embed in the SAM/BAM header instead.", Exit, OutOfRange); } record = NULL; // go through the command line arguments for(i=0;i<rg_sam_num;i++) { if(strlen(rg_sam[i]) < 4) tmap_error("Read group too small", Exit, OutOfRange); if(':' != rg_sam[i][2]) tmap_error("Read group improperly formatted (no colon)", Exit, OutOfRange); // check for id if('I' == rg_sam[i][0] && 'D' == rg_sam[i][1]) { // new read group if(NULL != record) { // add the record tmap_seqs_io_init2_fs_and_add(io_in, header, record); // add @RG.KS and @RG.FO } record = sam_header_record_init("RG"); // new read group } // add the tag/value to the record if(NULL == record) { tmap_error("The read group ID must be specified first", Exit, OutOfRange); } tag[0]=rg_sam[i][0]; tag[1]=rg_sam[i][1]; // setup the tag if(0 == sam_header_record_add(record, tag, rg_sam[i]+3)) tmap_bug(); // add the tag/value } if(NULL != record) { // add the record tmap_seqs_io_init2_fs_and_add(io_in, header, record); // add @RG.KS and @RG.FO } // check that the # of read groups added was the same as the # of input files... sam_header_records_t *records = sam_header_get_records(header, "RG"); // get the header line if(records->n != io_in->n) tmap_error("The number of read groups did not match the number of input files", Exit, OutOfRange); } else if(io_in->type != TMAP_SEQ_TYPE_SAM && io_in->type != TMAP_SEQ_TYPE_BAM) { // dummy... for(i=0;i<io_in->n;i++) { // for each input file char buf[32]; record = sam_header_record_init("RG"); // new read group if(1 == io_in->n) strcpy(buf, "NOID"); else if(sprintf(buf, "NOID.%d", i+1) < 0) tmap_bug(); if(0 == sam_header_record_add(record, "ID", buf)) tmap_bug(); // dummy ID if(0 == sam_header_record_add(record, "SM", "NOSM")) tmap_bug(); // dummy SM, for Picard validation if(0 == sam_header_record_add(record, "PG", id)) tmap_bug(); // dummy PG tmap_seqs_io_init2_fs_and_add(io_in, header, record); // add @RG.KS and @RG.FO } } else { // check that SM/PG are present sam_header_records_t *records = sam_header_get_records(header, "RG"); // get the header line for(i=0;i<records->n;i++) { record = records->records[i]; if(NULL == sam_header_record_get(record, "ID")) tmap_error("Missing @RG.ID in the SAM/BAM Header", Exit, OutOfRange); if(NULL == sam_header_record_get(record, "SM")) { if(0 == sam_header_record_add(record, "SM", "NOSM")) tmap_bug(); // dummy SM, for Picard validation } if(NULL == sam_header_record_get(record, "PG")) { if(0 == sam_header_record_add(record, "PG", id)) tmap_bug(); // dummy PG } } } // @PG - program group // TODO: check for previous program group ID and set @PG.PP record = sam_header_record_init("PG"); // new program group if(0 == sam_header_record_add(record, "ID", id)) tmap_bug(); // @PG.ID if(0 == sam_header_record_add(record, "VN", PACKAGE_VERSION)) tmap_bug(); // @PG.VN // @PG.CL command_line = NULL; j = 1; // for the EOL command_line = tmap_realloc(command_line, sizeof(char) * j, "command_line"); command_line[j-1] = '\0'; for(i=0;i<argc;i++) { if(0 < i) j++; j += strlen(argv[i]); command_line = tmap_realloc(command_line, sizeof(char) * j, "command_line"); if(0 < i) strcat(command_line, " "); strcat(command_line, argv[i]); command_line[j-1] = '\0'; } if(0 == sam_header_record_add(record, "CL", command_line)) tmap_bug(); // @PG.CL if(NULL != id_pp) { // @PG.PP if(0 == sam_header_record_add(record, "PP", id_pp)) tmap_bug(); // @PG.CL } if(0 == sam_header_add_record(header, record)) tmap_bug(); // add the record free(command_line); // Check the new SAM Header if(0 == sam_header_check(header)) { tmap_error("SAM Header was not consistent", Exit, OutOfRange); } // Create a BAM Header from the SAM Header bam_header = bam_header_init(); // empty bam_header->header = header; // soft-copy the header bam_header = sam_header_to_bam_header(bam_header); // convert // free memory free(id); free(id_pp); return bam_header; }
static inline void tmap_map1_aux_stack_push(tmap_map1_aux_stack_t *stack, int32_t offset, tmap_bwt_match_occ_t *match_sa_prev, int32_t n_mm, int32_t n_gapo, int32_t n_gape, int32_t state, int32_t is_diff, tmap_map1_aux_stack_entry_t *prev_entry, const tmap_map_opt_t *opt) { int32_t i; int32_t n_bins_needed = 0; tmap_map1_aux_stack_entry_t *entry = NULL; tmap_map1_aux_bin_t *bin = NULL; // check to see if we need more memory if(stack->entry_pool_length <= stack->entry_pool_i) { int32_t i = stack->entry_pool_length; stack->entry_pool_length <<= 2; stack->entry_pool = tmap_realloc(stack->entry_pool, sizeof(tmap_map1_aux_stack_entry_t*)*stack->entry_pool_length, "stack->entry_pool"); while(i<stack->entry_pool_length) { stack->entry_pool[i] = tmap_malloc(sizeof(tmap_map1_aux_stack_entry_t), "stack->entry_pool[i]"); i++; } } entry = stack->entry_pool[stack->entry_pool_i]; entry->score = aln_score(n_mm, n_gapo, n_gape, opt); entry->n_mm = n_mm; entry->n_gapo = n_gapo; entry->n_gape = n_gape; entry->state = state; entry->match_sa = (*match_sa_prev); entry->i = stack->entry_pool_i; entry->offset = offset; if(NULL == prev_entry) { entry->last_diff_offset = offset; entry->prev_i = -1; } else { entry->last_diff_offset = (1 == is_diff) ? (offset) : prev_entry->last_diff_offset; entry->prev_i = prev_entry->i; } if(stack->n_bins <= entry->score) { //tmap_bug(); // resize the bins if necessary n_bins_needed = entry->score + 1; // realloc tmap_roundup32(n_bins_needed); stack->bins = tmap_realloc(stack->bins, sizeof(tmap_map1_aux_bin_t) * n_bins_needed, "stack->bins"); // initialize for(i=stack->n_bins;i<n_bins_needed;i++) { stack->bins[i].n_entries = stack->bins[i].m_entries = 0; stack->bins[i].entries = NULL; } stack->n_bins = n_bins_needed; } if(stack->n_bins <= entry->score) { tmap_bug(); } bin = &stack->bins[entry->score]; // - remove duplicates // - most likely formed by tandem repeats or indels // - too computationally expensive, and not necessary /* for(i=0;i<bin->n_entries;i++) { if(bin->entries[i]->match_sa.k == entry->match_sa.k && bin->entries[i]->match_sa.l == entry->match_sa.l && bin->entries[i]->offset == entry->offset && bin->entries[i]->state == entry->state) { return; } } */ // update best score if(stack->best_score > entry->score) stack->best_score = entry->score; if(bin->m_entries <= bin->n_entries) { bin->m_entries++; tmap_roundup32(bin->m_entries); bin->entries = tmap_realloc(bin->entries, sizeof(tmap_map1_aux_bin_t) * bin->m_entries, "bin->entries"); } bin->entries[bin->n_entries] = entry; bin->n_entries++; stack->entry_pool_i++; stack->n_entries++; }
void tmap_seq_update(tmap_seq_t *seq, int32_t idx, sam_header_t *header) { char *rg_id = NULL; sam_header_records_t *records = NULL; sam_header_record_t **record_list = NULL; int32_t n = 0; // Read Group switch(seq->type) { case TMAP_SEQ_TYPE_FQ: case TMAP_SEQ_TYPE_SFF: break; case TMAP_SEQ_TYPE_SAM: case TMAP_SEQ_TYPE_BAM: rg_id = tmap_sam_get_rg_id(seq->data.sam); break; default: tmap_error("type is unrecognized", Exit, OutOfRange); break; } if(NULL == rg_id) { // did not find in SAM/BAM // NB: assume that it is from the ith record in the header records = sam_header_get_records(header, "RG"); if(NULL != records) { // it exists if(idx < 0 || records->n <= idx) { tmap_error("RG records index was out of bounds", Exit, OutOfRange); } seq->rg_record = records->records[idx]; // copy over if(NULL == seq->rg_record) tmap_bug(); } } else { // found in SAM/BAM n = 0; record_list = sam_header_get_record(header, "RG", "ID", rg_id, &n); if(0 == n) { fprintf(stderr, "Read Group Identifier: [%s]\n", rg_id); tmap_error("Did not find the @RG.ID in the SAM/BAM Header", Exit, OutOfRange); } else if(1 < n) { fprintf(stderr, "Read Group Identifier: [%s]\n", rg_id); tmap_error("Found more than one @RG.ID in the SAM/BAM Header", Exit, OutOfRange); } seq->rg_record = record_list[0]; free(record_list); // NB: shallow copied } // Program Group // NB: assumes the last item in the header records = sam_header_get_records(header, "PG"); if(NULL != records && 0 < records->n) { // it exists seq->pg_record = records->records[records->n-1]; // copy over } else { seq->pg_record = NULL; } // key sequence and flow order seq->fo_start_idx = -1; if(NULL != seq->rg_record) { // It should exist in the SAM/BAM Header seq->ks = sam_header_record_get(seq->rg_record, "KS"); seq->fo = sam_header_record_get(seq->rg_record, "FO"); // flow order index start if(NULL != seq->ks && NULL != seq->fo && TMAP_SEQ_TYPE_SFF == seq->type) { // only if it is an SFF // in addition, remove key sequence and trimming seq->fo_start_idx = tmap_seq_remove_key_sequence(seq, 1); } else if(TMAP_SEQ_TYPE_SAM == seq->type || TMAP_SEQ_TYPE_BAM == seq->type) { // Try the ZF tag... seq->fo_start_idx = tmap_sam_get_fo_start_idx(seq->data.sam); } // flowgram information... seq->flowgram_len = tmap_seq_get_flowgram(seq, &seq->flowgram); // check if all flowspace information is available /*if((NULL == seq->ks || NULL == seq->fo || -1 == seq->fo_start_idx || NULL == seq->flowgram)// anything missing && (NULL != seq->ks || NULL != seq->fo || -1 != seq->fo_start_idx || NULL != seq->flowgram)) { // anything exists fprintf(stderr, "@RG.KS %s present.\n", (NULL == seq->ks) ? "is not" : "is"); fprintf(stderr, "@RG.FO %s present.\n", (NULL == seq->fo) ? "is not" : "is"); fprintf(stderr, "@SAM.FZ %s present.\n", (NULL == seq->flowgram) ? "is not" : "is"); fprintf(stderr, "@SAM.ZF %s present.\n", (-1 == seq->fo_start_idx) ? "is not" : "is"); tmap_error("Not all flowspace information available (@RG.KS and @RG.FO, and @SAM.FZ and @SAM.ZF)", Exit, OutOfRange); }*/ } }
static inline void tmap_map3_aux_core_seed_helper(uint8_t *query, int32_t query_length, int32_t offset, uint8_t *flow_order, uint8_t flow_i, int32_t hp_diff, tmap_refseq_t *refseq, tmap_bwt_t *bwt, tmap_sa_t *sa, tmap_bwt_match_hash_t *hash, tmap_map_opt_t *opt, tmap_map3_aux_seed_t **seeds, int32_t *n_seeds, int32_t *m_seeds, int32_t seed_length) { int32_t i, k; int32_t n_bases;; tmap_bwt_match_occ_t prev_sa, cur_sa, next_sa, tmp_sa; if(query_length <= offset) return; if(flow_order[flow_i] != query[offset]) { tmap_bug(); } // initialize prev prev_sa.k = 0; prev_sa.l = bwt->seq_len; prev_sa.hi = 0; prev_sa.offset = 0; i = offset; while(i < query_length) { // reached the seed length if(seed_length < i - offset) { break; } // get the homopolymer length n_bases = 0; if(flow_order[flow_i] == query[i]) { // non-empty flow n_bases = 1; while(i + n_bases < query_length && query[i] == query[i+n_bases]) { n_bases++; } } // move through the homopolymer, trying deletions if possible next_sa = prev_sa; for(k=0;k<n_bases;k++) { // reached the seed length if(seed_length < i - offset + k) { break; } // only delete if there are bases available and we are not deleting // the entire first flow int32_t bases_to_align = seed_length - (i - offset + k); int32_t bases_left = query_length - i - n_bases; if(0 < n_bases - k // bases to delete && n_bases - k <= hp_diff // not too many to delete && (i != offset || 0 != k) // do not delete the entire flow && bases_to_align <= bases_left) { // enough bases // match exactly from here onwards tmp_sa = next_sa; if(0 < tmap_bwt_match_hash_exact_alt(bwt, bases_to_align, query + i + n_bases, &tmp_sa, hash) && (tmp_sa.l - tmp_sa.k + 1) <= opt->max_seed_hits) { tmap_map3_aux_seed_add(seeds, n_seeds, m_seeds, tmp_sa.k, tmp_sa.l, offset, seed_length + n_bases - k); } } // move past this base in the hp tmp_sa = next_sa; tmap_bwt_match_hash_2occ(bwt, &tmp_sa, flow_order[flow_i], &next_sa, hash); if(next_sa.l < next_sa.k) { // no match, return return; } } // insert hp bases if(i + n_bases < offset + seed_length) { // not the last flow cur_sa = next_sa; // already considered the 'n_bases' of this flow // insert for(k=1;k<=hp_diff;k++) { // # of bases to insert tmap_bwt_match_hash_2occ(bwt, &cur_sa, flow_order[flow_i], &tmp_sa, hash); if(tmp_sa.l < tmp_sa.k) { // no match, do not continue break; } // match exactly from here onwards if(0 < tmap_bwt_match_hash_exact_alt(bwt, seed_length - (i - offset) - n_bases, query + i + n_bases, &tmp_sa, hash) && (tmp_sa.l - tmp_sa.k + 1) <= opt->max_seed_hits) { tmap_map3_aux_seed_add(seeds, n_seeds, m_seeds, tmp_sa.k, tmp_sa.l, offset, seed_length + k); } // move to the next cur_sa = tmp_sa; } } // next flow flow_i = (1+flow_i) & 3; i += n_bases; prev_sa = next_sa; } if(i - offset < seed_length) tmap_bug(); // add in the seed with no hp indels if((next_sa.l - next_sa.k + 1) <= opt->max_seed_hits) { tmap_map3_aux_seed_add(seeds, n_seeds, m_seeds, next_sa.k, next_sa.l, offset, seed_length); } }
static inline void tmap_map3_aux_core_seed(uint8_t *query, int32_t query_length, uint8_t *flow_order, int32_t hp_diff, tmap_refseq_t *refseq, tmap_bwt_t *bwt, tmap_sa_t *sa, tmap_bwt_match_hash_t *hash, tmap_map_opt_t *opt, tmap_map3_aux_seed_t **seeds, int32_t *n_seeds, int32_t *m_seeds, int32_t seed_length, int32_t seed_step, int32_t fwd_search) { int32_t i, j, flow_i; if(0 < opt->hp_diff) { i=flow_i=0; while(i<query_length - seed_length + 1) { // move to the next flow j=0; while(query[i] != flow_order[flow_i]) { flow_i = (flow_i + 1) & 3; // sanity check j++; if(4 <= j) tmap_bug(); } // add seeds tmap_map3_aux_core_seed_helper(query, query_length, i, flow_order, flow_i, hp_diff, refseq, bwt, sa, hash, opt, seeds, n_seeds, m_seeds, seed_length); // skip over this hp i++; while(i<query_length) { if(query[i] != query[i-1]) { break; } i++; } } } else { int k, count; tmap_bwt_match_occ_t cur_sa, prev_sa; j = count = 0; if(1 == fwd_search) { for(i=0;i<query_length-seed_length+1;i++) { if(0 < tmap_bwt_match_hash_exact(bwt, seed_length, query + i, &cur_sa, hash)) { count++; if((cur_sa.l - cur_sa.k + 1) <= opt->max_seed_hits) { // extend further prev_sa = cur_sa; k = i + 1; while(k < query_length - seed_length) { tmap_bwt_match_hash_2occ(bwt, &prev_sa, query[k], &cur_sa, hash); if(cur_sa.l < cur_sa.k) { // use prev cur_sa = prev_sa; break; } else { // keep going prev_sa = cur_sa; k++; } } k--; // k is always one greater tmap_map3_aux_seed_add(seeds, n_seeds, m_seeds, cur_sa.k, cur_sa.l, k, seed_length + k - i); j++; // skip over if(0 < opt->skip_seed_frac) { i += opt->skip_seed_frac * (seed_length + k - i - 1); // - 1 since i will be incremented } } else { // seed stepping if(0 < seed_step) { k = i + seed_length; int32_t n = 0; while(k + seed_step < query_length && 0 < tmap_bwt_match_hash_exact(bwt, seed_step, query + k, &cur_sa, hash)) { if((cur_sa.l - cur_sa.k + 1) <= opt->max_seed_hits) { tmap_map3_aux_seed_add(seeds, n_seeds, m_seeds, cur_sa.k, cur_sa.l, i, seed_length + k - i); j++; // skip over if(0 < opt->skip_seed_frac) { i += opt->skip_seed_frac * (seed_length + k - i - 1); // - 1 since i will be incremented } break; } k += seed_step; n++; } } } } } } else { for(i=query_length-seed_length;0<=i;i--) { if(0 < tmap_bwt_match_hash_exact(bwt, seed_length, query + i, &cur_sa, hash)) { count++; if((cur_sa.l - cur_sa.k + 1) <= opt->max_seed_hits) { tmap_map3_aux_seed_add(seeds, n_seeds, m_seeds, cur_sa.k, cur_sa.l, i, seed_length); j++; if(0 < opt->skip_seed_frac) { i -= opt->skip_seed_frac * (seed_length - 1); // -1 since i will be incremented } } else { // seed stepping if(0 < seed_step) { int32_t k = i + seed_length; int32_t n = 0; while(k + seed_step < query_length && 0 < tmap_bwt_match_hash_exact_alt(bwt, seed_step, query + k, &cur_sa, hash)) { if((cur_sa.l - cur_sa.k + 1) <= opt->max_seed_hits) { tmap_map3_aux_seed_add(seeds, n_seeds, m_seeds, cur_sa.k, cur_sa.l, i, seed_length + k - i); j++; if(0 < opt->skip_seed_frac) { i -= opt->skip_seed_frac * (seed_length + k - i - 1); // -1 since i will be incremented } // break when the e find the first hit break; } k += seed_step; n++; } } } } else { // skip over if we came up short i -= (seed_length - cur_sa.offset); } } } // remove seeds if there were too many repetitive hits // NB: does count seed steps if(j / (double)count < opt->hit_frac) { (*n_seeds) = 0; //(*n_seeds) -= j; } } }
static inline tmap_string_t * tmap_sam_md(tmap_refseq_t *refseq, char *read_bases, // read bases are characters uint32_t seqid, uint32_t pos, // seqid and pos are 0-based uint32_t *cigar, int32_t n_cigar, int32_t *nm, char *read_bases_eq) { int32_t i, j; uint32_t ref_i, read_i, ref_start, ref_end; int32_t l = 0; // the length of the last md op uint8_t read_base, ref_base; tmap_string_t *md=NULL; uint8_t *target = NULL;; md = tmap_string_init(32); (*nm) = 0; ref_start = ref_end = pos + 1; // make one-based for(i=0;i<n_cigar;i++) { // go through each cigar operator int32_t op_len; op_len = cigar[i] >> 4; switch(cigar[i]&0xf) { case BAM_CMATCH: case BAM_CDEL: case BAM_CREF_SKIP: ref_end += op_len; break; default: break; } } ref_end--; target = tmap_refseq_subseq2(refseq, seqid+1, ref_start, ref_end, NULL, 0, NULL); if(NULL == target) { tmap_bug(); } if(0 == n_cigar) { tmap_bug(); } read_i = ref_i = 0; for(i=0;i<n_cigar;i++) { // go through each cigar operator int32_t op_len, op; op_len = cigar[i] >> 4; op = cigar[i] & 0xf; if(BAM_CMATCH == op) { for(j=0;j<op_len;j++) { if(refseq->len <= refseq->annos[seqid].offset + pos + ref_i) break; // out of boundary read_base = tmap_nt_char_to_int[(int)read_bases[read_i]]; ref_base = target[ref_i]; if(read_base == ref_base) { // a match if(NULL != read_bases_eq) read_bases_eq[read_i] = '='; l++; } else { if(NULL != read_bases_eq) read_bases_eq[read_i] = read_bases[read_i]; tmap_string_lsprintf(md, md->l, "%d%c", l, tmap_iupac_int_to_char[ref_base]); l = 0; (*nm)++; } read_i++; ref_i++; } if(j < op_len) break; } else if(BAM_CINS == op) { if(NULL != read_bases_eq) { for(j=0;j<op_len;j++) { read_bases_eq[read_i+j] = read_bases[read_i+j]; } } read_i += op_len; (*nm) += op_len; } else if(BAM_CDEL == op) { tmap_string_lsprintf(md, md->l, "%d^", l); for(j=0;j<op_len;j++) { if(refseq->len <= refseq->annos[seqid].offset + pos + ref_i) break; // out of boundary ref_base = target[ref_i]; tmap_string_lsprintf(md, md->l, "%c", tmap_iupac_int_to_char[ref_base]); ref_i++; } if(j < op_len) break; (*nm) += op_len; l=0; } else if(BAM_CREF_SKIP == op) { ref_i += op_len; } else if(BAM_CSOFT_CLIP == op) { if(NULL != read_bases_eq) { for(j=0;j<op_len;j++) { read_bases_eq[read_i+j] = read_bases[read_i+j]; } } read_i += op_len; } else if(BAM_CHARD_CLIP == op) { // ignore } else if(BAM_CPAD == op) { // ignore } else { tmap_error("could not understand the cigar operator", Exit, OutOfRange); } } tmap_string_lsprintf(md, md->l, "%d", l); if(NULL != read_bases_eq) read_bases_eq[read_i] = '\0'; free(target); return md; }