static tmap_map_sams_t* tmap_map1_thread_map_core(void **data, tmap_seq_t *seqs[4], int32_t seq_len, tmap_index_t *index, tmap_bwt_match_hash_t *hash, tmap_map_opt_t *opt) { tmap_map1_thread_data_t *d = (tmap_map1_thread_data_t*)(*data); int32_t seed2_len = 0; tmap_map_opt_t opt_local = (*opt); // copy over values tmap_map_sams_t *sams = NULL; tmap_string_t *bases = NULL; if((0 < opt->min_seq_len && seq_len < opt->min_seq_len) || (0 < opt->max_seq_len && opt->max_seq_len < seq_len)) { // go to the next loop return tmap_map_sams_init(NULL); } // not enough bases, ignore if(0 < opt->seed_length && seq_len < opt->seed_length){ return tmap_map_sams_init(NULL); } if(opt->seed2_length < 0 || seq_len < opt->seed2_length) { seed2_len = seq_len; // remember to round up opt_local.max_mm = (opt->max_mm < 0) ? (int)(0.99 + opt->max_mm_frac * seed2_len) : opt->max_mm; opt_local.max_gapo = (opt->max_gapo < 0) ? (int)(0.99 + opt->max_gapo_frac * seed2_len) : opt->max_gapo; opt_local.max_gape = (opt->max_gape < 0) ? (int)(0.99 + opt->max_gape_frac * seed2_len) : opt->max_gape; } else { seed2_len = opt->seed2_length; opt_local.max_mm = d->max_mm; opt_local.max_gapo = d->max_gapo; opt_local.max_gape = d->max_gape; } // get bases for the reversed sequence bases = tmap_seq_get_bases(seqs[2]); // primary width, use seed2 length if(d->width_length < seed2_len) { d->width_length = seed2_len; d->width = tmap_realloc(d->width, (1+d->width_length) * sizeof(tmap_bwt_match_width_t), "d->width"); memset(d->width, 0, (1+d->width_length) * sizeof(tmap_bwt_match_width_t)); } // NB: use the reversed sequence tmap_bwt_match_cal_width_reverse(index->bwt, seed2_len, bases->s + (seq_len - seed2_len), d->width); // seed width if(0 < opt->seed_length) { // NB: use the reversed sequence tmap_bwt_match_cal_width_reverse(index->bwt, opt->seed_length, bases->s + (seq_len - opt->seed_length), d->seed_width); } // NB: use the reverse complimented sequence sams = tmap_map1_aux_core(seqs[1], index, hash, d->width, (0 < opt_local.seed_length) ? d->seed_width : NULL, &opt_local, d->stack, seed2_len); return sams; }
void tmap_seqs_add(tmap_seqs_t *seqs, tmap_seq_t *seq) { // do we need more memory? if(seqs->m <= seqs->n) { seqs->m++; seqs->seqs = tmap_realloc(seqs->seqs, seqs->m * sizeof(tmap_seq_t*), "seqs->seqs"); } seqs->n++; seqs->seqs[seqs->n-1] = seq; }
int32_t tmap_sff_get_flowgram(tmap_sff_t *sff, uint16_t **flowgram, int32_t mem) { int32_t i; if(mem <= sff->gheader->flow_length) { (*flowgram) = tmap_realloc((*flowgram), sizeof(uint16_t) * sff->gheader->flow_length, "flowgram"); } for(i=0;i<sff->gheader->flow_length;i++) { (*flowgram)[i] = sff->read->flowgram[i]; } return sff->gheader->flow_length; }
tmap_seq_t * tmap_seqs_get(tmap_seqs_t *seqs, int32_t i) { if(seqs->m <= i) { // make room seqs->seqs = tmap_realloc(seqs->seqs, (i+1) * sizeof(tmap_seq_t*), "seqs->seqs"); while(seqs->m <= i) { seqs->seqs[seqs->m] = tmap_seq_init(seqs->type); seqs->m++; } } return seqs->seqs[i]; }
static tmap_map1_aux_stack_t* tmap_map1_aux_stack_reset(tmap_map1_aux_stack_t *stack, int32_t max_mm, int32_t max_gapo, int32_t max_gape, const tmap_map_opt_t *opt) { int32_t i; //int32_t i, j; int32_t n_bins_needed = 0; // move to the beginning of the memory pool stack->entry_pool_i = 0; stack->best_score = INT32_MAX; if(TMAP_MAP1_AUX_STACK_TOO_BIG < stack->entry_pool_length) { tmap_map1_aux_stack_destroy_helper(stack, 0); tmap_map1_aux_stack_init_helper(stack); } // clear the bins for(i=0;i<stack->n_bins;i++) { /* for(j=0;j<stack->bins[i].n_entries;j++) { stack->bins[i].entries[j] = NULL; } */ stack->bins[i].n_entries = 0; } // resize the bins if necessary n_bins_needed = aln_score(max_mm+1, max_gapo+1, max_gape+1, opt); if(stack->n_bins < n_bins_needed) { // realloc tmap_roundup32(n_bins_needed); stack->bins = tmap_realloc(stack->bins, sizeof(tmap_map1_aux_bin_t) * n_bins_needed, "stack->bins"); // initialize for(i=stack->n_bins;i<n_bins_needed;i++) { stack->bins[i].n_entries = stack->bins[i].m_entries = 0; stack->bins[i].entries = NULL; } stack->n_bins = n_bins_needed; } stack->n_entries = 0; return stack; }
void tmap_map2_aln_realloc(tmap_map2_aln_t *a, int32_t n) { int32_t i; if(NULL == a) return; if(n < a->n) { for(i=n;i<a->n;i++) { tmap_map2_hit_nullify(&a->hits[i]); } a->n = n; } else if(a->max < n) { // allocate more memory i = a->max; // save for init a->max = (0 == a->max && n < 4) ? 4 : tmap_roundup32(n); // resize a->hits = tmap_realloc(a->hits, sizeof(tmap_map2_hit_t) * a->max, "a->hits"); // init while(i < a->max) { tmap_map2_hit_nullify(&a->hits[i]); i++; } } }
static inline void tmap_map3_aux_seed_add(tmap_map3_aux_seed_t **seeds, int32_t *n_seeds, int32_t *m_seeds, tmap_bwt_int_t k, tmap_bwt_int_t l, int32_t start, int16_t seed_length) { /* if(offset < INT8_MIN || INT8_MAX < offset) { tmap_error("offset for hp enumeration was out of range", Warn, OutOfRange); } */ if((*m_seeds) <= (*n_seeds)) { (*m_seeds) = (0 == (*m_seeds)) ? 64 : ((*m_seeds) << 1); (*seeds) = tmap_realloc((*seeds), sizeof(tmap_map3_aux_seed_t)*(*m_seeds), "(*seeds)"); } (*seeds)[(*n_seeds)].k = k; (*seeds)[(*n_seeds)].l = l; (*seeds)[(*n_seeds)].start = start; (*seeds)[(*n_seeds)].seed_length = seed_length; (*n_seeds)++; }
tmap_map_sams_t * tmap_map2_aux_core(tmap_map_opt_t *_opt, tmap_seq_t *seqs[4], tmap_refseq_t *refseq, tmap_bwt_t *bwt, tmap_sa_t *sa, tmap_bwt_match_hash_t *hash, tmap_rand_t *rand, tmap_map2_global_mempool_t *pool) { tmap_map_opt_t opt; tmap_seq_t *orig_seq = NULL; tmap_string_t *seq[2]={NULL, NULL}; tmap_string_t *rseq[2]={NULL, NULL}; tmap_map_sams_t *sams = NULL; tmap_map2_aln_t *b[2]={NULL,NULL}; tmap_string_t *bases = NULL; int32_t i, k, l, num_n; opt = (*_opt); // sequence length bases = tmap_seq_get_bases(seqs[0]); l = bases->l; // update the local opt tmap_map2_aux_core_update_opt(&opt, _opt, l); // set opt->score_thr if(pool->max_l < l) { // then enlarge working space for tmap_sw_extend_core() int32_t tmp; if(0 == opt.pen_gape) { tmp = ((l + 1) / 2 * opt.score_match + opt.pen_gape) + l; } else { tmp = ((l + 1) / 2 * opt.score_match + opt.pen_gape) / opt.pen_gape + l; } pool->max_l = l; pool->aln_mem = tmap_realloc(pool->aln_mem, sizeof(uint8_t) * (tmp + 2) * 24, "pool->aln_mem"); } // get the number of Ns for(i=num_n=0;i<l;i++) { uint8_t c = (uint8_t)tmap_nt_char_to_int[(int)bases->s[i]]; if(c >= 4) num_n++; // FIXME: ambiguous bases are not properly handled } // will we always be lower than the score threshold if((l*opt.score_match) + (num_n*opt.pen_mm) < opt.score_thr) { return tmap_map_sams_init(NULL); } // save sequences seq[0] = tmap_seq_get_bases(seqs[0]); seq[1] = tmap_seq_get_bases(seqs[1]); rseq[0] = tmap_seq_get_bases(seqs[2]); rseq[1] = tmap_seq_get_bases(seqs[3]); // handle ambiguous bases if(0 < num_n) { // save original to de-randomize later orig_seq = tmap_seq_clone(seqs[0]); // randomize for(i=0;i<l;i++) { uint8_t c = (uint8_t)bases->s[i]; if(c >= 4) { c = (int)(tmap_rand_get(rand) * 4); // FIXME: ambiguous bases are not properly handled seq[0]->s[i] = c; // original seq[1]->s[l-1-i] = 3 - c; // reverse compliment rseq[0]->s[l-1-i] = 3 - c; // reverse compliment rseq[1]->s[i] = c; // original //rseq[0]->s[l-1-i] = c; // reverse //rseq[1]->s[i] = 3 - c; // compliment } } } // alignment b[0] = tmap_map2_aux_aln(&opt, refseq, bwt, sa, hash, seq, 0, pool); for(k = 0; k < b[0]->n; ++k) { if(b[0]->hits[k].n_seeds < opt.seeds_rev) break; } if(k < b[0]->n) { b[1] = tmap_map2_aux_aln(&opt, refseq, bwt, sa, hash, rseq, 1, pool); for(i = 0; i < b[1]->n; ++i) { tmap_map2_hit_t *p = b[1]->hits + i; int x = p->beg; p->flag ^= 0x10, p->is_rev ^= 1; // flip the strand p->beg = l - p->end; p->end = l - x; if(p->l == 0) { if(refseq->len * 2 < (p->k + p->tlen)) p->k = 0; else p->k = 2 * refseq->len - (p->k + p->tlen); } } tmap_map2_aux_merge_hits(b, l, 0); } else b[1] = 0; // set the flag to forward/reverse tmap_map2_aux_flag_fr(b); // tlen may overestimated due to not counting insertions properly, bound it! for(i = 0; i < b[0]->n; ++i) { if(refseq->len * 2 <= b[0]->hits[i].k + b[0]->hits[i].tlen) { b[0]->hits[i].tlen = (refseq->len * 2) - b[0]->hits[i].k; } else if(b[0]->hits[i].k < refseq->len && refseq->len <= b[0]->hits[i].k + b[0]->hits[i].tlen) { b[0]->hits[i].tlen = refseq->len - b[0]->hits[i].k; } } // make one-based for pac2real for(i = 0; i < b[0]->n; ++i) { b[0]->hits[i].k++; } // store in SAM sams = tmap_map2_aux_store_hits(refseq, &opt, b[0], l); // free tmap_map2_aln_destroy(b[0]); // revert ambiguous bases if(0 < num_n) { // de-randomize bases = tmap_seq_get_bases(orig_seq); for(i=0;i<l;i++) { uint8_t c = (uint8_t)bases->s[i]; if(c >= 4) { // NB: always keep them at "4" seq[0]->s[i] = c; // original seq[1]->s[l-1-i] = c; // reverse compliment rseq[0]->s[l-1-i] = c; // reverse compliment rseq[1]->s[i] = c; // original //rseq[0]->s[l-1-i] = c; // reverse //rseq[1]->s[i] = 3 - c; // compliment } } tmap_seq_destroy(orig_seq); } return sams; }
static tmap_map2_aln_t * tmap_map2_aux_aln(tmap_map_opt_t *opt, tmap_refseq_t *target_refseq, tmap_bwt_t *target_bwt, tmap_sa_t *target_sa, tmap_bwt_match_hash_t *target_hash, tmap_string_t *seq[2], int32_t is_rev, tmap_map2_global_mempool_t *pool) { tmap_map2_aln_t *b[2], **bb[2], **_b, *p; int32_t j, k; tmap_bwtl_t *query = tmap_bwtl_seq2bwtl(seq[0]->l, (uint8_t*)seq[0]->s); _b = tmap_map2_core_aln(opt, query, target_refseq, target_bwt, target_sa, target_hash, pool); tmap_bwtl_destroy(query); for(k = 0; k < 2; ++k) { bb[k] = tmap_calloc(2, sizeof(void*), "bb[k]"); bb[k][0] = tmap_calloc(1, sizeof(tmap_map2_aln_t), "bb[k][0]"); bb[k][1] = tmap_calloc(1, sizeof(tmap_map2_aln_t), "bb[k][1]"); } for(k = 0; k < 2; ++k) { // separate _b into bb[2] based on the strand // resolve duplicates // _b[0] are "wide SA hits" // _b[1] are "narrow SA hits" if(1 == k || 0 != opt->narrow_rmdup) { tmap_map2_aux_resolve_duphits(target_refseq, target_bwt, target_sa, target_hash, _b[k], opt->max_seed_hits, opt->max_seed_intv, 0); } else { // only to packed reference coordinates if(0 == tmap_map2_aux_sa_pac_pos(target_refseq, target_bwt, target_sa, target_hash, _b[k], opt->max_seed_hits, INT32_MAX, INT32_MIN)) { // revert to resolving duplicates narrowly tmap_map2_aux_resolve_duphits(target_refseq, target_bwt, target_sa, target_hash, _b[k], opt->max_seed_hits, opt->max_seed_intv, 0); } } for(j = 0; j < _b[k]->n; ++j) { tmap_map2_hit_t *q; p = bb[_b[k]->hits[j].is_rev][k]; if (p->n == p->max) { p->max = p->max? p->max<<1 : 8; p->hits = tmap_realloc(p->hits, p->max * sizeof(tmap_map2_hit_t), "p->hits"); } q = &p->hits[p->n++]; *q = _b[k]->hits[j]; if (_b[k]->hits[j].is_rev) { int32_t x = q->beg; q->beg = seq[0]->l - q->end; q->end = seq[0]->l - x; } } } // free for(k = 0; k < 2; ++k) { free(_b[k]->hits); free(_b[k]); } free(_b); // resolve duplicates for(k = 0; k < 2; ++k) { // bb[*][0] are "wide SA hits" tmap_map2_aux_resolve_duphits(NULL, NULL, NULL, NULL, bb[k][0], opt->max_seed_hits, opt->max_seed_intv, 0); // bb[*][1] are "narrow SA hits" if(0 != opt->narrow_rmdup) { tmap_map2_aux_resolve_duphits(NULL, NULL, NULL, NULL, bb[k][1], opt->max_seed_hits, opt->max_seed_intv, 0); } } b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits" tmap_map2_chain_filter(opt, seq[0]->l, b); // NB: only unique seeds are chained // merge all hits for(k = 0; k < 2; ++k) { tmap_map2_aux_merge_hits(bb[k], seq[k]->l, 0); // bb[k][1] and bb[k][0] are merged into bb[k][0] b[k] = bb[k][0]; free(bb[k]); } tmap_map2_aux_merge_hits(b, seq[0]->l, 1); // b[1] and b[0] are merged into b[0] return b[0]; }
int tmap_seqs_io_sff2sam_main(int argc, char *argv[]) { int c, help = 0; tmap_seqs_io_t *io_in = NULL; tmap_seqs_t *seqs = NULL; char **sam_rg = NULL; int32_t sam_rg_num = 0; int bidirectional = 0, sam_flowspace_tags = 0; int out_type = 0; tmap_sam_io_t *io_out = NULL; bam_header_t *header = NULL; // BAM Header int32_t i; /* uint8_t *key_seq = NULL; int key_seq_len = 0; */ while((c = getopt(argc, argv, "DGR:Yvh")) >= 0) { switch(c) { case 'D': bidirectional = 1; break; case 'G': break; case 'R': sam_rg = tmap_realloc(sam_rg, (1+sam_rg_num) * sizeof(char*), "sam_rg"); sam_rg[sam_rg_num] = tmap_strdup(optarg); sam_rg_num++; break; case 'Y': sam_flowspace_tags = 1; break; case 'v': tmap_progress_set_verbosity(1); break; case 'h': help = 1; break; default: return 1; } } if(1 != argc - optind || 1 == help) { tmap_file_fprintf(tmap_file_stderr, "Usage: %s %s [-R -Y -v -h] <in.sff>\n", PACKAGE, argv[0]); return 1; } // input io_in = tmap_seqs_io_init(&argv[optind], 1, TMAP_SEQ_TYPE_SFF, TMAP_FILE_NO_COMPRESSION, 0l, 0l); // BAM Header header = tmap_seqs_io_to_bam_header(NULL, io_in, sam_rg, sam_rg_num, argc, argv); // open the output file switch(out_type) { case 0: // SAM io_out = tmap_sam_io_init2("-", "wh", header); break; case 1: io_out = tmap_sam_io_init2("-", "wb", header); break; case 2: io_out = tmap_sam_io_init2("-", "wbu", header); break; default: tmap_bug(); } // destroy the BAM Header bam_header_destroy(header); header = NULL; seqs = tmap_seqs_init(TMAP_SEQ_TYPE_SFF); while(0 < tmap_seqs_io_read(io_in, seqs, io_out->fp->header->header)) { bam1_t *b = NULL; tmap_seq_t *seq = seqs->seqs[0]; b = tmap_sam_convert_unmapped(seq, sam_flowspace_tags, bidirectional, NULL, 0, 0, 0, 0, 0, 0, "\tlq:i:%d\trq:i:%d\tla:i:%d\trq:i:%d", seq->data.sff->rheader->clip_qual_left, seq->data.sff->rheader->clip_qual_right, seq->data.sff->rheader->clip_adapter_left, seq->data.sff->rheader->clip_adapter_right); if(samwrite(io_out->fp, b) <= 0) { tmap_error("Error writing the SAM file", Exit, WriteFileError); } bam_destroy1(b); tmap_seqs_destroy(seqs); seqs = tmap_seqs_init(TMAP_SEQ_TYPE_SFF); } tmap_seqs_destroy(seqs); // free memory tmap_seqs_io_destroy(io_in); tmap_sam_io_destroy(io_out); for(i=0;i<sam_rg_num;i++) { free(sam_rg[i]); } free(sam_rg); return 0; }
bam_header_t * tmap_seqs_io_to_bam_header(tmap_refseq_t *refseq, tmap_seqs_io_t *io_in, char **rg_sam, int32_t rg_sam_num, int32_t argc, char *argv[]) { bam_header_t *bam_header = NULL; sam_header_t *header = NULL; // the output header sam_header_record_t *record = NULL; sam_header_record_t **record_list = NULL; char tag[2]; char *command_line= NULL; char *id = NULL; char *id_pp = NULL; int32_t i, j; // @HD if(io_in->type == TMAP_SEQ_TYPE_SAM || io_in->type == TMAP_SEQ_TYPE_BAM) { // should be only one input file if(1 != io_in->n) { tmap_bug(); } // get the current header if(NULL == io_in->seqios[0]) tmap_bug(); if(NULL == io_in->seqios[0]->io.samio) tmap_bug(); if(NULL == io_in->seqios[0]->io.samio->fp->header) tmap_bug(); if(NULL == io_in->seqios[0]->io.samio->fp->header->header) { header = sam_header_parse2(io_in->seqios[0]->io.samio->fp->header->text); } else { header = io_in->seqios[0]->io.samio->fp->header->header; // wow, that's a lot of pointers if(NULL == header) tmap_bug(); header = sam_header_clone(header); // clone the header } if(NULL == header) tmap_bug(); } else { // empty header header = sam_header_init(); // @HD - header line record = sam_header_record_init("HD"); // new header line if(0 == sam_header_record_add(record, "VN", "1.4")) tmap_bug(); // version number if(0 == sam_header_add_record(header, record)) tmap_bug(); // add the header line // nullify record = NULL; } // Get the TMAP program ID id = tmap_malloc(sizeof(char) * (1 + strlen(PACKAGE_NAME)), "id"); strcpy(id, PACKAGE_NAME); // default for(i=j=0;NULL != (record_list = sam_header_get_record(header, "PG", "ID", id, &i)) && 0 < i;i=0) { // while the id is found char *ptr = NULL; // swap id and id_pp ptr = id_pp; id_pp = id; id = ptr; // create the new ID j++; id = tmap_realloc(id, sizeof(char) * (1 + (int)log10(j) + 1 + strlen(PACKAGE_NAME) + 1), "id"); if(sprintf(id, "%s.%d", PACKAGE_NAME, j) < 0) tmap_bug(); free(record_list); record_list = NULL; } // @SQ if(NULL != refseq) { sam_header_records_t *records = NULL; // NB: check to see if any SQ/SN records exist, if not, then ignore checking... // ZZ: We will not checking, but instead just remove all the old header. The old way of checking is not working records = sam_header_get_records(header, "SQ"); if (NULL != records) { // ZZ: remove the headers if exists. sam_header_remove_records(header, "SQ"); records = NULL; } // ZZ: Now we will just add all new tags for(i=0;i<refseq->num_annos;i++) { // for each reference sequence char num[32]; record = sam_header_record_init("SQ"); // new reference sequence record if(0 == sam_header_record_add(record, "SN", refseq->annos[i].name->s)) tmap_bug(); // reference sequence name if(sprintf(num, "%u", (uint32_t)refseq->annos[i].len) < 0) tmap_bug(); // integer to string if(0 == sam_header_record_add(record, "LN", num)) tmap_bug(); // reference sequence length if(0 == sam_header_add_record(header, record)) tmap_bug(); // add the reference sequence record } } // @RG - read group if(0 < rg_sam_num) { // @RG specified on the command line // Check for SAM/BAM // TODO: this should be possible... if(io_in->type == TMAP_SEQ_TYPE_SAM || io_in->type == TMAP_SEQ_TYPE_BAM) { tmap_error("Cannot specify the read groups on the command line when using SAM/BAM as input." " Please embed in the SAM/BAM header instead.", Exit, OutOfRange); } record = NULL; // go through the command line arguments for(i=0;i<rg_sam_num;i++) { if(strlen(rg_sam[i]) < 4) tmap_error("Read group too small", Exit, OutOfRange); if(':' != rg_sam[i][2]) tmap_error("Read group improperly formatted (no colon)", Exit, OutOfRange); // check for id if('I' == rg_sam[i][0] && 'D' == rg_sam[i][1]) { // new read group if(NULL != record) { // add the record tmap_seqs_io_init2_fs_and_add(io_in, header, record); // add @RG.KS and @RG.FO } record = sam_header_record_init("RG"); // new read group } // add the tag/value to the record if(NULL == record) { tmap_error("The read group ID must be specified first", Exit, OutOfRange); } tag[0]=rg_sam[i][0]; tag[1]=rg_sam[i][1]; // setup the tag if(0 == sam_header_record_add(record, tag, rg_sam[i]+3)) tmap_bug(); // add the tag/value } if(NULL != record) { // add the record tmap_seqs_io_init2_fs_and_add(io_in, header, record); // add @RG.KS and @RG.FO } // check that the # of read groups added was the same as the # of input files... sam_header_records_t *records = sam_header_get_records(header, "RG"); // get the header line if(records->n != io_in->n) tmap_error("The number of read groups did not match the number of input files", Exit, OutOfRange); } else if(io_in->type != TMAP_SEQ_TYPE_SAM && io_in->type != TMAP_SEQ_TYPE_BAM) { // dummy... for(i=0;i<io_in->n;i++) { // for each input file char buf[32]; record = sam_header_record_init("RG"); // new read group if(1 == io_in->n) strcpy(buf, "NOID"); else if(sprintf(buf, "NOID.%d", i+1) < 0) tmap_bug(); if(0 == sam_header_record_add(record, "ID", buf)) tmap_bug(); // dummy ID if(0 == sam_header_record_add(record, "SM", "NOSM")) tmap_bug(); // dummy SM, for Picard validation if(0 == sam_header_record_add(record, "PG", id)) tmap_bug(); // dummy PG tmap_seqs_io_init2_fs_and_add(io_in, header, record); // add @RG.KS and @RG.FO } } else { // check that SM/PG are present sam_header_records_t *records = sam_header_get_records(header, "RG"); // get the header line for(i=0;i<records->n;i++) { record = records->records[i]; if(NULL == sam_header_record_get(record, "ID")) tmap_error("Missing @RG.ID in the SAM/BAM Header", Exit, OutOfRange); if(NULL == sam_header_record_get(record, "SM")) { if(0 == sam_header_record_add(record, "SM", "NOSM")) tmap_bug(); // dummy SM, for Picard validation } if(NULL == sam_header_record_get(record, "PG")) { if(0 == sam_header_record_add(record, "PG", id)) tmap_bug(); // dummy PG } } } // @PG - program group // TODO: check for previous program group ID and set @PG.PP record = sam_header_record_init("PG"); // new program group if(0 == sam_header_record_add(record, "ID", id)) tmap_bug(); // @PG.ID if(0 == sam_header_record_add(record, "VN", PACKAGE_VERSION)) tmap_bug(); // @PG.VN // @PG.CL command_line = NULL; j = 1; // for the EOL command_line = tmap_realloc(command_line, sizeof(char) * j, "command_line"); command_line[j-1] = '\0'; for(i=0;i<argc;i++) { if(0 < i) j++; j += strlen(argv[i]); command_line = tmap_realloc(command_line, sizeof(char) * j, "command_line"); if(0 < i) strcat(command_line, " "); strcat(command_line, argv[i]); command_line[j-1] = '\0'; } if(0 == sam_header_record_add(record, "CL", command_line)) tmap_bug(); // @PG.CL if(NULL != id_pp) { // @PG.PP if(0 == sam_header_record_add(record, "PP", id_pp)) tmap_bug(); // @PG.CL } if(0 == sam_header_add_record(header, record)) tmap_bug(); // add the record free(command_line); // Check the new SAM Header if(0 == sam_header_check(header)) { tmap_error("SAM Header was not consistent", Exit, OutOfRange); } // Create a BAM Header from the SAM Header bam_header = bam_header_init(); // empty bam_header->header = header; // soft-copy the header bam_header = sam_header_to_bam_header(bam_header); // convert // free memory free(id); free(id_pp); return bam_header; }
tmap_map_sams_t * tmap_map1_aux_core(tmap_seq_t *seq, tmap_index_t *index, tmap_bwt_match_hash_t *hash, tmap_bwt_match_width_t *width, tmap_bwt_match_width_t *seed_width, tmap_map_opt_t *opt, tmap_map1_aux_stack_t *stack, int32_t seed2_len) { int32_t max_mm = opt->max_mm, max_gapo = opt->max_gapo, max_gape = opt->max_gape, seed_max_diff = opt->seed_max_diff; int32_t best_score, next_best_score; int32_t best_cnt = 0; int32_t i, j, num_n = 0; int32_t max_edit_score; tmap_bwt_match_occ_t match_sa_start; tmap_string_t *bases=NULL; tmap_map_sams_t *sams = NULL; int32_t max_diff, best_diff; tmap_bwt_int_t k, l; tmap_refseq_t *refseq = index->refseq; tmap_bwt_t *bwt = index->bwt; tmap_sa_t *sa = index->sa; tmap_map1_aux_occ_t *occs = NULL; max_edit_score = opt->pen_mm; //if(max_edit_score < opt->pen_gapo + opt->pen_gape) max_edit_score = opt->pen_gapo + opt->pen_gape; //if(max_edit_score < opt->pen_gape) max_edit_score = opt->pen_gape; bases = tmap_seq_get_bases(seq); /* fputc('\n', stderr); for(i=0;i<bases->l;i++) { fputc("ACGTN"[(int)bases->s[i]], stderr); } fputc('\n', stderr); */ // the maximum # of differences if(bases->l <= TMAP_MAP_OPT_MAX_DIFF_READ_LENGTH) { best_diff = max_diff = opt->max_diff_table[bases->l]; } else { best_diff = max_diff = opt->max_diff_table[TMAP_MAP_OPT_MAX_DIFF_READ_LENGTH]; } // bound differenes by the maximum # of differences if(max_diff < max_mm) max_mm = max_diff; if(max_diff < max_gapo) max_gapo = max_diff; //if(max_diff < max_gape) max_gape = max_diff; best_score = next_best_score = aln_score(max_mm+1, max_gapo+1, max_gape+1, opt); // check whether there are too many N for(j=bases->l-seed2_len,num_n=0;j<bases->l;j++) { if(3 < bases->s[j]) { num_n++; } } if(max_mm < num_n || max_diff < num_n) { return tmap_map_sams_init(NULL); } // initialize sams = tmap_map_sams_init(NULL); occs = NULL; match_sa_start.offset = 0; match_sa_start.hi = 0; match_sa_start.k = 0; match_sa_start.l = bwt->seq_len; stack = tmap_map1_aux_stack_reset(stack, max_mm, max_gapo, max_gape, opt); // reset stack tmap_map1_aux_stack_push(stack, bases->l, &match_sa_start, 0, 0, 0, STATE_M, 0, NULL, opt); while(0 < tmap_map1_aux_stack_size(stack) && tmap_map1_aux_stack_size(stack) < opt->max_entries) { tmap_map1_aux_stack_entry_t *e = NULL; int32_t len=-1; int32_t n_seed_mm=0, offset, width_cur_i; const uint8_t *str=NULL; int32_t sam_found, m; tmap_bwt_match_width_t *width_cur = NULL; const tmap_bwt_match_width_t *seed_width_cur = NULL; tmap_bwt_match_occ_t match_sa_cur, match_sa_next[4]; // get the best entry e = tmap_map1_aux_stack_pop(stack); // bound with best score if(best_score + max_edit_score < e->score) { break; // no need to continue } // some more information match_sa_cur = e->match_sa; // check if we have too many edits m = max_diff - (e->n_mm + e->n_gapo + e->n_gape); if(m < 0) { continue; // too many edits } // get the rest of the information offset = e->offset; // zero-based str = (uint8_t*)bases->s; len = bases->l; width_cur = width; width_cur_i = seed2_len - (len - offset); if(NULL != seed_width) { seed_width_cur = seed_width; n_seed_mm = seed_max_diff - (e->n_mm + e->n_gapo + e->n_gape); // consider only mismatches in the seed } else { seed_width_cur = NULL; } if(0 < width_cur_i && m < width_cur[width_cur_i-1].bid) { // too many edits continue; } // check whether a sam is found sam_found = 0; if(len - seed2_len == offset) { sam_found = 1; } else if(max_mm == e->n_mm // no mismatches from any state && ((e->state == STATE_M && max_gapo == e->n_gapo) // in STATE_M but no more gap opens || (e->state != STATE_M && max_gape == e->n_gape))) { // in STATE_I/STATE_D but no more extensions if(0 < tmap_bwt_match_hash_exact_alt_reverse(bwt, offset, str, &match_sa_cur, hash)) { // the alignment must match exactly to sam sam_found = 2; } else { continue; // no sam, skip } } if(0 < sam_found) { // alignment found // check for duplicates if(0 < sams->n) { for(i=0;i<sams->n;i++) { // check contained if(match_sa_cur.k <= occs[i].k && occs[i].k <= match_sa_cur.l) { // MK <= SK <= ML if(occs[i].l <= match_sa_cur.l) { // MK <= SK <= SL <= ML // Want (SK - MK) + (ML - SL) k = occs[i].k - match_sa_cur.k; // (SK - MK) k += match_sa_cur.l - occs[i].l; // (ML - SL) occs[i].l = match_sa_cur.l; // Make SL = ML } else { // MK <= SK <= ML <= SL k = occs[i].k - match_sa_cur.k; // (SK - MK) } occs[i].k = match_sa_cur.k; // Make SK = MK break; } else if(match_sa_cur.k <= occs[i].l && occs[i].l <= match_sa_cur.l) { // MK <= SL <= ML if(match_sa_cur.k <= occs[i].k) { // MK <= SK <= SL <= ML // Want (SK - MK) + (ML - SL) k = occs[i].k - match_sa_cur.k; // (SK - MK) k += match_sa_cur.l - occs[i].l; // (ML - SL) occs[i].k = match_sa_cur.k; // Make SK = MK } else { // SK <= MK <= SL <= ML k = match_sa_cur.l - occs[i].l; // (ML - SL) } occs[i].l = match_sa_cur.l; // Make SL = ML break; } } if(i < sams->n) { // shadow if(0 < k) { //tmap_map1_aux_stack_shadow(k, bwt->seq_len, e->last_diff_offset, width_cur); width_cur_i = seed2_len - (len - e->last_diff_offset); tmap_map1_aux_stack_shadow(k, seed2_len, width_cur_i, width_cur); } sam_found = 0; continue; } } int32_t score = aln_score(e->n_mm, e->n_gapo, e->n_gape, opt); int32_t do_add = 1; if(sams->n == 0) { best_score = score; best_cnt = 0; best_diff = e->n_mm + e->n_gapo + e->n_gape; } if(score == best_score) { best_cnt += match_sa_cur.l - match_sa_cur.k + 1; } else { if(best_diff + 1 <= max_diff) { max_diff = best_diff + 1; } if(score < next_best_score) { next_best_score = score; } else if(next_best_score < score) { // no need to examine further break; } } if(do_add) { // append uint32_t op, op_len, cigar_i; tmap_map_sam_t *sam = NULL; tmap_map1_aux_stack_entry_t *cur = NULL; tmap_map_sams_realloc(sams, sams->n+1); occs = tmap_realloc(occs, sizeof(tmap_map1_aux_occ_t) * sams->n, "occs"); sam = &sams->sams[sams->n-1]; sam->algo_id = TMAP_MAP_ALGO_MAP1; sam->algo_stage = 0; sam->score = e->score; // aux data tmap_map_sam_malloc_aux(sam); k = occs[sams->n-1].k = match_sa_cur.k; l = occs[sams->n-1].l= match_sa_cur.l; sam->aux.map1_aux->n_mm = e->n_mm; sam->aux.map1_aux->n_gapo = e->n_gapo; sam->aux.map1_aux->n_gape = e->n_gape; // aux data: reference length cur = e; i = e->i; sam->aux.map1_aux->aln_ref = 0; cigar_i = 0; if(2 == sam_found) { // we used 'tmap_bwt_match_exact_alt_reverse' op = STATE_M; op_len = offset; } else { op = -1; op_len = 0; } while(0 <= i) { cur = stack->entry_pool[i]; if(len == cur->offset) break; if(op != cur->state) { if(STATE_M == op || STATE_D == op) { sam->aux.map1_aux->aln_ref += op_len; } op = cur->state; op_len = 1; } else { op_len++; } //fprintf(stderr, "cur->state=%c op_len=%d cur->prev_i=%d k=%u l=%u\n", "MIDS"[cur->state], op_len, cur->prev_i, cur->match_sa.k, cur->match_sa.l); i = cur->prev_i; } if(STATE_M == op || STATE_D == op) { sam->aux.map1_aux->aln_ref += op_len; } /* fprintf(stderr, "shadow 2 k=%u l=%u len=%d offset=%d last_diff_offset=%d\n", k, l, len, offset, e->last_diff_offset); fprintf(stderr, "e->n_mm=%d e->n_gapo=%d e->n_gape=%d\n", e->n_mm, e->n_gapo, e->n_gape); */ //tmap_map1_aux_stack_shadow(l - k + 1, bwt->seq_len, e->last_diff_offset, width_cur); width_cur_i = seed2_len - (len - e->last_diff_offset); tmap_map1_aux_stack_shadow(l - k + 1, seed2_len, width_cur_i, width_cur); if(opt->max_best_cals < best_cnt) { // ignore if too many "best" have been found occs[sams->n-1].l -= (best_cnt - opt->max_best_cals); // only save the maximum break; } } } else { int32_t allow_diff = 1, allow_mm = (e->n_mm < max_mm) ? 1 : 0; // decrement the offset offset--; // use a bound for mismatches if(0 < offset) { int32_t seed_width_cur_i = offset - (len - opt->seed_length); width_cur_i = seed2_len - (len - offset); if(0 < width_cur_i) { if(m-1 < width_cur[width_cur_i-1].bid) { allow_diff = 0; } else if(width_cur[width_cur_i-1].bid == m-1 && width_cur[width_cur_i].bid == m-1 && width_cur[width_cur_i-1].w == width_cur[width_cur_i].w) { allow_mm = 0; } } if(0 < seed_width_cur_i) { if(NULL != seed_width_cur && 0 < seed_width_cur_i) { if(n_seed_mm-1 < seed_width_cur[seed_width_cur_i-1].bid) { allow_diff = 0; } else if(seed_width_cur[seed_width_cur_i-1].bid == n_seed_mm-1 && seed_width_cur[seed_width_cur_i].bid == n_seed_mm-1 && seed_width_cur[seed_width_cur_i-1].w == seed_width_cur[seed_width_cur_i].w) { allow_mm = 0; } } } } // retrieve the next SA interval tmap_bwt_match_hash_2occ4(bwt, &e->match_sa, match_sa_next, hash); // insertions/deletions if(allow_diff && opt->indel_ends_bound + e->n_gapo + e->n_gape <= offset && opt->indel_ends_bound + e->n_gapo + e->n_gape <= len - offset) { // check to add gaps if(STATE_M == e->state) { // gap open if(e->n_gapo < max_gapo) { // gap open is allowed // insertion tmap_map1_aux_stack_push(stack, offset, &match_sa_cur, e->n_mm, e->n_gapo + 1, e->n_gape, STATE_I, 1, e, opt); // deletion for(j = 0; j != 4; ++j) { if(match_sa_next[j].k <= match_sa_next[j].l) { // remember that a gap deletion does not consume a // read base, so use 'offset+1' tmap_map1_aux_stack_push(stack, offset+1, &match_sa_next[j], e->n_mm, e->n_gapo + 1, e->n_gape, STATE_D, 1, e, opt); } } } } else if(STATE_I == e->state) { // extension of an insertion if(e->n_gape < max_gape) { // gap extension is allowed tmap_map1_aux_stack_push(stack, offset, &match_sa_cur, e->n_mm, e->n_gapo, e->n_gape + 1, STATE_I, 1, e, opt); } } else if(STATE_D == e->state) { // extension of a deletion if(e->n_gape < max_gape) { if(e->n_gape + e->n_gapo < max_diff || e->match_sa.l - e->match_sa.k + 1 < opt->max_cals_del) { // gap extension is allowed for(j = 0; j != 4; ++j) { if(match_sa_next[j].k <= match_sa_next[j].l) { // remember that a gap deletion does not consume a // read base, so use 'offset+1' tmap_map1_aux_stack_push(stack, offset+1, &match_sa_next[j], e->n_mm, e->n_gapo, e->n_gape + 1, STATE_D, 1, e, opt); } } } } } } // mismatches if(1 == allow_mm && 1 == allow_diff) { // mismatches allowed for(j=0;j<4;j++) { int32_t c = (str[offset] + j) & 3; int32_t is_mm = (0 < j || 3 < str[offset]); if(match_sa_next[c].k <= match_sa_next[c].l) { tmap_map1_aux_stack_push(stack, offset, &match_sa_next[c], e->n_mm + is_mm, e->n_gapo, e->n_gape, STATE_M, is_mm, e, opt); } } } else if(str[offset] < 4) { // try exact match only int32_t c = str[offset] & 3; if(match_sa_next[c].k <= match_sa_next[c].l) { tmap_map1_aux_stack_push(stack, offset, &match_sa_next[c], e->n_mm, e->n_gapo, e->n_gape, STATE_M, 0, e, opt); } } } } return tmap_map1_sam_to_real(sams, occs, bases, seed2_len, refseq, bwt, sa, hash, opt); }
static inline void tmap_map1_aux_stack_push(tmap_map1_aux_stack_t *stack, int32_t offset, tmap_bwt_match_occ_t *match_sa_prev, int32_t n_mm, int32_t n_gapo, int32_t n_gape, int32_t state, int32_t is_diff, tmap_map1_aux_stack_entry_t *prev_entry, const tmap_map_opt_t *opt) { int32_t i; int32_t n_bins_needed = 0; tmap_map1_aux_stack_entry_t *entry = NULL; tmap_map1_aux_bin_t *bin = NULL; // check to see if we need more memory if(stack->entry_pool_length <= stack->entry_pool_i) { int32_t i = stack->entry_pool_length; stack->entry_pool_length <<= 2; stack->entry_pool = tmap_realloc(stack->entry_pool, sizeof(tmap_map1_aux_stack_entry_t*)*stack->entry_pool_length, "stack->entry_pool"); while(i<stack->entry_pool_length) { stack->entry_pool[i] = tmap_malloc(sizeof(tmap_map1_aux_stack_entry_t), "stack->entry_pool[i]"); i++; } } entry = stack->entry_pool[stack->entry_pool_i]; entry->score = aln_score(n_mm, n_gapo, n_gape, opt); entry->n_mm = n_mm; entry->n_gapo = n_gapo; entry->n_gape = n_gape; entry->state = state; entry->match_sa = (*match_sa_prev); entry->i = stack->entry_pool_i; entry->offset = offset; if(NULL == prev_entry) { entry->last_diff_offset = offset; entry->prev_i = -1; } else { entry->last_diff_offset = (1 == is_diff) ? (offset) : prev_entry->last_diff_offset; entry->prev_i = prev_entry->i; } if(stack->n_bins <= entry->score) { //tmap_bug(); // resize the bins if necessary n_bins_needed = entry->score + 1; // realloc tmap_roundup32(n_bins_needed); stack->bins = tmap_realloc(stack->bins, sizeof(tmap_map1_aux_bin_t) * n_bins_needed, "stack->bins"); // initialize for(i=stack->n_bins;i<n_bins_needed;i++) { stack->bins[i].n_entries = stack->bins[i].m_entries = 0; stack->bins[i].entries = NULL; } stack->n_bins = n_bins_needed; } if(stack->n_bins <= entry->score) { tmap_bug(); } bin = &stack->bins[entry->score]; // - remove duplicates // - most likely formed by tandem repeats or indels // - too computationally expensive, and not necessary /* for(i=0;i<bin->n_entries;i++) { if(bin->entries[i]->match_sa.k == entry->match_sa.k && bin->entries[i]->match_sa.l == entry->match_sa.l && bin->entries[i]->offset == entry->offset && bin->entries[i]->state == entry->state) { return; } } */ // update best score if(stack->best_score > entry->score) stack->best_score = entry->score; if(bin->m_entries <= bin->n_entries) { bin->m_entries++; tmap_roundup32(bin->m_entries); bin->entries = tmap_realloc(bin->entries, sizeof(tmap_map1_aux_bin_t) * bin->m_entries, "bin->entries"); } bin->entries[bin->n_entries] = entry; bin->n_entries++; stack->entry_pool_i++; stack->n_entries++; }
uint64_t tmap_refseq_fasta2pac(const char *fn_fasta, int32_t compression) { tmap_file_t *fp_pac = NULL, *fp_anno = NULL; tmap_seq_io_t *seqio = NULL; tmap_seq_t *seq = NULL; tmap_refseq_t *refseq = NULL; char *fn_pac = NULL, *fn_anno = NULL; uint8_t buffer[TMAP_REFSEQ_BUFFER_SIZE]; int32_t i, j, l, buffer_length; uint32_t num_IUPAC_found= 0, amb_bases_mem = 0; uint8_t x = 0; uint64_t ref_len; tmap_progress_print("packing the reference FASTA"); refseq = tmap_calloc(1, sizeof(tmap_refseq_t), "refseq"); refseq->version_id = TMAP_VERSION_ID; refseq->package_version = tmap_string_clone2(PACKAGE_VERSION); refseq->seq = buffer; // IMPORTANT: must nullify later refseq->annos = NULL; refseq->num_annos = 0; refseq->len = 0; refseq->is_rev = 0; refseq->is_shm = 0; memset(buffer, 0, TMAP_REFSEQ_BUFFER_SIZE); buffer_length = 0; // input files seqio = tmap_seq_io_init(fn_fasta, TMAP_SEQ_TYPE_FQ, 0, compression); seq = tmap_seq_init(TMAP_SEQ_TYPE_FQ); // output files fn_pac = tmap_get_file_name(fn_fasta, TMAP_PAC_FILE); fp_pac = tmap_file_fopen(fn_pac, "wb", TMAP_PAC_COMPRESSION); // read in sequences while(0 <= (l = tmap_seq_io_read(seqio, seq))) { tmap_anno_t *anno = NULL; tmap_progress_print2("packing contig [%s:1-%d]", seq->data.fq->name->s, l); refseq->num_annos++; refseq->annos = tmap_realloc(refseq->annos, sizeof(tmap_anno_t)*refseq->num_annos, "refseq->annos"); anno = &refseq->annos[refseq->num_annos-1]; anno->name = tmap_string_clone(seq->data.fq->name); anno->len = l; anno->offset = (1 == refseq->num_annos) ? 0 : refseq->annos[refseq->num_annos-2].offset + refseq->annos[refseq->num_annos-2].len; anno->amb_positions_start = NULL; anno->amb_positions_end = NULL; anno->amb_bases = NULL; anno->num_amb = 0; amb_bases_mem = 0; // fill the buffer for(i=0;i<l;i++) { uint8_t c = tmap_nt_char_to_int[(int)seq->data.fq->seq->s[i]]; // handle IUPAC codes if(4 <= c) { int32_t k; // warn users about IUPAC codes if(0 == num_IUPAC_found) { tmap_error("IUPAC codes were found and will be converted to non-matching DNA bases", Warn, OutOfRange); for(j=4;j<15;j++) { c = tmap_iupac_char_to_bit_string[(int)tmap_iupac_int_to_char[j]]; // get the lexicographically smallest base not compatible with this code for(k=0;k<4;k++) { if(!(c & (0x1 << k))) { break; } } tmap_progress_print2("IUPAC code %c will be converted to %c", tmap_iupac_int_to_char[j], "ACGTN"[k & 3]); } } num_IUPAC_found++; // change it to a mismatched base than the IUPAC code c = tmap_iupac_char_to_bit_string[(int)seq->data.fq->seq->s[i]]; // store IUPAC bases if(amb_bases_mem <= anno->num_amb) { // allocate more memory if necessary amb_bases_mem = anno->num_amb + 1; tmap_roundup32(amb_bases_mem); anno->amb_positions_start = tmap_realloc(anno->amb_positions_start, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_start"); anno->amb_positions_end = tmap_realloc(anno->amb_positions_end, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_end"); anno->amb_bases = tmap_realloc(anno->amb_bases, sizeof(uint8_t) * amb_bases_mem, "anno->amb_bases"); } // encode stretches of the same base if(0 < anno->num_amb && anno->amb_positions_end[anno->num_amb-1] == i && anno->amb_bases[anno->num_amb-1] == tmap_iupac_char_to_int[(int)seq->data.fq->seq->s[i]]) { anno->amb_positions_end[anno->num_amb-1]++; // expand the range } else { // new ambiguous base and range anno->num_amb++; anno->amb_positions_start[anno->num_amb-1] = i+1; // one-based anno->amb_positions_end[anno->num_amb-1] = i+1; // one-based anno->amb_bases[anno->num_amb-1] = tmap_iupac_char_to_int[(int)seq->data.fq->seq->s[i]]; } // get the lexicographically smallest base not compatible with // this code for(j=0;j<4;j++) { if(!(c & (0x1 << j))) { break; } } c = j & 3; // Note: Ns will go to As } if(3 < c) { tmap_error("bug encountered", Exit, OutOfRange); } if(buffer_length == (TMAP_REFSEQ_BUFFER_SIZE << 2)) { // 2-bit if(tmap_refseq_seq_memory(buffer_length) != tmap_file_fwrite(buffer, sizeof(uint8_t), tmap_refseq_seq_memory(buffer_length), fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } memset(buffer, 0, TMAP_REFSEQ_BUFFER_SIZE); buffer_length = 0; } tmap_refseq_seq_store_i(refseq, buffer_length, c); buffer_length++; } refseq->len += l; // re-size the amibiguous bases if(anno->num_amb < amb_bases_mem) { amb_bases_mem = anno->num_amb; anno->amb_positions_start = tmap_realloc(anno->amb_positions_start, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_start"); anno->amb_positions_end = tmap_realloc(anno->amb_positions_end, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_end"); anno->amb_bases = tmap_realloc(anno->amb_bases, sizeof(uint8_t) * amb_bases_mem, "anno->amb_bases"); } } // write out the buffer if(tmap_refseq_seq_memory(buffer_length) != tmap_file_fwrite(buffer, sizeof(uint8_t), tmap_refseq_seq_memory(buffer_length), fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } if(refseq->len % 4 == 0) { // add an extra byte if we completely filled all bits if(1 != tmap_file_fwrite(&x, sizeof(uint8_t), 1, fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } } // store number of unused bits at the last byte x = refseq->len % 4; if(1 != tmap_file_fwrite(&x, sizeof(uint8_t), 1, fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } refseq->seq = NULL; // IMPORTANT: nullify this ref_len = refseq->len; // save for return tmap_progress_print2("total genome length [%u]", refseq->len); if(0 < num_IUPAC_found) { if(1 == num_IUPAC_found) { tmap_progress_print("%u IUPAC base was found and converted to a DNA base", num_IUPAC_found); } else { tmap_progress_print("%u IUPAC bases were found and converted to DNA bases", num_IUPAC_found); } } // write annotation file fn_anno = tmap_get_file_name(fn_fasta, TMAP_ANNO_FILE); fp_anno = tmap_file_fopen(fn_anno, "wb", TMAP_ANNO_COMPRESSION); tmap_refseq_write_anno(fp_anno, refseq); // close files tmap_file_fclose(fp_pac); tmap_file_fclose(fp_anno); // check sequence name uniqueness for(i=0;i<refseq->num_annos;i++) { for(j=i+1;j<refseq->num_annos;j++) { if(0 == strcmp(refseq->annos[i].name->s, refseq->annos[j].name->s)) { tmap_file_fprintf(tmap_file_stderr, "Contigs have the same name: #%d [%s] and #%d [%s]\n", i+1, refseq->annos[i].name->s, j+1, refseq->annos[j].name->s); tmap_error("Contig names must be unique", Exit, OutOfRange); } } } tmap_refseq_destroy(refseq); tmap_seq_io_destroy(seqio); tmap_seq_destroy(seq); free(fn_pac); free(fn_anno); tmap_progress_print2("packed the reference FASTA"); tmap_refseq_pac2revpac(fn_fasta); return ref_len; }
void tmap_sam_update_cigar_and_md(bam1_t *b, char *ref, char *read, int32_t len) { int32_t i, n_cigar, last_type; uint32_t *cigar; int32_t diff; int32_t soft_clip_start_i, soft_clip_end_i; if(b->data_len - b->l_aux != bam1_aux(b) - b->data) { tmap_error("b->data_len - b->l_aux != bam1_aux(b) - b->data", Exit, OutOfRange); } // keep track of soft clipping n_cigar = soft_clip_start_i = soft_clip_end_i = 0; cigar = bam1_cigar(b); if(BAM_CSOFT_CLIP == TMAP_SW_CIGAR_OP(cigar[0])) { soft_clip_start_i = 1; n_cigar++; } if(1 < b->core.n_cigar && BAM_CSOFT_CLIP == TMAP_SW_CIGAR_OP(cigar[b->core.n_cigar-1])) { soft_clip_end_i = 1; n_cigar++; } cigar = NULL; // get the # of cigar operators last_type = tmap_sam_get_type(ref[0], read[0]); n_cigar++; for(i=1;i<len;i++) { int32_t cur_type = tmap_sam_get_type(ref[i], read[i]); if(cur_type != last_type) { n_cigar++; } last_type = cur_type; } // resize the data field if necessary if(n_cigar < b->core.n_cigar) { diff = sizeof(uint32_t) * (b->core.n_cigar - n_cigar); // shift down for(i=b->core.l_qname;i<b->data_len - diff;i++) { b->data[i] = b->data[i + diff]; } b->data_len -= diff; b->core.n_cigar = n_cigar; } else if(b->core.n_cigar < n_cigar) { diff = sizeof(uint32_t) * (n_cigar - b->core.n_cigar); // realloc if(b->m_data <= (b->data_len + diff)) { b->m_data = b->data_len + diff + 1; tmap_roundup32(b->m_data); b->data = tmap_realloc(b->data, sizeof(uint8_t) * b->m_data, "b->data"); } // shift up for(i=b->data_len-1;b->core.l_qname<=i;i--) { b->data[i + diff] = b->data[i]; } b->data_len += diff; b->core.n_cigar = n_cigar; } if(b->data_len - b->l_aux != bam1_aux(b) - b->data) { tmap_error("b->data_len - b->l_aux != bam1_aux(b) - b->data", Exit, OutOfRange); } // create the cigar cigar = bam1_cigar(b); for(i=soft_clip_start_i;i<n_cigar-soft_clip_end_i;i++) { cigar[i] = 0; } n_cigar = soft_clip_start_i; // skip over soft clipping etc. last_type = tmap_sam_get_type(ref[0], read[0]); TMAP_SW_CIGAR_STORE(cigar[n_cigar], last_type, 1); for(i=1;i<len;i++) { int32_t cur_type = tmap_sam_get_type(ref[i], read[i]); if(cur_type == last_type) { // add to the cigar length TMAP_SW_CIGAR_ADD_LENGTH(cigar[n_cigar], 1); } else { // add to the cigar n_cigar++; TMAP_SW_CIGAR_STORE(cigar[n_cigar], cur_type, 1); } last_type = cur_type; } // Note: the md tag must be updated tmap_sam_md1(b, ref, len); }