static tmap_map1_aux_stack_t* tmap_map1_aux_stack_reset(tmap_map1_aux_stack_t *stack, int32_t max_mm, int32_t max_gapo, int32_t max_gape, const tmap_map_opt_t *opt) { int32_t i; //int32_t i, j; int32_t n_bins_needed = 0; // move to the beginning of the memory pool stack->entry_pool_i = 0; stack->best_score = INT32_MAX; if(TMAP_MAP1_AUX_STACK_TOO_BIG < stack->entry_pool_length) { tmap_map1_aux_stack_destroy_helper(stack, 0); tmap_map1_aux_stack_init_helper(stack); } // clear the bins for(i=0;i<stack->n_bins;i++) { /* for(j=0;j<stack->bins[i].n_entries;j++) { stack->bins[i].entries[j] = NULL; } */ stack->bins[i].n_entries = 0; } // resize the bins if necessary n_bins_needed = aln_score(max_mm+1, max_gapo+1, max_gape+1, opt); if(stack->n_bins < n_bins_needed) { // realloc tmap_roundup32(n_bins_needed); stack->bins = tmap_realloc(stack->bins, sizeof(tmap_map1_aux_bin_t) * n_bins_needed, "stack->bins"); // initialize for(i=stack->n_bins;i<n_bins_needed;i++) { stack->bins[i].n_entries = stack->bins[i].m_entries = 0; stack->bins[i].entries = NULL; } stack->n_bins = n_bins_needed; } stack->n_entries = 0; return stack; }
void tmap_map2_aln_realloc(tmap_map2_aln_t *a, int32_t n) { int32_t i; if(NULL == a) return; if(n < a->n) { for(i=n;i<a->n;i++) { tmap_map2_hit_nullify(&a->hits[i]); } a->n = n; } else if(a->max < n) { // allocate more memory i = a->max; // save for init a->max = (0 == a->max && n < 4) ? 4 : tmap_roundup32(n); // resize a->hits = tmap_realloc(a->hits, sizeof(tmap_map2_hit_t) * a->max, "a->hits"); // init while(i < a->max) { tmap_map2_hit_nullify(&a->hits[i]); i++; } } }
static inline void tmap_map1_aux_stack_push(tmap_map1_aux_stack_t *stack, int32_t offset, tmap_bwt_match_occ_t *match_sa_prev, int32_t n_mm, int32_t n_gapo, int32_t n_gape, int32_t state, int32_t is_diff, tmap_map1_aux_stack_entry_t *prev_entry, const tmap_map_opt_t *opt) { int32_t i; int32_t n_bins_needed = 0; tmap_map1_aux_stack_entry_t *entry = NULL; tmap_map1_aux_bin_t *bin = NULL; // check to see if we need more memory if(stack->entry_pool_length <= stack->entry_pool_i) { int32_t i = stack->entry_pool_length; stack->entry_pool_length <<= 2; stack->entry_pool = tmap_realloc(stack->entry_pool, sizeof(tmap_map1_aux_stack_entry_t*)*stack->entry_pool_length, "stack->entry_pool"); while(i<stack->entry_pool_length) { stack->entry_pool[i] = tmap_malloc(sizeof(tmap_map1_aux_stack_entry_t), "stack->entry_pool[i]"); i++; } } entry = stack->entry_pool[stack->entry_pool_i]; entry->score = aln_score(n_mm, n_gapo, n_gape, opt); entry->n_mm = n_mm; entry->n_gapo = n_gapo; entry->n_gape = n_gape; entry->state = state; entry->match_sa = (*match_sa_prev); entry->i = stack->entry_pool_i; entry->offset = offset; if(NULL == prev_entry) { entry->last_diff_offset = offset; entry->prev_i = -1; } else { entry->last_diff_offset = (1 == is_diff) ? (offset) : prev_entry->last_diff_offset; entry->prev_i = prev_entry->i; } if(stack->n_bins <= entry->score) { //tmap_bug(); // resize the bins if necessary n_bins_needed = entry->score + 1; // realloc tmap_roundup32(n_bins_needed); stack->bins = tmap_realloc(stack->bins, sizeof(tmap_map1_aux_bin_t) * n_bins_needed, "stack->bins"); // initialize for(i=stack->n_bins;i<n_bins_needed;i++) { stack->bins[i].n_entries = stack->bins[i].m_entries = 0; stack->bins[i].entries = NULL; } stack->n_bins = n_bins_needed; } if(stack->n_bins <= entry->score) { tmap_bug(); } bin = &stack->bins[entry->score]; // - remove duplicates // - most likely formed by tandem repeats or indels // - too computationally expensive, and not necessary /* for(i=0;i<bin->n_entries;i++) { if(bin->entries[i]->match_sa.k == entry->match_sa.k && bin->entries[i]->match_sa.l == entry->match_sa.l && bin->entries[i]->offset == entry->offset && bin->entries[i]->state == entry->state) { return; } } */ // update best score if(stack->best_score > entry->score) stack->best_score = entry->score; if(bin->m_entries <= bin->n_entries) { bin->m_entries++; tmap_roundup32(bin->m_entries); bin->entries = tmap_realloc(bin->entries, sizeof(tmap_map1_aux_bin_t) * bin->m_entries, "bin->entries"); } bin->entries[bin->n_entries] = entry; bin->n_entries++; stack->entry_pool_i++; stack->n_entries++; }
uint64_t tmap_refseq_fasta2pac(const char *fn_fasta, int32_t compression) { tmap_file_t *fp_pac = NULL, *fp_anno = NULL; tmap_seq_io_t *seqio = NULL; tmap_seq_t *seq = NULL; tmap_refseq_t *refseq = NULL; char *fn_pac = NULL, *fn_anno = NULL; uint8_t buffer[TMAP_REFSEQ_BUFFER_SIZE]; int32_t i, j, l, buffer_length; uint32_t num_IUPAC_found= 0, amb_bases_mem = 0; uint8_t x = 0; uint64_t ref_len; tmap_progress_print("packing the reference FASTA"); refseq = tmap_calloc(1, sizeof(tmap_refseq_t), "refseq"); refseq->version_id = TMAP_VERSION_ID; refseq->package_version = tmap_string_clone2(PACKAGE_VERSION); refseq->seq = buffer; // IMPORTANT: must nullify later refseq->annos = NULL; refseq->num_annos = 0; refseq->len = 0; refseq->is_rev = 0; refseq->is_shm = 0; memset(buffer, 0, TMAP_REFSEQ_BUFFER_SIZE); buffer_length = 0; // input files seqio = tmap_seq_io_init(fn_fasta, TMAP_SEQ_TYPE_FQ, 0, compression); seq = tmap_seq_init(TMAP_SEQ_TYPE_FQ); // output files fn_pac = tmap_get_file_name(fn_fasta, TMAP_PAC_FILE); fp_pac = tmap_file_fopen(fn_pac, "wb", TMAP_PAC_COMPRESSION); // read in sequences while(0 <= (l = tmap_seq_io_read(seqio, seq))) { tmap_anno_t *anno = NULL; tmap_progress_print2("packing contig [%s:1-%d]", seq->data.fq->name->s, l); refseq->num_annos++; refseq->annos = tmap_realloc(refseq->annos, sizeof(tmap_anno_t)*refseq->num_annos, "refseq->annos"); anno = &refseq->annos[refseq->num_annos-1]; anno->name = tmap_string_clone(seq->data.fq->name); anno->len = l; anno->offset = (1 == refseq->num_annos) ? 0 : refseq->annos[refseq->num_annos-2].offset + refseq->annos[refseq->num_annos-2].len; anno->amb_positions_start = NULL; anno->amb_positions_end = NULL; anno->amb_bases = NULL; anno->num_amb = 0; amb_bases_mem = 0; // fill the buffer for(i=0;i<l;i++) { uint8_t c = tmap_nt_char_to_int[(int)seq->data.fq->seq->s[i]]; // handle IUPAC codes if(4 <= c) { int32_t k; // warn users about IUPAC codes if(0 == num_IUPAC_found) { tmap_error("IUPAC codes were found and will be converted to non-matching DNA bases", Warn, OutOfRange); for(j=4;j<15;j++) { c = tmap_iupac_char_to_bit_string[(int)tmap_iupac_int_to_char[j]]; // get the lexicographically smallest base not compatible with this code for(k=0;k<4;k++) { if(!(c & (0x1 << k))) { break; } } tmap_progress_print2("IUPAC code %c will be converted to %c", tmap_iupac_int_to_char[j], "ACGTN"[k & 3]); } } num_IUPAC_found++; // change it to a mismatched base than the IUPAC code c = tmap_iupac_char_to_bit_string[(int)seq->data.fq->seq->s[i]]; // store IUPAC bases if(amb_bases_mem <= anno->num_amb) { // allocate more memory if necessary amb_bases_mem = anno->num_amb + 1; tmap_roundup32(amb_bases_mem); anno->amb_positions_start = tmap_realloc(anno->amb_positions_start, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_start"); anno->amb_positions_end = tmap_realloc(anno->amb_positions_end, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_end"); anno->amb_bases = tmap_realloc(anno->amb_bases, sizeof(uint8_t) * amb_bases_mem, "anno->amb_bases"); } // encode stretches of the same base if(0 < anno->num_amb && anno->amb_positions_end[anno->num_amb-1] == i && anno->amb_bases[anno->num_amb-1] == tmap_iupac_char_to_int[(int)seq->data.fq->seq->s[i]]) { anno->amb_positions_end[anno->num_amb-1]++; // expand the range } else { // new ambiguous base and range anno->num_amb++; anno->amb_positions_start[anno->num_amb-1] = i+1; // one-based anno->amb_positions_end[anno->num_amb-1] = i+1; // one-based anno->amb_bases[anno->num_amb-1] = tmap_iupac_char_to_int[(int)seq->data.fq->seq->s[i]]; } // get the lexicographically smallest base not compatible with // this code for(j=0;j<4;j++) { if(!(c & (0x1 << j))) { break; } } c = j & 3; // Note: Ns will go to As } if(3 < c) { tmap_error("bug encountered", Exit, OutOfRange); } if(buffer_length == (TMAP_REFSEQ_BUFFER_SIZE << 2)) { // 2-bit if(tmap_refseq_seq_memory(buffer_length) != tmap_file_fwrite(buffer, sizeof(uint8_t), tmap_refseq_seq_memory(buffer_length), fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } memset(buffer, 0, TMAP_REFSEQ_BUFFER_SIZE); buffer_length = 0; } tmap_refseq_seq_store_i(refseq, buffer_length, c); buffer_length++; } refseq->len += l; // re-size the amibiguous bases if(anno->num_amb < amb_bases_mem) { amb_bases_mem = anno->num_amb; anno->amb_positions_start = tmap_realloc(anno->amb_positions_start, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_start"); anno->amb_positions_end = tmap_realloc(anno->amb_positions_end, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_end"); anno->amb_bases = tmap_realloc(anno->amb_bases, sizeof(uint8_t) * amb_bases_mem, "anno->amb_bases"); } } // write out the buffer if(tmap_refseq_seq_memory(buffer_length) != tmap_file_fwrite(buffer, sizeof(uint8_t), tmap_refseq_seq_memory(buffer_length), fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } if(refseq->len % 4 == 0) { // add an extra byte if we completely filled all bits if(1 != tmap_file_fwrite(&x, sizeof(uint8_t), 1, fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } } // store number of unused bits at the last byte x = refseq->len % 4; if(1 != tmap_file_fwrite(&x, sizeof(uint8_t), 1, fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } refseq->seq = NULL; // IMPORTANT: nullify this ref_len = refseq->len; // save for return tmap_progress_print2("total genome length [%u]", refseq->len); if(0 < num_IUPAC_found) { if(1 == num_IUPAC_found) { tmap_progress_print("%u IUPAC base was found and converted to a DNA base", num_IUPAC_found); } else { tmap_progress_print("%u IUPAC bases were found and converted to DNA bases", num_IUPAC_found); } } // write annotation file fn_anno = tmap_get_file_name(fn_fasta, TMAP_ANNO_FILE); fp_anno = tmap_file_fopen(fn_anno, "wb", TMAP_ANNO_COMPRESSION); tmap_refseq_write_anno(fp_anno, refseq); // close files tmap_file_fclose(fp_pac); tmap_file_fclose(fp_anno); // check sequence name uniqueness for(i=0;i<refseq->num_annos;i++) { for(j=i+1;j<refseq->num_annos;j++) { if(0 == strcmp(refseq->annos[i].name->s, refseq->annos[j].name->s)) { tmap_file_fprintf(tmap_file_stderr, "Contigs have the same name: #%d [%s] and #%d [%s]\n", i+1, refseq->annos[i].name->s, j+1, refseq->annos[j].name->s); tmap_error("Contig names must be unique", Exit, OutOfRange); } } } tmap_refseq_destroy(refseq); tmap_seq_io_destroy(seqio); tmap_seq_destroy(seq); free(fn_pac); free(fn_anno); tmap_progress_print2("packed the reference FASTA"); tmap_refseq_pac2revpac(fn_fasta); return ref_len; }
void tmap_sam_update_cigar_and_md(bam1_t *b, char *ref, char *read, int32_t len) { int32_t i, n_cigar, last_type; uint32_t *cigar; int32_t diff; int32_t soft_clip_start_i, soft_clip_end_i; if(b->data_len - b->l_aux != bam1_aux(b) - b->data) { tmap_error("b->data_len - b->l_aux != bam1_aux(b) - b->data", Exit, OutOfRange); } // keep track of soft clipping n_cigar = soft_clip_start_i = soft_clip_end_i = 0; cigar = bam1_cigar(b); if(BAM_CSOFT_CLIP == TMAP_SW_CIGAR_OP(cigar[0])) { soft_clip_start_i = 1; n_cigar++; } if(1 < b->core.n_cigar && BAM_CSOFT_CLIP == TMAP_SW_CIGAR_OP(cigar[b->core.n_cigar-1])) { soft_clip_end_i = 1; n_cigar++; } cigar = NULL; // get the # of cigar operators last_type = tmap_sam_get_type(ref[0], read[0]); n_cigar++; for(i=1;i<len;i++) { int32_t cur_type = tmap_sam_get_type(ref[i], read[i]); if(cur_type != last_type) { n_cigar++; } last_type = cur_type; } // resize the data field if necessary if(n_cigar < b->core.n_cigar) { diff = sizeof(uint32_t) * (b->core.n_cigar - n_cigar); // shift down for(i=b->core.l_qname;i<b->data_len - diff;i++) { b->data[i] = b->data[i + diff]; } b->data_len -= diff; b->core.n_cigar = n_cigar; } else if(b->core.n_cigar < n_cigar) { diff = sizeof(uint32_t) * (n_cigar - b->core.n_cigar); // realloc if(b->m_data <= (b->data_len + diff)) { b->m_data = b->data_len + diff + 1; tmap_roundup32(b->m_data); b->data = tmap_realloc(b->data, sizeof(uint8_t) * b->m_data, "b->data"); } // shift up for(i=b->data_len-1;b->core.l_qname<=i;i--) { b->data[i + diff] = b->data[i]; } b->data_len += diff; b->core.n_cigar = n_cigar; } if(b->data_len - b->l_aux != bam1_aux(b) - b->data) { tmap_error("b->data_len - b->l_aux != bam1_aux(b) - b->data", Exit, OutOfRange); } // create the cigar cigar = bam1_cigar(b); for(i=soft_clip_start_i;i<n_cigar-soft_clip_end_i;i++) { cigar[i] = 0; } n_cigar = soft_clip_start_i; // skip over soft clipping etc. last_type = tmap_sam_get_type(ref[0], read[0]); TMAP_SW_CIGAR_STORE(cigar[n_cigar], last_type, 1); for(i=1;i<len;i++) { int32_t cur_type = tmap_sam_get_type(ref[i], read[i]); if(cur_type == last_type) { // add to the cigar length TMAP_SW_CIGAR_ADD_LENGTH(cigar[n_cigar], 1); } else { // add to the cigar n_cigar++; TMAP_SW_CIGAR_STORE(cigar[n_cigar], cur_type, 1); } last_type = cur_type; } // Note: the md tag must be updated tmap_sam_md1(b, ref, len); }