void tmap_seqs_destroy(tmap_seqs_t *seqs) { int32_t i; for(i=0;i<seqs->m;i++) { tmap_seq_destroy(seqs->seqs[i]); } free(seqs->seqs); free(seqs); }
tmap_map_sams_t * tmap_map2_aux_core(tmap_map_opt_t *_opt, tmap_seq_t *seqs[4], tmap_refseq_t *refseq, tmap_bwt_t *bwt, tmap_sa_t *sa, tmap_bwt_match_hash_t *hash, tmap_rand_t *rand, tmap_map2_global_mempool_t *pool) { tmap_map_opt_t opt; tmap_seq_t *orig_seq = NULL; tmap_string_t *seq[2]={NULL, NULL}; tmap_string_t *rseq[2]={NULL, NULL}; tmap_map_sams_t *sams = NULL; tmap_map2_aln_t *b[2]={NULL,NULL}; tmap_string_t *bases = NULL; int32_t i, k, l, num_n; opt = (*_opt); // sequence length bases = tmap_seq_get_bases(seqs[0]); l = bases->l; // update the local opt tmap_map2_aux_core_update_opt(&opt, _opt, l); // set opt->score_thr if(pool->max_l < l) { // then enlarge working space for tmap_sw_extend_core() int32_t tmp; if(0 == opt.pen_gape) { tmp = ((l + 1) / 2 * opt.score_match + opt.pen_gape) + l; } else { tmp = ((l + 1) / 2 * opt.score_match + opt.pen_gape) / opt.pen_gape + l; } pool->max_l = l; pool->aln_mem = tmap_realloc(pool->aln_mem, sizeof(uint8_t) * (tmp + 2) * 24, "pool->aln_mem"); } // get the number of Ns for(i=num_n=0;i<l;i++) { uint8_t c = (uint8_t)tmap_nt_char_to_int[(int)bases->s[i]]; if(c >= 4) num_n++; // FIXME: ambiguous bases are not properly handled } // will we always be lower than the score threshold if((l*opt.score_match) + (num_n*opt.pen_mm) < opt.score_thr) { return tmap_map_sams_init(NULL); } // save sequences seq[0] = tmap_seq_get_bases(seqs[0]); seq[1] = tmap_seq_get_bases(seqs[1]); rseq[0] = tmap_seq_get_bases(seqs[2]); rseq[1] = tmap_seq_get_bases(seqs[3]); // handle ambiguous bases if(0 < num_n) { // save original to de-randomize later orig_seq = tmap_seq_clone(seqs[0]); // randomize for(i=0;i<l;i++) { uint8_t c = (uint8_t)bases->s[i]; if(c >= 4) { c = (int)(tmap_rand_get(rand) * 4); // FIXME: ambiguous bases are not properly handled seq[0]->s[i] = c; // original seq[1]->s[l-1-i] = 3 - c; // reverse compliment rseq[0]->s[l-1-i] = 3 - c; // reverse compliment rseq[1]->s[i] = c; // original //rseq[0]->s[l-1-i] = c; // reverse //rseq[1]->s[i] = 3 - c; // compliment } } } // alignment b[0] = tmap_map2_aux_aln(&opt, refseq, bwt, sa, hash, seq, 0, pool); for(k = 0; k < b[0]->n; ++k) { if(b[0]->hits[k].n_seeds < opt.seeds_rev) break; } if(k < b[0]->n) { b[1] = tmap_map2_aux_aln(&opt, refseq, bwt, sa, hash, rseq, 1, pool); for(i = 0; i < b[1]->n; ++i) { tmap_map2_hit_t *p = b[1]->hits + i; int x = p->beg; p->flag ^= 0x10, p->is_rev ^= 1; // flip the strand p->beg = l - p->end; p->end = l - x; if(p->l == 0) { if(refseq->len * 2 < (p->k + p->tlen)) p->k = 0; else p->k = 2 * refseq->len - (p->k + p->tlen); } } tmap_map2_aux_merge_hits(b, l, 0); } else b[1] = 0; // set the flag to forward/reverse tmap_map2_aux_flag_fr(b); // tlen may overestimated due to not counting insertions properly, bound it! for(i = 0; i < b[0]->n; ++i) { if(refseq->len * 2 <= b[0]->hits[i].k + b[0]->hits[i].tlen) { b[0]->hits[i].tlen = (refseq->len * 2) - b[0]->hits[i].k; } else if(b[0]->hits[i].k < refseq->len && refseq->len <= b[0]->hits[i].k + b[0]->hits[i].tlen) { b[0]->hits[i].tlen = refseq->len - b[0]->hits[i].k; } } // make one-based for pac2real for(i = 0; i < b[0]->n; ++i) { b[0]->hits[i].k++; } // store in SAM sams = tmap_map2_aux_store_hits(refseq, &opt, b[0], l); // free tmap_map2_aln_destroy(b[0]); // revert ambiguous bases if(0 < num_n) { // de-randomize bases = tmap_seq_get_bases(orig_seq); for(i=0;i<l;i++) { uint8_t c = (uint8_t)bases->s[i]; if(c >= 4) { // NB: always keep them at "4" seq[0]->s[i] = c; // original seq[1]->s[l-1-i] = c; // reverse compliment rseq[0]->s[l-1-i] = c; // reverse compliment rseq[1]->s[i] = c; // original //rseq[0]->s[l-1-i] = c; // reverse //rseq[1]->s[i] = 3 - c; // compliment } } tmap_seq_destroy(orig_seq); } return sams; }
uint64_t tmap_refseq_fasta2pac(const char *fn_fasta, int32_t compression) { tmap_file_t *fp_pac = NULL, *fp_anno = NULL; tmap_seq_io_t *seqio = NULL; tmap_seq_t *seq = NULL; tmap_refseq_t *refseq = NULL; char *fn_pac = NULL, *fn_anno = NULL; uint8_t buffer[TMAP_REFSEQ_BUFFER_SIZE]; int32_t i, j, l, buffer_length; uint32_t num_IUPAC_found= 0, amb_bases_mem = 0; uint8_t x = 0; uint64_t ref_len; tmap_progress_print("packing the reference FASTA"); refseq = tmap_calloc(1, sizeof(tmap_refseq_t), "refseq"); refseq->version_id = TMAP_VERSION_ID; refseq->package_version = tmap_string_clone2(PACKAGE_VERSION); refseq->seq = buffer; // IMPORTANT: must nullify later refseq->annos = NULL; refseq->num_annos = 0; refseq->len = 0; refseq->is_rev = 0; refseq->is_shm = 0; memset(buffer, 0, TMAP_REFSEQ_BUFFER_SIZE); buffer_length = 0; // input files seqio = tmap_seq_io_init(fn_fasta, TMAP_SEQ_TYPE_FQ, 0, compression); seq = tmap_seq_init(TMAP_SEQ_TYPE_FQ); // output files fn_pac = tmap_get_file_name(fn_fasta, TMAP_PAC_FILE); fp_pac = tmap_file_fopen(fn_pac, "wb", TMAP_PAC_COMPRESSION); // read in sequences while(0 <= (l = tmap_seq_io_read(seqio, seq))) { tmap_anno_t *anno = NULL; tmap_progress_print2("packing contig [%s:1-%d]", seq->data.fq->name->s, l); refseq->num_annos++; refseq->annos = tmap_realloc(refseq->annos, sizeof(tmap_anno_t)*refseq->num_annos, "refseq->annos"); anno = &refseq->annos[refseq->num_annos-1]; anno->name = tmap_string_clone(seq->data.fq->name); anno->len = l; anno->offset = (1 == refseq->num_annos) ? 0 : refseq->annos[refseq->num_annos-2].offset + refseq->annos[refseq->num_annos-2].len; anno->amb_positions_start = NULL; anno->amb_positions_end = NULL; anno->amb_bases = NULL; anno->num_amb = 0; amb_bases_mem = 0; // fill the buffer for(i=0;i<l;i++) { uint8_t c = tmap_nt_char_to_int[(int)seq->data.fq->seq->s[i]]; // handle IUPAC codes if(4 <= c) { int32_t k; // warn users about IUPAC codes if(0 == num_IUPAC_found) { tmap_error("IUPAC codes were found and will be converted to non-matching DNA bases", Warn, OutOfRange); for(j=4;j<15;j++) { c = tmap_iupac_char_to_bit_string[(int)tmap_iupac_int_to_char[j]]; // get the lexicographically smallest base not compatible with this code for(k=0;k<4;k++) { if(!(c & (0x1 << k))) { break; } } tmap_progress_print2("IUPAC code %c will be converted to %c", tmap_iupac_int_to_char[j], "ACGTN"[k & 3]); } } num_IUPAC_found++; // change it to a mismatched base than the IUPAC code c = tmap_iupac_char_to_bit_string[(int)seq->data.fq->seq->s[i]]; // store IUPAC bases if(amb_bases_mem <= anno->num_amb) { // allocate more memory if necessary amb_bases_mem = anno->num_amb + 1; tmap_roundup32(amb_bases_mem); anno->amb_positions_start = tmap_realloc(anno->amb_positions_start, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_start"); anno->amb_positions_end = tmap_realloc(anno->amb_positions_end, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_end"); anno->amb_bases = tmap_realloc(anno->amb_bases, sizeof(uint8_t) * amb_bases_mem, "anno->amb_bases"); } // encode stretches of the same base if(0 < anno->num_amb && anno->amb_positions_end[anno->num_amb-1] == i && anno->amb_bases[anno->num_amb-1] == tmap_iupac_char_to_int[(int)seq->data.fq->seq->s[i]]) { anno->amb_positions_end[anno->num_amb-1]++; // expand the range } else { // new ambiguous base and range anno->num_amb++; anno->amb_positions_start[anno->num_amb-1] = i+1; // one-based anno->amb_positions_end[anno->num_amb-1] = i+1; // one-based anno->amb_bases[anno->num_amb-1] = tmap_iupac_char_to_int[(int)seq->data.fq->seq->s[i]]; } // get the lexicographically smallest base not compatible with // this code for(j=0;j<4;j++) { if(!(c & (0x1 << j))) { break; } } c = j & 3; // Note: Ns will go to As } if(3 < c) { tmap_error("bug encountered", Exit, OutOfRange); } if(buffer_length == (TMAP_REFSEQ_BUFFER_SIZE << 2)) { // 2-bit if(tmap_refseq_seq_memory(buffer_length) != tmap_file_fwrite(buffer, sizeof(uint8_t), tmap_refseq_seq_memory(buffer_length), fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } memset(buffer, 0, TMAP_REFSEQ_BUFFER_SIZE); buffer_length = 0; } tmap_refseq_seq_store_i(refseq, buffer_length, c); buffer_length++; } refseq->len += l; // re-size the amibiguous bases if(anno->num_amb < amb_bases_mem) { amb_bases_mem = anno->num_amb; anno->amb_positions_start = tmap_realloc(anno->amb_positions_start, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_start"); anno->amb_positions_end = tmap_realloc(anno->amb_positions_end, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_end"); anno->amb_bases = tmap_realloc(anno->amb_bases, sizeof(uint8_t) * amb_bases_mem, "anno->amb_bases"); } } // write out the buffer if(tmap_refseq_seq_memory(buffer_length) != tmap_file_fwrite(buffer, sizeof(uint8_t), tmap_refseq_seq_memory(buffer_length), fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } if(refseq->len % 4 == 0) { // add an extra byte if we completely filled all bits if(1 != tmap_file_fwrite(&x, sizeof(uint8_t), 1, fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } } // store number of unused bits at the last byte x = refseq->len % 4; if(1 != tmap_file_fwrite(&x, sizeof(uint8_t), 1, fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } refseq->seq = NULL; // IMPORTANT: nullify this ref_len = refseq->len; // save for return tmap_progress_print2("total genome length [%u]", refseq->len); if(0 < num_IUPAC_found) { if(1 == num_IUPAC_found) { tmap_progress_print("%u IUPAC base was found and converted to a DNA base", num_IUPAC_found); } else { tmap_progress_print("%u IUPAC bases were found and converted to DNA bases", num_IUPAC_found); } } // write annotation file fn_anno = tmap_get_file_name(fn_fasta, TMAP_ANNO_FILE); fp_anno = tmap_file_fopen(fn_anno, "wb", TMAP_ANNO_COMPRESSION); tmap_refseq_write_anno(fp_anno, refseq); // close files tmap_file_fclose(fp_pac); tmap_file_fclose(fp_anno); // check sequence name uniqueness for(i=0;i<refseq->num_annos;i++) { for(j=i+1;j<refseq->num_annos;j++) { if(0 == strcmp(refseq->annos[i].name->s, refseq->annos[j].name->s)) { tmap_file_fprintf(tmap_file_stderr, "Contigs have the same name: #%d [%s] and #%d [%s]\n", i+1, refseq->annos[i].name->s, j+1, refseq->annos[j].name->s); tmap_error("Contig names must be unique", Exit, OutOfRange); } } } tmap_refseq_destroy(refseq); tmap_seq_io_destroy(seqio); tmap_seq_destroy(seq); free(fn_pac); free(fn_anno); tmap_progress_print2("packed the reference FASTA"); tmap_refseq_pac2revpac(fn_fasta); return ref_len; }