static inline void tmap_refseq_read_annos(tmap_file_t *fp, tmap_anno_t *anno) { uint32_t len = 0; // includes the null-terminator if(1 != tmap_file_fread(&len, sizeof(uint32_t), 1, fp)) { tmap_error(NULL, Exit, ReadFileError); } anno->name = tmap_string_init(len); if(len != tmap_file_fread(anno->name->s, sizeof(char), len, fp) || 1 != tmap_file_fread(&anno->len, sizeof(uint64_t), 1, fp) || 1 != tmap_file_fread(&anno->offset, sizeof(uint64_t), 1, fp) || 1 != tmap_file_fread(&anno->num_amb, sizeof(uint32_t), 1, fp)) { tmap_error(NULL, Exit, ReadFileError); } if(0 < anno->num_amb) { anno->amb_positions_start = tmap_malloc(sizeof(uint32_t) * anno->num_amb, "anno->amb_positions_start"); anno->amb_positions_end = tmap_malloc(sizeof(uint32_t) * anno->num_amb, "anno->amb_positions_end"); anno->amb_bases = tmap_malloc(sizeof(uint8_t) * anno->num_amb, "anno->amb_bases"); if(anno->num_amb != tmap_file_fread(anno->amb_positions_start, sizeof(uint32_t), anno->num_amb, fp) || anno->num_amb != tmap_file_fread(anno->amb_positions_end, sizeof(uint32_t), anno->num_amb, fp) || anno->num_amb != tmap_file_fread(anno->amb_bases, sizeof(uint8_t), anno->num_amb, fp)) { tmap_error(NULL, Exit, WriteFileError); } } else { anno->amb_positions_start = NULL; anno->amb_positions_end = NULL; anno->amb_bases = NULL; } // set name length anno->name->l = len-1; }
inline uint8_t* tmap_refseq_subseq2(const tmap_refseq_t *refseq, uint32_t seqid, uint32_t start, uint32_t end, uint8_t *target, int32_t to_n, int32_t *conv) { uint32_t i, j; if(0 == seqid || refseq->num_annos < seqid || end < start) { return NULL; } if(NULL == target) { target = tmap_malloc(sizeof(char) * (end - start + 1), "target"); } if((end - start + 1) != tmap_refseq_subseq(refseq, refseq->annos[seqid-1].offset + start, end - start + 1, target)) { free(target); return NULL; } // check if any IUPAC bases fall within the range // NB: this could be done more efficiently, since we we know start <= end if(NULL != conv) (*conv) = 0; if(0 < tmap_refseq_amb_bases(refseq, seqid, start, end)) { // modify them for(i=start;i<=end;i++) { j = tmap_refseq_amb_bases(refseq, seqid, i, i); // Note: j is one-based if(0 < j) { target[i-start] = (0 == to_n) ? refseq->annos[seqid-1].amb_bases[j-1] : 4; if(NULL != conv) (*conv)++; } } } return target; }
tmap_refseq_t * tmap_refseq_read(const char *fn_fasta, uint32_t is_rev) { tmap_file_t *fp_pac = NULL, *fp_anno = NULL; char *fn_pac = NULL, *fn_anno = NULL; tmap_refseq_t *refseq = NULL; // allocate some memory refseq = tmap_calloc(1, sizeof(tmap_refseq_t), "refseq"); refseq->is_rev = is_rev; refseq->is_shm = 0; // read annotation file fn_anno = tmap_get_file_name(fn_fasta, TMAP_ANNO_FILE); fp_anno = tmap_file_fopen(fn_anno, "rb", TMAP_ANNO_COMPRESSION); tmap_refseq_read_anno(fp_anno, refseq); tmap_file_fclose(fp_anno); free(fn_anno); // read the sequence fn_pac = tmap_get_file_name(fn_fasta, (0 == is_rev) ? TMAP_PAC_FILE : TMAP_REV_PAC_FILE); fp_pac = tmap_file_fopen(fn_pac, "rb", (0 == is_rev) ? TMAP_PAC_COMPRESSION : TMAP_REV_PAC_COMPRESSION); refseq->seq = tmap_malloc(sizeof(uint8_t)*tmap_refseq_seq_memory(refseq->len), "refseq->seq"); // allocate if(tmap_refseq_seq_memory(refseq->len) != tmap_file_fread(refseq->seq, sizeof(uint8_t), tmap_refseq_seq_memory(refseq->len), fp_pac)) { tmap_error(NULL, Exit, ReadFileError); } tmap_file_fclose(fp_pac); free(fn_pac); return refseq; }
tmap_sff_read_t * tmap_sff_read_read(tmap_file_t *fp, tmap_sff_header_t *gh, tmap_sff_read_header_t *rh) { tmap_sff_read_t *r = NULL; uint32_t i, n = 0; r = tmap_calloc(1, sizeof(tmap_sff_read_t), "r"); r->flowgram = tmap_malloc(sizeof(uint16_t)*gh->flow_length, "r->flowgram"); r->flow_index = tmap_malloc(sizeof(uint8_t)*rh->n_bases, "r->flow_index"); r->bases = tmap_string_init(rh->n_bases+1); r->quality = tmap_string_init(rh->n_bases+1); if(gh->flow_length != tmap_file_fread(r->flowgram, sizeof(uint16_t), gh->flow_length, fp) || rh->n_bases != tmap_file_fread(r->flow_index, sizeof(uint8_t), rh->n_bases, fp) || rh->n_bases != tmap_file_fread(r->bases->s, sizeof(char), rh->n_bases, fp) || rh->n_bases != tmap_file_fread(r->quality->s, sizeof(char), rh->n_bases, fp)) { tmap_error("tmap_file_fread", Exit, ReadFileError); } n += sizeof(uint16_t)*gh->flow_length + 3*sizeof(uint8_t)*rh->n_bases; // set length and null-terminators r->bases->l = rh->n_bases; r->quality->l = rh->n_bases; r->bases->s[r->bases->l]='\0'; r->quality->s[r->quality->l]='\0'; // convert qualities from int to char for(i=0;i<r->quality->l;i++) { r->quality->s[i] = QUAL2CHAR(r->quality->s[i]); } // convert flowgram to host order for(i=0;i<gh->flow_length;i++) { r->flowgram[i] = ntohs(r->flowgram[i]); } n += tmap_sff_read_padding(fp, n); #ifdef TMAP_SFF_DEBUG tmap_sff_read_print(stderr, r, gh, rh); #endif return r; }
int32_t tmap_sff_get_key_seq_int(tmap_sff_t *sff, uint8_t **key_seq) { int32_t i; int32_t key_seq_len = sff->gheader->key->l; (*key_seq) = tmap_malloc(sizeof(uint8_t) * key_seq_len, "key_seq"); for(i=0;i<key_seq_len;i++) { (*key_seq)[i] = tmap_nt_char_to_int[(int)sff->gheader->key->s[i]]; } return key_seq_len; }
int32_t tmap_sff_get_flow_order_int(tmap_sff_t *sff, uint8_t **flow_order) { int32_t i; int32_t flow_order_len = sff->gheader->flow->l; (*flow_order) = tmap_malloc(sizeof(uint8_t) * flow_order_len, "flow_order"); for(i=0;i<flow_order_len;i++) { (*flow_order)[i] = tmap_nt_char_to_int[(int)sff->gheader->flow->s[i]]; } return flow_order_len; }
// NB: would require sorting to get the commands sorted void tmap_help_unknown_cmd(const char *cmd) { int32_t i, n, best=INT32_MAX, best_n=0; uint64_t *distances = NULL; tmap_command_t *c = NULL; // get # of commands n = 0; c = commands; while(0 <= c->type) { n++; c++; } distances = tmap_malloc(sizeof(uint64_t)*n, "distances"); for(i=0;0 <= commands[i].type;i++) { if(0 == strcmp(cmd, commands[i].name)) tmap_bug(); if(!tmap_prefixcmp(commands[i].name, cmd)) { distances[i] = ((uint64_t)i << 32); // zero score } else { distances[i] = ((uint64_t)i << 32) | (tmap_levenshtein(cmd, commands[i].name, 0, 2, 1, 4) + 1); // pack } if(__get_distance(distances[i]) < best) { best = __get_distance(distances[i]); best_n = 0; } else if(__get_distance(distances[i]) == best) { best_n++; } //fprintf(stderr, "%s -> %u\n", commands[distances[i]>>32].name, (__get_distance(distances[i]))); } if(0 == best && n == best_n) { // matches everything best = TMAP_HELP_SIMILARITY_FLOOR + 1; } n = i; // output similar matches fprintf(stderr, "%s: '%s' is not a tmap command. See 'tmap --help'.\n", PACKAGE, cmd); if(TMAP_HELP_SIMILAR_ENOUGH(best)) { fprintf(stderr, "\nDid you mean %s?\n", i < 2 ? "this": "one of these"); for (i = 0; i < n; i++) if(best == __get_distance(distances[i])) { fprintf(stderr, "\t%s\n", commands[__get_name_idx(distances[i])].name); } } free(distances); }
static tmap_sff_read_t * tmap_sff_read_clone(tmap_sff_read_t *r, tmap_sff_header_t *gh, tmap_sff_read_header_t *rh) { tmap_sff_read_t *ret = NULL; uint32_t i; ret = tmap_calloc(1, sizeof(tmap_sff_read_t), "r"); ret->flowgram = tmap_malloc(sizeof(uint16_t)*gh->flow_length, "ret->flowgram"); for(i=0;i<gh->flow_length;i++) { ret->flowgram[i] = r->flowgram[i]; } ret->flow_index = tmap_malloc(sizeof(uint8_t)*rh->n_bases, "ret->flow_index"); for(i=0;i<rh->n_bases;i++) { ret->flow_index[i] = r->flow_index[i]; } ret->bases = tmap_string_clone(r->bases); ret->quality = tmap_string_clone(r->quality); return ret; }
// from bam_md.c in SAMtools void tmap_sam_md1(bam1_t *b, char *ref, int32_t len) { int32_t i, j; char *ref_tmp = NULL; ref_tmp = tmap_malloc(sizeof(char) * (1 + len), "ref_tmp"); for(i=j=0;i<len;i++) { if('-' != ref[i] && 'H' != ref[i]) { ref_tmp[j] = ref[i]; j++; } } ref_tmp[j]='\0'; tmap_sam_md1_core(b, ref_tmp); free(ref_tmp); }
// TODO: memory pools? tmap_map_sams_t * tmap_map3_aux_core(tmap_seq_t *seq, uint8_t *flow_order, int32_t flow_order_len, tmap_refseq_t *refseq, tmap_bwt_t *bwt, tmap_sa_t *sa, tmap_bwt_match_hash_t *hash, tmap_map_opt_t *opt) { int32_t i, j, n, seed_length, hp_diff = 0; int32_t seq_len; tmap_string_t *bases; uint8_t *query; uint8_t *flow=NULL; tmap_map3_aux_seed_t *seeds; int32_t m_seeds, n_seeds; tmap_map_sams_t *sams = NULL; if(0 < opt->hp_diff) { // set up the flow order to be used if(NULL == flow_order) { hp_diff = 0; } else { flow = tmap_malloc(sizeof(uint8_t)*flow_order_len, "flow[0]"); for(i=0;i<flow_order_len;i++) { flow[i] = flow_order[i]; // forward } } } // init sams = tmap_map_sams_init(NULL); // update the seed length based on the read length seed_length = opt->seed_length; if(0 == opt->seed_length_set) { i = tmap_seq_get_bases_length(seq); while(0 < i) { seed_length++; i >>= 1; // divide by two } }
tmap_seqs_t * tmap_seqs_clone(tmap_seqs_t *seqs) { tmap_seqs_t *ret = NULL; int32_t i; ret = tmap_calloc(1, sizeof(tmap_seqs_t), "ret"); ret->type = seqs->type; ret->n = seqs->n; ret->m = seqs->n; // do not expand memory if(0 < seqs->n) { ret->seqs = tmap_malloc(seqs->n * sizeof(tmap_seq_t*), "ret->seqs"); for(i=0;i<ret->n;i++) { ret->seqs[i] = tmap_seq_clone(seqs->seqs[i]); } } return ret; }
void tmap_vsw_bm_core(int32_t seq_len, int32_t tlen, int32_t n_iter, int32_t n_sub_iter, int32_t vsw_type) { int32_t i, j, k; tmap_vsw_t *vsw = NULL; tmap_vsw_opt_t *vsw_opt = NULL; int32_t softclip_start, softclip_end; tmap_sw_param_t ap; int32_t matrix[25]; tmap_map_opt_t *opt = tmap_map_opt_init(TMAP_MAP_ALGO_NONE); uint8_t *seq, *target; tmap_rand_t *rand = tmap_rand_init(0); seq = tmap_malloc(sizeof(uint8_t) * seq_len, "seq"); target = tmap_malloc(sizeof(uint8_t) * tlen, "target"); // random sequence for(i=0;i<seq_len;i++) { seq[i] = (uint8_t)(4*tmap_rand_get(rand)); } softclip_start = 1; softclip_end = 1; // initialize opt if(0 <= vsw_type) { vsw_opt = tmap_vsw_opt_init(opt->score_match, opt->pen_mm, opt->pen_gapo, opt->pen_gape, opt->score_thr); vsw = tmap_vsw_init(seq, seq_len, softclip_start, softclip_end, vsw_type, vsw_opt); } else { ap.matrix = matrix; __map_util_gen_ap(ap, opt); } int32_t front = (tlen - seq_len) / 2; int32_t end = tlen - seq_len - front; while(i<n_iter) { tmap_map_sam_t tmp_sam; int32_t overflow; for(j=k=0;j<front;j++,k++) { target[k] = (uint8_t)(4*tmap_rand_get(rand)); } for(j=0;j<seq_len;j++,k++) { target[k] = seq[j]; } for(j=0;j<end;j++,k++) { target[k] = (uint8_t)(4*tmap_rand_get(rand)); } for(j=0;j<n_sub_iter&&i<n_iter;j++,i++) { if(0 <= vsw_type) { // initialize the bounds tmp_sam.result.query_start = tmp_sam.result.query_end = 0; tmp_sam.result.target_start = tmp_sam.result.target_end = 0; // run the vsw tmap_vsw_process_fwd(vsw, seq, seq_len, target, tlen, &tmp_sam.result, &overflow, opt->score_thr, 0); } else { tmap_sw_clipping_core(seq, seq_len, target, tlen, &ap, softclip_start, softclip_end, NULL, NULL, 0); } } } // free memory free(target); free(seq); if(0 <= vsw_type) { tmap_vsw_opt_destroy(vsw_opt); tmap_vsw_destroy(vsw); } tmap_map_opt_destroy(opt); tmap_rand_destroy(rand); }
bam_header_t * tmap_seqs_io_to_bam_header(tmap_refseq_t *refseq, tmap_seqs_io_t *io_in, char **rg_sam, int32_t rg_sam_num, int32_t argc, char *argv[]) { bam_header_t *bam_header = NULL; sam_header_t *header = NULL; // the output header sam_header_record_t *record = NULL; sam_header_record_t **record_list = NULL; char tag[2]; char *command_line= NULL; char *id = NULL; char *id_pp = NULL; int32_t i, j; // @HD if(io_in->type == TMAP_SEQ_TYPE_SAM || io_in->type == TMAP_SEQ_TYPE_BAM) { // should be only one input file if(1 != io_in->n) { tmap_bug(); } // get the current header if(NULL == io_in->seqios[0]) tmap_bug(); if(NULL == io_in->seqios[0]->io.samio) tmap_bug(); if(NULL == io_in->seqios[0]->io.samio->fp->header) tmap_bug(); if(NULL == io_in->seqios[0]->io.samio->fp->header->header) { header = sam_header_parse2(io_in->seqios[0]->io.samio->fp->header->text); } else { header = io_in->seqios[0]->io.samio->fp->header->header; // wow, that's a lot of pointers if(NULL == header) tmap_bug(); header = sam_header_clone(header); // clone the header } if(NULL == header) tmap_bug(); } else { // empty header header = sam_header_init(); // @HD - header line record = sam_header_record_init("HD"); // new header line if(0 == sam_header_record_add(record, "VN", "1.4")) tmap_bug(); // version number if(0 == sam_header_add_record(header, record)) tmap_bug(); // add the header line // nullify record = NULL; } // Get the TMAP program ID id = tmap_malloc(sizeof(char) * (1 + strlen(PACKAGE_NAME)), "id"); strcpy(id, PACKAGE_NAME); // default for(i=j=0;NULL != (record_list = sam_header_get_record(header, "PG", "ID", id, &i)) && 0 < i;i=0) { // while the id is found char *ptr = NULL; // swap id and id_pp ptr = id_pp; id_pp = id; id = ptr; // create the new ID j++; id = tmap_realloc(id, sizeof(char) * (1 + (int)log10(j) + 1 + strlen(PACKAGE_NAME) + 1), "id"); if(sprintf(id, "%s.%d", PACKAGE_NAME, j) < 0) tmap_bug(); free(record_list); record_list = NULL; } // @SQ if(NULL != refseq) { sam_header_records_t *records = NULL; // NB: check to see if any SQ/SN records exist, if not, then ignore checking... // ZZ: We will not checking, but instead just remove all the old header. The old way of checking is not working records = sam_header_get_records(header, "SQ"); if (NULL != records) { // ZZ: remove the headers if exists. sam_header_remove_records(header, "SQ"); records = NULL; } // ZZ: Now we will just add all new tags for(i=0;i<refseq->num_annos;i++) { // for each reference sequence char num[32]; record = sam_header_record_init("SQ"); // new reference sequence record if(0 == sam_header_record_add(record, "SN", refseq->annos[i].name->s)) tmap_bug(); // reference sequence name if(sprintf(num, "%u", (uint32_t)refseq->annos[i].len) < 0) tmap_bug(); // integer to string if(0 == sam_header_record_add(record, "LN", num)) tmap_bug(); // reference sequence length if(0 == sam_header_add_record(header, record)) tmap_bug(); // add the reference sequence record } } // @RG - read group if(0 < rg_sam_num) { // @RG specified on the command line // Check for SAM/BAM // TODO: this should be possible... if(io_in->type == TMAP_SEQ_TYPE_SAM || io_in->type == TMAP_SEQ_TYPE_BAM) { tmap_error("Cannot specify the read groups on the command line when using SAM/BAM as input." " Please embed in the SAM/BAM header instead.", Exit, OutOfRange); } record = NULL; // go through the command line arguments for(i=0;i<rg_sam_num;i++) { if(strlen(rg_sam[i]) < 4) tmap_error("Read group too small", Exit, OutOfRange); if(':' != rg_sam[i][2]) tmap_error("Read group improperly formatted (no colon)", Exit, OutOfRange); // check for id if('I' == rg_sam[i][0] && 'D' == rg_sam[i][1]) { // new read group if(NULL != record) { // add the record tmap_seqs_io_init2_fs_and_add(io_in, header, record); // add @RG.KS and @RG.FO } record = sam_header_record_init("RG"); // new read group } // add the tag/value to the record if(NULL == record) { tmap_error("The read group ID must be specified first", Exit, OutOfRange); } tag[0]=rg_sam[i][0]; tag[1]=rg_sam[i][1]; // setup the tag if(0 == sam_header_record_add(record, tag, rg_sam[i]+3)) tmap_bug(); // add the tag/value } if(NULL != record) { // add the record tmap_seqs_io_init2_fs_and_add(io_in, header, record); // add @RG.KS and @RG.FO } // check that the # of read groups added was the same as the # of input files... sam_header_records_t *records = sam_header_get_records(header, "RG"); // get the header line if(records->n != io_in->n) tmap_error("The number of read groups did not match the number of input files", Exit, OutOfRange); } else if(io_in->type != TMAP_SEQ_TYPE_SAM && io_in->type != TMAP_SEQ_TYPE_BAM) { // dummy... for(i=0;i<io_in->n;i++) { // for each input file char buf[32]; record = sam_header_record_init("RG"); // new read group if(1 == io_in->n) strcpy(buf, "NOID"); else if(sprintf(buf, "NOID.%d", i+1) < 0) tmap_bug(); if(0 == sam_header_record_add(record, "ID", buf)) tmap_bug(); // dummy ID if(0 == sam_header_record_add(record, "SM", "NOSM")) tmap_bug(); // dummy SM, for Picard validation if(0 == sam_header_record_add(record, "PG", id)) tmap_bug(); // dummy PG tmap_seqs_io_init2_fs_and_add(io_in, header, record); // add @RG.KS and @RG.FO } } else { // check that SM/PG are present sam_header_records_t *records = sam_header_get_records(header, "RG"); // get the header line for(i=0;i<records->n;i++) { record = records->records[i]; if(NULL == sam_header_record_get(record, "ID")) tmap_error("Missing @RG.ID in the SAM/BAM Header", Exit, OutOfRange); if(NULL == sam_header_record_get(record, "SM")) { if(0 == sam_header_record_add(record, "SM", "NOSM")) tmap_bug(); // dummy SM, for Picard validation } if(NULL == sam_header_record_get(record, "PG")) { if(0 == sam_header_record_add(record, "PG", id)) tmap_bug(); // dummy PG } } } // @PG - program group // TODO: check for previous program group ID and set @PG.PP record = sam_header_record_init("PG"); // new program group if(0 == sam_header_record_add(record, "ID", id)) tmap_bug(); // @PG.ID if(0 == sam_header_record_add(record, "VN", PACKAGE_VERSION)) tmap_bug(); // @PG.VN // @PG.CL command_line = NULL; j = 1; // for the EOL command_line = tmap_realloc(command_line, sizeof(char) * j, "command_line"); command_line[j-1] = '\0'; for(i=0;i<argc;i++) { if(0 < i) j++; j += strlen(argv[i]); command_line = tmap_realloc(command_line, sizeof(char) * j, "command_line"); if(0 < i) strcat(command_line, " "); strcat(command_line, argv[i]); command_line[j-1] = '\0'; } if(0 == sam_header_record_add(record, "CL", command_line)) tmap_bug(); // @PG.CL if(NULL != id_pp) { // @PG.PP if(0 == sam_header_record_add(record, "PP", id_pp)) tmap_bug(); // @PG.CL } if(0 == sam_header_add_record(header, record)) tmap_bug(); // add the record free(command_line); // Check the new SAM Header if(0 == sam_header_check(header)) { tmap_error("SAM Header was not consistent", Exit, OutOfRange); } // Create a BAM Header from the SAM Header bam_header = bam_header_init(); // empty bam_header->header = header; // soft-copy the header bam_header = sam_header_to_bam_header(bam_header); // convert // free memory free(id); free(id_pp); return bam_header; }
static inline void tmap_map1_aux_stack_push(tmap_map1_aux_stack_t *stack, int32_t offset, tmap_bwt_match_occ_t *match_sa_prev, int32_t n_mm, int32_t n_gapo, int32_t n_gape, int32_t state, int32_t is_diff, tmap_map1_aux_stack_entry_t *prev_entry, const tmap_map_opt_t *opt) { int32_t i; int32_t n_bins_needed = 0; tmap_map1_aux_stack_entry_t *entry = NULL; tmap_map1_aux_bin_t *bin = NULL; // check to see if we need more memory if(stack->entry_pool_length <= stack->entry_pool_i) { int32_t i = stack->entry_pool_length; stack->entry_pool_length <<= 2; stack->entry_pool = tmap_realloc(stack->entry_pool, sizeof(tmap_map1_aux_stack_entry_t*)*stack->entry_pool_length, "stack->entry_pool"); while(i<stack->entry_pool_length) { stack->entry_pool[i] = tmap_malloc(sizeof(tmap_map1_aux_stack_entry_t), "stack->entry_pool[i]"); i++; } } entry = stack->entry_pool[stack->entry_pool_i]; entry->score = aln_score(n_mm, n_gapo, n_gape, opt); entry->n_mm = n_mm; entry->n_gapo = n_gapo; entry->n_gape = n_gape; entry->state = state; entry->match_sa = (*match_sa_prev); entry->i = stack->entry_pool_i; entry->offset = offset; if(NULL == prev_entry) { entry->last_diff_offset = offset; entry->prev_i = -1; } else { entry->last_diff_offset = (1 == is_diff) ? (offset) : prev_entry->last_diff_offset; entry->prev_i = prev_entry->i; } if(stack->n_bins <= entry->score) { //tmap_bug(); // resize the bins if necessary n_bins_needed = entry->score + 1; // realloc tmap_roundup32(n_bins_needed); stack->bins = tmap_realloc(stack->bins, sizeof(tmap_map1_aux_bin_t) * n_bins_needed, "stack->bins"); // initialize for(i=stack->n_bins;i<n_bins_needed;i++) { stack->bins[i].n_entries = stack->bins[i].m_entries = 0; stack->bins[i].entries = NULL; } stack->n_bins = n_bins_needed; } if(stack->n_bins <= entry->score) { tmap_bug(); } bin = &stack->bins[entry->score]; // - remove duplicates // - most likely formed by tandem repeats or indels // - too computationally expensive, and not necessary /* for(i=0;i<bin->n_entries;i++) { if(bin->entries[i]->match_sa.k == entry->match_sa.k && bin->entries[i]->match_sa.l == entry->match_sa.l && bin->entries[i]->offset == entry->offset && bin->entries[i]->state == entry->state) { return; } } */ // update best score if(stack->best_score > entry->score) stack->best_score = entry->score; if(bin->m_entries <= bin->n_entries) { bin->m_entries++; tmap_roundup32(bin->m_entries); bin->entries = tmap_realloc(bin->entries, sizeof(tmap_map1_aux_bin_t) * bin->m_entries, "bin->entries"); } bin->entries[bin->n_entries] = entry; bin->n_entries++; stack->entry_pool_i++; stack->n_entries++; }
// prepare internal structures for clipping and alignment // returns true if realignment was performed bool RealignImp::compute_alignment ( const char* q_seq, unsigned q_len, const char* r_seq, unsigned r_len, int r_pos, bool forward, const uint32_t* cigar, unsigned cigar_sz, uint32_t*& cigar_dest, unsigned& cigar_dest_sz, int& new_pos, bool& already_perfect, bool& clip_failed, bool& alignment_failed, bool& unclip_failed) { already_perfect = false; alignment_failed = false; unclip_failed = false; unsigned oplen; const char* q_seq_clipped = q_seq; const uint32_t* cigar_clipped = cigar; unsigned cigar_sz_clipped = cigar_sz; unsigned sclip_q_len, sclip_r_len, sclip_al_len; assert (cigar_sz); // reset realigner Reset (); // set clipping SetClipping ((int) cliptype_, forward); // clip out the hard and soft clipping zones from 5" and 3" // The 'cut out' of the q_seq is done by switching to downstream pointer. if (bam_cigar_op (*cigar) == BAM_CSOFT_CLIP) { oplen = bam_cigar_oplen (*cigar); ClipStart (oplen); q_seq_clipped += oplen; ++cigar_clipped; --cigar_sz_clipped; } if (cigar_sz > 1 && bam_cigar_op (cigar [cigar_sz - 1]) == BAM_CSOFT_CLIP) { oplen = bam_cigar_oplen (cigar [cigar_sz - 1]); ClipEnd (oplen); --cigar_sz_clipped; } // cigar defines q_seq and t_seq lengths sclip_al_len = seq_lens_from_bin_cigar (cigar_clipped, cigar_sz_clipped, &sclip_q_len, &sclip_r_len); const std::string query (q_seq_clipped, sclip_q_len); const std::string target (r_seq, sclip_r_len); std::string pretty_al; pretty_al.reserve (sclip_al_len); pretty_al_from_bin_cigar (cigar_clipped, cigar_sz_clipped, q_seq_clipped, r_seq, pretty_al); // Realigner requires strings of proper size to be passed to SetSequences SetSequences (query, target, pretty_al, forward); if (!ClipAnchors (clip_failed)) { already_perfect = true; return false; // alignment already good, no imperfect zone to realign found } // TODO avoid automatic vectors to prevent unneeded heap usage vector<MDelement> new_md_vec; vector<CigarOp> new_cigar_vec; unsigned int start_pos_shift; if (!computeSWalignment(new_cigar_vec, new_md_vec, start_pos_shift)) { alignment_failed = true; return false; } if (!addClippedBasesToTags(new_cigar_vec, new_md_vec, q_len)) { unclip_failed = true; return false; // error adding back clipped out zones } if (!LeftAnchorClipped () && start_pos_shift != 0) { // build cigar data only if it is needed // TODO avoid automatic vectors to prevent unneeded heap usage std::vector <CigarOp> cigar_vec; cigar_vector_from_bin (cigar, cigar_sz, cigar_vec); new_pos = updateReadPosition (cigar_vec, start_pos_shift, r_pos); } else new_pos = r_pos; // free (cigar_dest); // TODO: switch to better alignment memory management, avoid heap operations cigar_dest = (uint32_t*) tmap_malloc (sizeof (uint32_t) * new_cigar_vec.size (), "cigar_dest"); cigar_dest_sz = new_cigar_vec.size (); cigar_vector_to_bin (new_cigar_vec, cigar_dest); return true; }
void tmap_sam_print_header(tmap_file_t *fp, tmap_refseq_t *refseq, tmap_seq_io_t *seqio, char *sam_rg, int32_t sam_flowspace_tags, int32_t ignore_rg_sam_tags, int argc, char *argv[]) { int32_t i, j, header_n = 0; char **header_a = NULL; char ***header_b = NULL; // SAM header tmap_file_fprintf(fp, "@HD\tVN:%s\tSO:unsorted\n", TMAP_SAM_PRINT_VERSION); if(NULL != refseq) { for(i=0;i<refseq->num_annos;i++) { tmap_file_fprintf(fp, "@SQ\tSN:%s\tLN:%d\n", refseq->annos[i].name->s, (int)refseq->annos[i].len); } } // RG header_a = tmap_sam_parse_rg(sam_rg); // parse the input read group line if(1 == ignore_rg_sam_tags) { // do not get the header from the input file if(1 == sam_flowspace_tags) { // ... except for the RG.FS/RG.KO // get the RG header from the input file header_b = tmap_seq_io_get_rg_header(seqio, &header_n); if(1 < header_n) { // TODO: we could check to see that FO/KS are the same across all // input read groups tmap_error("Command line read group found with multiple read groups from the input file", Exit, OutOfRange); } else if(1 == header_n) { if(NULL == header_a) { header_a = tmap_calloc(TMAP_SAM_RG_NUM, sizeof(char*), "header_a"); // copy over default RG.ID header_a[TMAP_SAM_RG_ID] = tmap_malloc(sizeof(char) * (strlen(tmap_sam_rg_id) + 1), "header_a[TMAP_SAM_RG_ID]"); strcpy(header_a[TMAP_SAM_RG_ID], tmap_sam_rg_id); } for(i=0;i<TMAP_SAM_RG_NUM;i++) { // for each RG.TAG switch(i) { case TMAP_SAM_RG_FO: case TMAP_SAM_RG_KS: if(NULL != header_a[i] && NULL != header_b[0][i]) { tmap_error("Command line and input read groups share tags", Exit, OutOfRange); } else if(NULL != header_b[0][i]) { // copy over header_a[i] = tmap_malloc(sizeof(char) * (strlen(header_b[0][i]) + 1), "header_a[i]"); strcpy(header_a[i], header_b[0][i]); } default: break; } } } // free header_b, it is no longer in use for(i=0;i<header_n;i++) { free(header_b[i]); } free(header_b); header_b = NULL; header_n = 0; } } else { // get the RG header from the input file header_b = tmap_seq_io_get_rg_header(seqio, &header_n); } // reconcile the RG headers if(NULL != header_a) { // header a exists if(NULL != header_b && 1 == header_n) { // header b exists, and only one line... // check to see if they are mutually exclusive for(i=0;i<TMAP_SAM_RG_NUM;i++) { if(NULL != header_a[i] && NULL != header_b[0][i]) { tmap_file_fprintf(tmap_file_stderr, "\nFound both command line and input file read group information for the same tag: %s.\n", TMAP_SAM_RG_TAGS[i]); tmap_error(NULL, Exit, OutOfRange); } else if(NULL == header_a[i] && NULL != header_b[0][i]) { // copy over header_a[i] = tmap_calloc(1+strlen(header_b[0][i]), sizeof(char), "header_a[i]"); strcpy(header_a[i], header_b[0][i]); } } // free free(header_b[0]); free(header_b); header_b = NULL; header_n = 0; } if(0 == header_n) { // no header b if(NULL != header_a[TMAP_SAM_RG_ID]) { strcpy(tmap_sam_rg_id, header_a[TMAP_SAM_RG_ID]); } else { header_a[TMAP_SAM_RG_ID] = tmap_malloc(sizeof(char) * (strlen(tmap_sam_rg_id) + 1), "header_a[i]"); strcpy(header_a[TMAP_SAM_RG_ID], tmap_sam_rg_id); } if(NULL == header_a[TMAP_SAM_RG_SM]) { // for Picard header_a[TMAP_SAM_RG_SM] = tmap_malloc(sizeof(char) * (strlen(TMAP_SAM_NO_RG_SM) + 1), "header_a[i]"); strcpy(header_a[TMAP_SAM_RG_SM], TMAP_SAM_NO_RG_SM); } tmap_sam_rg_id_use = 1; tmap_file_fprintf(fp, "@RG"); for(i=0;i<TMAP_SAM_RG_NUM;i++) { if(NULL != header_a[i]) { tmap_file_fprintf(fp, "\t%s:%s", TMAP_SAM_RG_TAGS[i], header_a[i]); } } tmap_file_fprintf(fp, "\n"); } else { // both header_a and header_b exist tmap_error("Found both command line and input file read group information", Exit, OutOfRange); } } else { // no header_a exists if(NULL != header_b) { // no header_b exists tmap_sam_rg_id_use = 0; for(i=0;i<header_n;i++) { // for each RG.ID if(NULL == header_b[i][TMAP_SAM_RG_ID]) { if(1 == header_n && TMAP_SEQ_TYPE_SFF == seqio->type) { // make an exception for SFF files header_b[i][TMAP_SAM_RG_ID] = tmap_sam_rg_id; tmap_sam_rg_id_use = 1; } else { tmap_error("missing RG.ID found in the RG SAM Header", Exit, OutOfRange); } } // RG.SM for picard if(NULL == header_b[i][TMAP_SAM_RG_SM]) { header_b[i][TMAP_SAM_RG_SM] = TMAP_SAM_NO_RG_SM; } tmap_file_fprintf(fp, "@RG"); for(j=0;j<TMAP_SAM_RG_NUM;j++) { // for each RG.TAG if(NULL != header_b[i][j]) { tmap_file_fprintf(fp, "\t%s:%s", TMAP_SAM_RG_TAGS[j], header_b[i][j]); } } tmap_file_fprintf(fp, "\n"); } } else { header_a = tmap_calloc(TMAP_SAM_RG_NUM, sizeof(char*), "header_a"); // RG.ID header_a[TMAP_SAM_RG_ID] = tmap_malloc(sizeof(char) * (strlen(tmap_sam_rg_id) + 1), "header_a[i]"); strcpy(header_a[TMAP_SAM_RG_ID], tmap_sam_rg_id); // RG.SM for Picard header_a[TMAP_SAM_RG_SM] = tmap_malloc(sizeof(char) * (strlen(TMAP_SAM_NO_RG_SM) + 1), "header_a[i]"); strcpy(header_a[TMAP_SAM_RG_SM], TMAP_SAM_NO_RG_SM); tmap_sam_rg_id_use = 1; tmap_file_fprintf(fp, "@RG"); for(i=0;i<TMAP_SAM_RG_NUM;i++) { if(NULL != header_a[i]) { tmap_file_fprintf(fp, "\t%s:%s", TMAP_SAM_RG_TAGS[i], header_a[i]); } } tmap_file_fprintf(fp, "\n"); } } // PG tmap_file_fprintf(fp, "@PG\tID:%s\tVN:%s\tCL:", PACKAGE_NAME, PACKAGE_VERSION); for(i=0;i<argc;i++) { if(0 < i) tmap_file_fprintf(fp, " "); tmap_file_fprintf(fp, "%s", argv[i]); } tmap_file_fprintf(fp, "\n"); // free for(i=0;i<header_n;i++) { free(header_b[i]); } free(header_b); if(NULL != header_a) { for(i=0;i<TMAP_SAM_RG_NUM;i++) { free(header_a[i]); } } free(header_a); }