static void calcsplicesitescoreprocmatch(Traversealignmentstate *state, void *data, unsigned long lengthofeop) { Calcsplicesitescoredata *d = (Calcsplicesitescoredata*) data; unsigned int gen_alphabet_mapsize = gt_alphabet_size(d->gen_alphabet); GthDPOptionsEST *dp_options_est = d->dp_options_est; unsigned char genomicchar, referencechar; unsigned long numofmatchestoprocess, alignmentpositionsleft; GthFlt genomicinterimvalue = 0.0, referenceinterimvalue = 0.0; if (d->processedalignmentpositions < SPLICE_SITE_SCORE_WINDOW) { alignmentpositionsleft = SPLICE_SITE_SCORE_WINDOW - d->processedalignmentpositions; numofmatchestoprocess = MIN(lengthofeop, alignmentpositionsleft); genomicchar = d->gen_seq_tran[state->genomicptr]; referencechar = d->ref_seq_tran[state->referenceptr]; ADDOUTPUTWEIGHT(referenceinterimvalue, genomicchar, referencechar); ADDOUTPUTWEIGHTIDENTITY(genomicinterimvalue, genomicchar); genomicinterimvalue *= numofmatchestoprocess; referenceinterimvalue *= numofmatchestoprocess; d->splicesiteweight += referenceinterimvalue; d->maxsplicesiteweight += genomicinterimvalue; d->processedalignmentpositions += numofmatchestoprocess; } else d->breaktraversealignment = true; }
GtAlphabet* gt_coin_hmm_alphabet(void) { GtAlphabet *a = gt_alphabet_new_empty(); gt_alphabet_add_mapping(a, "Hh"); gt_alphabet_add_mapping(a, "Tt"); gt_assert(gt_alphabet_size(a) == 2); return a; }
void gt_score_matrix_show(const GtScoreMatrix *sm, FILE *fp) { unsigned i, j; gt_assert(sm && fp); /* show alphabet line */ gt_xfputc(' ', fp); for (i = 0; i < gt_alphabet_size(sm->alphabet); i++) fprintf(fp, " %c", gt_alphabet_decode(sm->alphabet, i)); gt_xfputc('\n', fp); /* show score lines */ for (i = 0; i < gt_alphabet_size(sm->alphabet); i++) { gt_xfputc(gt_alphabet_decode(sm->alphabet, i), fp); for (j = 0; j < gt_alphabet_size(sm->alphabet); j++) fprintf(fp, " %2d", gt_score_matrix_get_score(sm, i, j)); gt_xfputc('\n', fp); } }
static GtHcrSeqDecoder *hcr_seq_decoder_new(GtAlphabet *alpha, const char *name, GtError *err) { GtHcrSeqDecoder *seq_dec = gt_malloc(sizeof (GtHcrSeqDecoder)); GtBaseQualDistr *bqd = NULL; GtWord end_enc_start_sampling = 0; FILE *fp = NULL; GT_UNUSED size_t read, one = (size_t) 1; seq_dec->alpha = alpha; seq_dec->alphabet_size = gt_alphabet_size(alpha); seq_dec->cur_read = 0; seq_dec->data_iter = NULL; seq_dec->file_info_rbt = NULL; seq_dec->fileinfos = NULL; seq_dec->filename = gt_str_new_cstr(name); seq_dec->huff_dec = NULL; seq_dec->huffman = NULL; seq_dec->sampling = NULL; seq_dec->symbols = NULL; gt_str_append_cstr(seq_dec->filename, HCRFILESUFFIX); fp = gt_fa_fopen_with_suffix(name, HCRFILESUFFIX, "rb", err); if (gt_error_is_set(err)) { hcr_seq_decoder_delete(seq_dec); seq_dec = NULL; } else { hcr_read_file_info(seq_dec, fp); bqd = hcr_base_qual_distr_new_from_file(fp, seq_dec->alpha); seq_dec->qual_offset = bqd->qual_offset; read = gt_xfread_one(&end_enc_start_sampling, fp); gt_assert(read == one); seq_dec->start_of_encoding = decoder_calc_start_of_encoded_data(fp); seq_decoder_init_huffman(seq_dec, end_enc_start_sampling, bqd, err); if (gt_error_is_set(err)) { hcr_seq_decoder_delete(seq_dec); seq_dec = NULL; } } if (seq_dec != NULL) { gt_xfseek(fp, end_enc_start_sampling, SEEK_SET); seq_dec->sampling = gt_sampling_read(fp); seq_dec->file_info_rbt = seq_decoder_init_file_info(seq_dec->fileinfos, seq_dec->num_of_files); } hcr_base_qual_distr_delete(bqd); gt_fa_fclose(fp); return seq_dec; }
static GtUword hcr_write_seq(GtHcrSeqEncoder *seq_encoder, const GtUchar *seq, const GtUchar *qual, GtUword len, GtBitOutStream *bitstream, bool dry) { unsigned bits_to_write, cur_char_code, cur_qual, symbol; GtUword i, written_bits = 0; GtBitsequence code; for (i = 0; i < len; i++) { cur_char_code = (unsigned) seq[i]; if (cur_char_code == WILDCARD) cur_char_code = gt_alphabet_size(seq_encoder->alpha) - 1; cur_qual = (unsigned) qual[i]; if (seq_encoder->qrange.start != GT_UNDEF_UINT) { if (cur_qual <= seq_encoder->qrange.start) cur_qual = seq_encoder->qrange.start; } if (seq_encoder->qrange.end != GT_UNDEF_UINT) { if (cur_qual >= seq_encoder->qrange.end) cur_qual = seq_encoder->qrange.end; } cur_qual = cur_qual - seq_encoder->qual_offset; symbol = gt_alphabet_size(seq_encoder->alpha) * cur_qual + cur_char_code; gt_huffman_encode(seq_encoder->huffman, (GtUword) symbol, &code, &bits_to_write); written_bits += bits_to_write; if (!dry) { gt_bitoutstream_append(bitstream, code, bits_to_write); } } return written_bits; }
static int alphabet_lua_size(lua_State *L) { GtAlphabet **alpha; unsigned int size; alpha = check_alphabet(L, 1); size = gt_alphabet_size(*alpha); lua_pushinteger(L, size); return 1; }
GtScoreMatrix* gt_score_matrix_new(GtAlphabet *alphabet) { GtScoreMatrix *sm; gt_assert(alphabet); sm = gt_malloc(sizeof (GtScoreMatrix)); sm->alphabet = gt_alphabet_ref(alphabet); sm->dimension = gt_alphabet_size(alphabet); gt_array2dim_calloc(sm->scores, sm->dimension, sm->dimension); return sm; }
static GtBaseQualDistr* hcr_base_qual_distr_new(GtAlphabet *alpha, GtQualRange qrange) { GtBaseQualDistr *bqd; bqd = gt_calloc((size_t) 1, sizeof (GtBaseQualDistr)); gt_array2dim_calloc(bqd->distr, HCR_HIGHESTQUALVALUE + 1UL, gt_alphabet_size(alpha)); bqd->ncols = gt_alphabet_size(alpha); bqd->nrows = HCR_HIGHESTQUALVALUE + 1U; bqd->qual_offset = HCR_LOWESTQUALVALUE; bqd->wildcard_indx = gt_alphabet_size(alpha) - 1; bqd->min_qual = HCR_HIGHESTQUALVALUE; bqd->max_qual = HCR_LOWESTQUALVALUE; gt_safe_assign(bqd->qrange_start, qrange.start); gt_safe_assign(bqd->qrange_end, qrange.end); bqd->alpha = alpha; return bqd; }
static GtScoreFunction* gt_dna_scorefunc_new(GtAlphabet *a, int match, int mismatch, int insertion, int deletion) { GtScoreMatrix *sm = gt_score_matrix_new(a); GtScoreFunction *sf = gt_score_function_new(sm, insertion, deletion); unsigned int m,n; for (m=0;m<gt_alphabet_size(a);m++) { for (n=0;n<gt_alphabet_size(a);n++) { gt_score_matrix_set_score(sm, m, n, (n==m ? match : mismatch)); } } /* make N-N a mismatch! */ gt_score_matrix_set_score(sm, gt_alphabet_size(a) - 1, gt_alphabet_size(a) - 1, mismatch); return sf; }
static int alphabet_lua_decode(lua_State *L) { GtAlphabet **alpha; unsigned int code; char character; alpha = check_alphabet(L, 1); code = luaL_checkinteger(L, 2); /* XXX: too restrictive, does not consider wildcards */ luaL_argcheck(L, code < gt_alphabet_size(*alpha), 2, "invalid code"); character = gt_alphabet_decode(*alpha, code); lua_pushlstring(L, &character, 1); return 1; }
static GthFlt get_score(GtScoreMatrix *score_matrix, GtAlphabet *score_matrix_alphabet, unsigned char amino, unsigned char origreferencechar) { GthFlt rval = 0.0, scalefactor = SCALEFACTOR, indel_penalty = INDEL_PENALTY; if (amino == DASH || origreferencechar == DASH) { /* 1.) scaled INDEL_PENALTY for deletions from and insertions into genomic DNA of lengths 1, 2, or 3, irrespective of indel size */ rval = scalefactor * indel_penalty; } else if (amino != WILDCARD && amino <= CHAR_MAX && gt_alphabet_valid_input(score_matrix_alphabet, amino) && origreferencechar <= CHAR_MAX && gt_alphabet_valid_input(score_matrix_alphabet, origreferencechar)) { /* XXX: shorten this */ if (amino == GT_STOP_AMINO) { /* 2.) (-)2*INDEL_PENALTY for matching/mismatching a stop codon */ if (origreferencechar == GT_STOP_AMINO) rval = scalefactor * -2 * indel_penalty; else rval = scalefactor * 2 * indel_penalty; } else { /* 3.) amino acid substitution score */ if (origreferencechar == GT_STOP_AMINO) rval = scalefactor * 2 * indel_penalty; else { GtUchar code1, code2; int wcidx; code1 = gt_alphabet_encode(score_matrix_alphabet, amino); code2 = gt_alphabet_encode(score_matrix_alphabet, origreferencechar); wcidx = gt_alphabet_size(score_matrix_alphabet) - 1; rval = scalefactor * gt_score_matrix_get_score(score_matrix, code1 == WILDCARD ? wcidx : code1, code2 == WILDCARD ? wcidx : code2); } } } /* 4.) else: neutral score in case of wild-card characters in the genomic DNA */ return rval; }
static int hcr_huffman_write_base_qual_freq(GtUword symbol, GtUint64 freq, GT_UNUSED GtBitsequence code, GT_UNUSED unsigned code_length, void *pt) { GtUchar base, qual; WriteNodeInfo *info = (WriteNodeInfo*)pt; gt_safe_assign(base, (symbol % gt_alphabet_size(info->alpha))); if (base == (GtUchar) gt_alphabet_size(info->alpha) - 1) base = (GtUchar) WILDCARD; gt_safe_assign(base, (toupper(gt_alphabet_decode(info->alpha, base)))); gt_xfwrite_one(&base, info->output); gt_safe_assign(qual, (symbol / gt_alphabet_size(info->alpha) + info->qual_offset)); gt_xfwrite_one(&qual, info->output); gt_xfwrite_one(&freq, info->output); return 0; }
static GtBaseQualDistr* hcr_base_qual_distr_new_from_file(FILE *fp, GtAlphabet *alpha) { GtBaseQualDistr *bqd; char read_char_code; GtUchar cur_char_code; unsigned char cur_qual; unsigned alpha_size, min_qual = HCR_HIGHESTQUALVALUE, max_qual = HCR_LOWESTQUALVALUE; GtUword numofleaves, i; GtUint64 cur_freq; GT_UNUSED size_t read, one = (size_t) 1; alpha_size = gt_alphabet_size(alpha); bqd = gt_malloc(sizeof (GtBaseQualDistr)); gt_array2dim_calloc(bqd->distr, HCR_HIGHESTQUALVALUE + 1UL, alpha_size) bqd->ncols = alpha_size; bqd->nrows = HCR_HIGHESTQUALVALUE + 1U; bqd->qual_offset = HCR_LOWESTQUALVALUE; bqd->wildcard_indx = alpha_size - 1; read = gt_xfread_one(&numofleaves, fp); gt_assert(read == one); for (i = 0; i < numofleaves; i++) { read = gt_xfread_one(&read_char_code, fp); gt_assert(read == one); read = gt_xfread_one(&cur_qual, fp); gt_assert(read == one); read = gt_xfread_one(&cur_freq, fp); gt_assert(read == one); cur_char_code = gt_alphabet_encode(alpha, read_char_code); if (cur_char_code == (GtUchar) WILDCARD) gt_safe_assign(cur_char_code, bqd->wildcard_indx); bqd->distr[cur_qual][cur_char_code] = cur_freq; if ((unsigned) cur_qual > max_qual) max_qual = cur_qual; if ((unsigned) cur_qual < min_qual) min_qual = cur_qual; } bqd->min_qual = min_qual; bqd->max_qual = max_qual; hcr_base_qual_distr_trim(bqd); return bqd; }
static void computebordersandscoresprocinsertion(Traversealignmentstate *state, void *data, GT_UNUSED unsigned long lengthofeop) { Computebordersandscoresdata *d = (Computebordersandscoresdata*) data; unsigned int gen_alphabet_mapsize = gt_alphabet_size(d->gen_alphabet); GthDPOptionsEST *dp_options_est = d->dp_options_est; unsigned char genomicchar, referencechar; GthFlt score; gt_assert(lengthofeop == 1); /* we are not processing with 1 base left here */ gt_assert(!state->processing_intron_with_1_base_left); /* we are not processing with 2 bases left here */ gt_assert(!state->processing_intron_with_2_bases_left); evalnewexonifpossible(d->proteineop, &d->newexon, &d->newintron, &d->firstexon, d->introncutout, d->spliced_seq, &d->exon, &d->intron, d->sa, state, d->gen_alphabet, d->dp_param, dp_options_est, d->gen_seq_tran, d->ref_seq_tran, d->gen_dp_start); referencechar = d->ref_seq_tran[state->referenceptr]; if (d->proteineop) { score = GTHGETSCORE(d->dp_scores_protein, DASH, DASH, DASH, referencechar); gt_assert(score < 0.0); /* XXX: maybe remove this */ d->singleexonweight += score; /* we subtract the negative score here to increase the maxsingleexonweight this is a somewhat arbirtarily chosen value, since it does not reflect the maximum value which is possible with a proper codon */ d->maxsingleexonweight -= score; /* XXX: maybe better add the maximum score which can be achieved by a match with referencechar */ } else { genomicchar = (unsigned char) DASH; ADDOUTPUTWEIGHT(d->singleexonweight, genomicchar, referencechar); /* SK: replaced ADDOUTPUTWEIGHT(d->maxsingleexonweight, genomicchar, genomicchar); by the following */ ADDOUTPUTWEIGHTIDENTITY(d->maxsingleexonweight, genomicchar); } }
static void calcsplicesitescoreprocmismatchordeletion(Traversealignmentstate *state, void *data, GT_UNUSED unsigned long lengthofeop) { Calcsplicesitescoredata *d = (Calcsplicesitescoredata*) data; unsigned int gen_alphabet_mapsize = gt_alphabet_size(d->gen_alphabet); GthDPOptionsEST *dp_options_est = d->dp_options_est; unsigned char genomicchar; if (d->processedalignmentpositions < SPLICE_SITE_SCORE_WINDOW) { genomicchar = d->gen_seq_tran[state->genomicptr]; ADDOUTPUTWEIGHTIDENTITY(d->maxsplicesiteweight, genomicchar); d->processedalignmentpositions++; } else d->breaktraversealignment = true; }
GtWtree* gt_wtree_encseq_new(GtEncseq *encseq) { /* sample rate for compressd bitseq */ const unsigned int samplerate = 32U; GtWtree *wtree; GtWtreeEncseq *wtree_encseq; wtree = gt_wtree_create(gt_wtree_encseq_class()); wtree_encseq = gt_wtree_encseq_cast(wtree); wtree_encseq->encseq = gt_encseq_ref(encseq); wtree_encseq->alpha = gt_alphabet_ref(gt_encseq_alphabet(encseq)); /* encoded chars + WC given by gt_alphabet_size, we have to encode UNDEFCHAR and SEPARATOR too */ wtree_encseq->alpha_size = gt_alphabet_size(wtree_encseq->alpha) + 2; wtree->members->num_of_symbols = (GtUword) wtree_encseq->alpha_size; /* levels in tree: \lceil log_2(\sigma)\rceil */ wtree_encseq->levels = gt_determinebitspervalue((GtUword) wtree_encseq->alpha_size); wtree_encseq->root_fo = gt_wtree_encseq_fill_offset_new(); wtree_encseq->current_fo = wtree_encseq->root_fo; wtree->members->length = gt_encseq_total_length(encseq); /* each level has number of symbols bits */ wtree_encseq->num_of_bits = wtree_encseq->levels * wtree->members->length; wtree_encseq->bits_size = wtree_encseq->num_of_bits / (sizeof (GtBitsequence) * CHAR_BIT); if (wtree_encseq->num_of_bits % (sizeof (GtBitsequence) * CHAR_BIT) != 0) wtree_encseq->bits_size++; wtree_encseq->bits = gt_calloc((size_t) wtree_encseq->bits_size, sizeof (GtBitsequence)); wtree_encseq->node_start = 0; gt_wtree_encseq_fill_bits(wtree_encseq); wtree_encseq->c_bits = gt_compressed_bitsequence_new(wtree_encseq->bits, samplerate, wtree_encseq->num_of_bits); gt_free(wtree_encseq->bits); wtree_encseq->bits = NULL; return wtree; }
static void computebordersandscoresprocmatch(Traversealignmentstate *state, void *data, unsigned long lengthofeop) { Computebordersandscoresdata *d = (Computebordersandscoresdata*) data; unsigned int gen_alphabet_mapsize = gt_alphabet_size(d->gen_alphabet); GthDPOptionsEST *dp_options_est = d->dp_options_est; unsigned char genomicchar1, genomicchar2, genomicchar3, referencechar, origreferencechar; GthFlt genomicinterimvalue = 0.0, referenceinterimvalue = 0.0; evalnewexonifpossible(d->proteineop, &d->newexon, &d->newintron, &d->firstexon, d->introncutout, d->spliced_seq, &d->exon, &d->intron, d->sa, state, d->gen_alphabet, d->dp_param, d->dp_options_est, d->gen_seq_tran, d->ref_seq_tran, d->gen_dp_start); if (d->proteineop) { if (state->processing_intron_with_1_base_left) { genomicchar1 = d->gen_seq_tran[state->firstbaseleftptr]; genomicchar2 = d->gen_seq_tran[state->genomicptr]; genomicchar3 = d->gen_seq_tran[state->genomicptr + 1]; origreferencechar = d->ref_seq_orig[state->referenceptr]; } else if (state->processing_intron_with_2_bases_left) { genomicchar1 = d->gen_seq_tran[state->firstbaseleftptr]; genomicchar2 = d->gen_seq_tran[state->secondbaseleftptr]; genomicchar3 = d->gen_seq_tran[state->genomicptr]; origreferencechar = d->ref_seq_orig[state->referenceptr - 1]; /* ^^^! we are processing a match after an intron with two bases left here. therefore, the reference pointer points already to the amino acid after this match, because the previous amino acid belongs to the two nucleotides before the intron. now it should be clear why we have to subtract 1 here. */ } else { genomicchar1 = d->gen_seq_tran[state->genomicptr]; genomicchar2 = d->gen_seq_tran[state->genomicptr + 1]; genomicchar3 = d->gen_seq_tran[state->genomicptr + 2]; origreferencechar = d->ref_seq_orig[state->referenceptr]; } /* genomic codon equals reference character */ gt_assert(origreferencechar == gthgetcodon(genomicchar1, genomicchar2, genomicchar3, d->gen_alphabet_characters, d->transtable)); genomicinterimvalue = GTHGETSCORE(d->dp_scores_protein, genomicchar1, genomicchar2, genomicchar3, origreferencechar); genomicinterimvalue *= lengthofeop; d->singleexonweight += genomicinterimvalue; d->maxsingleexonweight += genomicinterimvalue; } else { genomicchar1 = d->gen_seq_tran[state->genomicptr]; /* referenceptr in valid range */ gt_assert(state->referenceptr >= 0 && state->referenceptr < (long) d->ref_dp_length); referencechar = d->ref_seq_tran[state->referenceptr]; /* genomic char equals reference char */ gt_assert(genomicchar1 == referencechar); ADDOUTPUTWEIGHT(referenceinterimvalue, genomicchar1, referencechar); /* SK: replaced ADDOUTPUTWEIGHT(genomicinterimvalue, genomicchar1, genomicchar1); by the following */ ADDOUTPUTWEIGHTIDENTITY(genomicinterimvalue, genomicchar1); genomicinterimvalue *= lengthofeop; referenceinterimvalue *= lengthofeop; d->singleexonweight += referenceinterimvalue; d->maxsingleexonweight += genomicinterimvalue; } }
static void computescoresprocmismatchordeletion(Traversealignmentstate *state, void *data, GT_UNUSED unsigned long lengthofeop) { Computebordersandscoresdata *d = (Computebordersandscoresdata*) data; unsigned int gen_alphabet_mapsize = gt_alphabet_size(d->gen_alphabet); GthDPOptionsEST *dp_options_est = d->dp_options_est; unsigned char genomicchar1, genomicchar2, genomicchar3, #ifndef NDEBUG origreferencechar, #endif codon; gt_assert(lengthofeop == 1); evalnewexonifpossible(d->proteineop, &d->newexon, &d->newintron, &d->firstexon, d->introncutout, d->spliced_seq, &d->exon, &d->intron, d->sa, state, d->gen_alphabet, d->dp_param, dp_options_est, d->gen_seq_tran, d->ref_seq_tran, d->gen_dp_start); if (d->proteineop) { if (state->processing_intron_with_1_base_left) { genomicchar1 = d->gen_seq_tran[state->firstbaseleftptr]; genomicchar2 = d->gen_seq_tran[state->genomicptr]; genomicchar3 = d->gen_seq_tran[state->genomicptr + 1]; } else if (state->processing_intron_with_2_bases_left) { genomicchar1 = d->gen_seq_tran[state->firstbaseleftptr]; genomicchar2 = d->gen_seq_tran[state->secondbaseleftptr]; genomicchar3 = d->gen_seq_tran[state->genomicptr]; } else { genomicchar1 = d->gen_seq_tran[state->genomicptr]; genomicchar2 = d->gen_seq_tran[state->genomicptr + 1]; genomicchar3 = d->gen_seq_tran[state->genomicptr + 2]; } codon = gthgetcodon(genomicchar1, genomicchar2, genomicchar3, d->gen_alphabet_characters, d->transtable); #ifndef NDEBUG if (d->process_mismatch) { if (state->processing_intron_with_2_bases_left) origreferencechar = d->ref_seq_orig[state->referenceptr - 1]; else origreferencechar = d->ref_seq_orig[state->referenceptr]; /* genomic codon does not equal reference character */ gt_assert(codon != origreferencechar); } #endif d->maxsingleexonweight += GTHGETSCORE(d->dp_scores_protein, genomicchar1, genomicchar2, genomicchar3, codon); } else { genomicchar1 = d->gen_seq_tran[state->genomicptr]; /* SK: replaced ADDOUTPUTWEIGHT(d->maxsingleexonweight, genomicchar1, genomicchar1); by the following */ ADDOUTPUTWEIGHTIDENTITY(d->maxsingleexonweight, genomicchar1); } }