GtScoreMatrix* gt_score_matrix_new(GtAlphabet *alphabet) { GtScoreMatrix *sm; gt_assert(alphabet); sm = gt_malloc(sizeof (GtScoreMatrix)); sm->alphabet = gt_alphabet_ref(alphabet); sm->dimension = gt_alphabet_size(alphabet); gt_array2dim_calloc(sm->scores, sm->dimension, sm->dimension); return sm; }
static GtBaseQualDistr* hcr_base_qual_distr_new_from_file(FILE *fp, GtAlphabet *alpha) { GtBaseQualDistr *bqd; char read_char_code; GtUchar cur_char_code; unsigned char cur_qual; unsigned alpha_size, min_qual = HCR_HIGHESTQUALVALUE, max_qual = HCR_LOWESTQUALVALUE; GtUword numofleaves, i; GtUint64 cur_freq; GT_UNUSED size_t read, one = (size_t) 1; alpha_size = gt_alphabet_size(alpha); bqd = gt_malloc(sizeof (GtBaseQualDistr)); gt_array2dim_calloc(bqd->distr, HCR_HIGHESTQUALVALUE + 1UL, alpha_size) bqd->ncols = alpha_size; bqd->nrows = HCR_HIGHESTQUALVALUE + 1U; bqd->qual_offset = HCR_LOWESTQUALVALUE; bqd->wildcard_indx = alpha_size - 1; read = gt_xfread_one(&numofleaves, fp); gt_assert(read == one); for (i = 0; i < numofleaves; i++) { read = gt_xfread_one(&read_char_code, fp); gt_assert(read == one); read = gt_xfread_one(&cur_qual, fp); gt_assert(read == one); read = gt_xfread_one(&cur_freq, fp); gt_assert(read == one); cur_char_code = gt_alphabet_encode(alpha, read_char_code); if (cur_char_code == (GtUchar) WILDCARD) gt_safe_assign(cur_char_code, bqd->wildcard_indx); bqd->distr[cur_qual][cur_char_code] = cur_freq; if ((unsigned) cur_qual > max_qual) max_qual = cur_qual; if ((unsigned) cur_qual < min_qual) min_qual = cur_qual; } bqd->min_qual = min_qual; bqd->max_qual = max_qual; hcr_base_qual_distr_trim(bqd); return bqd; }
static GtBaseQualDistr* hcr_base_qual_distr_new(GtAlphabet *alpha, GtQualRange qrange) { GtBaseQualDistr *bqd; bqd = gt_calloc((size_t) 1, sizeof (GtBaseQualDistr)); gt_array2dim_calloc(bqd->distr, HCR_HIGHESTQUALVALUE + 1UL, gt_alphabet_size(alpha)); bqd->ncols = gt_alphabet_size(alpha); bqd->nrows = HCR_HIGHESTQUALVALUE + 1U; bqd->qual_offset = HCR_LOWESTQUALVALUE; bqd->wildcard_indx = gt_alphabet_size(alpha) - 1; bqd->min_qual = HCR_HIGHESTQUALVALUE; bqd->max_qual = HCR_LOWESTQUALVALUE; gt_safe_assign(bqd->qrange_start, qrange.start); gt_safe_assign(bqd->qrange_end, qrange.end); bqd->alpha = alpha; return bqd; }
static void hcr_base_qual_distr_trim(GtBaseQualDistr *bqd) { if (bqd->min_qual != 0) { GtUint64 **distr_trimmed; unsigned nrows_new, i, j; nrows_new = bqd->max_qual - bqd->min_qual + 1; gt_array2dim_calloc(distr_trimmed, (GtUword) nrows_new, bqd->ncols); for (i = 0; i < nrows_new; i++) for (j = 0; j < bqd->ncols; j++) distr_trimmed[i][j] = bqd->distr[i + bqd->min_qual][j]; gt_array2dim_delete(bqd->distr); bqd->distr = distr_trimmed; bqd->nrows = nrows_new; bqd->qual_offset = bqd->min_qual; } }
static int gt_genomediff_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { bool mirrored = false; int had_err = 0, i; GtEncseq *encseq = NULL; GtGenomediffArguments *arguments = tool_arguments; GtLogger *logger; GtShuUnitFileInfo *unit_info = NULL; GtTimer *timer = NULL; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); gt_assert(logger); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(arguments->filenames, argv[i]); } if (gt_showtime_enabled()) { timer = gt_timer_new_with_progress_description("start"); gt_timer_start(timer); gt_assert(timer); } if (arguments->with_units) { gt_logger_log(logger, "unitfile option set, filename is %s\n", gt_str_get(arguments->unitfile)); } if (timer != NULL) gt_timer_show_progress(timer, "start shu search", stdout); if (gt_str_array_size(arguments->filenames) > 1UL) { GtEncseqEncoder *ee = gt_encseq_encoder_new(); gt_encseq_encoder_set_timer(ee, timer); gt_encseq_encoder_set_logger(ee, logger); /* kr only makes sense for dna, so we can check this already with ee */ gt_encseq_encoder_set_input_dna(ee); had_err = gt_encseq_encoder_encode(ee, arguments->filenames, gt_str_get(arguments->indexname), err); gt_encseq_encoder_delete(ee); } else { gt_str_append_str(arguments->indexname, gt_str_array_get_str(arguments->filenames, 0)); if (arguments->with_esa || arguments->with_pck) { GtStr *current_line = gt_str_new(); FILE *prj_fp; const char *buffer; char **elements = NULL; prj_fp = gt_fa_fopen_with_suffix(gt_str_get(arguments->indexname), GT_PROJECTFILESUFFIX,"rb",err); if (prj_fp == NULL) had_err = -1; while (!had_err && gt_str_read_next_line(current_line, prj_fp) != EOF) { buffer = gt_str_get(current_line); if (elements != NULL) { gt_free(elements[0]); gt_free(elements[1]); } gt_free(elements); elements = gt_cstr_split(buffer, '='); gt_log_log("%s", elements[0]); if (strcmp("mirrored", elements[0]) == 0) { gt_log_log("%s", elements[1]); if (strcmp("1", elements[1]) == 0) { mirrored = true; gt_log_log("sequences are treated as mirrored"); } } gt_str_reset(current_line); } gt_str_delete(current_line); if (elements != NULL) { gt_free(elements[0]); gt_free(elements[1]); } gt_free(elements); gt_fa_xfclose(prj_fp); } } if (!had_err) { GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts, err); if (mirrored) gt_encseq_loader_mirror(el); encseq = gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err); gt_encseq_loader_delete(el); } if (encseq == NULL) had_err = -1; if (!had_err) { unit_info = gt_shu_unit_info_new(encseq); if (arguments->with_units) had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info, logger, err); } if (!had_err) { uint64_t **shusums = NULL; if (arguments->with_esa || arguments->with_pck) { shusums = gt_genomediff_shulen_sum(arguments, unit_info, logger, timer, err); if (shusums == NULL) had_err = -1; } else { const bool doesa = true; GenomediffInfo gd_info; Suffixeratoroptions sopts; sopts.beverbose = arguments->verbose; sopts.indexname = arguments->indexname; sopts.db = NULL; sopts.encopts = NULL; sopts.genomediff = true; sopts.inputindex = arguments->indexname; sopts.loadopts = arguments->loadopts; sopts.showprogress = false; sopts.idxopts = arguments->idxopts; gt_assert(unit_info != NULL); gt_array2dim_calloc(shusums, unit_info->num_of_genomes, unit_info->num_of_genomes); gd_info.shulensums = shusums; gd_info.unit_info = unit_info; had_err = gt_runsuffixerator(doesa, &sopts, &gd_info, logger, err); } if (!had_err && shusums != NULL) { had_err = gt_genomediff_kr_calc(shusums, arguments, unit_info, arguments->with_pck, logger, timer, err); gt_array2dim_delete(shusums); } } if (timer != NULL) { gt_timer_show_progress_final(timer, stdout); gt_timer_delete(timer); } gt_logger_delete(logger); gt_encseq_delete(encseq); gt_shu_unit_info_delete(unit_info); return had_err; }
static int gt_gdiffcalc_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GT_UNUSED GtError *err) { GtGenomediffArguments *arguments = tool_arguments; int had_err = 0, i; GtUword lcounter = 0, zcounter = 0; double **shusums = NULL; GtEncseq *encseq = NULL; GtLogger *logger; GtShuUnitFileInfo *unit_info = NULL; GtTimer *timer = NULL; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); gt_assert(logger); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(arguments->filenames, argv[i]); } if (gt_showtime_enabled()) { timer = gt_timer_new_with_progress_description("load encseq"); gt_timer_start(timer); gt_assert(timer); } if (arguments->with_units) { gt_logger_log(logger, "unitfile option set, filename is %s\n", gt_str_get(arguments->unitfile)); } if (!had_err) { GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts, err); encseq = gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err); gt_encseq_loader_delete(el); } if (encseq == NULL) had_err = -1; if (timer != NULL) gt_timer_show_progress(timer, "load units", stdout); if (!had_err) { unit_info = gt_shu_unit_info_new(encseq); if (arguments->with_units) had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info, logger, err); } if (timer != NULL) gt_timer_show_progress(timer, "read table", stdout); if (!had_err) { GtIO *table_file = NULL; GtTokenizer *tokenizer = NULL; GtStr *line = NULL; gt_assert(unit_info != NULL); gt_array2dim_calloc(shusums, unit_info->num_of_genomes, unit_info->num_of_genomes); table_file = gt_io_new(gt_str_array_get(arguments->filenames, 0), "r"); tokenizer = gt_tokenizer_new(table_file); line = gt_tokenizer_get_token(tokenizer); while (line != NULL && !had_err) { char *cline = gt_str_get(line); char *elem = strtok(cline, ";"); zcounter = 0; while (elem != NULL && !had_err) { if (*elem != '#') { if (1 != sscanf(elem, "%lf", &shusums[lcounter][zcounter])) { had_err = 1; gt_error_set(err, "couldn't scan"); break; } gt_logger_log(logger,"wert: %lf", shusums[lcounter][zcounter]); zcounter++; } else { gt_logger_log(logger, "name: %s", elem++); } elem = strtok(NULL, ";"); } gt_tokenizer_next_token(tokenizer); gt_str_delete(line); line = gt_tokenizer_get_token(tokenizer); lcounter++; gt_logger_log(logger, "line "GT_WD"", lcounter); } } if (!had_err) { GtUword num_of_seq, file_idx, seq_idx, startpos; GT_UNUSED GtUword oldpos = 0; gt_assert(unit_info != NULL); gt_assert(lcounter == zcounter); gt_assert(lcounter == unit_info->num_of_genomes); num_of_seq = gt_encseq_num_of_sequences(unit_info->encseq); for (seq_idx = 0; seq_idx < num_of_seq; seq_idx++) { startpos = gt_encseq_seqstartpos(unit_info->encseq, seq_idx); file_idx = gt_encseq_filenum(unit_info->encseq, startpos); gt_log_log("seq: "GT_WU" starts at: "GT_WU"\n" "belonges to file: "GT_WU" which is part of genome: %s", seq_idx, startpos, file_idx, gt_str_array_get(unit_info->genome_names, unit_info->map_files[file_idx])); gt_assert(oldpos <= startpos); oldpos = startpos; } } if (!had_err && shusums != NULL) { had_err = gt_genomediff_calculate_div_from_avg(shusums, arguments, unit_info, logger, timer, err); gt_array2dim_delete(shusums); } if (timer != NULL) { gt_timer_show_progress_final(timer, stdout); gt_timer_delete(timer); } gt_logger_delete(logger); gt_encseq_delete(encseq); gt_shu_unit_info_delete(unit_info); return had_err; }
int mg_computepath(CombinedScoreMatrixEntry **combinedscore_matrix, HitInformation *hit_information, unsigned long rows, unsigned long contig_len, ParseStruct *parsestruct_ptr, GtError * err) { int had_err = 0; /* Initialisieren der Matrix fuer die Pfadberechnung */ PathMatrixEntry **path_matrix; /* i: Zaehlvariable fuer die Matrix-Zeilen; k: Zaehlvariable Precursors (von 0 bis max 2) maxpath_frame: Speichern des vorherigen Frames von dem der max-Wert berechnet wird */ unsigned short row_index = 0, precursor_index = 0, precursors_row = 0, maxpath_frame = 0; /* Position in der Query-DNA */ unsigned long column_index = 0; /* Variablen fuer den aktuellen Frame, den vorherigen Frame(speichert einen Wert aus precursors[], die Zeile des vorherigen Frames, GtArray mit den Precursors-Frames */ short current_frame = 0, precursors_frame = 0, precursors[NUM_PRECURSORS]; /* q ist der Wert, der bei Aus- oder Eintreten in ein Gen auf dem Forward- bzw. Reverse-Strang berechnet wird */ double q = ARGUMENTSSTRUCT(leavegene_value), max_new = 1, max_old = 1; /* Speicherreservierung fuer die Path-Matrix - Groesse entsprechend der CombinedScore-Matrix */ gt_array2dim_calloc(path_matrix, 7, contig_len); gt_error_check(err); /* fuer die erste Spalte der Path-Matrix wird die erste Spalte der CombinedScore-Matrix uebernommen */ for (row_index = 0; row_index < rows; row_index++) { path_matrix[row_index][0].score = combinedscore_matrix[row_index][0].matrix_score; path_matrix[row_index][0].path_frame = row_index; } /* Spaltenweise Berechnung des opt. Pfades */ for (column_index = 1; column_index < contig_len; column_index++) { for (row_index = 0; row_index < rows; row_index++) { /* Zaehlvariable fuer die Zeile wird umgerechnet in den entsprechenden Leserahmen */ current_frame = get_current_frame(row_index); /* Aufruf der Methode zum Berechnen der moeglichen Leserahmen anhand von aktuellem Leserahmen und der Query-DNA-Sequenz */ compute_precursors(current_frame, column_index, precursors); /* der max-Wert der moeglichen Vorgaenger wird berechnet */ for (precursor_index = 0; precursor_index < NUM_PRECURSORS && (precursors[precursor_index] != UNDEFINED); ++precursor_index) { /* aktueller Vorgaengerleserahmen - es gibt max. 3 moegliche Vorgaenger */ precursors_frame = precursors[precursor_index]; /* Vorgaengerleserahmen wird umgerechnet in die entsprechende Matrix-Zeile */ precursors_row = get_matrix_row(precursors_frame); /* der DP-Algo umfasst 3 moegliche Faelle 1. Fall: Wechsel vom Reversen- auf den Forward-Strang bzw. umgekehrt */ if ((current_frame < 0 && precursors_frame > 0) || (current_frame > 0 && precursors_frame < 0)) { max_new = path_matrix[precursors_row][column_index-1].score + combinedscore_matrix[row_index][column_index].matrix_score + 2*q; } /* 2. Fall: Einfacher Wechsel des Leserahmens, also von + zu + bzw.- zu - */ else if (current_frame != 0 && precursors_frame != current_frame) { max_new = path_matrix[precursors_row][column_index-1].score + combinedscore_matrix[row_index][column_index].matrix_score + q; } /* 3. Fall: Leserahmen wird beibehalten bzw. Wechsel von kodierend zu nicht-kodierend oder umgekehrt */ else { max_new = path_matrix[precursors_row][column_index-1].score + combinedscore_matrix[row_index][column_index] .matrix_score; } /* Bestimmen des Max-Wertes der max. 3 Moeglichkeiten und Speichern der Zeile, von der der Max-Wert stammt */ if (gt_double_compare(max_new, max_old) > 0) { max_old = max_new; maxpath_frame = precursors_row; } } /* Speichern des Max-Wertes und der "Vorgaenger"-Zeile; zuruecksetzen der Variablen */ path_matrix[row_index][column_index].score = max_old; path_matrix[row_index][column_index].path_frame = maxpath_frame; max_new = DBL_MIN; max_old = DBL_MIN; maxpath_frame = 0; } } /* Aufruf der Methode zur Genvorhersage */ had_err = mg_compute_gene_prediction(combinedscore_matrix, path_matrix, contig_len, hit_information, parsestruct_ptr, err); gt_array2dim_delete(path_matrix); return had_err; }