示例#1
0
GtScoreMatrix* gt_score_matrix_new(GtAlphabet *alphabet)
{
  GtScoreMatrix *sm;
  gt_assert(alphabet);
  sm = gt_malloc(sizeof (GtScoreMatrix));
  sm->alphabet = gt_alphabet_ref(alphabet);
  sm->dimension = gt_alphabet_size(alphabet);
  gt_array2dim_calloc(sm->scores, sm->dimension, sm->dimension);
  return sm;
}
示例#2
0
文件: hcr.c 项目: mader/genometools
static GtBaseQualDistr* hcr_base_qual_distr_new_from_file(FILE *fp,
        GtAlphabet *alpha)
{
    GtBaseQualDistr *bqd;
    char read_char_code;
    GtUchar cur_char_code;
    unsigned char cur_qual;
    unsigned alpha_size,
             min_qual = HCR_HIGHESTQUALVALUE,
             max_qual = HCR_LOWESTQUALVALUE;
    GtUword numofleaves,
            i;
    GtUint64 cur_freq;
    GT_UNUSED size_t read,
              one = (size_t) 1;

    alpha_size = gt_alphabet_size(alpha);
    bqd = gt_malloc(sizeof (GtBaseQualDistr));
    gt_array2dim_calloc(bqd->distr, HCR_HIGHESTQUALVALUE + 1UL, alpha_size)
    bqd->ncols = alpha_size;
    bqd->nrows = HCR_HIGHESTQUALVALUE + 1U;
    bqd->qual_offset = HCR_LOWESTQUALVALUE;
    bqd->wildcard_indx = alpha_size - 1;

    read = gt_xfread_one(&numofleaves, fp);
    gt_assert(read == one);
    for (i = 0; i < numofleaves; i++) {
        read = gt_xfread_one(&read_char_code, fp);
        gt_assert(read == one);
        read = gt_xfread_one(&cur_qual, fp);
        gt_assert(read == one);
        read = gt_xfread_one(&cur_freq, fp);
        gt_assert(read == one);
        cur_char_code = gt_alphabet_encode(alpha, read_char_code);
        if (cur_char_code == (GtUchar) WILDCARD)
            gt_safe_assign(cur_char_code, bqd->wildcard_indx);
        bqd->distr[cur_qual][cur_char_code] = cur_freq;
        if ((unsigned) cur_qual > max_qual)
            max_qual = cur_qual;
        if ((unsigned) cur_qual < min_qual)
            min_qual = cur_qual;
    }

    bqd->min_qual = min_qual;
    bqd->max_qual = max_qual;
    hcr_base_qual_distr_trim(bqd);
    return bqd;
}
示例#3
0
文件: hcr.c 项目: mader/genometools
static GtBaseQualDistr* hcr_base_qual_distr_new(GtAlphabet *alpha,
        GtQualRange qrange)
{
    GtBaseQualDistr *bqd;
    bqd = gt_calloc((size_t) 1, sizeof (GtBaseQualDistr));
    gt_array2dim_calloc(bqd->distr, HCR_HIGHESTQUALVALUE + 1UL,
                        gt_alphabet_size(alpha));

    bqd->ncols = gt_alphabet_size(alpha);
    bqd->nrows = HCR_HIGHESTQUALVALUE + 1U;
    bqd->qual_offset = HCR_LOWESTQUALVALUE;
    bqd->wildcard_indx = gt_alphabet_size(alpha) - 1;
    bqd->min_qual = HCR_HIGHESTQUALVALUE;
    bqd->max_qual = HCR_LOWESTQUALVALUE;
    gt_safe_assign(bqd->qrange_start, qrange.start);
    gt_safe_assign(bqd->qrange_end, qrange.end);
    bqd->alpha = alpha;
    return bqd;
}
示例#4
0
文件: hcr.c 项目: mader/genometools
static void hcr_base_qual_distr_trim(GtBaseQualDistr *bqd)
{
    if (bqd->min_qual != 0) {
        GtUint64 **distr_trimmed;
        unsigned nrows_new,
                 i,
                 j;

        nrows_new = bqd->max_qual - bqd->min_qual + 1;
        gt_array2dim_calloc(distr_trimmed, (GtUword) nrows_new, bqd->ncols);

        for (i = 0; i < nrows_new; i++)
            for (j = 0; j < bqd->ncols; j++)
                distr_trimmed[i][j] = bqd->distr[i + bqd->min_qual][j];
        gt_array2dim_delete(bqd->distr);
        bqd->distr = distr_trimmed;
        bqd->nrows = nrows_new;
        bqd->qual_offset = bqd->min_qual;
    }
}
示例#5
0
static int gt_genomediff_runner(int argc, const char **argv,
                                int parsed_args, void *tool_arguments,
                                GtError *err)
{
    bool mirrored = false;
    int had_err = 0,
        i;
    GtEncseq              *encseq = NULL;
    GtGenomediffArguments *arguments = tool_arguments;
    GtLogger              *logger;
    GtShuUnitFileInfo     *unit_info = NULL;
    GtTimer               *timer = NULL;

    gt_error_check(err);
    gt_assert(arguments);

    logger = gt_logger_new(arguments->verbose,
                           GT_LOGGER_DEFLT_PREFIX,
                           stdout);
    gt_assert(logger);

    for (i = parsed_args; i < argc; i++) {
        gt_str_array_add_cstr(arguments->filenames, argv[i]);
    }

    if (gt_showtime_enabled()) {
        timer = gt_timer_new_with_progress_description("start");
        gt_timer_start(timer);
        gt_assert(timer);
    }

    if (arguments->with_units) {
        gt_logger_log(logger, "unitfile option set, filename is %s\n",
                      gt_str_get(arguments->unitfile));
    }

    if (timer != NULL)
        gt_timer_show_progress(timer, "start shu search", stdout);

    if (gt_str_array_size(arguments->filenames) > 1UL) {
        GtEncseqEncoder *ee = gt_encseq_encoder_new();
        gt_encseq_encoder_set_timer(ee, timer);
        gt_encseq_encoder_set_logger(ee, logger);
        /* kr only makes sense for dna, so we can check this already with ee */
        gt_encseq_encoder_set_input_dna(ee);
        had_err = gt_encseq_encoder_encode(ee, arguments->filenames,
                                           gt_str_get(arguments->indexname), err);
        gt_encseq_encoder_delete(ee);
    }
    else {
        gt_str_append_str(arguments->indexname,
                          gt_str_array_get_str(arguments->filenames, 0));
        if (arguments->with_esa || arguments->with_pck) {
            GtStr *current_line = gt_str_new();
            FILE *prj_fp;
            const char *buffer;
            char **elements = NULL;

            prj_fp = gt_fa_fopen_with_suffix(gt_str_get(arguments->indexname),
                                             GT_PROJECTFILESUFFIX,"rb",err);
            if (prj_fp == NULL)
                had_err = -1;
            while (!had_err && gt_str_read_next_line(current_line, prj_fp) != EOF) {
                buffer = gt_str_get(current_line);
                if (elements != NULL) {
                    gt_free(elements[0]);
                    gt_free(elements[1]);
                }
                gt_free(elements);
                elements = gt_cstr_split(buffer, '=');
                gt_log_log("%s", elements[0]);
                if (strcmp("mirrored", elements[0]) == 0) {
                    gt_log_log("%s", elements[1]);
                    if (strcmp("1", elements[1]) == 0) {
                        mirrored = true;
                        gt_log_log("sequences are treated as mirrored");
                    }
                }
                gt_str_reset(current_line);
            }
            gt_str_delete(current_line);
            if (elements != NULL) {
                gt_free(elements[0]);
                gt_free(elements[1]);
            }
            gt_free(elements);
            gt_fa_xfclose(prj_fp);
        }
    }

    if (!had_err) {
        GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts,
                             err);
        if (mirrored)
            gt_encseq_loader_mirror(el);
        encseq =
            gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err);
        gt_encseq_loader_delete(el);
    }
    if (encseq == NULL)
        had_err = -1;
    if (!had_err) {
        unit_info = gt_shu_unit_info_new(encseq);
        if (arguments->with_units)
            had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info,
                                                 logger, err);
    }

    if (!had_err) {
        uint64_t **shusums = NULL;
        if (arguments->with_esa || arguments->with_pck) {
            shusums = gt_genomediff_shulen_sum(arguments, unit_info,
                                               logger, timer, err);
            if (shusums == NULL)
                had_err = -1;
        }
        else {
            const bool doesa = true;
            GenomediffInfo gd_info;
            Suffixeratoroptions sopts;
            sopts.beverbose = arguments->verbose;
            sopts.indexname = arguments->indexname;
            sopts.db = NULL;
            sopts.encopts = NULL;
            sopts.genomediff = true;
            sopts.inputindex = arguments->indexname;
            sopts.loadopts = arguments->loadopts;
            sopts.showprogress = false;
            sopts.idxopts = arguments->idxopts;

            gt_assert(unit_info != NULL);
            gt_array2dim_calloc(shusums, unit_info->num_of_genomes,
                                unit_info->num_of_genomes);
            gd_info.shulensums = shusums;
            gd_info.unit_info = unit_info;
            had_err = gt_runsuffixerator(doesa, &sopts, &gd_info, logger, err);
        }
        if (!had_err && shusums != NULL) {
            had_err = gt_genomediff_kr_calc(shusums, arguments, unit_info,
                                            arguments->with_pck, logger, timer, err);
            gt_array2dim_delete(shusums);
        }
    }

    if (timer != NULL) {
        gt_timer_show_progress_final(timer, stdout);
        gt_timer_delete(timer);
    }
    gt_logger_delete(logger);
    gt_encseq_delete(encseq);
    gt_shu_unit_info_delete(unit_info);

    return had_err;
}
static int gt_gdiffcalc_runner(int argc, const char **argv, int parsed_args,
                              void *tool_arguments, GT_UNUSED GtError *err)
{
  GtGenomediffArguments *arguments = tool_arguments;
  int had_err = 0, i;
  GtUword lcounter = 0, zcounter = 0;
  double **shusums = NULL;
  GtEncseq              *encseq = NULL;
  GtLogger              *logger;
  GtShuUnitFileInfo     *unit_info = NULL;
  GtTimer               *timer = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose,
                         GT_LOGGER_DEFLT_PREFIX,
                         stdout);
  gt_assert(logger);

  for (i = parsed_args; i < argc; i++) {
    gt_str_array_add_cstr(arguments->filenames, argv[i]);
  }

  if (gt_showtime_enabled()) {
    timer = gt_timer_new_with_progress_description("load encseq");
    gt_timer_start(timer);
    gt_assert(timer);
  }

  if (arguments->with_units) {
    gt_logger_log(logger, "unitfile option set, filename is %s\n",
                  gt_str_get(arguments->unitfile));
  }

  if (!had_err) {
    GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts,
                                                           err);
    encseq =
      gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err);
    gt_encseq_loader_delete(el);
  }
  if (encseq == NULL)
    had_err = -1;

  if (timer != NULL)
    gt_timer_show_progress(timer, "load units", stdout);

  if (!had_err) {
    unit_info = gt_shu_unit_info_new(encseq);
    if (arguments->with_units)
      had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info,
                                           logger, err);
  }

  if (timer != NULL)
    gt_timer_show_progress(timer, "read table", stdout);

  if (!had_err) {
    GtIO *table_file = NULL;
    GtTokenizer *tokenizer = NULL;
    GtStr *line = NULL;

    gt_assert(unit_info != NULL);
    gt_array2dim_calloc(shusums, unit_info->num_of_genomes,
                        unit_info->num_of_genomes);

    table_file = gt_io_new(gt_str_array_get(arguments->filenames, 0), "r");
    tokenizer = gt_tokenizer_new(table_file);
    line = gt_tokenizer_get_token(tokenizer);
    while (line != NULL && !had_err) {
      char *cline = gt_str_get(line);
      char *elem = strtok(cline, ";");
      zcounter = 0;
      while (elem != NULL && !had_err) {
        if (*elem != '#') {
          if (1 != sscanf(elem, "%lf",
                          &shusums[lcounter][zcounter])) {
            had_err = 1;
            gt_error_set(err, "couldn't scan");
            break;
          }
          gt_logger_log(logger,"wert: %lf", shusums[lcounter][zcounter]);
          zcounter++;
        }
        else {
          gt_logger_log(logger, "name: %s", elem++);
        }
        elem = strtok(NULL, ";");
      }
      gt_tokenizer_next_token(tokenizer);
      gt_str_delete(line);
      line = gt_tokenizer_get_token(tokenizer);
      lcounter++;
      gt_logger_log(logger, "line "GT_WD"", lcounter);
    }
  }
  if (!had_err) {
    GtUword num_of_seq, file_idx, seq_idx, startpos;
    GT_UNUSED GtUword oldpos = 0;

    gt_assert(unit_info != NULL);
    gt_assert(lcounter == zcounter);
    gt_assert(lcounter == unit_info->num_of_genomes);

    num_of_seq = gt_encseq_num_of_sequences(unit_info->encseq);

    for (seq_idx = 0; seq_idx < num_of_seq; seq_idx++) {
      startpos = gt_encseq_seqstartpos(unit_info->encseq, seq_idx);
      file_idx = gt_encseq_filenum(unit_info->encseq, startpos);
      gt_log_log("seq: "GT_WU" starts at: "GT_WU"\n"
                 "belonges to file: "GT_WU" which is part of genome: %s",
                 seq_idx, startpos, file_idx,
                 gt_str_array_get(unit_info->genome_names,
                                  unit_info->map_files[file_idx]));
      gt_assert(oldpos <= startpos);
      oldpos = startpos;
    }
  }
  if (!had_err && shusums != NULL) {
    had_err = gt_genomediff_calculate_div_from_avg(shusums, arguments,
                                                   unit_info,
                                                   logger, timer, err);
    gt_array2dim_delete(shusums);
  }

  if (timer != NULL) {
    gt_timer_show_progress_final(timer, stdout);
    gt_timer_delete(timer);
  }
  gt_logger_delete(logger);
  gt_encseq_delete(encseq);
  gt_shu_unit_info_delete(unit_info);
  return had_err;
}
示例#7
0
int mg_computepath(CombinedScoreMatrixEntry **combinedscore_matrix,
                   HitInformation *hit_information,
                   unsigned long rows,
                   unsigned long contig_len,
                   ParseStruct *parsestruct_ptr, GtError * err)
{
  int had_err = 0;

  /* Initialisieren der Matrix fuer die Pfadberechnung */
  PathMatrixEntry **path_matrix;

  /* i: Zaehlvariable fuer die Matrix-Zeilen; k: Zaehlvariable Precursors
     (von 0 bis max 2) maxpath_frame: Speichern des vorherigen Frames von
     dem der max-Wert berechnet wird */
  unsigned short row_index = 0,
    precursor_index = 0,
    precursors_row = 0,
    maxpath_frame = 0;

  /* Position in der Query-DNA */
  unsigned long column_index = 0;

  /* Variablen fuer den aktuellen Frame, den vorherigen Frame(speichert
     einen Wert aus precursors[], die Zeile des vorherigen Frames, GtArray
     mit den Precursors-Frames */
  short current_frame = 0,
    precursors_frame = 0,
    precursors[NUM_PRECURSORS];

  /* q ist der Wert, der bei Aus- oder Eintreten in ein Gen auf dem
     Forward- bzw. Reverse-Strang berechnet wird */
  double q = ARGUMENTSSTRUCT(leavegene_value),
    max_new = 1,
    max_old = 1;

  /* Speicherreservierung fuer die Path-Matrix - Groesse entsprechend der
     CombinedScore-Matrix */
  gt_array2dim_calloc(path_matrix, 7, contig_len);

  gt_error_check(err);

  /* fuer die erste Spalte der Path-Matrix wird die erste Spalte der
     CombinedScore-Matrix uebernommen */
  for (row_index = 0; row_index < rows; row_index++)
  {
    path_matrix[row_index][0].score =
      combinedscore_matrix[row_index][0].matrix_score;
    path_matrix[row_index][0].path_frame = row_index;
  }

  /* Spaltenweise Berechnung des opt. Pfades */
  for (column_index = 1; column_index < contig_len; column_index++)
  {
    for (row_index = 0; row_index < rows; row_index++)
    {
      /* Zaehlvariable fuer die Zeile wird umgerechnet in den entsprechenden
         Leserahmen */
      current_frame = get_current_frame(row_index);
      /* Aufruf der Methode zum Berechnen der moeglichen Leserahmen anhand von
         aktuellem Leserahmen und der Query-DNA-Sequenz */
      compute_precursors(current_frame,
                         column_index,
                         precursors);

      /* der max-Wert der moeglichen Vorgaenger wird berechnet */
      for (precursor_index = 0;
           precursor_index < NUM_PRECURSORS
             && (precursors[precursor_index] != UNDEFINED);
           ++precursor_index)
      {
        /* aktueller Vorgaengerleserahmen - es gibt max. 3 moegliche
           Vorgaenger */
        precursors_frame = precursors[precursor_index];
        /* Vorgaengerleserahmen wird umgerechnet in die entsprechende
           Matrix-Zeile */
        precursors_row = get_matrix_row(precursors_frame);

        /* der DP-Algo umfasst 3 moegliche Faelle
           1. Fall: Wechsel vom Reversen- auf den Forward-Strang bzw.
           umgekehrt */
        if ((current_frame < 0 && precursors_frame > 0) ||
            (current_frame > 0 && precursors_frame < 0))
        {
            max_new = path_matrix[precursors_row][column_index-1].score +
                      combinedscore_matrix[row_index][column_index].matrix_score
                      + 2*q;
        }
        /* 2. Fall: Einfacher Wechsel des Leserahmens, also von + zu +
           bzw.- zu - */
        else if (current_frame != 0 && precursors_frame != current_frame)
        {
            max_new = path_matrix[precursors_row][column_index-1].score +
                      combinedscore_matrix[row_index][column_index].matrix_score
                      + q;
        }
        /* 3. Fall: Leserahmen wird beibehalten bzw. Wechsel von kodierend zu
           nicht-kodierend oder umgekehrt */
        else
        {
            max_new = path_matrix[precursors_row][column_index-1].score +
                      combinedscore_matrix[row_index][column_index]
                      .matrix_score;
        }

        /* Bestimmen des Max-Wertes der max. 3 Moeglichkeiten und Speichern der
           Zeile, von der der Max-Wert stammt */
        if (gt_double_compare(max_new, max_old) > 0)
        {
            max_old = max_new;
            maxpath_frame = precursors_row;
        }
      }

      /* Speichern des Max-Wertes und der "Vorgaenger"-Zeile;
         zuruecksetzen der Variablen */
      path_matrix[row_index][column_index].score      = max_old;
      path_matrix[row_index][column_index].path_frame = maxpath_frame;

      max_new = DBL_MIN;
      max_old = DBL_MIN;
      maxpath_frame = 0;
    }
  }

  /* Aufruf der Methode zur Genvorhersage */
  had_err = mg_compute_gene_prediction(combinedscore_matrix,
                                       path_matrix,
                                       contig_len,
                                       hit_information,
                                       parsestruct_ptr, err);

  gt_array2dim_delete(path_matrix);

  return had_err;
}