static GthFlt get_score(GtScoreMatrix *score_matrix,
                        GtAlphabet *score_matrix_alphabet,
                        unsigned char amino,
                        unsigned char origreferencechar)
{
  GthFlt rval = 0.0,
         scalefactor   = SCALEFACTOR,
         indel_penalty = INDEL_PENALTY;

  if (amino  == DASH || origreferencechar == DASH) {
    /* 1.) scaled INDEL_PENALTY for deletions from and insertions into genomic
       DNA of lengths 1, 2, or 3, irrespective of indel size */
    rval = scalefactor * indel_penalty;
  }
  else if (amino != WILDCARD && amino <= CHAR_MAX &&
           gt_alphabet_valid_input(score_matrix_alphabet, amino) &&
           origreferencechar <= CHAR_MAX &&
           gt_alphabet_valid_input(score_matrix_alphabet,
                                   origreferencechar)) {
    /* XXX: shorten this */
    if (amino == GT_STOP_AMINO) {
      /* 2.) (-)2*INDEL_PENALTY for matching/mismatching a stop codon */
      if (origreferencechar == GT_STOP_AMINO)
        rval = scalefactor * -2 * indel_penalty;
      else
        rval = scalefactor *  2 * indel_penalty;
    }
    else {
      /* 3.) amino acid substitution score */
      if (origreferencechar == GT_STOP_AMINO)
        rval = scalefactor *  2 * indel_penalty;
      else {
        GtUchar code1, code2;
        int wcidx;
        code1 = gt_alphabet_encode(score_matrix_alphabet, amino);
        code2 = gt_alphabet_encode(score_matrix_alphabet, origreferencechar);
        wcidx = gt_alphabet_size(score_matrix_alphabet) - 1;
        rval = scalefactor *
               gt_score_matrix_get_score(score_matrix,
                                         code1 == WILDCARD ? wcidx : code1,
                                         code2 == WILDCARD ? wcidx : code2);
      }
    }
  }
  /* 4.) else: neutral score in case of wild-card characters in the genomic DNA
   */

  return rval;
}
Beispiel #2
0
static int hcr_base_qual_distr_add(GtBaseQualDistr *bqd, const GtUchar *qual,
                                   const GtUchar *seq, GtUword len)
{
    GtUword i;
    unsigned cur_char_code,
             cur_qual;

    for (i = 0; i < len; i++) {
        cur_char_code = (unsigned) gt_alphabet_encode(bqd->alpha,
                        (char) gt_alphabet_pretty_symbol(bqd->alpha,
                                (unsigned) seq[i]));
        cur_qual = (unsigned) qual[i];

        if (bqd->qrange_start != GT_UNDEF_UINT) {
            if (cur_qual <= bqd->qrange_start)
                cur_qual = bqd->qrange_start;
        }

        if (bqd->qrange_end != GT_UNDEF_UINT) {
            if (cur_qual >= bqd->qrange_end)
                cur_qual = bqd->qrange_end;
        }
        if (cur_char_code == WILDCARD)
            bqd->distr[cur_qual][bqd->wildcard_indx]++;
        else
            bqd->distr[cur_qual][cur_char_code]++;
        if (cur_qual > bqd->max_qual)
            bqd->max_qual = cur_qual;
        if (cur_qual < bqd->min_qual)
            bqd->min_qual = cur_qual;
    }
    return 0;
}
Beispiel #3
0
static GtUchar bambase2gtbase(uint8_t base, GtAlphabet *alphabet)
{
  switch (base)
  {
    case BAMBASEA:
      return gt_alphabet_encode(alphabet, 'A');
    case BAMBASEC:
      return gt_alphabet_encode(alphabet, 'C');
    case BAMBASEG:
      return gt_alphabet_encode(alphabet, 'G');
    case BAMBASET:
      return gt_alphabet_encode(alphabet, 'T');
    default:
      return gt_alphabet_encode(alphabet,
                                (char) gt_alphabet_wildcard_show(alphabet));
  }
}
Beispiel #4
0
void gt_gc_content_show(const char *seq, unsigned long len,
                        GtAlphabet *alphabet, GtFile *outfp)
{
  unsigned long i,
                gc = 0, /* number of G/C bases */
                at = 0, /* number of A/T bases */
                n  = 0; /* number of N   bases */
  unsigned int a_code, c_code, g_code, t_code, n_code, cc;
  gt_assert(seq && alphabet);
  gt_assert(gt_alphabet_is_dna(alphabet));
  a_code = gt_alphabet_encode(alphabet, 'A');
  c_code = gt_alphabet_encode(alphabet, 'C');
  g_code = gt_alphabet_encode(alphabet, 'G');
  t_code = gt_alphabet_encode(alphabet, 'T');
  n_code = gt_alphabet_encode(alphabet, 'N');
  for (i = 0; i < len; i++) {
    cc = gt_alphabet_encode(alphabet, seq[i]);
    if (cc == g_code || cc == c_code)
      gc++;
    else if (cc == a_code || cc == t_code)
      at++;
    else if (cc == n_code)
      n++;
    else {
      gt_assert(0);
    }
  }
  gt_file_xprintf(outfp, "GC-content: %.2f%% (AT-content: %.2f%%, "
                         "N-content: %.2f%%)\n",
                  ((double) gc / len) * 100.0, ((double) at / len) * 100.0,
                  ((double) n  / len) * 100.0);
}
Beispiel #5
0
static GtBaseQualDistr* hcr_base_qual_distr_new_from_file(FILE *fp,
        GtAlphabet *alpha)
{
    GtBaseQualDistr *bqd;
    char read_char_code;
    GtUchar cur_char_code;
    unsigned char cur_qual;
    unsigned alpha_size,
             min_qual = HCR_HIGHESTQUALVALUE,
             max_qual = HCR_LOWESTQUALVALUE;
    GtUword numofleaves,
            i;
    GtUint64 cur_freq;
    GT_UNUSED size_t read,
              one = (size_t) 1;

    alpha_size = gt_alphabet_size(alpha);
    bqd = gt_malloc(sizeof (GtBaseQualDistr));
    gt_array2dim_calloc(bqd->distr, HCR_HIGHESTQUALVALUE + 1UL, alpha_size)
    bqd->ncols = alpha_size;
    bqd->nrows = HCR_HIGHESTQUALVALUE + 1U;
    bqd->qual_offset = HCR_LOWESTQUALVALUE;
    bqd->wildcard_indx = alpha_size - 1;

    read = gt_xfread_one(&numofleaves, fp);
    gt_assert(read == one);
    for (i = 0; i < numofleaves; i++) {
        read = gt_xfread_one(&read_char_code, fp);
        gt_assert(read == one);
        read = gt_xfread_one(&cur_qual, fp);
        gt_assert(read == one);
        read = gt_xfread_one(&cur_freq, fp);
        gt_assert(read == one);
        cur_char_code = gt_alphabet_encode(alpha, read_char_code);
        if (cur_char_code == (GtUchar) WILDCARD)
            gt_safe_assign(cur_char_code, bqd->wildcard_indx);
        bqd->distr[cur_qual][cur_char_code] = cur_freq;
        if ((unsigned) cur_qual > max_qual)
            max_qual = cur_qual;
        if ((unsigned) cur_qual < min_qual)
            min_qual = cur_qual;
    }

    bqd->min_qual = min_qual;
    bqd->max_qual = max_qual;
    hcr_base_qual_distr_trim(bqd);
    return bqd;
}
Beispiel #6
0
double gt_pck_getGCcontent(const FMindex *bwtSubject,
                           const GtAlphabet *alphabet)
{
  GtUword  c, length;
  double gc;
  const MRAEnc *FM_alphabet;
  GtUchar c_sym;

  FM_alphabet = BWTSeqGetAlphabet((const BWTSeq *) bwtSubject);

  c_sym = MRAEncMapSymbol(FM_alphabet,
                          gt_alphabet_encode(alphabet, 'c'));
  length = ((const BWTSeq *) bwtSubject)->seqIdx->seqLen;

  c = ((const BWTSeq *) bwtSubject)->count[c_sym+1] -
    ((const BWTSeq *) bwtSubject)->count[c_sym];

  gc = c * 2 / (double) (length - 2);
  return gc;
}
Beispiel #7
0
static int gt_seed_extend_runner(GT_UNUSED int argc,
                                 GT_UNUSED const char **argv,
                                 GT_UNUSED int parsed_args,
                                 void *tool_arguments,
                                 GtError *err)
{
  GtSeedExtendArguments *arguments = tool_arguments;
  GtEncseqLoader *encseq_loader = NULL;
  GtEncseq *aencseq = NULL, *bencseq = NULL;
  GtGreedyextendmatchinfo *grextinfo = NULL;
  GtXdropmatchinfo *xdropinfo = NULL;
  GtQuerymatchoutoptions *querymatchoutopt = NULL;
  GtTimer *seedextendtimer = NULL;
  GtExtendCharAccess cam = GT_EXTEND_CHAR_ACCESS_ANY;
  GtUword errorpercentage = 0UL;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(arguments != NULL);
  gt_assert(arguments->se_minidentity >= GT_EXTEND_MIN_IDENTITY_PERCENTAGE &&
            arguments->se_minidentity <= 100UL);

  /* Calculate error percentage from minidentity */
  errorpercentage = 100UL - arguments->se_minidentity;

  /* Measure whole running time */
  if (arguments->benchmark || arguments->verbose) {
    gt_showtime_enable();
  }
  if (gt_showtime_enabled())
  {
    seedextendtimer = gt_timer_new();
    gt_timer_start(seedextendtimer);
  }

  /* Load encseq A */
  encseq_loader = gt_encseq_loader_new();
  gt_encseq_loader_enable_autosupport(encseq_loader);
  aencseq = gt_encseq_loader_load(encseq_loader,
                                  gt_str_get(arguments->dbs_indexname),
                                  err);
  if (aencseq == NULL)
    had_err = -1;

  /* If there is a 2nd read set: Load encseq B */
  if (!had_err) {
    if (strcmp(gt_str_get(arguments->dbs_queryname), "") != 0) {
      bencseq = gt_encseq_loader_load(encseq_loader,
                                      gt_str_get(arguments->dbs_queryname),
                                      err);
    } else {
      bencseq = gt_encseq_ref(aencseq);
    }
    if (bencseq == NULL) {
      had_err = -1;
      gt_encseq_delete(aencseq);
    }
  }
  gt_encseq_loader_delete(encseq_loader);

  /* set character access method */
  if (!had_err && (gt_option_is_set(arguments->se_option_greedy) ||
                   gt_option_is_set(arguments->se_option_xdrop) ||
                   arguments->se_alignmentwidth > 0))
  {
    cam = gt_greedy_extend_char_access(gt_str_get
                                       (arguments->se_char_access_mode),
                                       err);
    if ((int) cam == -1) {
      had_err = -1;
      gt_encseq_delete(aencseq);
      gt_encseq_delete(bencseq);
    }
  }

  /* Use bias dependent parameters, adapted from E. Myers' DALIGNER */
  if (!had_err && arguments->bias_parameters) {
    const GtAlphabet *alpha = gt_encseq_alphabet(aencseq);
    const double bias_factor[10] = {.690, .690, .690, .690, .780,
                                    .850, .900, .933, .966, 1.000};

    if (gt_alphabet_is_dna(alpha)) {
      GtUword at, cg;
      at = gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 'a'));
      at += gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 't'));
      cg = gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 'c'));
      cg += gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 'g'));
      if (at + cg > 0) {
        const double ratio = (double)MIN(at, cg) / (at + cg);
        int bias_index = (int)MAX(0.0, (ratio + 0.025) * 20.0 - 1.0);
        gt_assert(bias_index < 10);
        arguments->se_maxalilendiff = 30;
        arguments->se_perc_match_hist = (GtUword)(100.0 - errorpercentage *
                                                  bias_factor[bias_index]);
        if (arguments->verbose) {
          printf("# Base ratio = %4.2lf -> percmathistory = "GT_WU"\n",
                 ratio, arguments->se_perc_match_hist);
        }
      } else {
        had_err = -1;
      }
    } else {
      had_err = -1;
    }
    if (had_err) {
      gt_error_set(err, "option \"-bias-parameters\" can only be applied to "
                   "the DNA alphabet");
      gt_encseq_delete(aencseq);
      gt_encseq_delete(bencseq);
    }
  }

  /* Prepare options for greedy extension */
  if (!had_err && gt_option_is_set(arguments->se_option_greedy)) {
    grextinfo = gt_greedy_extend_matchinfo_new(errorpercentage,
                                               arguments->se_maxalilendiff,
                                               arguments->se_historysize,
                                               arguments->se_perc_match_hist,
                                               arguments->se_alignlength,
                                               cam,
                                               arguments->se_extendgreedy);
    if (arguments->benchmark) {
      gt_greedy_extend_matchinfo_silent_set(grextinfo);
    }
  }

  /* Prepare options for xdrop extension */
  if (!had_err && gt_option_is_set(arguments->se_option_xdrop)) {
    xdropinfo = gt_xdrop_matchinfo_new(arguments->se_alignlength,
                                       errorpercentage,
                                       arguments->se_xdropbelowscore,
                                       arguments->se_extendxdrop);
    if (arguments->benchmark) {
      gt_xdrop_matchinfo_silent_set(xdropinfo);
    }
  }

  /* Prepare output options */
  if (!had_err && (arguments->se_alignmentwidth > 0 ||
                   gt_option_is_set(arguments->se_option_xdrop)))
  {
    querymatchoutopt
      = gt_querymatchoutoptions_new(arguments->se_alignmentwidth);

    if (gt_option_is_set(arguments->se_option_xdrop) ||
        gt_option_is_set(arguments->se_option_greedy))
    {
      const GtUword sensitivity = gt_option_is_set(arguments->se_option_greedy)
                                    ? arguments->se_extendgreedy : 100;

      gt_querymatchoutoptions_extend(querymatchoutopt,
                                     errorpercentage,
                                     arguments->se_maxalilendiff,
                                     arguments->se_historysize,
                                     arguments->se_perc_match_hist,
                                     cam,
                                     sensitivity);
    }
  }

  /* Start algorithm */
  if (!had_err) {
    GtDiagbandseed dbsarguments;
    dbsarguments.errorpercentage = errorpercentage;
    dbsarguments.userdefinedleastlength = arguments->se_alignlength;
    dbsarguments.seedlength = arguments->dbs_seedlength;
    dbsarguments.logdiagbandwidth = arguments->dbs_logdiagbandwidth;
    dbsarguments.mincoverage = arguments->dbs_mincoverage;
    dbsarguments.maxfreq = arguments->dbs_maxfreq;
    dbsarguments.memlimit = arguments->dbs_memlimit;
    dbsarguments.mirror = arguments->mirror;
    dbsarguments.overlappingseeds = arguments->overlappingseeds;
    dbsarguments.verify = arguments->dbs_verify;
    dbsarguments.verbose = arguments->verbose;
    dbsarguments.debug_kmer = arguments->dbs_debug_kmer;
    dbsarguments.debug_seedpair = arguments->dbs_debug_seedpair;
    dbsarguments.seed_display = arguments->seed_display;
    dbsarguments.extendgreedyinfo = grextinfo;
    dbsarguments.extendxdropinfo = xdropinfo;
    dbsarguments.querymatchoutopt = querymatchoutopt;

    had_err = gt_diagbandseed_run(aencseq, bencseq, &dbsarguments, err);

    /* clean up */
    gt_encseq_delete(aencseq);
    gt_encseq_delete(bencseq);
    if (gt_option_is_set(arguments->se_option_greedy)) {
      gt_greedy_extend_matchinfo_delete(grextinfo);
    }
    if (gt_option_is_set(arguments->se_option_xdrop)) {
      gt_xdrop_matchinfo_delete(xdropinfo);
    }
    if (arguments->se_alignmentwidth > 0 ||
        gt_option_is_set(arguments->se_option_xdrop)) {
      gt_querymatchoutoptions_delete(querymatchoutopt);
    }
  }

  if (gt_showtime_enabled()) {
    if (!had_err) {
      char *keystring
        = gt_seed_extend_params_keystring(gt_option_is_set(arguments->
                                                           se_option_greedy),
                                          gt_option_is_set(arguments->
                                                           se_option_xdrop),
                                          arguments->dbs_seedlength,
                                          arguments->se_alignlength,
                                          arguments->se_minidentity,
                                          arguments->se_maxalilendiff,
                                          arguments->se_perc_match_hist,
                                          arguments->se_extendgreedy,
                                          arguments->se_extendxdrop,
                                          arguments->se_xdropbelowscore);
      printf("# TIME seedextend-%s", keystring);
      gt_free(keystring);
      gt_timer_show_formatted(seedextendtimer,
                              " overall " GT_WD ".%06ld\n",
                              stdout);
    }
    gt_timer_delete(seedextendtimer);
  }
  return had_err;
}
Beispiel #8
0
int gth_bssm_param_parameterize(GthBSSMParam *bssm_param, const char *path,
                                Termtype termtype, bool gzip, GtError *err)
{
    GtAlphabet *alphabet = NULL;
    GtBioseq *bioseq;
    GtStr *file2proc;
    GtUword i, j;
    int had_err = 0;
    gt_error_check(err);

    file2proc = gt_str_new();

    /* set version number */
    bssm_param->version_num = (unsigned char) MYVERSION;

    /* set model to true and set window sizes */
    switch (termtype) {
    case GT_DONOR_TYPE:
        bssm_param->gt_donor_model_set = true;
        set_window_sizes_in_Bssmmodel(&bssm_param->gt_donor_model);
        break;
    case GC_DONOR_TYPE:
        bssm_param->gc_donor_model_set = true;
        set_window_sizes_in_Bssmmodel(&bssm_param->gc_donor_model);
        break;
    case AG_ACCEPTOR_TYPE:
        bssm_param->ag_acceptor_model_set = true;
        set_window_sizes_in_Bssmmodel(&bssm_param->ag_acceptor_model);
        break;
    default:
        gt_assert(0);
    }

    for (i = 0; !had_err && i < NUMOFFILES; i++) {
        /* process datafile */
        gt_str_append_cstr(file2proc, path);
        switch (termtype) {
        case GT_DONOR_TYPE:
            gt_str_append_cstr(file2proc, "/GT_donor/");
            gt_str_append_cstr(file2proc, filenames[i]);
            break;
        case GC_DONOR_TYPE:
            gt_str_append_cstr(file2proc, "/GC_donor/");
            gt_str_append_cstr(file2proc, filenames[i]);
            break;
        case AG_ACCEPTOR_TYPE:
            gt_str_append_cstr(file2proc, "/AG_acceptor/");
            gt_str_append_cstr(file2proc, filenames[i]);
            break;
        default:
            gt_assert(0);
        }

        if (gzip)
            gt_str_append_cstr(file2proc, ".gz");

        if (!(bioseq = gt_bioseq_new(gt_str_get(file2proc), err)))
            had_err = -1;

        if (!had_err)
            alphabet = gt_bioseq_get_alphabet(bioseq);

        /* check here if all sequences have the length 102 and correct bases at
           positions 51 and 52 (i.e., GT, GC, or AG) */
        for (j = 0; !had_err && j < gt_bioseq_number_of_sequences(bioseq); j++) {
            GtUchar encoded_seq[2];
            /* check length */
            if (gt_bioseq_get_sequence_length(bioseq, j) != STRINGSIZE) {
                gt_error_set(err,
                             "sequence "GT_WU" in file \"%s\" does not have length %u",
                             j, gt_str_get(file2proc), STRINGSIZE);
                had_err = -1;
            }
            encoded_seq[0] = gt_bioseq_get_encoded_char(bioseq, j, 50);
            encoded_seq[1] = gt_bioseq_get_encoded_char(bioseq, j, 51);
            if (!had_err) {
                /* check base correctness */
                switch (termtype) {
                case GT_DONOR_TYPE:
                    if (encoded_seq[0] != gt_alphabet_encode(alphabet, 'G') ||
                            encoded_seq[1] != gt_alphabet_encode(alphabet, 'T')) {
                        gt_error_set(err, "sequence "GT_WU" in file \"%s\" is not a GT "
                                     "sequence", j, gt_str_get(file2proc));
                        had_err = -1;
                    }
                    break;
                case GC_DONOR_TYPE:
                    if (encoded_seq[0] != gt_alphabet_encode(alphabet, 'G') ||
                            encoded_seq[1] != gt_alphabet_encode(alphabet, 'C')) {
                        gt_error_set(err, "sequence "GT_WU" in file \"%s\" is not a GC "
                                     "sequence", j, gt_str_get(file2proc));
                        had_err = -1;
                    }
                    break;
                case AG_ACCEPTOR_TYPE:
                    if (encoded_seq[0] != gt_alphabet_encode(alphabet, 'A') ||
                            encoded_seq[1] != gt_alphabet_encode(alphabet, 'G')) {
                        gt_error_set(err, "sequence "GT_WU" in file \"%s\" is not a AG "
                                     "sequence", j, gt_str_get(file2proc));
                        had_err = -1;
                    }
                    break;
                default:
                    gt_assert(0);
                }
            }
        }

        if (!had_err) {
            switch (termtype) {
            case GT_DONOR_TYPE:
                build_bssm(bioseq, &bssm_param->gt_donor_model, i);
                break;
            case GC_DONOR_TYPE:
                build_bssm(bioseq, &bssm_param->gc_donor_model, i);
                break;
            case AG_ACCEPTOR_TYPE:
                build_bssm(bioseq, &bssm_param->ag_acceptor_model, i);
                break;
            default:
                gt_assert(0);
            }
        }

        /* reset */
        gt_str_reset(file2proc);

        /* free space */
        gt_bioseq_delete(bioseq);
    }
    gt_str_delete(file2proc);

    return had_err;
}
Beispiel #9
0
void gth_sa_calc_polyAtailpos(GthSA *sa, const unsigned char *ref_seq_tran,
                              GtAlphabet *ref_alphabet)
{
  GtUword ppa, mma, rightreferenceborder, referencelength;
  GtWord i, leftreferenceborder;

  sa->polyAtailpos.start = 0;
  sa->polyAtailpos.end = 0;
  ppa = mma = 0;

  rightreferenceborder = ((Exoninfo*) gt_array_get_last(sa->exons))
                         ->rightreferenceexonborder;
  leftreferenceborder  = ((Exoninfo*) gt_array_get_first(sa->exons))
                         ->leftreferenceexonborder;

  /* setting i */
  referencelength = gth_sa_ref_total_length(sa);
  if ((rightreferenceborder + 1) >=
      (referencelength - 1 - CALCPOLYATAILWINDOW)) {
    i = gt_safe_cast2long(rightreferenceborder + 1);
  }
  else {
    if (referencelength < 1 + CALCPOLYATAILWINDOW)
      i = 0;
    else
      i =  referencelength - 1 - CALCPOLYATAILWINDOW;
  }

  for (/* i already set */; i < gt_safe_cast2long(referencelength); i++) {
    if (ref_seq_tran[i] == gt_alphabet_encode(ref_alphabet, 'A'))
      ppa++;
    else {
      if (ppa > 0 && mma < 1) {
        mma++;
        continue;
      }
      else {
        if (ppa >= MINIMUMPOLYATAILLENGTH)
          break;
        else {
          ppa = mma = 0;
          continue;
        }
      }
    }
  }

  if (ppa >= MINIMUMPOLYATAILLENGTH) {
    sa->polyAtailpos.start = gt_safe_cast2ulong(i - ppa - mma);
    sa->polyAtailpos.end = i - 1;
  }
  else {
    ppa = mma = 0;

    /* setting i */
    if ((leftreferenceborder - 1) <= CALCPOLYATAILWINDOW)
      i = leftreferenceborder - 1;
    else
      i =  CALCPOLYATAILWINDOW - 1;

    for (/* i already set */; i >= 0; i--) {
      if (ref_seq_tran[i] == gt_alphabet_encode(ref_alphabet, 'T'))
        ppa++;
      else {
        if (ppa > 0 && mma < 1) {
          mma++;
          continue;
        }
        else {
          if (ppa >= MINIMUMPOLYATAILLENGTH)
            break;
          else {
            ppa = mma = 0;
            continue;
          }
        }
      }
    }

    if (ppa >= MINIMUMPOLYATAILLENGTH) {
      sa->polyAtailpos.start  = gt_safe_cast2ulong(i + ppa + mma);
      sa->polyAtailpos.end = i + 1;
    }
  }
}
Beispiel #10
0
static int parse_score_line(GtScoreMatrix *sm, GtTokenizer *tz,
                            GtArray *index_to_alpha_char_mapping,
                            char *parsed_characters, GtError *err)
{
  unsigned int num_of_chars, i = 0;
  char amino_acid;
  int score, had_err = 0;
  GtStr *token;
  gt_assert(sm && tz && index_to_alpha_char_mapping);
  gt_error_check(err);
  token = gt_tokenizer_get_token(tz);
  gt_assert(token);
  if (gt_str_length(token) != 1) {
    gt_error_set(err, "illegal character token '%s' on line %lu in file '%s'",
                 gt_str_get(token), gt_tokenizer_get_line_number(tz),
                 gt_tokenizer_get_filename(tz));
    had_err = -1;
  }
  amino_acid = gt_str_get(token)[0];
  /* check for character duplications */
  if (parsed_characters[(int) amino_acid]) {
    gt_error_set(err, "multiple character '%c' entry on line %lu in file '%s'",
                 amino_acid, gt_tokenizer_get_line_number(tz),
                 gt_tokenizer_get_filename(tz));
    had_err = -1;
  }
  parsed_characters[(int) amino_acid] = GT_UNDEF_CHAR;
  gt_str_delete(token);
  if (!had_err) {
    num_of_chars = gt_alphabet_num_of_chars(sm->alphabet);
    gt_tokenizer_next_token(tz);
    while ((token = gt_tokenizer_get_token(tz))) {
      unsigned int idx1, idx2;
      /* the tokenizer can return tokens which are empty except for a newline
         -> skip these */
      if (!strcmp(gt_str_get(token), "\n")) {
        gt_str_delete(token);
        gt_tokenizer_next_token(tz);
        if (gt_tokenizer_line_start(tz))
          break;
        continue;
      }
      /* token is not empty -> parse score */
      had_err = gt_parse_int_line(&score, gt_str_get(token),
                                  gt_tokenizer_get_line_number(tz),
                                  gt_tokenizer_get_filename(tz), err);
      if (had_err)
        break;
      idx1 = gt_alphabet_encode(sm->alphabet, amino_acid);
      idx2 = gt_alphabet_encode(sm->alphabet, *(char*)
                                gt_array_get(index_to_alpha_char_mapping, i));
      gt_score_matrix_set_score(sm,
                                idx1 == WILDCARD ? num_of_chars : idx1,
                                idx2 == WILDCARD ? num_of_chars : idx2,
                                score);
      i++;
      gt_str_delete(token);
      gt_tokenizer_next_token(tz);
      if (gt_tokenizer_line_start(tz))
        break;
    }
  }
  return had_err;
}
int gt_genomediff_pck_shu_simple(GtLogger *logger,
                                 const GtGenomediffArguments *arguments,
                                 GtError *err)
{
  int had_err = 0;
  int retval;
  GtSeqIterator *queries = NULL;
  const GtUchar *symbolmap, *currentQuery;
  const GtAlphabet *alphabet;
  GtUchar c_sym = 0,
          g_sym = 0;
  uint64_t queryNo;
  char *description = NULL;
  unsigned long queryLength,
                subjectLength = 0,
                currentSuffix;
  double avgShuLength,
         currentShuLength = 0.0,
         /*gc_subject,*/
         gc_query /*, gc*/;
  const FMindex *subjectindex = NULL;
  Genericindex *genericindexSubject;
  const GtEncseq *encseq = NULL;
  double *ln_n_fac;

  /* get the precalculation of ln(n!) for 0<n<max_ln_n_fac */
  ln_n_fac = gt_get_ln_n_fac(arguments->max_ln_n_fac);
  gt_log_log("ln(max_ln_n_fac!) = %f\n",
             ln_n_fac[arguments->max_ln_n_fac]);

  genericindexSubject = genericindex_new(gt_str_get(
                                           arguments->indexname),
                                         arguments->with_esa,
                                         true,
                                         false,
                                         true,
                                         arguments->user_max_depth,
                                         logger,
                                         err);
  if (genericindexSubject == NULL)
  {
    had_err = 1;
  }
  else
  {
    encseq = genericindex_getencseq(genericindexSubject);
  }

  if (!had_err)
  {
    subjectLength = genericindex_get_totallength(genericindexSubject) - 1;
    /*subjectLength /= 2;*/
    /*gt_log_log("subject length: %lu", subjectLength);*/
    subjectindex = genericindex_get_packedindex(genericindexSubject);

    queries = gt_seqiterator_sequence_buffer_new(
                                          arguments->queryname,
                                          err);
    gt_assert(queries);
    alphabet = gt_encseq_alphabet(encseq);
    /* makes assumption that alphabet is dna, it has to calculate the gc! */
    if (!gt_alphabet_is_dna(alphabet))
    {
      fprintf(stderr, "error: Sequences need to be dna");
      had_err = 1;
    }
    else
    {
      symbolmap = gt_alphabet_symbolmap(alphabet);
      gt_seqiterator_set_symbolmap(queries, symbolmap);
      c_sym = gt_alphabet_encode(alphabet, 'c');
      g_sym = gt_alphabet_encode(alphabet, 'g');
    }
  }

  for (queryNo = 0; !had_err; queryNo++)
  {
    retval = gt_seqiterator_next(queries,
                                 &currentQuery,
                                 &queryLength,
                                 &description,
                                 err);
    if ( retval != 1)
    {
      if (retval < 0)
      {
        gt_free(description);
      }
      break;
    }
    gt_logger_log(logger,
                  "found query of length: %lu",
                  queryLength);
    avgShuLength = 0.0;
    gc_query = 0.0;
    for (currentSuffix = 0; currentSuffix < queryLength; currentSuffix++)
    {
      currentShuLength = (double) gt_pck_getShuStringLength(
                    subjectindex,
                    &currentQuery[currentSuffix],
                    queryLength - currentSuffix);
      avgShuLength += currentShuLength;
      if (currentQuery[currentSuffix] == c_sym ||
          currentQuery[currentSuffix] == g_sym)
      {
        gc_query++;
      }
    }
    if (arguments->shulen_only)
    {
      printf("# Query %d sum of shulen:\n %.0f\n",
             (int) queryNo, avgShuLength);
    }
    else
    {
      avgShuLength /= (double) queryLength;
      gc_query /= (double) queryLength;

      gt_logger_log(logger, "Query %d has an average SHUstring length "
                            "of\n# shulength: %f",
                            (int) queryNo, avgShuLength);
      gt_logger_log(logger, "Query description: %s", description);
      gt_log_log("Query (i): %s", description);

  /* XXX Fehlerabfragen einbauen */

      if ( !had_err )
      {
        double div, kr;

        gt_logger_log(logger, "shulen:\n%f", avgShuLength);
        gt_log_log("shu: %f, gc: %f, len: %lu",
            avgShuLength, gc_query, subjectLength);
        div =  gt_divergence(arguments->divergence_rel_err,
                             arguments->divergence_abs_err,
                             arguments->divergence_m,
                             arguments->divergence_threshold,
                             avgShuLength,
                             subjectLength,
                             gc_query,
                             ln_n_fac,
                             arguments->max_ln_n_fac);
        gt_logger_log(logger, "divergence:\n%f", div);

        kr = gt_calculateKr(div);

        printf("# Kr:\n%f\n", kr);
      }
    }
  }
  gt_free(ln_n_fac);
  gt_seqiterator_delete(queries);
  genericindex_delete(genericindexSubject);
  return had_err;
}
Beispiel #12
0
static int gt_encseq_info_runner(GT_UNUSED int argc, const char **argv,
                           int parsed_args, void *tool_arguments,
                           GtError *err)
{
  GtEncseqInfoArguments *arguments = tool_arguments;
  int had_err = 0;
  GtAlphabet *alpha;
  const GtUchar *chars;
  gt_error_check(err);
  gt_assert(arguments);

  if (arguments->nomap) {
    GtEncseqMetadata *emd = gt_encseq_metadata_new(argv[parsed_args], err);
    if (!emd)
      had_err = -1;

    if (!had_err) {
      if (!arguments->noindexname) {
        gt_file_xprintf(arguments->outfp, "index name: ");
        gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]);
      }

      gt_file_xprintf(arguments->outfp, "file format version: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                          gt_encseq_metadata_version(emd));

      gt_file_xprintf(arguments->outfp, "64-bit file: ");
      gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_metadata_is64bit(emd)
                                                  ? "yes"
                                                  : "no");

      gt_file_xprintf(arguments->outfp, "total length: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_metadata_total_length(emd));

      gt_file_xprintf(arguments->outfp, "number of sequences: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                      gt_encseq_metadata_num_of_sequences(emd));

      gt_file_xprintf(arguments->outfp, "number of files: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_metadata_num_of_files(emd));

      gt_file_xprintf(arguments->outfp, "length of shortest/longest "
                                        "sequence: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n",
                                        gt_encseq_metadata_min_seq_length(emd),
                                        gt_encseq_metadata_max_seq_length(emd));

      gt_file_xprintf(arguments->outfp, "accesstype: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                 gt_encseq_access_type_str(gt_encseq_metadata_accesstype(emd)));

      alpha = gt_encseq_metadata_alphabet(emd);
      chars = gt_alphabet_characters(alpha);
      gt_file_xprintf(arguments->outfp, "alphabet size: ");
      gt_file_xprintf(arguments->outfp, "%u\n",
                                        gt_alphabet_num_of_chars(alpha));
      gt_file_xprintf(arguments->outfp, "alphabet characters: ");
      gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha),
                                        (char*) chars);
      if (gt_alphabet_is_dna(alpha))
        gt_file_xprintf(arguments->outfp, " (DNA)");
      if (gt_alphabet_is_protein(alpha))
        gt_file_xprintf(arguments->outfp, " (Protein)");
      gt_file_xprintf(arguments->outfp, "\n");
      if (arguments->show_alphabet) {
        GtStr *out = gt_str_new();
        gt_alphabet_to_str(alpha, out);
        gt_file_xprintf(arguments->outfp, "alphabet definition:\n");
        gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out));
        gt_str_delete(out);
      }

    }
    gt_encseq_metadata_delete(emd);
  } else {
    GtEncseqLoader *encseq_loader;
    GtEncseq *encseq;

    encseq_loader = gt_encseq_loader_new();
    if (arguments->mirror)
      gt_encseq_loader_mirror(encseq_loader);
    if (!(encseq = gt_encseq_loader_load(encseq_loader,
                                         argv[parsed_args], err)))
      had_err = -1;

    if (!had_err) {
      const GtStrArray *filenames;
      GtUword i;

      if (!arguments->noindexname) {
        gt_file_xprintf(arguments->outfp, "index name: ");
        gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]);
      }

      gt_file_xprintf(arguments->outfp, "file format version: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_version(encseq));

      gt_file_xprintf(arguments->outfp, "64-bit file: ");
      gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_is_64_bit(encseq)
                                                   ? "yes"
                                                   : "no");

      gt_file_xprintf(arguments->outfp, "total length: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_total_length(encseq));

      gt_file_xprintf(arguments->outfp, "compressed size: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU" bytes\n",
                                        gt_encseq_sizeofrep(encseq));

      gt_file_xprintf(arguments->outfp, "number of sequences: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_num_of_sequences(encseq));

      gt_file_xprintf(arguments->outfp, "number of files: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_num_of_files(encseq));

      gt_file_xprintf(arguments->outfp, "length of shortest/longest "
                                        "sequence: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n",
                                      gt_encseq_min_seq_length(encseq),
                                      gt_encseq_max_seq_length(encseq));

      filenames = gt_encseq_filenames(encseq);
      gt_file_xprintf(arguments->outfp, "original filenames:\n");
      for (i = 0; i < gt_str_array_size(filenames); i++) {
        gt_file_xprintf(arguments->outfp, "\t%s ("GT_WU" characters)\n",
                                          gt_str_array_get(filenames, i),
                                          (GtUword)
                                     gt_encseq_effective_filelength(encseq, i));
      }

      alpha = gt_encseq_alphabet(encseq);
      chars = gt_alphabet_characters(alpha);
      gt_file_xprintf(arguments->outfp, "alphabet size: ");
      gt_file_xprintf(arguments->outfp, "%u\n",
                                        gt_alphabet_num_of_chars(alpha));
      gt_file_xprintf(arguments->outfp, "alphabet characters: ");
      gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha),
                                        (char*) chars);
      if (gt_alphabet_is_dna(alpha))
        gt_file_xprintf(arguments->outfp, " (DNA)");
      if (gt_alphabet_is_protein(alpha))
        gt_file_xprintf(arguments->outfp, " (Protein)");
      gt_file_xprintf(arguments->outfp, "\n");
      if (arguments->show_alphabet) {
        GtStr *out = gt_str_new();
        gt_alphabet_to_str(alpha, out);
        gt_file_xprintf(arguments->outfp, "alphabet definition:\n");
        gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out));
        gt_str_delete(out);
      }

      gt_file_xprintf(arguments->outfp, "character distribution:\n");
      for (i = 0; i < gt_alphabet_num_of_chars(alpha); i++) {
        GtUword cc;
        cc = gt_encseq_charcount(encseq, gt_alphabet_encode(alpha, chars[i]));
        gt_file_xprintf(arguments->outfp, "\t%c: "GT_WU" (%.2f%%)\n",
                                          (char) chars[i],
                                          cc,
                             (cc /(double) (gt_encseq_total_length(encseq)
                                  - gt_encseq_num_of_sequences(encseq)+1))*100);
      }

      gt_file_xprintf(arguments->outfp, "number of wildcards: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n",
                                        gt_encseq_wildcards(encseq),
                                        gt_encseq_realwildcardranges(encseq));

      gt_file_xprintf(arguments->outfp, "number of special characters: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n",
                                        gt_encseq_specialcharacters(encseq),
                                        gt_encseq_realspecialranges(encseq));

      gt_file_xprintf(arguments->outfp, "length of longest non-special "
                                        "character stretch: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                   gt_encseq_lengthoflongestnonspecial(encseq));

      gt_file_xprintf(arguments->outfp, "accesstype: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                   gt_encseq_access_type_str(gt_encseq_accesstype_get(encseq)));

      gt_file_xprintf(arguments->outfp, "bits used per character: ");
      gt_file_xprintf(arguments->outfp, "%f\n",
        (double) ((uint64_t) CHAR_BIT *
                  (uint64_t) gt_encseq_sizeofrep(encseq)) /
        (double) gt_encseq_total_length(encseq));

      gt_file_xprintf(arguments->outfp, "has special ranges: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                                        gt_encseq_has_specialranges(encseq)
                                          ? "yes"
                                          : "no");

      gt_file_xprintf(arguments->outfp, "has description support: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                                       gt_encseq_has_description_support(encseq)
                                          ? "yes"
                                          : "no");

      if (gt_encseq_has_description_support(encseq)) {
        gt_file_xprintf(arguments->outfp, "length of longest description: ");
        gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                          gt_encseq_max_desc_length(encseq));
      }

      gt_file_xprintf(arguments->outfp, "has multiple sequence support: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                                        gt_encseq_has_multiseq_support(encseq)
                                          ? "yes"
                                          : "no");
    }
    gt_encseq_delete(encseq);
    gt_encseq_loader_delete(encseq_loader);
  }

  return had_err;
}