Ejemplo n.º 1
0
static char* generate_fragment(GtShredder *shredder,
                               unsigned long *fragment_length,
                               GtStr *desc)
{
  gt_assert(shredder && fragment_length);
  if (shredder->seqnum < gt_bioseq_number_of_sequences(shredder->bioseq)) {
    unsigned long seqlen, fraglen;
    char *frag;
    seqlen = gt_bioseq_get_sequence_length(shredder->bioseq, shredder->seqnum);
    fraglen = (shredder->maxlength == shredder->minlength
               ? 0 : gt_rand_max(shredder->maxlength - shredder->minlength))
              + shredder->minlength;
    gt_assert(fraglen >= shredder->minlength);
    if (shredder->pos + fraglen > seqlen)
      fraglen = seqlen - shredder->pos;
    *fragment_length = fraglen;
    gt_str_reset(desc);
    gt_str_append_cstr(desc, gt_bioseq_get_description(shredder->bioseq,
                                                       shredder->seqnum));
    gt_assert(shredder->pos + fraglen <= seqlen);
    frag = gt_bioseq_get_sequence_range(shredder->bioseq, shredder->seqnum,
                                        shredder->pos,
                                        shredder->pos + fraglen -1);
    if (shredder->pos + fraglen == seqlen) { /* last fragment */
      shredder->seqnum++;
      shredder->pos = 0;
    }
    else {
      if (fraglen > shredder->overlap)
        shredder->pos += fraglen - shredder->overlap;
      else
        shredder->pos++; /* go at least one base further each step */
    }
    return frag;
  }
  return NULL;
}
Ejemplo n.º 2
0
static int extractseq_match(GtFile *outfp, GtBioseq *bs,
                            const char *pattern, unsigned long width,
                            GtError *err)
{
  const char *desc;
  unsigned long i;
  bool match;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(bs && pattern);

  for (i = 0; !had_err && i < gt_bioseq_number_of_sequences(bs); i++) {
    desc = gt_bioseq_get_description(bs, i);
    gt_assert(desc);
    had_err = gt_grep(&match, pattern, desc, err);
    if (!had_err && match) {
      gt_fasta_show_entry(desc, gt_bioseq_get_sequence(bs, i),
                          gt_bioseq_get_sequence_length(bs, i), width, outfp);
    }
  }

  return had_err;
}
Ejemplo n.º 3
0
static int gt_seqfilter_runner(int argc, const char **argv, int parsed_args,
                               void *tool_arguments, GtError *err)
{
  SeqFilterArguments *arguments = tool_arguments;
  GtBioseqIterator *bsi;
  GtBioseq *bioseq;
  GtUint64 passed = 0, filtered = 0, num_of_sequences = 0, steps = 0;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(tool_arguments);

  bsi = gt_bioseq_iterator_new(argc - parsed_args, argv + parsed_args);

  while (!(had_err = gt_bioseq_iterator_next(bsi, &bioseq, err)) &&
         bioseq != NULL) {
    GtUword i;
    GtUint64 current_num = gt_bioseq_number_of_sequences(bioseq);
    for (i = 0;
         i < current_num &&
         (arguments->maxseqnum == GT_UNDEF_UWORD ||
          passed + 1 <= arguments->maxseqnum);
         i++) {
      char *seq;
      if ((arguments->step == 1 ||
           steps + 1 == arguments->step) &&
          (arguments->sample_prob == 1.0 ||
           gt_rand_0_to_1() <= arguments->sample_prob) &&
          (arguments->minlength == GT_UNDEF_UWORD ||
           gt_bioseq_get_sequence_length(bioseq, i) >= arguments->minlength) &&
          (arguments->maxlength == GT_UNDEF_UWORD ||
           gt_bioseq_get_sequence_length(bioseq, i) <= arguments->maxlength)) {
        seq = gt_bioseq_get_sequence(bioseq, i);
        gt_fasta_show_entry(gt_bioseq_get_description(bioseq, i),
                            seq,
                            gt_bioseq_get_sequence_length(bioseq, i),
                            arguments->width, arguments->outfp);
        gt_free(seq);
        passed++;
      }
      else {
        filtered++;
      }
      steps = (steps + 1 == arguments->step) ? 0 : steps + 1;
    }
    filtered += current_num - i;
    num_of_sequences += current_num;
    gt_bioseq_delete(bioseq);
  }

  /* show statistics */
  if (!had_err) {
    gt_assert(passed + filtered == num_of_sequences);
    fprintf(stderr, "# " GT_LLU " out of " GT_LLU
            " sequences have been removed (%.3f%%)\n",
            filtered, num_of_sequences,
            ((double) filtered / num_of_sequences) * 100.0);
  }

  gt_bioseq_iterator_delete(bsi);

  return had_err;
}
Ejemplo n.º 4
0
static int grep_desc(GtBioseqCol *bsc, GtUword *filenum,
                     GtUword *seqnum, GtStr *seqid, GtError *err)
{
  GtUword i, j, num_matches = 0;
  const GtSeqInfo *seq_info_ptr;
  GtSeqInfo seq_info;
  GtStr *pattern, *escaped;
  bool match = false;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(bsc && filenum && seqnum && seqid);
  /* create cache */
  if (!bsc->grep_cache)
    bsc->grep_cache = gt_seq_info_cache_new();
  /* try to read from cache */
  seq_info_ptr = gt_seq_info_cache_get(bsc->grep_cache, gt_str_get(seqid));
  if (seq_info_ptr) {
    *filenum = seq_info_ptr->filenum;
    *seqnum = seq_info_ptr->seqnum;
    return 0;
  }
  pattern = gt_str_new();
  escaped = gt_str_new();
  gt_grep_escape_extended(escaped, gt_str_get(seqid), gt_str_length(seqid));
  if (bsc->matchdescstart)
    gt_str_append_cstr(pattern, "^");
  gt_str_append_str(pattern, escaped);
  if (bsc->matchdescstart)
    gt_str_append_cstr(pattern, "([[:space:]]|$)");
  for (i = 0; !had_err && i < bsc->num_of_seqfiles; i++) {
    GtBioseq *bioseq = bsc->bioseqs[i];
    for (j = 0; !had_err && j < gt_bioseq_number_of_sequences(bioseq); j++) {
      const char *desc = gt_bioseq_get_description(bioseq, j);
      had_err = gt_grep(&match, gt_str_get(pattern), desc, err);
      if (!had_err && match) {
        num_matches++;
        if (num_matches > 1) {
          gt_error_set(err, "query seqid '%s' could match more than one "
                            "sequence description", gt_str_get(seqid));
          had_err = -1;
          break;
        }
        *filenum = i;
        *seqnum = j;
        /* cache results */
        seq_info.filenum = i;
        seq_info.seqnum = j;
        gt_seq_info_cache_add(bsc->grep_cache, gt_str_get(seqid), &seq_info);
      }
    }
    if (match)
      break;
  }
  gt_str_delete(pattern);
  gt_str_delete(escaped);
  if (!had_err && num_matches == 0) {
    gt_error_set(err, "no description matched sequence ID '%s'",
                 gt_str_get(seqid));
    had_err = -1;
  }
  return had_err;
}
Ejemplo n.º 5
0
int gt_pbs_unit_test(GtError *err)
{
  int had_err = 0;
  GtLTRElement element;
  GtPBSOptions o;
  GtStr *tmpfilename;
  FILE *tmpfp;
  GtPBSResults *res;
  GtPBSHit *hit;
  double score1, score2;
  GtRange rng;
  char *rev_seq,
       *seq,
       tmp[BUFSIZ];
  const char *fullseq =                           "aaaaaaaaaaaaaaaaaaaa"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "acatactaggatgctag" /* <- PBS forward */
                                     "aatatagtttcgaatatagcactgcatttcgaa"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "tatagcactgcatttcgaatatagtttcgaatatag"
                                   /* PBS reverse -> */ "gatcctaaggctac"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "aaaaaaaaaaaaaaaaaaaa";

  /* notice previous errors */
  gt_error_check(err);

  /* create temporary tRNA library file */
  tmpfilename = gt_str_new();
  tmpfp = gt_xtmpfp(tmpfilename);
  fprintf(tmpfp, ">test1\nccccccccccccccctagcatcctagtatgtccc\n"
                 ">test2\ncccccccccgatcctagggctaccctttc\n");
  gt_fa_xfclose(tmpfp);
  ensure(had_err, gt_file_exists(gt_str_get(tmpfilename)));

  /* setup testing parameters */
  o.radius = 30;
  o.max_edist = 1;
  o.alilen.start = 11;
  o.alilen.end = 30;
  o.offsetlen.start = 0;
  o.offsetlen.end = 5;
  o.trnaoffsetlen.start = 0;
  o.trnaoffsetlen.end =  40;
  o.ali_score_match = 5;
  o.ali_score_mismatch = -10;
  o.ali_score_insertion = o.ali_score_deletion = -20;
  o.trna_lib = gt_bioseq_new(gt_str_get(tmpfilename), err);
  ensure(had_err, gt_bioseq_number_of_sequences(o.trna_lib) == 2);

  element.leftLTR_5 = 20;
  element.leftLTR_3 = 119;
  element.rightLTR_5 = 520;
  element.rightLTR_3 = 619;

  /* setup sequences */
  seq     = gt_malloc(600 * sizeof (char));
  rev_seq = gt_malloc(600 * sizeof (char));
  memcpy(seq,     fullseq + 20, 600);
  memcpy(rev_seq, fullseq + 20, 600);
  gt_reverse_complement(rev_seq, 600, err);

  /* try to find PBS in sequences */
  res = gt_pbs_find(seq, rev_seq, &element, &o, err);
  ensure(had_err, res != NULL);
  ensure(had_err, gt_pbs_results_get_number_of_hits(res) == 2);

  /* check first hit on forward strand */
  hit = gt_pbs_results_get_ranked_hit(res, 0);
  ensure(had_err, hit != NULL);
  ensure(had_err, gt_pbs_hit_get_alignment_length(hit) == 17);
  ensure(had_err, gt_pbs_hit_get_edist(hit) == 0);
  ensure(had_err, gt_pbs_hit_get_offset(hit) == 0);
  ensure(had_err, gt_pbs_hit_get_tstart(hit) == 3);
  ensure(had_err, strcmp(gt_pbs_hit_get_trna(hit), "test1") == 0);
  rng = gt_pbs_hit_get_coords(hit);
  ensure(had_err, rng.start == 120);
  ensure(had_err, rng.end == 136);
  score1 = gt_pbs_hit_get_score(hit);
  ensure(had_err, gt_pbs_hit_get_strand(hit) == GT_STRAND_FORWARD);
  memset(tmp, 0, BUFSIZ-1);
  memcpy(tmp, fullseq + (rng.start * sizeof (char)),
         (rng.end - rng.start + 1) * sizeof (char));
  ensure(had_err, strcmp(tmp, "acatactaggatgctag" ) == 0);

  /* check second hit on reverse strand */
  hit = gt_pbs_results_get_ranked_hit(res, 1);
  ensure(had_err, hit != NULL);
  ensure(had_err, gt_pbs_hit_get_alignment_length(hit) == 14);
  ensure(had_err, gt_pbs_hit_get_edist(hit) == 1);
  ensure(had_err, gt_pbs_hit_get_offset(hit) == 0);
  ensure(had_err, gt_pbs_hit_get_tstart(hit) == 6);
  ensure(had_err, strcmp(gt_pbs_hit_get_trna(hit), "test2") == 0);
  rng = gt_pbs_hit_get_coords(hit);
  ensure(had_err, rng.start == 506);
  ensure(had_err, rng.end == 519);
  score2 = gt_pbs_hit_get_score(hit);
  ensure(had_err, gt_double_compare(score1, score2) > 0);
  ensure(had_err, gt_pbs_hit_get_strand(hit) == GT_STRAND_REVERSE);
  memset(tmp, 0, BUFSIZ-1);
  memcpy(tmp, fullseq + (rng.start * sizeof (char)),
         (rng.end - rng.start + 1) * sizeof (char));
  ensure(had_err, strcmp(tmp, "gatcctaaggctac" ) == 0);

  /* clean up */
  gt_xremove(gt_str_get(tmpfilename));
  ensure(had_err, !gt_file_exists(gt_str_get(tmpfilename)));
  gt_str_delete(tmpfilename);
  gt_bioseq_delete(o.trna_lib);
  gt_free(rev_seq);
  gt_free(seq);
  gt_pbs_results_delete(res);

  return had_err;
}
Ejemplo n.º 6
0
GtPBSResults* gt_pbs_find(const char *seq,
                          const char *rev_seq,
                          GtLTRElement *element,
                          GtPBSOptions *o,
                          GtError *err)
{
  GtSeq *seq_forward, *seq_rev;
  GtPBSResults *results;
  unsigned long j;
  GtAlignment *ali;
  GtAlphabet *a = gt_alphabet_new_dna();
  GtScoreFunction *sf = gt_dna_scorefunc_new(a,
                                             o->ali_score_match,
                                             o->ali_score_mismatch,
                                             o->ali_score_insertion,
                                             o->ali_score_deletion);

  gt_assert(seq && rev_seq && sf && a && element);

  results = gt_pbs_results_new(element, o);

  seq_forward = gt_seq_new(seq + (gt_ltrelement_leftltrlen(element))
                               - (o->radius),
                           2*o->radius + 1,
                           a);

  seq_rev     = gt_seq_new(rev_seq + (gt_ltrelement_rightltrlen(element))
                                   - (o->radius),
                           2*o->radius + 1,
                           a);

    for (j=0;j<gt_bioseq_number_of_sequences(o->trna_lib);j++)
  {
    GtSeq *trna_seq, *trna_from3;
    char *trna_from3_full;
    unsigned long trna_seqlen;

    trna_seq = gt_bioseq_get_seq(o->trna_lib, j);
    trna_seqlen = gt_seq_length(trna_seq);

    trna_from3_full = gt_calloc(trna_seqlen, sizeof (char));
    memcpy(trna_from3_full, gt_seq_get_orig(trna_seq),
           sizeof (char)*trna_seqlen);
    (void) gt_reverse_complement(trna_from3_full, trna_seqlen, err);
    trna_from3 = gt_seq_new_own(trna_from3_full, trna_seqlen, a);

    ali = gt_swalign(seq_forward, trna_from3, sf);
    gt_pbs_add_hit(results->hits, ali, o, trna_seqlen,
                   gt_seq_get_description(trna_seq), GT_STRAND_FORWARD,
                   results);
    gt_alignment_delete(ali);

    ali = gt_swalign(seq_rev, trna_from3, sf);
    gt_pbs_add_hit(results->hits, ali, o, trna_seqlen,
                   gt_seq_get_description(trna_seq), GT_STRAND_REVERSE,
                   results);
    gt_alignment_delete(ali);

    gt_seq_delete(trna_from3);
  }
  gt_seq_delete(seq_forward);
  gt_seq_delete(seq_rev);
  gt_score_function_delete(sf);
  gt_alphabet_delete(a);
  gt_array_sort(results->hits, gt_pbs_hit_compare);
  return results;
}
Ejemplo n.º 7
0
int gth_bssm_param_parameterize(GthBSSMParam *bssm_param, const char *path,
                                Termtype termtype, bool gzip, GtError *err)
{
    GtAlphabet *alphabet = NULL;
    GtBioseq *bioseq;
    GtStr *file2proc;
    GtUword i, j;
    int had_err = 0;
    gt_error_check(err);

    file2proc = gt_str_new();

    /* set version number */
    bssm_param->version_num = (unsigned char) MYVERSION;

    /* set model to true and set window sizes */
    switch (termtype) {
    case GT_DONOR_TYPE:
        bssm_param->gt_donor_model_set = true;
        set_window_sizes_in_Bssmmodel(&bssm_param->gt_donor_model);
        break;
    case GC_DONOR_TYPE:
        bssm_param->gc_donor_model_set = true;
        set_window_sizes_in_Bssmmodel(&bssm_param->gc_donor_model);
        break;
    case AG_ACCEPTOR_TYPE:
        bssm_param->ag_acceptor_model_set = true;
        set_window_sizes_in_Bssmmodel(&bssm_param->ag_acceptor_model);
        break;
    default:
        gt_assert(0);
    }

    for (i = 0; !had_err && i < NUMOFFILES; i++) {
        /* process datafile */
        gt_str_append_cstr(file2proc, path);
        switch (termtype) {
        case GT_DONOR_TYPE:
            gt_str_append_cstr(file2proc, "/GT_donor/");
            gt_str_append_cstr(file2proc, filenames[i]);
            break;
        case GC_DONOR_TYPE:
            gt_str_append_cstr(file2proc, "/GC_donor/");
            gt_str_append_cstr(file2proc, filenames[i]);
            break;
        case AG_ACCEPTOR_TYPE:
            gt_str_append_cstr(file2proc, "/AG_acceptor/");
            gt_str_append_cstr(file2proc, filenames[i]);
            break;
        default:
            gt_assert(0);
        }

        if (gzip)
            gt_str_append_cstr(file2proc, ".gz");

        if (!(bioseq = gt_bioseq_new(gt_str_get(file2proc), err)))
            had_err = -1;

        if (!had_err)
            alphabet = gt_bioseq_get_alphabet(bioseq);

        /* check here if all sequences have the length 102 and correct bases at
           positions 51 and 52 (i.e., GT, GC, or AG) */
        for (j = 0; !had_err && j < gt_bioseq_number_of_sequences(bioseq); j++) {
            GtUchar encoded_seq[2];
            /* check length */
            if (gt_bioseq_get_sequence_length(bioseq, j) != STRINGSIZE) {
                gt_error_set(err,
                             "sequence "GT_WU" in file \"%s\" does not have length %u",
                             j, gt_str_get(file2proc), STRINGSIZE);
                had_err = -1;
            }
            encoded_seq[0] = gt_bioseq_get_encoded_char(bioseq, j, 50);
            encoded_seq[1] = gt_bioseq_get_encoded_char(bioseq, j, 51);
            if (!had_err) {
                /* check base correctness */
                switch (termtype) {
                case GT_DONOR_TYPE:
                    if (encoded_seq[0] != gt_alphabet_encode(alphabet, 'G') ||
                            encoded_seq[1] != gt_alphabet_encode(alphabet, 'T')) {
                        gt_error_set(err, "sequence "GT_WU" in file \"%s\" is not a GT "
                                     "sequence", j, gt_str_get(file2proc));
                        had_err = -1;
                    }
                    break;
                case GC_DONOR_TYPE:
                    if (encoded_seq[0] != gt_alphabet_encode(alphabet, 'G') ||
                            encoded_seq[1] != gt_alphabet_encode(alphabet, 'C')) {
                        gt_error_set(err, "sequence "GT_WU" in file \"%s\" is not a GC "
                                     "sequence", j, gt_str_get(file2proc));
                        had_err = -1;
                    }
                    break;
                case AG_ACCEPTOR_TYPE:
                    if (encoded_seq[0] != gt_alphabet_encode(alphabet, 'A') ||
                            encoded_seq[1] != gt_alphabet_encode(alphabet, 'G')) {
                        gt_error_set(err, "sequence "GT_WU" in file \"%s\" is not a AG "
                                     "sequence", j, gt_str_get(file2proc));
                        had_err = -1;
                    }
                    break;
                default:
                    gt_assert(0);
                }
            }
        }

        if (!had_err) {
            switch (termtype) {
            case GT_DONOR_TYPE:
                build_bssm(bioseq, &bssm_param->gt_donor_model, i);
                break;
            case GC_DONOR_TYPE:
                build_bssm(bioseq, &bssm_param->gc_donor_model, i);
                break;
            case AG_ACCEPTOR_TYPE:
                build_bssm(bioseq, &bssm_param->ag_acceptor_model, i);
                break;
            default:
                gt_assert(0);
            }
        }

        /* reset */
        gt_str_reset(file2proc);

        /* free space */
        gt_bioseq_delete(bioseq);
    }
    gt_str_delete(file2proc);

    return had_err;
}
Ejemplo n.º 8
0
/* updates the BSSM parameterization file */
static void build_bssm(GtBioseq *bioseq, GthBSSMModel *bssm_model,
                       unsigned int hypothesisnum)
{
    GtUword mono_ct[STRINGSIZE-1][ALPHSIZE],         /* Mononuc freq */
            di_ct[STRINGSIZE-1][ALPHSIZE][ALPHSIZE]; /* Dinuc freq */
    double mono_freq,      /* Mononuc relative freq */
           di_freq;        /* Dinuc relative freq */
    GtUword i, j, k, /* Iterator variables */
            len, curlen = 0,
                 num_entries = gt_bioseq_number_of_sequences(bioseq);
    GtUchar *encoded_seq = NULL;

    /* Inits of local variables */
    for (i = 0; i < (STRINGSIZE-1); i++) {
        for (j = 0; j < ALPHSIZE; j++) {
            mono_ct[i][j] = INITVAL_INT;
            for (k = 0; k < ALPHSIZE; k++)
                di_ct[i][j][k] = INITVAL_INT;
        }
    }

    /* mononucleotides */
    for (j = 0; j < num_entries; j++) {
        len = gt_bioseq_get_sequence_length(bioseq, j);
        gt_assert(len == STRINGSIZE);
        if (len > curlen) {
            encoded_seq = gt_realloc(encoded_seq, len);
            curlen = len;
        }
        gt_bioseq_get_encoded_sequence(bioseq, encoded_seq, j);
        for (i = 0; i < (STRINGSIZE-1); i++) {
            gt_assert(encoded_seq[i] < ALPHSIZE);
            mono_ct[i][encoded_seq[i]]++;
        }
    }

    /* dinucleotides */
    for (j = 0; j < num_entries; j++) {
        len = gt_bioseq_get_sequence_length(bioseq, j);
        gt_assert(len == STRINGSIZE);
        if (len > curlen) {
            encoded_seq = gt_realloc(encoded_seq, len);
            curlen = len;
        }
        gt_bioseq_get_encoded_sequence(bioseq, encoded_seq, j);
        for (i = 0; i < (STRINGSIZE-1); i++) {
            di_ct[i][encoded_seq[i]]
            [encoded_seq[i + 1]]++;
        }
    }

    gt_free(encoded_seq);

    /* Record equilibrium frequencies (1st ``slot" in transition freqs) */
    for (i = 0; i < ALPHSIZE; i++) {
        for (j = 0; j < ALPHSIZE; j++) {
            bssm_model->hypotables
            .hypo7table[hypothesisnum][0][i][j] = (GthFlt)
                                                  mono_ct[0][i] / num_entries;
        }
    }

    /* Populate the remaining transition frequencies */
    for (k = 1; k < STRINGSIZE; k++) {
        for (i = 0; i < ALPHSIZE; i++) {
            mono_freq = (double) mono_ct[k-1][i] / num_entries;
            for (j = 0; j < ALPHSIZE; j++) {
                di_freq = (double) di_ct[k-1][i][j] / num_entries;
                if (mono_freq == 0.0) {
                    bssm_model->hypotables
                    .hypo7table[hypothesisnum][k][i][j] = (GthFlt) NULLPROB;
                }
                else {
                    bssm_model->hypotables
                    .hypo7table[hypothesisnum][k][i][j] = (GthFlt)
                                                          (di_freq / mono_freq);
                }
            }

            /* Remove non-zero transition probabilities:
               Briefly, 0.0 entries (dinucleotide absent in training corpus) are
               replaced arbitrarily by PSEUDOPROB, and non-0.0 entries p are replaced
               by p = p * (1 - 4 * PSEUDOPROB) + PSEUDOPROB */
            for (j = 0; j < ALPHSIZE; ++j) {
                /* If any entry is NULLPROB, ALL elements in the row need fixed */
                if (bssm_model->hypotables
                        .hypo7table[hypothesisnum][k][i][j] == NULLPROB) {
                    /* Fix all elements in the row, then break */
                    for (j = 0; j < ALPHSIZE; j++) {
                        if (bssm_model->hypotables
                                .hypo7table[hypothesisnum][k][i][j] == NULLPROB) {
                            bssm_model->hypotables
                            .hypo7table[hypothesisnum][k][i][j] = (GthFlt)
                                                                  PSEUDOPROB;
                        }
                        else {
                            /* Adjust non-zero transition prob */
                            bssm_model->hypotables.hypo7table[hypothesisnum][k][i][j] =
                                (GthFlt)
                                (bssm_model->hypotables.hypo7table[hypothesisnum][k][i][j] *
                                 (1 - (4 * PSEUDOPROB)) + PSEUDOPROB);
                        }
                    }
                    break;
                }
            }
        }
    }
}
Ejemplo n.º 9
0
static int gt_sequniq_runner(int argc, const char **argv, int parsed_args,
                             void *tool_arguments, GtError *err)
{
  GtSequniqArguments *arguments = tool_arguments;
  GtUint64 duplicates = 0, num_of_sequences = 0;
  int i, had_err = 0;
  GtMD5Set *md5set;

  gt_error_check(err);
  gt_assert(arguments);
  md5set = gt_md5set_new(arguments->nofseqs);
  if (!arguments->seqit) {
    GtUword j;
    GtBioseq *bs;

    for (i = parsed_args; !had_err && i < argc; i++) {
      if (!(bs = gt_bioseq_new(argv[i], err)))
        had_err = -1;
      if (!had_err) {
        GtMD5SetStatus retval;
        for (j = 0; j < gt_bioseq_number_of_sequences(bs) && !had_err; j++) {
          char *seq = gt_bioseq_get_sequence(bs, j);
          retval = gt_md5set_add_sequence(md5set, seq,
                                          gt_bioseq_get_sequence_length(bs, j),
                                          arguments->rev, err);
          if (retval == GT_MD5SET_NOT_FOUND)
            gt_fasta_show_entry(gt_bioseq_get_description(bs, j), seq,
                                gt_bioseq_get_sequence_length(bs, j),
                                arguments->width, arguments->outfp);
          else if (retval != GT_MD5SET_ERROR)
            duplicates++;
          else
            had_err = -1;
          num_of_sequences++;
          gt_free(seq);
        }
        gt_bioseq_delete(bs);
      }
    }
  }
  else {
    GtSeqIterator *seqit;
    GtStrArray *files;
    off_t totalsize;
    const GtUchar *sequence;
    char *desc;
    GtUword len;

    files = gt_str_array_new();
    for (i = parsed_args; i < argc; i++)
      gt_str_array_add_cstr(files, argv[i]);
    totalsize = gt_files_estimate_total_size(files);
    seqit = gt_seq_iterator_sequence_buffer_new(files, err);
    if (!seqit)
      had_err = -1;
    if (!had_err) {
      if (arguments->verbose) {
        gt_progressbar_start(gt_seq_iterator_getcurrentcounter(seqit,
                                                          (GtUint64) totalsize),
                             (GtUint64) totalsize);
      }
      while (!had_err) {
        GtMD5SetStatus retval;
        if ((gt_seq_iterator_next(seqit, &sequence, &len, &desc, err)) != 1)
          break;

        retval = gt_md5set_add_sequence(md5set, (const char*) sequence, len,
                                        arguments->rev, err);
        if (retval == GT_MD5SET_NOT_FOUND)
          gt_fasta_show_entry(desc, (const char*) sequence, len,
                              arguments->width, arguments->outfp);
        else if (retval != GT_MD5SET_ERROR)
          duplicates++;
        else
          had_err = -1;
        num_of_sequences++;
      }
      if (arguments->verbose)
        gt_progressbar_stop();
      gt_seq_iterator_delete(seqit);
    }
    gt_str_array_delete(files);
  }

  /* show statistics */
  if (!had_err) {
    fprintf(stderr,
            "# "GT_WU" out of "GT_WU" sequences have been removed (%.3f%%)\n",
            (GtUword)duplicates, (GtUword)num_of_sequences,
            ((double) duplicates / (double)num_of_sequences) * 100.0);
  }

  gt_md5set_delete(md5set);
  return had_err;
}