Пример #1
0
SEXP rph_gff_convert_coords(SEXP gffP, SEXP msaP, SEXP toP) {
  GFF_Set *gff;
  MSA *msa=(MSA*)EXTPTR_PTR(msaP);
  int to=INTEGER_VALUE(toP);

  gff = gff_copy_set_no_groups((GFF_Set*)EXTPTR_PTR(gffP));
  msa_map_gff_coords(msa, gff, -1, to, 0);
  return rph_gff_new_extptr(gff);
}
int main(int argc, char* argv[]) {
  FILE* F;
  MSA *msa;
  msa_format_type format = UNKNOWN_FORMAT;
  int src_ref = -1, dest_ref = 0, offset = 0;
  char *msa_fname = NULL, *feat_fname = NULL;
  GFF_Set *gff;
  char c;

  while ((c = (char)getopt(argc, argv, "hm:f:s:d:i:p:n:")) != -1) {
    switch(c) {
    case 'm':
      msa_fname = optarg;
      break;
    case 'f':
      feat_fname = optarg;
      break;
    case 's':
      src_ref = get_arg_int(optarg);
      break;
    case 'd':
      dest_ref = get_arg_int(optarg);
      break;
    case 'i':
      format = msa_str_to_format(optarg);
      if (format == UNKNOWN_FORMAT) die("ERROR: bad alignment format.\n");
      break;
    case 'p':
      offset = get_arg_int(optarg);
      break;
    case 'n':
      offset = -1 * get_arg_int(optarg);
      break;
    case 'h':
      print_usage();
      exit(1);
    case '?':
      print_usage();
      exit(1);
    }
  }

  if (msa_fname == NULL || feat_fname == NULL) {
    print_usage();
    exit(1);
  }

  set_seed(-1);

  F = phast_fopen(feat_fname, "r");
  if ((gff = gff_read_set(F)) == NULL) { 
    die("ERROR: error reading %s.\n", feat_fname);
  }
  phast_fclose(F);

  /* handle case of local alignment specially -- avoid representing
     the alignment explicitly */
  F = phast_fopen(msa_fname, "r");
  if (format == UNKNOWN_FORMAT)
    format = msa_format_for_content(F, 1);
  if (format == LAV) {
    LocalPwAlignment *lpwa = NULL;
/*     int i; */

    fprintf(stderr, "WARNING: in local alignment mode, coordinates may only be mapped from query (reference) sequence to target (aligned) sequence.\n"); 

    lpwa = la_read_lav(F, 0);
    la_gff_transform(lpwa, gff);
/*     for (i = 0; i < lst_size(gff->features); i++) { */
/*       GFF_Feature *feat = lst_get_ptr(gff->features, i); */
/*       feat->start = la_get_target_coord(lpwa, feat->start); */
/*       feat->end = la_get_target_coord(lpwa, feat->end); */
/*     } */
  }

  else {                        /* normal alignment */
    msa = msa_new_from_file_define_format(F, format, NULL);
    phast_fclose(F);

    msa_map_gff_coords(msa, gff, src_ref, dest_ref, offset);
    msa_free(msa);
  }

  gff_print_set(stdout, gff);

  gff_free_set(gff);

  return 0;
}
Пример #3
0
int main(int argc, char* argv[]) {
    FILE* F;
    MSA *msa;
    int *msa_gap_patterns = NULL;
    HMM *hmm = NULL;
    TreeNode *tree = NULL;
    int i, input_format = SS, msa_idx, quiet_mode = FALSE,
           ncats, nmsas, ncats_unspooled, indel_nseqs = -1;
    String *msa_fname, *gff_fname;
    List *gff_fname_list = NULL, *msa_fname_list = NULL,
          *msa_length_list = NULL, *model_indels_str = NULL;
    Matrix *traincounts = NULL;
    Vector *begcounts = NULL, *statecounts = NULL;
    CategoryMap *cm = NULL;
    char c;
    GapPatternMap *gpm = NULL;
    GFF_Set *gff;
    char *reverse_groups_tag = NULL;

    while ((c = getopt(argc, argv, "i:g:c:m:M:R:I:n:t:P:G:qh")) != -1) {
        switch(c) {
        case 'i':
            input_format = msa_str_to_format(optarg);
            if (input_format == -1)
                die("ERROR: bad alignment format.\n");
            break;
        case 'g':
            gff_fname_list = get_arg_list(optarg);
            break;
        case 'c':
            cm = cm_new_string_or_file(optarg);
            break;
        case 'm':
            msa_fname_list = get_arg_list(optarg);
            break;
        case 'M':
            msa_length_list = str_list_as_int(get_arg_list(optarg));
            break;
        case 'R':
            reverse_groups_tag = optarg;
            break;
        case 'I':
            model_indels_str = get_arg_list(optarg);
            break;
        case 'n':
            indel_nseqs = get_arg_int(optarg);
            break;
        case 't':
            if (optarg[0] == '(')     /* in this case, assume topology given
                                   at command line */
                tree = tr_new_from_string(optarg);
            else
                tree = tr_new_from_file(phast_fopen(optarg, "r"));
            break;
        case 'q':
            quiet_mode = TRUE;
            break;
        case 'h':
            print_usage();
            exit(0);
        case '?':
            die("ERROR: unrecognized option.\n\nType 'hmm_train -h' for usage.\n");
        }
    }

    if (msa_fname_list == NULL)
        die("ERROR: -m required.  Type 'hmm_train -h' for usage.\n");
    if (gff_fname_list == NULL)
        die("ERROR: -g required in training mode.  Type 'hmm_train -h' for usage.\n");
    if (msa_length_list != NULL && msa_fname_list != NULL)
        die("ERROR: -m and -M are mutually exclusive.  Type 'hmm_train -h' for usage.\n");
    if (model_indels_str != NULL && tree == NULL)
        die("ERROR: -I requires -t.  Type 'hmm_train -h' for usage.\n");
    if (cm == NULL)
        die("ERROR: category map required.\n");

    set_seed(-1);

    ncats = cm->ncats + 1;
    ncats_unspooled = cm->unspooler != NULL ? cm->unspooler->nstates_unspooled :
                      ncats;
    nmsas = (msa_length_list != NULL ? lst_size(msa_length_list) :
             lst_size(msa_fname_list));

    if (model_indels_str != NULL) {
        if (tree == NULL)
            die("ERROR: tree is NULL\n");  /*FIXME: indel_ncats broken */
        gpm = gp_create_gapcats(cm, model_indels_str, tree, FALSE);
        ncats = cm->ncats + 1;    /* numbers will change */
        ncats_unspooled = cm->unspooler == NULL ? ncats :
                          cm->unspooler->nstates_unspooled;
    }

    /* allocate memory for storage of "training paths" */
    traincounts = mat_new(ncats_unspooled, ncats_unspooled);
    statecounts = vec_new(ncats_unspooled);
    begcounts = vec_new(ncats_unspooled);
    mat_zero(traincounts);
    vec_zero(statecounts);
    vec_zero(begcounts);


    /* create skeleton of new HMM. */
    hmm = hmm_new_nstates(ncats_unspooled, 0, 0);

    /* Main loop: consider each MSA in turn */
    for (msa_idx = 0; msa_idx < nmsas; msa_idx++) {
        if (msa_fname_list != NULL) {
            msa_fname = (String*)lst_get_ptr(msa_fname_list, msa_idx);
            F = phast_fopen(msa_fname->chars, "r");
            if (!quiet_mode)
                fprintf(stderr, "Reading alignment from %s ...\n",
                        F == stdin ? "stdin" : msa_fname->chars);
            msa = msa_new_from_file(F, NULL);
            phast_fclose(F);

        }
        else {                      /* only lengths of alignments specified */
            msa = msa_new(NULL, NULL, 0, lst_get_int(msa_length_list, msa_idx), NULL);
            /* just a shell in this case */
        }

        gff_fname = (String*)lst_get_ptr(gff_fname_list, msa_idx);
        if (!quiet_mode)
            fprintf(stderr, "Reading annotations from %s ...\n", gff_fname->chars);
        gff = gff_read_set(phast_fopen(gff_fname->chars, "r"));

        /* convert GFF to coordinate frame of alignment */
        if (msa_length_list == NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Mapping annotations to alignment ...\n");
            msa_map_gff_coords(msa, gff, 1, 0, 0); /* assume seq 1 is ref */
        }

        if (model_indels_str != NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Obtaining gap patterns ...\n");
            msa_gap_patterns = smalloc(msa->length * sizeof(int));
            gp_set_phylo_patterns(gpm, msa_gap_patterns, msa);
        }

        /* at this point, we don't actually need the alignment anymore;
           if using ordered suff stats (likely with large data sets),
           can free them now, to avoid running out of memory */
        if (msa->ss != NULL) {
            ss_free(msa->ss);
            msa->ss = NULL;
        }

        if (reverse_groups_tag != NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Reverse complementing features on negative strand (group by '%s') ...\n",
                        reverse_groups_tag);
            /* we don't need to reverse complement the whole alignment --
               just the gff and possibly the gap pattern array (pass a
               NULL msa) */
            gff_group(gff, reverse_groups_tag);
            msa_reverse_compl_feats(NULL, gff, msa_gap_patterns);
        }

        if (!quiet_mode)
            fprintf(stderr, "Labeling sites by category ...\n");
        msa_label_categories(msa, gff, cm);

        gff_free_set(gff);

        if (model_indels_str != NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Remapping categories according to gap patterns ...\n");

            if (indel_nseqs > 0 && indel_nseqs != msa->nseqs) {
                /* in this case, we'll simply reassign non-trivial gap
                   patterns randomly.  This will achieve the desired
                   effect with minimal coding, as long as the number of
                   sites is not too small (the indel model is probably
                   useless anyway if the number is small) */
                int pat, newpat;
                int npatterns = 4 * indel_nseqs - 5;
                int complex_allowed[cm->ncats+1];
                List *no_complex_names, *no_complex_nums;

                if (!quiet_mode)
                    fprintf(stderr, "(target number of sequences: %d)\n", indel_nseqs);

                /* set up index indicating by cat no. whether complex gaps
                   are allowed */
                for (i = 0; i < ncats; i++) complex_allowed[i] = 1;
                no_complex_names = lst_new_ptr(10);
                str_split(str_new_charstr(NO_COMPLEX), ",", no_complex_names);
                no_complex_nums = cm_get_category_list(cm, no_complex_names, 1);
                for (i = 0; i < lst_size(no_complex_nums); i++)
                    complex_allowed[lst_get_int(no_complex_nums, i)] = 0;
                lst_free(no_complex_nums);
                lst_free_strings(no_complex_names);
                lst_free(no_complex_names);

                /* now reassign all non-null numbers */
                for (i = 0; i < msa->length; ) {
                    if ((pat = msa_gap_patterns[i]) != 0) {
                        if (complex_allowed[msa->categories[i]])
                            newpat = 1 + ((double)npatterns * unif_rand());
                        /* random number in interval [1, npatterns] */
                        else
                            newpat = 1 + ((double)(npatterns-1) * unif_rand());
                        /* random number in interval [1,npatterns-1]
                           (excludes complex gap pattern) */
                        for (; i < msa->length && msa_gap_patterns[i] == pat; i++)
                            msa_gap_patterns[i] = newpat; /* change for whole sequence */
                    }
                    else i++;
                }
            }

            /* obtain gapped category number for each site */
            for (i = 0; i < msa->length; i++)
                if (gpm->cat_x_pattern_to_gapcat[msa->categories[i]] != NULL)
                    msa->categories[i] = gpm->cat_x_pattern_to_gapcat[msa->categories[i]][msa_gap_patterns[i]];
        }

        if (!quiet_mode)
            fprintf(stderr, "Unspooling categories ...\n");
        cm_spooled_to_unspooled(cm, msa->categories, msa->length);

        if (!quiet_mode)
            fprintf(stderr, "Collecting training data ...\n");
        hmm_train_update_counts(traincounts, statecounts, begcounts,
                                msa->categories, msa->length,
                                ncats_unspooled);

        if (msa_gap_patterns != NULL) sfree(msa_gap_patterns);
        msa_free(msa);
    }

    /* now train HMM, using cumulative data */
    hmm_train_from_counts(hmm, traincounts, NULL, statecounts, NULL,
                          begcounts, NULL);

    /* if modeling indels, adjust begin transitions so probability is
       distributed among different "gap pattern" states that all
       correspond to the same ungapped state (category); this helps
       avoid problems that occur when training on a few large sequences
       (e.g., whole chromosomes) and then testing on many shorter ones */
    if (model_indels_str != NULL) {
        double tprob[gpm->ncats];
        int nst[gpm->ncats];  /* total prob and number of states per
                             spooled, ungapped category */
        for (i = 0; i < gpm->ncats; i++) tprob[i] = nst[i] = 0;
        for (i = 0; i < hmm->nstates; i++) {
            if (vec_get(hmm->begin_transitions, i) > 0)
                /* have to go from unspooled space to spooled space, then to
                   ungapped space (HMM states correspond to unspooled,
                   gapped categories).  Note that states with nonzero begin
                   probs shouldn't be conditioned on other states. */
                tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] +=
                    vec_get(hmm->begin_transitions, i);
            nst[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]]++;
        }
        for (i = 0; i < hmm->nstates; i++)
            if (tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] > 0)
                vec_set(hmm->begin_transitions, i,
                        tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] /
                        nst[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]]);
        /* (uniform prior) */
    }

    /* write trained HMM */
    hmm_print(stdout, hmm);

    if (!quiet_mode) fprintf(stderr, "Done.\n");

    return 0;
}
Пример #4
0
int main(int argc, char *argv[]) {
    char c;
    List *l;
    int i, j, strand, bed_output = 0, backgd_nmods = -1, feat_nmods = -1,
                      winsize = -1, verbose = 0, max_nmods, memblocksize, old_nleaves,
                      refidx = 1, base_by_base = FALSE, windowWig = FALSE;
    TreeModel **backgd_mods = NULL, **feat_mods = NULL;
    HMM *backgd_hmm = NULL, *feat_hmm = NULL;
    msa_format_type inform = UNKNOWN_FORMAT;
    GFF_Set *features = NULL;
    MSA *msa, *msa_compl=NULL;
    double **backgd_emissions, **feat_emissions, **mem, **dummy_emissions,
           *winscore_pos=NULL, *winscore_neg=NULL;
    int *no_alignment=NULL;
    List *pruned_names;
    char *msa_fname;
    FILE *infile;

    int opt_idx;
    struct option long_opts[] = {
        {"background-mods", 1, 0, 'b'},
        {"background-hmm", 1, 0, 'B'},
        {"feature-mods", 1, 0, 'f'},
        {"feature-hmm", 1, 0, 'F'},
        {"features", 1, 0, 'g'},
        {"window", 1, 0, 'w'},
        {"window-wig", 1, 0, 'W'},
        {"base-by-base", 0, 0, 'y'},
        {"msa-format", 1, 0, 'i'},
        {"refidx", 1, 0, 'r'},
        {"output-bed", 0, 0, 'd'},
        {"verbose", 0, 0, 'v'},
        {"help", 0, 0, 'h'},
        {0, 0, 0, 0}
    };

    while ((c = getopt_long(argc, argv, "B:b:F:f:r:g:w:W:i:ydvh", long_opts, &opt_idx)) != -1) {
        switch (c) {
        case 'B':
            backgd_hmm = hmm_new_from_file(phast_fopen(optarg, "r"));
            break;
        case 'b':
            l = get_arg_list(optarg);
            backgd_nmods = lst_size(l);
            backgd_mods = smalloc(backgd_nmods * sizeof(void*));
            for (i = 0; i < backgd_nmods; i++)
                backgd_mods[i] = tm_new_from_file(phast_fopen(((String*)lst_get_ptr(l, i))->chars, "r"), 1);
            lst_free_strings(l);
            lst_free(l);
            break;
        case 'F':
            feat_hmm = hmm_new_from_file(phast_fopen(optarg, "r"));
            break;
        case 'f':
            l = get_arg_list(optarg);
            feat_nmods = lst_size(l);
            feat_mods = smalloc(feat_nmods * sizeof(void*));
            for (i = 0; i < feat_nmods; i++)
                feat_mods[i] = tm_new_from_file(phast_fopen(((String*)lst_get_ptr(l, i))->chars, "r"), 1);
            lst_free_strings(l);
            lst_free(l);
            break;
        case 'g':
            features = gff_read_set(phast_fopen(optarg, "r"));
            break;
        case 'w':
            winsize = get_arg_int(optarg);
            if (winsize <= 0) die("ERROR: window size must be positive.\n");
            break;
        case 'W':
            winsize = get_arg_int(optarg);
            if (winsize <= 0) die("ERROR: window size must be positive.\n");
            windowWig = TRUE;
            break;
        case 'y':
            base_by_base = TRUE;
            break;
        case 'i':
            inform = msa_str_to_format(optarg);
            if (inform == UNKNOWN_FORMAT) die("Bad argument to -i.\n");
            break;
        case 'r':
            refidx = get_arg_int_bounds(optarg, 0, INFTY);
            break;
        case 'd':
            bed_output = 1;
            break;
        case 'h':
            printf("%s", HELP);
            exit(0);
        case 'v':
            verbose = 1;
            break;
        case '?':
            die("Bad argument.  Try '%s -h'.\n", argv[0]);
        }
    }

    set_seed(-1);

    if (backgd_mods == NULL || feat_mods == NULL)
        die("ERROR: -b and -f required.  Try '%s -h'.\n", argv[0]);

    if (backgd_nmods == 1 && backgd_hmm == NULL)
        backgd_hmm = hmm_create_trivial();
    else if (backgd_hmm == NULL)
        die("ERROR: -B required.  Try '%s -h'.\n", argv[0]);

    if (feat_nmods == 1 && feat_hmm == NULL)
        feat_hmm = hmm_create_trivial();
    else if (feat_hmm == NULL)
        die("ERROR: -F required.  Try '%s -h'.\n", argv[0]);

    if ((winsize == -1 && features == NULL && !base_by_base) ||
            (winsize != -1 && features != NULL) ||
            (winsize != -1 && base_by_base) ||
            (features != NULL && base_by_base))
        die("ERROR: must specify exactly one of -g, -w, and -y.  Try '%s -h'.\n", argv[0]);

    if (backgd_hmm->nstates != backgd_nmods)
        die("ERROR: number of states must equal number of tree models for background.\n");

    if (feat_hmm->nstates != feat_nmods)
        die("ERROR: number of states must equal number of tree models for features.\n");

    if (features != NULL && lst_size(features->features) == 0)
        die("ERROR: empty features file.\n");

    if (base_by_base && (backgd_nmods > 1 || feat_nmods > 1))
        die("ERROR: only single phylogenetic models (not HMMs) are supported with --base-by-base.\n");

    if (optind != argc - 1)
        die("ERROR: too few arguments.  Try '%s -h'.\n", argv[0]);

    if (verbose) fprintf(stderr, "Reading alignment ...\n");
    msa_fname = argv[optind];
    infile = phast_fopen(msa_fname, "r");
    if (inform == UNKNOWN_FORMAT)
        inform = msa_format_for_content(infile, 1);
    if (inform == MAF)
        msa = maf_read(infile, NULL, 1, NULL, NULL,
                       NULL, -1, TRUE, NULL, NO_STRIP, FALSE);
    else
        msa = msa_new_from_file_define_format(infile, inform, NULL);
    if (msa_alph_has_lowercase(msa)) msa_toupper(msa);
    msa_remove_N_from_alph(msa);

    /* need ordered representation of alignment */
    if (msa->seqs == NULL && (msa->ss == NULL || msa->ss->tuple_idx == NULL) )
        die("ERROR: ordered sufficient statistics are required.\n");

    pruned_names = lst_new_ptr(msa->nseqs);
    for (i = 0; i < backgd_nmods; i++) {
        old_nleaves = (backgd_mods[i]->tree->nnodes + 1) / 2;
        tm_prune(backgd_mods[i], msa, pruned_names);
        if (lst_size(pruned_names) >= old_nleaves)
            die("ERROR: no match for leaves of tree in alignment (background model #%d)\n", i+1);
        else if (lst_size(pruned_names) > 0) {
            fprintf(stderr, "WARNING: pruned away leaves in background model (#%d) with no match in alignment (", i+1);
            for (j = 0; j < lst_size(pruned_names); j++)
                fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, j))->chars,
                        j < lst_size(pruned_names) - 1 ? ", " : ").\n");
        }
        lst_free_strings(pruned_names);
    }
    for (i = 0; i < feat_nmods; i++) {
        old_nleaves = (feat_mods[i]->tree->nnodes + 1) / 2;
        tm_prune(feat_mods[i], msa, pruned_names);
        if (lst_size(pruned_names) >= old_nleaves)
            die("ERROR: no match for leaves of tree in alignment (features model #%d)\n", i+1);
        else if (lst_size(pruned_names) > 0) {
            fprintf(stderr, "WARNING: pruned away leaves in features model (#%d) with no match in alignment (", i+1);
            for (j = 0; j < lst_size(pruned_names); j++)
                fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, j))->chars,
                        j < lst_size(pruned_names) - 1 ? ", " : ").\n");
        }
        lst_free_strings(pruned_names);
    }
    lst_free(pruned_names);

    /* first have to subtract offset from features, if necessary */
    if (msa->idx_offset != 0 && features != NULL) {
        for (i = 0; i < lst_size(features->features); i++) {
            GFF_Feature *f = lst_get_ptr(features->features, i);
            f->start -= msa->idx_offset;
            f->end -= msa->idx_offset;
        }
    }

    /* convert to coord frame of alignment */
    if (features != NULL && refidx != 0) {
        if (verbose) fprintf(stderr, "Mapping coordinates ...\n");
        msa_map_gff_coords(msa, features, refidx, 0, 0);
        if (lst_size(features->features) == 0)
            die("ERROR: no features within coordinate range of alignment.\n");
    }

    /* Make a reverse complemented copy of the alignment.  The two
       strands will be processed separately, to avoid problems with
       overlapping features, etc. */
    if (!base_by_base) {          /* skip in base by base case */
        if (verbose) fprintf(stderr, "Creating reverse complemented alignment ...\n");
        msa_compl = msa_create_copy(msa, 0);
        /* temporary workaround: make sure reverse complement not based on
           sufficient stats */
        if (msa_compl->seqs == NULL) ss_to_msa(msa_compl);
        if (msa_compl->ss != NULL) {
            ss_free(msa_compl->ss);
            msa_compl->ss = NULL;
        }
        msa_reverse_compl(msa_compl);
    }

    /* allocate memory for computing scores */
    backgd_emissions = smalloc(backgd_nmods * sizeof(void*));
    for (i = 0; i < backgd_nmods; i++)
        backgd_emissions[i] = smalloc(msa->length * sizeof(double));
    feat_emissions = smalloc(feat_nmods * sizeof(void*));
    for (i = 0; i < feat_nmods; i++)
        feat_emissions[i] = smalloc(msa->length * sizeof(double));
    max_nmods = max(backgd_nmods, feat_nmods);
    dummy_emissions = smalloc(max_nmods * sizeof(void*));
    mem = smalloc(max_nmods * sizeof(void*));
    /* memory for forward algorithm -- each block must be as large as
       the largest feature */
    if (features != NULL) {
        for (i = 0, memblocksize = -1; i < lst_size(features->features); i++) {
            GFF_Feature *f = lst_get_ptr(features->features, i);
            if (f->end - f->start + 1 > memblocksize)
                memblocksize = f->end - f->start + 1;
        }
    }
    else memblocksize = winsize;  /* -1 if base-by-base mode */

    if (memblocksize > 0)
        for (i = 0; i < max_nmods; i++)
            mem[i] = smalloc(memblocksize * sizeof(double));

    if (winsize != -1) {
        winscore_pos = smalloc(msa->length * sizeof(double));
        winscore_neg = smalloc(msa->length * sizeof(double));
        no_alignment = smalloc(msa->length * sizeof(int));

        for (i = 0; i < msa->length; i++) {
            winscore_pos[i] = winscore_neg[i] = NEGINFTY;
            if (refidx == 0)
                no_alignment[i] = FALSE;
            else
                no_alignment[i] = msa_missing_col(msa, refidx, i);
        }
    }

    /* the rest will be repeated for each strand */
    for (strand = 1; strand <= 2; strand++) {
        MSA *thismsa = strand == 1 ? msa : msa_compl;
        double *winscore = strand == 1 ? winscore_pos : winscore_neg;

        if (base_by_base && strand == 2) break; /* don't do second pass in
                                               base_by_base case */

        if (verbose) fprintf(stderr, "Processing %c strand ...\n",
                                 strand == 1 ? '+' : '-');

        /* set up dummy categories array, so that emissions are only
           computed where needed */
        thismsa->categories = smalloc(thismsa->length * sizeof(int));
        thismsa->ncats = 1;
        if (winsize != -1) {
            if (strand == 1)
                for (i = 0; i < thismsa->length; i++)
                    thismsa->categories[i] = no_alignment[i] ? 0 : 1;
            else
                for (i = 0; i < thismsa->length; i++)
                    thismsa->categories[i] = no_alignment[thismsa->length - i - 1] ? 0 : 1;
        }
        else if (features != NULL) {
            for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = 0;
            for (i = 0; i < lst_size(features->features); i++) {
                GFF_Feature *f = lst_get_ptr(features->features, i);
                if (f->start <= 0 || f->end <= 0) {
                    fprintf(stderr, "WARNING: feature out of range ('");
                    gff_print_feat(stderr, f);
                    fprintf(stderr, "')\n");
                    continue;
                }

                if (strand == 1 && f->strand != '-')
                    for (j = f->start - 1; j < f->end; j++)
                        thismsa->categories[j] = 1;
                else if (strand == 2 && f->strand == '-')
                    for (j = thismsa->length - f->end;
                            j < thismsa->length - f->start + 1; j++)
                        thismsa->categories[j] = 1;
            }
        }
        else {                      /* base-by-base scores */
            for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = 1;
        }
        if (thismsa->ss != NULL) ss_update_categories(thismsa);

        /* compute emissions */
        for (i = 0; i < backgd_nmods; i++) {
            if (verbose)
                fprintf(stderr, "Computing emissions for background model #%d ...\n", i+1);
            tl_compute_log_likelihood(backgd_mods[i], thismsa,
                                      backgd_emissions[i], NULL, 1, NULL);
        }
        for (i = 0; i < feat_nmods; i++) {
            if (verbose)
                fprintf(stderr, "Computing emissions for features model #%d ...\n", i+1);
            tl_compute_log_likelihood(feat_mods[i], thismsa,
                                      feat_emissions[i], NULL, 1, NULL);
        }

        /* now compute scores */
        if (winsize != -1) {        /* windows case */
            int winstart;
            if (verbose) fprintf(stderr, "Computing scores ...\n");

            for (winstart = 0; winstart <= thismsa->length - winsize; winstart++) {
                int centeridx = winstart + winsize/2;

                if (strand == 2) centeridx = thismsa->length - centeridx - 1;

                if (no_alignment[centeridx]) continue;

                for (j = 0; j < feat_nmods; j++)
                    dummy_emissions[j] = &(feat_emissions[j][winstart]);
                winscore[centeridx] = hmm_forward(feat_hmm, dummy_emissions,
                                                  winsize, mem);

                if (winscore[centeridx] <= NEGINFTY) {
                    winscore[centeridx] = NEGINFTY;
                    continue;
                }

                for (j = 0; j < backgd_nmods; j++)
                    dummy_emissions[j] = &(backgd_emissions[j][winstart]);
                winscore[centeridx] -= hmm_forward(backgd_hmm, dummy_emissions,
                                                   winsize, mem);

                if (winscore[centeridx] < NEGINFTY) winscore[centeridx] = NEGINFTY;
            }
        }
        else if (features != NULL) { /* features case */
            if (verbose) fprintf(stderr, "Computing scores ...\n");
            for (i = 0; i < lst_size(features->features); i++) {
                GFF_Feature *f = lst_get_ptr(features->features, i);
                int s, e;

                if ((strand == 1 && f->strand == '-') ||
                        (strand == 2 && f->strand != '-') ||
                        f->start <= 0 || f->end <= 0 || f->end - f->start < 0)
                    continue;

                /* effective coords */
                if (f->strand == '-') {
                    s = thismsa->length - f->end + 1;
                    e = thismsa->length - f->start + 1;
                }
                else {
                    s = f->start;
                    e = f->end;
                }

                f->score_is_null = 0;

                for (j = 0; j < feat_nmods; j++)
                    dummy_emissions[j] = &(feat_emissions[j][s-1]);
                f->score = hmm_forward(feat_hmm, dummy_emissions, e - s + 1, mem);

                if (f->score <= NEGINFTY) {
                    f->score = NEGINFTY;
                    continue;
                }

                for (j = 0; j < backgd_nmods; j++)
                    dummy_emissions[j] = &(backgd_emissions[j][s-1]);
                f->score -= hmm_forward(backgd_hmm, dummy_emissions, e - s + 1, mem);

                if (f->score < NEGINFTY) f->score = NEGINFTY;
            }
        }
    }

    if (verbose) fprintf(stderr, "Generating output ...\n");

    if (winsize != -1 && windowWig == FALSE) { /* standard windows output */
        for (i = 0, j = 0; i < msa->length; i++) {
            if (no_alignment[i] == FALSE)
                printf("%d\t%.3f\t%.3f\n", j + msa->idx_offset + 1, winscore_pos[i],
                       winscore_neg[i]);
            if (ss_get_char_pos(msa, i, 0, 0) != GAP_CHAR) j++;
        }
    }
    else if (windowWig == TRUE) { /* windows with wig output */
        int last = NEGINFTY;
        for (i = 0, j = 0; i < msa->length; i++) {
            if (refidx == 0 || msa_get_char(msa, refidx-1, i) != GAP_CHAR) {
                if (no_alignment[i] == FALSE && winscore_pos[i] > NEGINFTY) {
                    if (j > last + 1)
                        printf("fixedStep chrom=%s start=%d step=1\n",
                               refidx > 0 ? msa->names[refidx-1] : "alignment",
                               j + msa->idx_offset + 1);
                    printf("%.3f\n", winscore_pos[i]);
                    last = j;
                }
                j++;
            }
        }
    }
    else if (features != NULL) {  /* features output */
        /* return to coord frame of reference seq (also, replace offset) */
        if (refidx != 0)
            msa_map_gff_coords(msa, features, 0, refidx, msa->idx_offset);
        else if (msa->idx_offset != 0) {
            for (i = 0; i < lst_size(features->features); i++) {
                GFF_Feature *f = lst_get_ptr(features->features, i);
                f->start += msa->idx_offset;
                f->end += msa->idx_offset;
            }
        }

        if (bed_output)
            gff_print_bed(stdout, features, FALSE);
        else
            gff_print_set(stdout, features);
    }
    else {           /* base-by-base scores */
        /* in this case, we can just output the difference between the emissions */
        printf("fixedStep chrom=%s start=%d step=1\n",
               refidx > 0 ? msa->names[refidx-1] : "alignment",
               msa->idx_offset + 1);
        for (i = 0, j = 0; i < msa->length; i++) {
            if (refidx == 0 || msa_get_char(msa, refidx-1, i) != GAP_CHAR) {
                printf("%.3f\n", feat_emissions[0][i] - backgd_emissions[0][i]);
                j++;
            }
        }
    }

    if (verbose) fprintf(stderr, "\nDone.\n");

    return 0;
}
Пример #5
0
int main(int argc, char *argv[]) {
  char c;
  char *msa_fname = NULL;
  int opt_idx, i, old_nnodes;
  MSA *msa;
  List *pruned_names = lst_new_ptr(5), *tmpl;
  BDPhyloHmm *bdphmm;
  GFF_Set *predictions;
  int found = FALSE;
  List *ignore_types = lst_new_ptr(1);

  struct option long_opts[] = {
    {"refseq", 1, 0, 'M'},
    {"msa-format", 1, 0, 'i'},
    {"refidx", 1, 0, 'r'},
    {"rho", 1, 0, 'R'},
    {"phi", 1, 0, 'p'},
    {"transitions", 1, 0, 't'},    
    {"expected-length", 1, 0, 'E'},
    {"target-coverage", 1, 0, 'C'},
    {"seqname", 1, 0, 'N'},
    {"idpref", 1, 0, 'P'},
    {"indel-model", 1, 0, 'I'},
    {"indel-history", 1, 0, 'H'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  /* arguments and defaults for options */
  FILE *refseq_f = NULL, *msa_f = NULL;
  msa_format_type msa_format = UNKNOWN_FORMAT;
  TreeModel *source_mod;
  double rho = DEFAULT_RHO, mu = DEFAULT_MU, nu = DEFAULT_NU, 
    phi = DEFAULT_PHI, gamma = -1, omega = -1, 
    alpha_c = -1, beta_c = -1, tau_c = -1,
    alpha_n = -1, beta_n = -1, tau_n = -1;
  int set_transitions = FALSE, refidx = 1, estim_phi = TRUE, 
    estim_gamma = TRUE, estim_omega = TRUE;
  char *seqname = NULL, *idpref = NULL;
  IndelHistory *ih = NULL;

  while ((c = getopt_long(argc, argv, "R:t:p:E:C:r:M:i:N:P:I:H:h", long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 'R':
      rho = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 't':
      if (optarg[0] != '~') estim_gamma = estim_omega = FALSE;
      else optarg = &optarg[1];
      set_transitions = TRUE;
      tmpl = get_arg_list_dbl(optarg);
      if (lst_size(tmpl) != 2) 
        die("ERROR: bad argument to --transitions.\n");
      mu = lst_get_dbl(tmpl, 0);
      nu = lst_get_dbl(tmpl, 1);
      if (mu <= 0 || mu >= 1 || nu <= 0 || nu >= 1)
        die("ERROR: bad argument to --transitions.\n");
      lst_free(tmpl);
      break;
    case 'p':
      if (optarg[0] != '~') estim_phi = FALSE;
      else optarg = &optarg[1];
      phi = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'E':
      if (optarg[0] != '~') estim_omega = FALSE;
      else optarg = &optarg[1];
      omega = get_arg_dbl_bounds(optarg, 1, INFTY);
      mu = 1/omega;
      break;
    case 'C':
      if (optarg[0] != '~') estim_gamma = FALSE;
      else optarg = &optarg[1];
      gamma = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'r':
      refidx = get_arg_int_bounds(optarg, 0, INFTY);
      break;
    case 'M':
      refseq_f = phast_fopen(optarg, "r");
      break;
    case 'i':
      msa_format = msa_str_to_format(optarg);
      if (msa_format == UNKNOWN_FORMAT)
        die("ERROR: unrecognized alignment format.\n");
      break;
    case 'N':
      seqname = optarg;
      break;
    case 'P':
      idpref = optarg;
      break;
    case 'I':
      tmpl = get_arg_list_dbl(optarg);
      if (lst_size(tmpl) != 3 && lst_size(tmpl) != 6)
        die("ERROR: bad argument to --indel-model.\n");
      alpha_n = lst_get_dbl(tmpl, 0);
      beta_n = lst_get_dbl(tmpl, 1);
      tau_n = lst_get_dbl(tmpl, 2);
      if (lst_size(tmpl) == 6) {
        alpha_c = lst_get_dbl(tmpl, 3);
        beta_c = lst_get_dbl(tmpl, 4);
        tau_c = lst_get_dbl(tmpl, 5);
      }
      else {
        alpha_c = alpha_n; beta_c = beta_n; tau_c = tau_n;
      }
      if (alpha_c <= 0 || alpha_c >= 1 || beta_c <= 0 || beta_c >= 1 || 
          tau_c <= 0 || tau_c >= 1 || alpha_n <= 0 || alpha_n >= 1 || 
          beta_n <= 0 || beta_n >= 1 || tau_n <= 0 || tau_n >= 1)
        die("ERROR: bad argument to --indel-model.\n");
      break;
    case 'H':
      fprintf(stderr, "Reading indel history from %s...\n", optarg);
      ih = ih_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("Bad argument.  Try 'dless -h'.\n");
    }
  }

  if (optind != argc - 1)
    die("Missing alignment file or model file.  Try 'dless -h'.\n");

  if (set_transitions && (gamma != -1 || omega != -1))
    die("ERROR: --transitions and --target-coverage/--expected-length cannot be used together.\n");

  if ((gamma != -1 && omega == -1) || (gamma == -1 && omega != -1))
    die("ERROR: --target-coverage and --expecteed-length must be used together.\n");

  set_seed(-1);

  if (gamma != -1)
    nu = gamma/(1-gamma) * mu;

  fprintf(stderr, "Reading tree model from %s...\n", argv[optind]);
  source_mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1);

  if (source_mod->nratecats > 1) 
    die("ERROR: rate variation not currently supported.\n");

  if (source_mod->order > 0)
    die("ERROR: only single nucleotide models are currently supported.\n");

  if (!tm_is_reversible(source_mod))
    phast_warning("WARNING: p-value computation assumes reversibility and your model is non-reversible.\n");

  /* read alignment */
  msa_f = phast_fopen(argv[optind], "r");

  fprintf(stderr, "Reading alignment from %s...\n", argv[optind]);
  if (msa_format == UNKNOWN_FORMAT) 
    msa_format = msa_format_for_content(msa_f, 1);

  if (msa_format == MAF) {
    msa = maf_read(msa_f, refseq_f, 1, NULL, NULL, NULL, -1, TRUE, NULL, 
                   NO_STRIP, FALSE); 
  }
  else 
    msa = msa_new_from_file_define_format(msa_f, msa_format, NULL);

  if (msa_alph_has_lowercase(msa)) msa_toupper(msa); 
  msa_remove_N_from_alph(msa);

  if (msa->ss == NULL) {
    fprintf(stderr, "Extracting sufficient statistics...\n");
    ss_from_msas(msa, 1, TRUE, NULL, NULL, NULL, -1, 0);
  }
  else if (msa->ss->tuple_idx == NULL)
    die("ERROR: ordered representation of alignment required unless --suff-stats.\n");

  /* prune tree, if necessary */
  old_nnodes = source_mod->tree->nnodes;
  tm_prune(source_mod, msa, pruned_names);

  if (lst_size(pruned_names) == (old_nnodes + 1) / 2)
    die("ERROR: no match for leaves of tree in alignment (leaf names must match alignment names).\n");
  if (lst_size(pruned_names) > 0) {
    fprintf(stderr, "WARNING: pruned away leaves of tree with no match in alignment (");
    for (i = 0; i < lst_size(pruned_names); i++)
      fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, i))->chars, 
              i < lst_size(pruned_names) - 1 ? ", " : ").\n");
  }

  /* this has to be done after pruning tree */
  tr_name_ancestors(source_mod->tree);

  /* also make sure match for reference sequence in tree */
  if (refidx > 0) {
    for (i = 0, found = FALSE; !found && i < source_mod->tree->nnodes; i++) {
      TreeNode *n = lst_get_ptr(source_mod->tree->nodes, i);
      if (!strcmp(n->name, msa->names[refidx-1]))
        found = TRUE;
    }
    if (!found) die("ERROR: no match for reference sequence in tree.\n");
  }

  /* checks for indel model */
  if (alpha_c > 0) {
    if (ih == NULL) {
      fprintf(stderr, "Reconstructing indel history by parsimony...\n");
      ih = ih_reconstruct(msa, source_mod->tree);
    }
    else {
      if (ih->ncols != msa->length)
        die("ERROR: indel history doesn't seem to match alignment.\n");
      if (ih->tree->nnodes != source_mod->tree->nnodes)
        die("ERROR: indel history doesn't seem to match tree model.\n");
    }
  }

  bdphmm = bd_new(source_mod, rho, mu, nu, phi, alpha_c, beta_c, tau_c, 
                  alpha_n, beta_n, tau_n, estim_gamma, estim_omega, 
                  estim_phi);

  /* compute emissions */
  phmm_compute_emissions(bdphmm->phmm, msa, FALSE);

  /* add emissions for indel model, if necessary */
  if (alpha_c > 0) {
    fprintf(stderr, "Adjusting emissions for indels...\n");
    bd_add_indel_emissions(bdphmm, ih);
  }

  /* postprocess for missing data (requires special handling) */
  fprintf(stderr, "Adjusting emissions for missing data...\n");
  bd_handle_missing_data(bdphmm, msa);

  if (estim_gamma || estim_omega || estim_phi) {
    fprintf(stderr, "Estimating free parameters...\n");
    bd_estimate_transitions(bdphmm, msa);
  }

  /* set seqname and idpref, if necessary */
  if (seqname == NULL || idpref == NULL) {
    /* derive default from file name root */
    String *tmp = str_new_charstr(msa_fname);
    if (!str_equals_charstr(tmp, "-")) {
      str_remove_path(tmp);
      str_root(tmp, '.');
      if (idpref == NULL) idpref = copy_charstr(tmp->chars);
      str_root(tmp, '.');         /* apply one more time for double suffix */
      if (seqname == NULL) seqname = tmp->chars;    
    }
    else if (seqname == NULL) seqname = "refseq";
  }

  /* obtain predictions */
  fprintf(stderr, "Running Viterbi algorithm...\n");
  predictions = phmm_predict_viterbi(bdphmm->phmm, seqname, NULL, idpref, NULL);
  lst_push_ptr(ignore_types, str_new_charstr("nonconserved"));
  gff_filter_by_type(predictions, ignore_types, TRUE, NULL);

  /* score predictions */
  fprintf(stderr, "Scoring predictions...\n");
  bd_score_predictions(bdphmm, predictions);
  
  /* can free emissions now */
  for (i = 0; i < bdphmm->phmm->hmm->nstates; i++)
    sfree(bdphmm->phmm->emissions[i]);
  sfree(bdphmm->phmm->emissions);
  bdphmm->phmm->emissions = NULL;

  /* convert GFF to coord frame of reference sequence and adjust
     coords by idx_offset, if necessary  */
  if (refidx != 0 || msa->idx_offset != 0)
    msa_map_gff_coords(msa, predictions, 0, refidx, msa->idx_offset);

  if (refidx != 0) 
    gff_flatten(predictions);	
  /* necessary because coord conversion might create overlapping
     features (can happen in deletions in reference sequence) */

  /* now output predictions */
  fprintf(stderr, "Writing GFF to stdout...\n");
  gff_print_set(stdout, predictions);

  fprintf(stderr, "Done.\n");
  
  return 0;
}