示例#1
0
文件: rph_gff.c 项目: cran/rphast
SEXP rph_gff_print(SEXP filename, SEXP gff) {
  FILE *outfile;
  if (filename == R_NilValue)
    outfile = stdout;
  else outfile = phast_fopen(CHARACTER_VALUE(filename), "w");

  gff_print_set(outfile, (GFF_Set*)EXTPTR_PTR(gff));
  if (outfile != stdout) phast_fclose(outfile);
  return R_NilValue;
}
int main(int argc, char* argv[]) {
  FILE* F;
  MSA *msa;
  msa_format_type format = UNKNOWN_FORMAT;
  int src_ref = -1, dest_ref = 0, offset = 0;
  char *msa_fname = NULL, *feat_fname = NULL;
  GFF_Set *gff;
  char c;

  while ((c = (char)getopt(argc, argv, "hm:f:s:d:i:p:n:")) != -1) {
    switch(c) {
    case 'm':
      msa_fname = optarg;
      break;
    case 'f':
      feat_fname = optarg;
      break;
    case 's':
      src_ref = get_arg_int(optarg);
      break;
    case 'd':
      dest_ref = get_arg_int(optarg);
      break;
    case 'i':
      format = msa_str_to_format(optarg);
      if (format == UNKNOWN_FORMAT) die("ERROR: bad alignment format.\n");
      break;
    case 'p':
      offset = get_arg_int(optarg);
      break;
    case 'n':
      offset = -1 * get_arg_int(optarg);
      break;
    case 'h':
      print_usage();
      exit(1);
    case '?':
      print_usage();
      exit(1);
    }
  }

  if (msa_fname == NULL || feat_fname == NULL) {
    print_usage();
    exit(1);
  }

  set_seed(-1);

  F = phast_fopen(feat_fname, "r");
  if ((gff = gff_read_set(F)) == NULL) { 
    die("ERROR: error reading %s.\n", feat_fname);
  }
  phast_fclose(F);

  /* handle case of local alignment specially -- avoid representing
     the alignment explicitly */
  F = phast_fopen(msa_fname, "r");
  if (format == UNKNOWN_FORMAT)
    format = msa_format_for_content(F, 1);
  if (format == LAV) {
    LocalPwAlignment *lpwa = NULL;
/*     int i; */

    fprintf(stderr, "WARNING: in local alignment mode, coordinates may only be mapped from query (reference) sequence to target (aligned) sequence.\n"); 

    lpwa = la_read_lav(F, 0);
    la_gff_transform(lpwa, gff);
/*     for (i = 0; i < lst_size(gff->features); i++) { */
/*       GFF_Feature *feat = lst_get_ptr(gff->features, i); */
/*       feat->start = la_get_target_coord(lpwa, feat->start); */
/*       feat->end = la_get_target_coord(lpwa, feat->end); */
/*     } */
  }

  else {                        /* normal alignment */
    msa = msa_new_from_file_define_format(F, format, NULL);
    phast_fclose(F);

    msa_map_gff_coords(msa, gff, src_ref, dest_ref, offset);
    msa_free(msa);
  }

  gff_print_set(stdout, gff);

  gff_free_set(gff);

  return 0;
}
int main(int argc, char *argv[]) {
  char c;
  int opt_idx;
  GFF_Set *gff;
  List *include = NULL;
  char *groupby = "transcript_id", *exongroup_tag = NULL;
  int unique = FALSE, sort = FALSE, simplebed = FALSE, fix_start_stop = FALSE,
    add_utrs = FALSE, add_introns = FALSE, add_signals = FALSE;
  enum {GFF, BED, GENEPRED, WIG} output_format = GFF;
  FILE *discards_f = NULL, *groups_f = NULL;

  struct option long_opts[] = {
    {"output", 1, 0, 'o'},
    {"include-only", 1, 0, 'i'},
    {"include-groups", 1, 0, 'l'},
    {"groupby", 1, 0, 'g'},
    {"exongroup", 1, 0, 'e'},
    {"add-utrs", 0, 0, 'U'},
    {"add-introns", 0, 0, 'I'},
    {"add-signals", 0, 0, 'S'},
    {"fix-start-stop", 0, 0, 'f'},
    {"unique", 0, 0, 'u'},
    {"sort", 0, 0, 's'},
    {"simplebed", 0, 0, 'b'},
    {"discards", 1, 0, 'd'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  while ((c = (char)getopt_long(argc, argv, "o:i:l:g:e:d:UISfusbh", long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 'o':
      if (!strcmp("bed", optarg)) output_format = BED;
      else if (!strcmp("genepred", optarg)) output_format = GENEPRED;
      else if (!strcmp("wig", optarg)) output_format = WIG;
      else if (strcmp("gff", optarg)) die("ERROR: bad output format.\n");
      break;
    case 'i':
      include = get_arg_list(optarg);
      break;
    case 'l':
      groups_f = phast_fopen(optarg, "r");
      break;
    case 'g':
      groupby = optarg;
      break;
    case 'e':
      exongroup_tag = optarg;
      break;
    case 'U':
      add_utrs = TRUE;
      break;
    case 'I':
      add_introns = TRUE;
      break;
    case 'S':
      add_signals = TRUE;
      break;
    case 'f':
      fix_start_stop = TRUE;
      break;
    case 'u':
      unique = TRUE;
      break;
    case 'b':
      simplebed = TRUE;
      output_format = BED;
      break;
    case 'd':
      discards_f = phast_fopen(optarg, "w+");
      break;
    case 's':
      sort = TRUE;
      break;
    case 'h':
      usage(argv[0]);
    case '?':
      die("Bad argument.  Try '%s -h'.\n", argv[0]);
    }
  }

  if (optind != argc - 1) 
    die("Input filename required.  Try '%s -h'.\n", argv[0]);

  set_seed(-1);

  gff = gff_read_set(phast_fopen(argv[optind], "r"));

  if (lst_size(gff->features) == 0) exit(0); /* helps avoid unexpected
                                                behavior below */

  /* filter by type */
  if (include != NULL) gff_filter_by_type(gff, include, FALSE, discards_f);

  /* group */
  gff_group(gff, groupby);

  /* utrs, introns, & signals */
  if (add_utrs) gff_create_utrs(gff);
  if (add_introns) gff_create_introns(gff);
  if (add_signals) gff_create_signals(gff);

  /* subgroup */
  if (exongroup_tag != NULL) gff_exon_group(gff, exongroup_tag);

  /* filter by group */
  if (groups_f != NULL) {
    String *s = str_new(STR_LONG_LEN);
    List *groups = lst_new_ptr(10000);
    str_slurp(s, groups_f);
    str_split(s, NULL, groups);
    gff_filter_by_group(gff, groups);
    lst_free_strings(groups); lst_free(groups);
    str_free(s);
  }

  /* sort */
  if (sort) gff_sort(gff);

  /* make unique */
  if (unique) gff_remove_overlaps(gff, discards_f);
  
  if (fix_start_stop) gff_fix_start_stop(gff);

  if (output_format == BED)
    gff_print_bed(stdout, gff, !simplebed);
  else if (output_format == GENEPRED)
    gff_print_genepred(stdout, gff);
  else if (output_format == WIG)
    wig_print(stdout, gff);
  else 
    gff_print_set(stdout, gff);
  gff_free_set(gff);
  
  return 0;
}
int main(int argc, char *argv[]) {

  int check_start = 0, check_stop = 0, check_splice = 0, check_nonsense = 0,
    offset5 = 0, offset3 = 0, opt_idx, i, j, indel_strict = 0, no_output = 0,
    check_alignment = 0, splice_strict = 0;
  int ncons_tested, nkept, nconserved_exons;
  int nce_gap_type[NGAP_TYPES], nconsid[NTYPES], nfail[NTYPES];
  double Nfrac = 0.05;
  char c;
  MSA *msa;
  GFF_Set *gff;
  msa_format_type msa_format = UNKNOWN_FORMAT;
  List *keepers, *problems = lst_new_ptr(10), 
    *ends_adjusted = lst_new_ptr(1), *starts_adjusted = lst_new_ptr(1), 
    *discards=NULL, *intron_splice = lst_new_ptr(10);
  char *rseq_fname = NULL;
  FILE *logf = NULL, *mlogf = NULL, *statsf = NULL, *discardf = NULL;
  cds_gap_type fshift_mode = FSHIFT_BAD;
  char *groupby = "transcript_id";
  msa_coord_map *map;
  int *countNs, *countCDSs;
  FILE *infile;
  char *msa_fname;

  struct option long_opts[] = {
    {"start", 0, 0, 's'},
    {"stop", 0, 0, 't'},
    {"splice", 0, 0, 'l'},
    {"nonsense", 0, 0, 'n'},
    {"fshift", 0, 0, 'f'},
    {"conserved", 0, 0, 'c'},
    {"N-limit", 1, 0, 'N'},
    {"clean-gaps", 0, 0, 'e'},
    {"indel-strict", 0, 0, 'I'},
    {"splice-strict", 0, 0, 'C'},
    {"groupby", 1, 0, 'g'},
    {"msa-format", 1, 0, 'i'},
    {"refseq", 1, 0, 'r'},
    {"offset5", 1, 0, 'o'},
    {"offset3", 1, 0, 'p'},
    {"no-output", 0, 0, 'x'},
    {"discards", 1, 0, 'd'},
    {"log", 1, 0, 'L'},
    {"machine-log", 1, 0, 'M'},
    {"stats", 1, 0, 'S'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  while ((c = (char)getopt_long(argc, argv, "N:i:r:L:M:S:g:d:stlnfceICxh", 
                          long_opts, &opt_idx)) != -1) {
    switch(c) {
    case 's':
      check_alignment = check_start = 1;
      break;
    case 't':
      check_alignment = check_stop = 1;
      break;
    case 'l':
      check_alignment = check_splice = 1;
      break;
    case 'n':
      check_alignment = check_nonsense = 1;
      break;
    case 'f':
      check_alignment = 1;
      fshift_mode = FSHIFT_OK;
      break;
    case 'c':
      check_alignment = check_start = check_stop = check_splice = check_nonsense = 1;
      if (fshift_mode < FSHIFT_OK) fshift_mode = FSHIFT_OK;
      break;
    case 'N':
      Nfrac = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'e':
      check_alignment = 1;
      if (fshift_mode < CLN_GAPS) fshift_mode = CLN_GAPS;
      break;
    case 'I':
      check_alignment = 1;
      fshift_mode = NOVRLP_CLN_GAPS;
      indel_strict = 1;
      break;
    case 'C':
      check_alignment = check_splice = splice_strict = 1;
      break;
    case 'g':
      groupby = optarg;
      break;
    case 'i':
      msa_format = msa_str_to_format(optarg);
      if (msa_format == UNKNOWN_FORMAT) die("Bad alignment format.\n");
      break;
    case 'r':
      rseq_fname = optarg;
      break;
    case 'o':
      offset5 = get_arg_int(optarg);
      break;
    case 'p':
      offset3 = get_arg_int(optarg);
      break;
    case 'L':
      logf = phast_fopen(optarg, "w+");
      break;
    case 'M':
      mlogf = phast_fopen(optarg, "w+");
      break;
    case 'S':
      statsf = phast_fopen(optarg, "w+");
      break;
    case 'd':
      discardf = phast_fopen(optarg, "w+");
      break;
    case 'x':
      no_output = 1;
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("ERROR: Bad argument.  Try the --help option.\n");
    }
  }

  if (optind + 1 >= argc ) {
    die("ERROR:  Missing required arguments.  Try the --help option.\n");
  }
  
  set_seed(-1);

  gff = gff_read_set(phast_fopen(argv[optind], "r"));
  msa_fname = argv[optind+1];
  infile = phast_fopen(msa_fname, "r");
  if (msa_format == UNKNOWN_FORMAT)
    msa_format = msa_format_for_content(infile, 1);
  if (msa_format == MAF) {
    msa = maf_read(infile, 
                   rseq_fname == NULL ? NULL : phast_fopen(rseq_fname, "r"), 
                   1, NULL, NULL, NULL, -1, TRUE, NULL, NO_STRIP, FALSE); 
  }
  else {
    msa = msa_new_from_file_define_format(infile,
                            msa_format, NULL); 
    if (msa->ss == NULL) 
      ss_from_msas(msa, 1, 1, NULL, NULL, NULL, -1, 0);
  }
  if (!msa->ss->tuple_idx)
    die("ERROR: need ordered tuples\n");
  msa_remove_N_from_alph(msa);  /* for backward compatibility (old SS files) */

  if (msa->idx_offset != 0) {   /* avoids offset problem */
    for (i = 0; i < lst_size(gff->features); i++) {
      GFF_Feature *f = lst_get_ptr(gff->features, i);
      f->start -= msa->idx_offset;
      f->end -= msa->idx_offset;
    }
  }

  /* set up coordinate map; assume GFF is for sequence 1 */
  map = msa_build_coord_map(msa, 1);

  /* convert all features */
  for (i = 0; i < lst_size(gff->features); i++) {
    GFF_Feature *f = lst_get_ptr(gff->features, i);
    int newstart, newend;
 
    if (f->start < 0 || f->end < f->start)
      die("ERROR: bad feature in GFF (start=%d, end=%d).\n",
          f->start, f->end);

    newstart = msa_map_seq_to_msa(map, f->start);
    newend = msa_map_seq_to_msa(map, f->end);

    if (newstart < 0 || newend < newstart)
      die("ERROR: unable to map coordinates for feature (start=%d, end=%d).\n",
          f->start, f->end);

    f->start = newstart;
    f->end = newend;
  }

  gff_group(gff, groupby);	/* do this after coord conversion, or
                               group coords and feature coords
                               will be out of sync */

  keepers = lst_new_ptr(lst_size(gff->features));
  if (discardf != NULL) discards = lst_new_ptr(lst_size(gff->features));

  ncons_tested = nkept = nconserved_exons = 0;
  for (i = 0; i < NTYPES; i++) nconsid[i] = 0;
  for (i = 0; i < NTYPES; i++) nfail[i] = 0;
  for (i = 0; i < NGAP_TYPES; i++) nce_gap_type[i] = 0;  

  countNs = smalloc(msa->nseqs * sizeof(int));
  countCDSs = smalloc(msa->nseqs * sizeof(int));

  for (i = 0; i < lst_size(gff->groups); i++) {
    GFF_FeatureGroup *group = lst_get_ptr(gff->groups, i);
    List *gfeatures = group->features;
    GFF_Feature *feat;
    status_type status = OKAY;
    cds_gap_type gt = FSHIFT_BAD;
    problems_clear(problems);

    /* make sure have frame info for CDSs */
    for (j = 0; j < lst_size(gfeatures); j++) {
      feat = lst_get_ptr(gfeatures, j);
      if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) && 
          feat->frame == GFF_NULL_FRAME)
        die("ERROR: Missing frame info for CDS.\n");
    }

    /* First, exclude stop codons from cds's, if necessary (simplifies
       the detection of nonsense mutations). */
    exclude_stops(group, starts_adjusted, ends_adjusted);

    /* In all cases, discard any group for which the reference sequence
       doesn't have valid splice sites or start/stop codons, or has a
       premature stop codon */
    if (!ref_seq_okay(gfeatures, msa, offset3, indel_strict, splice_strict,
                      problems)) {
      status = BAD_REF;
      nfail[BAD_REF]++;
    }
    else
      /* Everything else counts as a potentially valid group */
      ncons_tested++;

    if (status == OKAY && check_alignment) {      
                                /* only bother with below if
                                   interested in cross-species
                                   conservation */

      /* Check first to make sure there's alignment across species in
         the cds; if not, there's no need to look at individual
         features. */
      for (j = 0; j < lst_size(gfeatures); j++) { 
        feat = lst_get_ptr(gfeatures, j);
        if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) &&
            is_incomplete_alignment(feat, msa)) {
          status = NO_ALN;
          nfail[NO_ALN]++;
          problem_add(problems, feat, NO_ALN, -1, -1);
          break;
        }
      }

      if (status == OKAY) {     /* we have alignment and agreement
                                   with the ref seq; now check feature
                                   by feature  */

        lst_clear(intron_splice);
        for (j = 0; j < msa->nseqs; j++) countNs[j] = countCDSs[j] = 0;

        for (j = 0; j < lst_size(gfeatures); j++) {
          feat = lst_get_ptr(gfeatures, j);

          if (feat->end - 1 >= msa->length) 
            die("ERROR: feature extends beyond alignment (%d >= %d).\n",
                feat->end - 1, msa->length);
        
          if (check_start && str_equals_charstr(feat->feature, GFF_START_TYPE)) {

            nconsid[BAD_START]++;

            if (!is_conserved_start(feat, msa)) {
              status = BAD_START;
              problem_add(problems, feat, BAD_START, -1, -1);
            }
          }

          else if (check_stop && str_equals_charstr(feat->feature, GFF_STOP_TYPE)) {

            nconsid[BAD_STOP]++;

            if (!is_conserved_stop(feat, msa)) {
              status = BAD_STOP;
              problem_add(problems, feat, BAD_STOP, -1, -1);
            }
          }

          else if (check_splice && 
                   str_equals_charstr(feat->feature, SPLICE_5)) {

            nconsid[BAD_5_SPLICE]++;

            if (!is_conserved_5splice(feat, msa, offset5, splice_strict)) {
              status = BAD_5_SPLICE;
              problem_add(problems, feat, BAD_5_SPLICE, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (check_splice && 
                   str_equals_charstr(feat->feature, SPLICE_5_UTR)) {

            nconsid[BAD_5_SPLICE_UTR]++;

            if (!is_conserved_5splice(feat, msa, offset5, splice_strict)) {
              status = BAD_5_SPLICE_UTR;
              problem_add(problems, feat, BAD_5_SPLICE_UTR, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (check_splice && str_equals_charstr(feat->feature, SPLICE_3)) {


            nconsid[BAD_3_SPLICE]++;

            if (!is_conserved_3splice(feat, msa, offset3, splice_strict)) {
              status = BAD_3_SPLICE;
              problem_add(problems, feat, BAD_3_SPLICE, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (check_splice && str_equals_charstr(feat->feature, SPLICE_3)) {

            nconsid[BAD_3_SPLICE_UTR]++;

            if (!is_conserved_3splice(feat, msa, offset3, splice_strict)) {
              status = BAD_3_SPLICE_UTR;
              problem_add(problems, feat, BAD_3_SPLICE_UTR, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) {
 
            if (fshift_mode > FSHIFT_BAD 
		&& (gt = get_cds_gap_type(feat, msa, problems)) < fshift_mode) {
              if (status == OKAY || status == NONSENSE) status = FSHIFT;
            }

            if (check_nonsense && !is_nonsense_clean(feat, msa, problems)) {
              if (status == OKAY) status = NONSENSE;
            }

            if (Nfrac < 1) 
              get_N_counts(countNs, countCDSs, feat, msa);
          }
        } /* end loop through features in group */

        /* still have to make sure splice sites are paired correctly
           (GT-AG, GC-AG, AT-AC) */
        if (status == OKAY && !splice_strict && lst_size(intron_splice) >= 2 &&
            !are_introns_okay(intron_splice, msa, problems, offset5, offset3)) 
          status = BAD_INTRON;

        /* also check fraction of Ns */
        if (Nfrac < 1) {
          enum {MY_OKAY, MY_FAIL, MY_WARN} Nstatus = MY_OKAY;
          for (j = 0; j < msa->nseqs; j++) {
            if ((double)countNs[j] / countCDSs[j] > Nfrac) Nstatus = MY_FAIL;
            if (Nstatus == MY_OKAY && countNs[j] > 0) Nstatus = MY_WARN;
          }
          if (Nstatus == MY_FAIL) {
            problem_add(problems, NULL, TOO_MANY_Ns, -1, -1);
            if (status == OKAY) status = TOO_MANY_Ns;
          }
          else if (Nstatus == MY_WARN) 
            problem_add(problems, NULL, WARN_Ns, -1, -1);
        }

        /* if collecting stats, record counts for failures */
        if (statsf != NULL) {
          if (status != OKAY) {
            for (j = 0; j < lst_size(problems); j++) {
              struct Problem *problem = lst_get_ptr(problems, j);
              status_type ftype = problem->status;
              if ((ftype == FSHIFT || ftype == NONSENSE) && 
                  status != FSHIFT && status != NONSENSE)
                continue;       /* don't count secondary frame shifts
                                   and nonsense mutations */ 

              if (ftype == BAD_INTRON && j % 2 == 0)
                continue;       /* only count one of every pair of these */

              nfail[ftype]++;
            }
          }

          /* also keep track of the total number of "conserved exons", and
             the number having each kind of gap */
          if ((status == OKAY || (status == FSHIFT && gt >= FSHIFT_OK))) {
            nconserved_exons++;
            nce_gap_type[gt]++;     /* number of conserved exons having
                                       given type of gaps */
          }
        }
      } /* end if (status == OKAY) [checks for conserved features] */
    } /* end if (status == OKAY && check_alignment) [all cross-species
         checks] */

    /* now we have looked at the whole group; we just need to do some
       final accounting and logging */

    if (status == OKAY) {
      nkept++;
      if (!no_output) {
        restore_stops(group, starts_adjusted, ends_adjusted);
        for (j = 0; j < lst_size(gfeatures); j++)
          lst_push_ptr(keepers, lst_get_ptr(gfeatures, j));
      }
      if (logf != NULL && lst_size(problems) > 0) /* warnings only */
        write_log(logf, group, status, problems, msa, map);
      if (mlogf != NULL) {
        /* no problem, need to add an okay status to log */
        problem_add(problems, NULL, OKAY, -1, -1);
        write_machine_log(mlogf, group, problems, map); /* may include
                                                           warnings */
      }
    }
    else {
      if (discardf != NULL) {
        restore_stops(group, starts_adjusted, ends_adjusted);
        for (j = 0; j < lst_size(gfeatures); j++) 
          lst_push_ptr(discards, lst_get_ptr(gfeatures, j));
      }
      if (logf != NULL) 
        write_log(logf, group, status, problems, msa, map);
      if (mlogf != NULL)
        write_machine_log(mlogf, group, problems, map);
    }
  } /* end loop over groups */

  /* write main output and discards */
  if (!no_output || discardf != NULL) {
    /* first map features back to coord frame of reference seq. */
    for (i = 0; i < lst_size(gff->features); i++) {
      GFF_Feature *f = lst_get_ptr(gff->features, i);
      f->start = msa_map_msa_to_seq(map, f->start) + msa->idx_offset;
      f->end = msa_map_msa_to_seq(map, f->end) + msa->idx_offset;
    }

    if (!no_output) {
      gff->features = keepers;
      gff_print_set(stdout, gff);
    }

    if (discardf != NULL) {
      gff->features = discards;
      gff_print_set(discardf, gff);
    }
  }


  /* dump counts to stats file */
  if (statsf != NULL) {
    fprintf(statsf, "#%11s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s\n", 
            "total", "nbad_ref", "nconsid", "nkept", "nno_aln", 
            "nbad_starts", "(out of)", "nbad_stops", "(out of)", 
            "nbad_5spl", "(out of)", "nbad_3spl", "(out of)", 
            "nbad_5utr", "(out of)", "nbad_3utr", "(out of)", 
            "nbad_intron", "nnons", "nfshifts", "nNs", "ncons_exons", 
            "nce_ngaps", "nce_nov_cln", "nce_clean", "nce_fshftok");
    fprintf(statsf, "%12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d\n", 
            nfail[BAD_REF]+ncons_tested, nfail[BAD_REF], ncons_tested, nkept, 
            nfail[NO_ALN], nfail[BAD_START], nconsid[BAD_START], 
            nfail[BAD_STOP], nconsid[BAD_STOP], nfail[BAD_5_SPLICE], 
            nconsid[BAD_5_SPLICE], nfail[BAD_3_SPLICE], nconsid[BAD_3_SPLICE],
            nfail[BAD_5_SPLICE_UTR], nconsid[BAD_5_SPLICE_UTR],
            nfail[BAD_3_SPLICE_UTR], nconsid[BAD_3_SPLICE_UTR], 
            nfail[BAD_INTRON], nfail[NONSENSE], nfail[FSHIFT], 
            nfail[TOO_MANY_Ns], nconserved_exons, nce_gap_type[NGAPS], 
            nce_gap_type[NOVRLP_CLN_GAPS], nce_gap_type[CLN_GAPS], 
            nce_gap_type[FSHIFT_OK]);
    fprintf(statsf, "%s", STATS_DESCRIPTION);
  }

  if (logf != NULL) phast_fclose(logf);
  if (mlogf != NULL) phast_fclose(mlogf);
  if (statsf != NULL) phast_fclose(statsf);
  if (discardf != NULL) phast_fclose(discardf);

  return 0;
}
示例#5
0
int main(int argc, char *argv[]) {
    char c;
    List *l;
    int i, j, strand, bed_output = 0, backgd_nmods = -1, feat_nmods = -1,
                      winsize = -1, verbose = 0, max_nmods, memblocksize, old_nleaves,
                      refidx = 1, base_by_base = FALSE, windowWig = FALSE;
    TreeModel **backgd_mods = NULL, **feat_mods = NULL;
    HMM *backgd_hmm = NULL, *feat_hmm = NULL;
    msa_format_type inform = UNKNOWN_FORMAT;
    GFF_Set *features = NULL;
    MSA *msa, *msa_compl=NULL;
    double **backgd_emissions, **feat_emissions, **mem, **dummy_emissions,
           *winscore_pos=NULL, *winscore_neg=NULL;
    int *no_alignment=NULL;
    List *pruned_names;
    char *msa_fname;
    FILE *infile;

    int opt_idx;
    struct option long_opts[] = {
        {"background-mods", 1, 0, 'b'},
        {"background-hmm", 1, 0, 'B'},
        {"feature-mods", 1, 0, 'f'},
        {"feature-hmm", 1, 0, 'F'},
        {"features", 1, 0, 'g'},
        {"window", 1, 0, 'w'},
        {"window-wig", 1, 0, 'W'},
        {"base-by-base", 0, 0, 'y'},
        {"msa-format", 1, 0, 'i'},
        {"refidx", 1, 0, 'r'},
        {"output-bed", 0, 0, 'd'},
        {"verbose", 0, 0, 'v'},
        {"help", 0, 0, 'h'},
        {0, 0, 0, 0}
    };

    while ((c = getopt_long(argc, argv, "B:b:F:f:r:g:w:W:i:ydvh", long_opts, &opt_idx)) != -1) {
        switch (c) {
        case 'B':
            backgd_hmm = hmm_new_from_file(phast_fopen(optarg, "r"));
            break;
        case 'b':
            l = get_arg_list(optarg);
            backgd_nmods = lst_size(l);
            backgd_mods = smalloc(backgd_nmods * sizeof(void*));
            for (i = 0; i < backgd_nmods; i++)
                backgd_mods[i] = tm_new_from_file(phast_fopen(((String*)lst_get_ptr(l, i))->chars, "r"), 1);
            lst_free_strings(l);
            lst_free(l);
            break;
        case 'F':
            feat_hmm = hmm_new_from_file(phast_fopen(optarg, "r"));
            break;
        case 'f':
            l = get_arg_list(optarg);
            feat_nmods = lst_size(l);
            feat_mods = smalloc(feat_nmods * sizeof(void*));
            for (i = 0; i < feat_nmods; i++)
                feat_mods[i] = tm_new_from_file(phast_fopen(((String*)lst_get_ptr(l, i))->chars, "r"), 1);
            lst_free_strings(l);
            lst_free(l);
            break;
        case 'g':
            features = gff_read_set(phast_fopen(optarg, "r"));
            break;
        case 'w':
            winsize = get_arg_int(optarg);
            if (winsize <= 0) die("ERROR: window size must be positive.\n");
            break;
        case 'W':
            winsize = get_arg_int(optarg);
            if (winsize <= 0) die("ERROR: window size must be positive.\n");
            windowWig = TRUE;
            break;
        case 'y':
            base_by_base = TRUE;
            break;
        case 'i':
            inform = msa_str_to_format(optarg);
            if (inform == UNKNOWN_FORMAT) die("Bad argument to -i.\n");
            break;
        case 'r':
            refidx = get_arg_int_bounds(optarg, 0, INFTY);
            break;
        case 'd':
            bed_output = 1;
            break;
        case 'h':
            printf("%s", HELP);
            exit(0);
        case 'v':
            verbose = 1;
            break;
        case '?':
            die("Bad argument.  Try '%s -h'.\n", argv[0]);
        }
    }

    set_seed(-1);

    if (backgd_mods == NULL || feat_mods == NULL)
        die("ERROR: -b and -f required.  Try '%s -h'.\n", argv[0]);

    if (backgd_nmods == 1 && backgd_hmm == NULL)
        backgd_hmm = hmm_create_trivial();
    else if (backgd_hmm == NULL)
        die("ERROR: -B required.  Try '%s -h'.\n", argv[0]);

    if (feat_nmods == 1 && feat_hmm == NULL)
        feat_hmm = hmm_create_trivial();
    else if (feat_hmm == NULL)
        die("ERROR: -F required.  Try '%s -h'.\n", argv[0]);

    if ((winsize == -1 && features == NULL && !base_by_base) ||
            (winsize != -1 && features != NULL) ||
            (winsize != -1 && base_by_base) ||
            (features != NULL && base_by_base))
        die("ERROR: must specify exactly one of -g, -w, and -y.  Try '%s -h'.\n", argv[0]);

    if (backgd_hmm->nstates != backgd_nmods)
        die("ERROR: number of states must equal number of tree models for background.\n");

    if (feat_hmm->nstates != feat_nmods)
        die("ERROR: number of states must equal number of tree models for features.\n");

    if (features != NULL && lst_size(features->features) == 0)
        die("ERROR: empty features file.\n");

    if (base_by_base && (backgd_nmods > 1 || feat_nmods > 1))
        die("ERROR: only single phylogenetic models (not HMMs) are supported with --base-by-base.\n");

    if (optind != argc - 1)
        die("ERROR: too few arguments.  Try '%s -h'.\n", argv[0]);

    if (verbose) fprintf(stderr, "Reading alignment ...\n");
    msa_fname = argv[optind];
    infile = phast_fopen(msa_fname, "r");
    if (inform == UNKNOWN_FORMAT)
        inform = msa_format_for_content(infile, 1);
    if (inform == MAF)
        msa = maf_read(infile, NULL, 1, NULL, NULL,
                       NULL, -1, TRUE, NULL, NO_STRIP, FALSE);
    else
        msa = msa_new_from_file_define_format(infile, inform, NULL);
    if (msa_alph_has_lowercase(msa)) msa_toupper(msa);
    msa_remove_N_from_alph(msa);

    /* need ordered representation of alignment */
    if (msa->seqs == NULL && (msa->ss == NULL || msa->ss->tuple_idx == NULL) )
        die("ERROR: ordered sufficient statistics are required.\n");

    pruned_names = lst_new_ptr(msa->nseqs);
    for (i = 0; i < backgd_nmods; i++) {
        old_nleaves = (backgd_mods[i]->tree->nnodes + 1) / 2;
        tm_prune(backgd_mods[i], msa, pruned_names);
        if (lst_size(pruned_names) >= old_nleaves)
            die("ERROR: no match for leaves of tree in alignment (background model #%d)\n", i+1);
        else if (lst_size(pruned_names) > 0) {
            fprintf(stderr, "WARNING: pruned away leaves in background model (#%d) with no match in alignment (", i+1);
            for (j = 0; j < lst_size(pruned_names); j++)
                fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, j))->chars,
                        j < lst_size(pruned_names) - 1 ? ", " : ").\n");
        }
        lst_free_strings(pruned_names);
    }
    for (i = 0; i < feat_nmods; i++) {
        old_nleaves = (feat_mods[i]->tree->nnodes + 1) / 2;
        tm_prune(feat_mods[i], msa, pruned_names);
        if (lst_size(pruned_names) >= old_nleaves)
            die("ERROR: no match for leaves of tree in alignment (features model #%d)\n", i+1);
        else if (lst_size(pruned_names) > 0) {
            fprintf(stderr, "WARNING: pruned away leaves in features model (#%d) with no match in alignment (", i+1);
            for (j = 0; j < lst_size(pruned_names); j++)
                fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, j))->chars,
                        j < lst_size(pruned_names) - 1 ? ", " : ").\n");
        }
        lst_free_strings(pruned_names);
    }
    lst_free(pruned_names);

    /* first have to subtract offset from features, if necessary */
    if (msa->idx_offset != 0 && features != NULL) {
        for (i = 0; i < lst_size(features->features); i++) {
            GFF_Feature *f = lst_get_ptr(features->features, i);
            f->start -= msa->idx_offset;
            f->end -= msa->idx_offset;
        }
    }

    /* convert to coord frame of alignment */
    if (features != NULL && refidx != 0) {
        if (verbose) fprintf(stderr, "Mapping coordinates ...\n");
        msa_map_gff_coords(msa, features, refidx, 0, 0);
        if (lst_size(features->features) == 0)
            die("ERROR: no features within coordinate range of alignment.\n");
    }

    /* Make a reverse complemented copy of the alignment.  The two
       strands will be processed separately, to avoid problems with
       overlapping features, etc. */
    if (!base_by_base) {          /* skip in base by base case */
        if (verbose) fprintf(stderr, "Creating reverse complemented alignment ...\n");
        msa_compl = msa_create_copy(msa, 0);
        /* temporary workaround: make sure reverse complement not based on
           sufficient stats */
        if (msa_compl->seqs == NULL) ss_to_msa(msa_compl);
        if (msa_compl->ss != NULL) {
            ss_free(msa_compl->ss);
            msa_compl->ss = NULL;
        }
        msa_reverse_compl(msa_compl);
    }

    /* allocate memory for computing scores */
    backgd_emissions = smalloc(backgd_nmods * sizeof(void*));
    for (i = 0; i < backgd_nmods; i++)
        backgd_emissions[i] = smalloc(msa->length * sizeof(double));
    feat_emissions = smalloc(feat_nmods * sizeof(void*));
    for (i = 0; i < feat_nmods; i++)
        feat_emissions[i] = smalloc(msa->length * sizeof(double));
    max_nmods = max(backgd_nmods, feat_nmods);
    dummy_emissions = smalloc(max_nmods * sizeof(void*));
    mem = smalloc(max_nmods * sizeof(void*));
    /* memory for forward algorithm -- each block must be as large as
       the largest feature */
    if (features != NULL) {
        for (i = 0, memblocksize = -1; i < lst_size(features->features); i++) {
            GFF_Feature *f = lst_get_ptr(features->features, i);
            if (f->end - f->start + 1 > memblocksize)
                memblocksize = f->end - f->start + 1;
        }
    }
    else memblocksize = winsize;  /* -1 if base-by-base mode */

    if (memblocksize > 0)
        for (i = 0; i < max_nmods; i++)
            mem[i] = smalloc(memblocksize * sizeof(double));

    if (winsize != -1) {
        winscore_pos = smalloc(msa->length * sizeof(double));
        winscore_neg = smalloc(msa->length * sizeof(double));
        no_alignment = smalloc(msa->length * sizeof(int));

        for (i = 0; i < msa->length; i++) {
            winscore_pos[i] = winscore_neg[i] = NEGINFTY;
            if (refidx == 0)
                no_alignment[i] = FALSE;
            else
                no_alignment[i] = msa_missing_col(msa, refidx, i);
        }
    }

    /* the rest will be repeated for each strand */
    for (strand = 1; strand <= 2; strand++) {
        MSA *thismsa = strand == 1 ? msa : msa_compl;
        double *winscore = strand == 1 ? winscore_pos : winscore_neg;

        if (base_by_base && strand == 2) break; /* don't do second pass in
                                               base_by_base case */

        if (verbose) fprintf(stderr, "Processing %c strand ...\n",
                                 strand == 1 ? '+' : '-');

        /* set up dummy categories array, so that emissions are only
           computed where needed */
        thismsa->categories = smalloc(thismsa->length * sizeof(int));
        thismsa->ncats = 1;
        if (winsize != -1) {
            if (strand == 1)
                for (i = 0; i < thismsa->length; i++)
                    thismsa->categories[i] = no_alignment[i] ? 0 : 1;
            else
                for (i = 0; i < thismsa->length; i++)
                    thismsa->categories[i] = no_alignment[thismsa->length - i - 1] ? 0 : 1;
        }
        else if (features != NULL) {
            for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = 0;
            for (i = 0; i < lst_size(features->features); i++) {
                GFF_Feature *f = lst_get_ptr(features->features, i);
                if (f->start <= 0 || f->end <= 0) {
                    fprintf(stderr, "WARNING: feature out of range ('");
                    gff_print_feat(stderr, f);
                    fprintf(stderr, "')\n");
                    continue;
                }

                if (strand == 1 && f->strand != '-')
                    for (j = f->start - 1; j < f->end; j++)
                        thismsa->categories[j] = 1;
                else if (strand == 2 && f->strand == '-')
                    for (j = thismsa->length - f->end;
                            j < thismsa->length - f->start + 1; j++)
                        thismsa->categories[j] = 1;
            }
        }
        else {                      /* base-by-base scores */
            for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = 1;
        }
        if (thismsa->ss != NULL) ss_update_categories(thismsa);

        /* compute emissions */
        for (i = 0; i < backgd_nmods; i++) {
            if (verbose)
                fprintf(stderr, "Computing emissions for background model #%d ...\n", i+1);
            tl_compute_log_likelihood(backgd_mods[i], thismsa,
                                      backgd_emissions[i], NULL, 1, NULL);
        }
        for (i = 0; i < feat_nmods; i++) {
            if (verbose)
                fprintf(stderr, "Computing emissions for features model #%d ...\n", i+1);
            tl_compute_log_likelihood(feat_mods[i], thismsa,
                                      feat_emissions[i], NULL, 1, NULL);
        }

        /* now compute scores */
        if (winsize != -1) {        /* windows case */
            int winstart;
            if (verbose) fprintf(stderr, "Computing scores ...\n");

            for (winstart = 0; winstart <= thismsa->length - winsize; winstart++) {
                int centeridx = winstart + winsize/2;

                if (strand == 2) centeridx = thismsa->length - centeridx - 1;

                if (no_alignment[centeridx]) continue;

                for (j = 0; j < feat_nmods; j++)
                    dummy_emissions[j] = &(feat_emissions[j][winstart]);
                winscore[centeridx] = hmm_forward(feat_hmm, dummy_emissions,
                                                  winsize, mem);

                if (winscore[centeridx] <= NEGINFTY) {
                    winscore[centeridx] = NEGINFTY;
                    continue;
                }

                for (j = 0; j < backgd_nmods; j++)
                    dummy_emissions[j] = &(backgd_emissions[j][winstart]);
                winscore[centeridx] -= hmm_forward(backgd_hmm, dummy_emissions,
                                                   winsize, mem);

                if (winscore[centeridx] < NEGINFTY) winscore[centeridx] = NEGINFTY;
            }
        }
        else if (features != NULL) { /* features case */
            if (verbose) fprintf(stderr, "Computing scores ...\n");
            for (i = 0; i < lst_size(features->features); i++) {
                GFF_Feature *f = lst_get_ptr(features->features, i);
                int s, e;

                if ((strand == 1 && f->strand == '-') ||
                        (strand == 2 && f->strand != '-') ||
                        f->start <= 0 || f->end <= 0 || f->end - f->start < 0)
                    continue;

                /* effective coords */
                if (f->strand == '-') {
                    s = thismsa->length - f->end + 1;
                    e = thismsa->length - f->start + 1;
                }
                else {
                    s = f->start;
                    e = f->end;
                }

                f->score_is_null = 0;

                for (j = 0; j < feat_nmods; j++)
                    dummy_emissions[j] = &(feat_emissions[j][s-1]);
                f->score = hmm_forward(feat_hmm, dummy_emissions, e - s + 1, mem);

                if (f->score <= NEGINFTY) {
                    f->score = NEGINFTY;
                    continue;
                }

                for (j = 0; j < backgd_nmods; j++)
                    dummy_emissions[j] = &(backgd_emissions[j][s-1]);
                f->score -= hmm_forward(backgd_hmm, dummy_emissions, e - s + 1, mem);

                if (f->score < NEGINFTY) f->score = NEGINFTY;
            }
        }
    }

    if (verbose) fprintf(stderr, "Generating output ...\n");

    if (winsize != -1 && windowWig == FALSE) { /* standard windows output */
        for (i = 0, j = 0; i < msa->length; i++) {
            if (no_alignment[i] == FALSE)
                printf("%d\t%.3f\t%.3f\n", j + msa->idx_offset + 1, winscore_pos[i],
                       winscore_neg[i]);
            if (ss_get_char_pos(msa, i, 0, 0) != GAP_CHAR) j++;
        }
    }
    else if (windowWig == TRUE) { /* windows with wig output */
        int last = NEGINFTY;
        for (i = 0, j = 0; i < msa->length; i++) {
            if (refidx == 0 || msa_get_char(msa, refidx-1, i) != GAP_CHAR) {
                if (no_alignment[i] == FALSE && winscore_pos[i] > NEGINFTY) {
                    if (j > last + 1)
                        printf("fixedStep chrom=%s start=%d step=1\n",
                               refidx > 0 ? msa->names[refidx-1] : "alignment",
                               j + msa->idx_offset + 1);
                    printf("%.3f\n", winscore_pos[i]);
                    last = j;
                }
                j++;
            }
        }
    }
    else if (features != NULL) {  /* features output */
        /* return to coord frame of reference seq (also, replace offset) */
        if (refidx != 0)
            msa_map_gff_coords(msa, features, 0, refidx, msa->idx_offset);
        else if (msa->idx_offset != 0) {
            for (i = 0; i < lst_size(features->features); i++) {
                GFF_Feature *f = lst_get_ptr(features->features, i);
                f->start += msa->idx_offset;
                f->end += msa->idx_offset;
            }
        }

        if (bed_output)
            gff_print_bed(stdout, features, FALSE);
        else
            gff_print_set(stdout, features);
    }
    else {           /* base-by-base scores */
        /* in this case, we can just output the difference between the emissions */
        printf("fixedStep chrom=%s start=%d step=1\n",
               refidx > 0 ? msa->names[refidx-1] : "alignment",
               msa->idx_offset + 1);
        for (i = 0, j = 0; i < msa->length; i++) {
            if (refidx == 0 || msa_get_char(msa, refidx-1, i) != GAP_CHAR) {
                printf("%.3f\n", feat_emissions[0][i] - backgd_emissions[0][i]);
                j++;
            }
        }
    }

    if (verbose) fprintf(stderr, "\nDone.\n");

    return 0;
}
示例#6
0
int main(int argc, char *argv[]) {
  char c;
  char *msa_fname = NULL;
  int opt_idx, i, old_nnodes;
  MSA *msa;
  List *pruned_names = lst_new_ptr(5), *tmpl;
  BDPhyloHmm *bdphmm;
  GFF_Set *predictions;
  int found = FALSE;
  List *ignore_types = lst_new_ptr(1);

  struct option long_opts[] = {
    {"refseq", 1, 0, 'M'},
    {"msa-format", 1, 0, 'i'},
    {"refidx", 1, 0, 'r'},
    {"rho", 1, 0, 'R'},
    {"phi", 1, 0, 'p'},
    {"transitions", 1, 0, 't'},    
    {"expected-length", 1, 0, 'E'},
    {"target-coverage", 1, 0, 'C'},
    {"seqname", 1, 0, 'N'},
    {"idpref", 1, 0, 'P'},
    {"indel-model", 1, 0, 'I'},
    {"indel-history", 1, 0, 'H'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  /* arguments and defaults for options */
  FILE *refseq_f = NULL, *msa_f = NULL;
  msa_format_type msa_format = UNKNOWN_FORMAT;
  TreeModel *source_mod;
  double rho = DEFAULT_RHO, mu = DEFAULT_MU, nu = DEFAULT_NU, 
    phi = DEFAULT_PHI, gamma = -1, omega = -1, 
    alpha_c = -1, beta_c = -1, tau_c = -1,
    alpha_n = -1, beta_n = -1, tau_n = -1;
  int set_transitions = FALSE, refidx = 1, estim_phi = TRUE, 
    estim_gamma = TRUE, estim_omega = TRUE;
  char *seqname = NULL, *idpref = NULL;
  IndelHistory *ih = NULL;

  while ((c = getopt_long(argc, argv, "R:t:p:E:C:r:M:i:N:P:I:H:h", long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 'R':
      rho = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 't':
      if (optarg[0] != '~') estim_gamma = estim_omega = FALSE;
      else optarg = &optarg[1];
      set_transitions = TRUE;
      tmpl = get_arg_list_dbl(optarg);
      if (lst_size(tmpl) != 2) 
        die("ERROR: bad argument to --transitions.\n");
      mu = lst_get_dbl(tmpl, 0);
      nu = lst_get_dbl(tmpl, 1);
      if (mu <= 0 || mu >= 1 || nu <= 0 || nu >= 1)
        die("ERROR: bad argument to --transitions.\n");
      lst_free(tmpl);
      break;
    case 'p':
      if (optarg[0] != '~') estim_phi = FALSE;
      else optarg = &optarg[1];
      phi = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'E':
      if (optarg[0] != '~') estim_omega = FALSE;
      else optarg = &optarg[1];
      omega = get_arg_dbl_bounds(optarg, 1, INFTY);
      mu = 1/omega;
      break;
    case 'C':
      if (optarg[0] != '~') estim_gamma = FALSE;
      else optarg = &optarg[1];
      gamma = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'r':
      refidx = get_arg_int_bounds(optarg, 0, INFTY);
      break;
    case 'M':
      refseq_f = phast_fopen(optarg, "r");
      break;
    case 'i':
      msa_format = msa_str_to_format(optarg);
      if (msa_format == UNKNOWN_FORMAT)
        die("ERROR: unrecognized alignment format.\n");
      break;
    case 'N':
      seqname = optarg;
      break;
    case 'P':
      idpref = optarg;
      break;
    case 'I':
      tmpl = get_arg_list_dbl(optarg);
      if (lst_size(tmpl) != 3 && lst_size(tmpl) != 6)
        die("ERROR: bad argument to --indel-model.\n");
      alpha_n = lst_get_dbl(tmpl, 0);
      beta_n = lst_get_dbl(tmpl, 1);
      tau_n = lst_get_dbl(tmpl, 2);
      if (lst_size(tmpl) == 6) {
        alpha_c = lst_get_dbl(tmpl, 3);
        beta_c = lst_get_dbl(tmpl, 4);
        tau_c = lst_get_dbl(tmpl, 5);
      }
      else {
        alpha_c = alpha_n; beta_c = beta_n; tau_c = tau_n;
      }
      if (alpha_c <= 0 || alpha_c >= 1 || beta_c <= 0 || beta_c >= 1 || 
          tau_c <= 0 || tau_c >= 1 || alpha_n <= 0 || alpha_n >= 1 || 
          beta_n <= 0 || beta_n >= 1 || tau_n <= 0 || tau_n >= 1)
        die("ERROR: bad argument to --indel-model.\n");
      break;
    case 'H':
      fprintf(stderr, "Reading indel history from %s...\n", optarg);
      ih = ih_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("Bad argument.  Try 'dless -h'.\n");
    }
  }

  if (optind != argc - 1)
    die("Missing alignment file or model file.  Try 'dless -h'.\n");

  if (set_transitions && (gamma != -1 || omega != -1))
    die("ERROR: --transitions and --target-coverage/--expected-length cannot be used together.\n");

  if ((gamma != -1 && omega == -1) || (gamma == -1 && omega != -1))
    die("ERROR: --target-coverage and --expecteed-length must be used together.\n");

  set_seed(-1);

  if (gamma != -1)
    nu = gamma/(1-gamma) * mu;

  fprintf(stderr, "Reading tree model from %s...\n", argv[optind]);
  source_mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1);

  if (source_mod->nratecats > 1) 
    die("ERROR: rate variation not currently supported.\n");

  if (source_mod->order > 0)
    die("ERROR: only single nucleotide models are currently supported.\n");

  if (!tm_is_reversible(source_mod))
    phast_warning("WARNING: p-value computation assumes reversibility and your model is non-reversible.\n");

  /* read alignment */
  msa_f = phast_fopen(argv[optind], "r");

  fprintf(stderr, "Reading alignment from %s...\n", argv[optind]);
  if (msa_format == UNKNOWN_FORMAT) 
    msa_format = msa_format_for_content(msa_f, 1);

  if (msa_format == MAF) {
    msa = maf_read(msa_f, refseq_f, 1, NULL, NULL, NULL, -1, TRUE, NULL, 
                   NO_STRIP, FALSE); 
  }
  else 
    msa = msa_new_from_file_define_format(msa_f, msa_format, NULL);

  if (msa_alph_has_lowercase(msa)) msa_toupper(msa); 
  msa_remove_N_from_alph(msa);

  if (msa->ss == NULL) {
    fprintf(stderr, "Extracting sufficient statistics...\n");
    ss_from_msas(msa, 1, TRUE, NULL, NULL, NULL, -1, 0);
  }
  else if (msa->ss->tuple_idx == NULL)
    die("ERROR: ordered representation of alignment required unless --suff-stats.\n");

  /* prune tree, if necessary */
  old_nnodes = source_mod->tree->nnodes;
  tm_prune(source_mod, msa, pruned_names);

  if (lst_size(pruned_names) == (old_nnodes + 1) / 2)
    die("ERROR: no match for leaves of tree in alignment (leaf names must match alignment names).\n");
  if (lst_size(pruned_names) > 0) {
    fprintf(stderr, "WARNING: pruned away leaves of tree with no match in alignment (");
    for (i = 0; i < lst_size(pruned_names); i++)
      fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, i))->chars, 
              i < lst_size(pruned_names) - 1 ? ", " : ").\n");
  }

  /* this has to be done after pruning tree */
  tr_name_ancestors(source_mod->tree);

  /* also make sure match for reference sequence in tree */
  if (refidx > 0) {
    for (i = 0, found = FALSE; !found && i < source_mod->tree->nnodes; i++) {
      TreeNode *n = lst_get_ptr(source_mod->tree->nodes, i);
      if (!strcmp(n->name, msa->names[refidx-1]))
        found = TRUE;
    }
    if (!found) die("ERROR: no match for reference sequence in tree.\n");
  }

  /* checks for indel model */
  if (alpha_c > 0) {
    if (ih == NULL) {
      fprintf(stderr, "Reconstructing indel history by parsimony...\n");
      ih = ih_reconstruct(msa, source_mod->tree);
    }
    else {
      if (ih->ncols != msa->length)
        die("ERROR: indel history doesn't seem to match alignment.\n");
      if (ih->tree->nnodes != source_mod->tree->nnodes)
        die("ERROR: indel history doesn't seem to match tree model.\n");
    }
  }

  bdphmm = bd_new(source_mod, rho, mu, nu, phi, alpha_c, beta_c, tau_c, 
                  alpha_n, beta_n, tau_n, estim_gamma, estim_omega, 
                  estim_phi);

  /* compute emissions */
  phmm_compute_emissions(bdphmm->phmm, msa, FALSE);

  /* add emissions for indel model, if necessary */
  if (alpha_c > 0) {
    fprintf(stderr, "Adjusting emissions for indels...\n");
    bd_add_indel_emissions(bdphmm, ih);
  }

  /* postprocess for missing data (requires special handling) */
  fprintf(stderr, "Adjusting emissions for missing data...\n");
  bd_handle_missing_data(bdphmm, msa);

  if (estim_gamma || estim_omega || estim_phi) {
    fprintf(stderr, "Estimating free parameters...\n");
    bd_estimate_transitions(bdphmm, msa);
  }

  /* set seqname and idpref, if necessary */
  if (seqname == NULL || idpref == NULL) {
    /* derive default from file name root */
    String *tmp = str_new_charstr(msa_fname);
    if (!str_equals_charstr(tmp, "-")) {
      str_remove_path(tmp);
      str_root(tmp, '.');
      if (idpref == NULL) idpref = copy_charstr(tmp->chars);
      str_root(tmp, '.');         /* apply one more time for double suffix */
      if (seqname == NULL) seqname = tmp->chars;    
    }
    else if (seqname == NULL) seqname = "refseq";
  }

  /* obtain predictions */
  fprintf(stderr, "Running Viterbi algorithm...\n");
  predictions = phmm_predict_viterbi(bdphmm->phmm, seqname, NULL, idpref, NULL);
  lst_push_ptr(ignore_types, str_new_charstr("nonconserved"));
  gff_filter_by_type(predictions, ignore_types, TRUE, NULL);

  /* score predictions */
  fprintf(stderr, "Scoring predictions...\n");
  bd_score_predictions(bdphmm, predictions);
  
  /* can free emissions now */
  for (i = 0; i < bdphmm->phmm->hmm->nstates; i++)
    sfree(bdphmm->phmm->emissions[i]);
  sfree(bdphmm->phmm->emissions);
  bdphmm->phmm->emissions = NULL;

  /* convert GFF to coord frame of reference sequence and adjust
     coords by idx_offset, if necessary  */
  if (refidx != 0 || msa->idx_offset != 0)
    msa_map_gff_coords(msa, predictions, 0, refidx, msa->idx_offset);

  if (refidx != 0) 
    gff_flatten(predictions);	
  /* necessary because coord conversion might create overlapping
     features (can happen in deletions in reference sequence) */

  /* now output predictions */
  fprintf(stderr, "Writing GFF to stdout...\n");
  gff_print_set(stdout, predictions);

  fprintf(stderr, "Done.\n");
  
  return 0;
}