示例#1
0
int main(int argc, char *argv[]) {
  char c;
  int opt_idx;
  Vector *newfreqs = vec_new(4);
  TreeModel *mod;

  struct option long_opts[] = {
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  while ((c = getopt_long(argc, argv, "h", long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("Bad argument.  Try 'modFreqs -h'.\n");
    }
  }

  if (optind != argc - 2 && optind != argc - 5) 
    die("ERROR: Wrong number of arguments.  Try 'modFreqs -h'.\n");

  set_seed(-1);

  mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1);

  if (!tm_is_reversible(mod)) 
    die("ERROR: reversible input model required.\n");
  if (mod->order != 0)
    die("ERROR: single nucleotide model required.\n");
  if (strcmp(mod->rate_matrix->states, DEFAULT_ALPHABET) != 0)
    die("ERROR: default nucleotide alphabet required.\n");

  if (optind == argc - 2) {
    double gc = get_arg_dbl_bounds(argv[optind+1], 0, 1);
    vec_set(newfreqs, 0, (1-gc)/2);
    vec_set(newfreqs, 1, gc/2);
    vec_set(newfreqs, 2, gc/2);
    vec_set(newfreqs, 3, (1-gc)/2);
  }
  else {
    vec_set(newfreqs, 0, get_arg_dbl_bounds(argv[optind+1], 0, 1));
    vec_set(newfreqs, 1, get_arg_dbl_bounds(argv[optind+2], 0, 1));
    vec_set(newfreqs, 2, get_arg_dbl_bounds(argv[optind+3], 0, 1));
    vec_set(newfreqs, 3, get_arg_dbl_bounds(argv[optind+4], 0, 1));
    pv_normalize(newfreqs);
  }

  tm_mod_freqs(mod, newfreqs);

  tm_print(stdout, mod);

  return 0;
}
示例#2
0
int main(int argc, char *argv[]) {
  FILE *infile;
  char c;
  int opt_idx;
  msa_format_type msa_format;
  struct bgchmm_struct *b = bgchmm_struct_new(0);

  struct option long_opts[] = {
    {"bgc", 1, 0, 'B'},
    {"estimate-bgc", 1, 0, 'b'},
    {"bgc-exp-length", 1, 0, 'L'},
    {"estimate-bgc-exp-length", 1, 0, 'l'},
    {"bgc-target-coverage", 1, 0, 'C'},
    {"estimate-bgc-target-coverage", 1, 0, 'c'},
    {"rho", 1, 0, 'R'},
    {"cons-exp-length", 1, 0, 'E'},
    {"cons-target-coverage", 1, 0, 'T'},
    {"scale", 1, 0, 'S'},
    {"estimate-scale", 1, 0, 's'},
    {"eqfreqs-from-msa", 1, 0, 'f'},
    {"output-tracts", 1, 0, 'g'},
    {"posteriors", 1, 0, 'p'},
    {"output-mods", 1, 0, 'm'},
    {"help", 0, 0, 'h'},
    {0,0,0,0}};

  while ((c = getopt_long(argc, argv, "B:b:L:l:C:c:R:E:T:S:s:f:g:p:m:Wh", long_opts, &opt_idx))
	 != -1) {
    switch (c) {
    case 'B':
      b->bgc = get_arg_dbl_bounds(optarg, 0, INFTY);
      break;
    case 'b':
      b->estimate_bgc = get_arg_int_bounds(optarg, 0, 1);
      break;
    case 'L':
      b->bgc_expected_length = get_arg_dbl_bounds(optarg, 0, INFTY);
      break;
    case 'l':
      b->estimate_bgc_expected_length = get_arg_int_bounds(optarg, 0, 1);
      break;
    case 'C':
      b->bgc_target_coverage = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'c':
      b->estimate_bgc_target_coverage = get_arg_int_bounds(optarg, 0, 1);
      break;
    case 'R':
      b->rho = get_arg_dbl_bounds(optarg, 0, INFTY);
      if (b->rho > 1.0) phast_warning("Warning: rho is a scale for conserved states and is usually less than 1, got %f", b->rho);
      break;
    case 'E':
      b->cons_expected_length = get_arg_dbl_bounds(optarg, 0, INFTY);
      break;
    case 'T':
      b->cons_target_coverage = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'S':
      b->scale = get_arg_dbl_bounds(optarg, 0, INFTY);
      break;
    case 's':
      b->estimate_scale = get_arg_int_bounds(optarg, 0, 1);
      break;
    case 'f':
      b->eqfreqs_from_msa = get_arg_int_bounds(optarg, 0, 1);
      break;
    case 'g':
      b->tract_fn = optarg;
      break;
    case 'p':
      if (strcasecmp(optarg, "none")==0)
	b->post_probs = NONE;
      else if (strcasecmp(optarg, "wig")==0)
	b->post_probs = WIG;
      else if (strcasecmp(optarg, "full")==0)
	b->post_probs = FULL;
      else die("--posteriors option expects either none, wig, or full, got %s", optarg);
      break;
    case 'm':
      b->mods_fn = optarg;
      break;
    case 'W':
      b->post_probs = NONE;
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
    default:
      die("Bad argument.  Try '%s -h'.\n", argv[0]);
    }
  }
   
  if (optind != argc - 3)
    die("ERROR: extra or missing arguments.  Try '%s -h'.\n", argv[0]);


  /* read the MSA and make sure there are no sufficient stats (we want to get these
     later depending on whether coding/noncoding, tuple size will be 3 or 1)
   */
  infile = phast_fopen(argv[optind], "r");
  msa_format = msa_format_for_content(infile, 1);
  if (msa_format == MAF) 
    b->msa = maf_read(infile, NULL, 1, NULL, NULL, NULL, 0, 1, NULL, NO_STRIP, 0);
  else b->msa = msa_new_from_file_define_format(infile, msa_format, NULL);
  fclose(infile);

  infile = phast_fopen(argv[optind+1], "r");
  b->mod = tm_new_from_file(infile, 1);
  fclose(infile);

  b->foregd_branch = argv[optind+2];
  
  bgcHmm(b);
  
  return 0;
}
int main(int argc, char *argv[]) {

  int check_start = 0, check_stop = 0, check_splice = 0, check_nonsense = 0,
    offset5 = 0, offset3 = 0, opt_idx, i, j, indel_strict = 0, no_output = 0,
    check_alignment = 0, splice_strict = 0;
  int ncons_tested, nkept, nconserved_exons;
  int nce_gap_type[NGAP_TYPES], nconsid[NTYPES], nfail[NTYPES];
  double Nfrac = 0.05;
  char c;
  MSA *msa;
  GFF_Set *gff;
  msa_format_type msa_format = UNKNOWN_FORMAT;
  List *keepers, *problems = lst_new_ptr(10), 
    *ends_adjusted = lst_new_ptr(1), *starts_adjusted = lst_new_ptr(1), 
    *discards=NULL, *intron_splice = lst_new_ptr(10);
  char *rseq_fname = NULL;
  FILE *logf = NULL, *mlogf = NULL, *statsf = NULL, *discardf = NULL;
  cds_gap_type fshift_mode = FSHIFT_BAD;
  char *groupby = "transcript_id";
  msa_coord_map *map;
  int *countNs, *countCDSs;
  FILE *infile;
  char *msa_fname;

  struct option long_opts[] = {
    {"start", 0, 0, 's'},
    {"stop", 0, 0, 't'},
    {"splice", 0, 0, 'l'},
    {"nonsense", 0, 0, 'n'},
    {"fshift", 0, 0, 'f'},
    {"conserved", 0, 0, 'c'},
    {"N-limit", 1, 0, 'N'},
    {"clean-gaps", 0, 0, 'e'},
    {"indel-strict", 0, 0, 'I'},
    {"splice-strict", 0, 0, 'C'},
    {"groupby", 1, 0, 'g'},
    {"msa-format", 1, 0, 'i'},
    {"refseq", 1, 0, 'r'},
    {"offset5", 1, 0, 'o'},
    {"offset3", 1, 0, 'p'},
    {"no-output", 0, 0, 'x'},
    {"discards", 1, 0, 'd'},
    {"log", 1, 0, 'L'},
    {"machine-log", 1, 0, 'M'},
    {"stats", 1, 0, 'S'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  while ((c = (char)getopt_long(argc, argv, "N:i:r:L:M:S:g:d:stlnfceICxh", 
                          long_opts, &opt_idx)) != -1) {
    switch(c) {
    case 's':
      check_alignment = check_start = 1;
      break;
    case 't':
      check_alignment = check_stop = 1;
      break;
    case 'l':
      check_alignment = check_splice = 1;
      break;
    case 'n':
      check_alignment = check_nonsense = 1;
      break;
    case 'f':
      check_alignment = 1;
      fshift_mode = FSHIFT_OK;
      break;
    case 'c':
      check_alignment = check_start = check_stop = check_splice = check_nonsense = 1;
      if (fshift_mode < FSHIFT_OK) fshift_mode = FSHIFT_OK;
      break;
    case 'N':
      Nfrac = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'e':
      check_alignment = 1;
      if (fshift_mode < CLN_GAPS) fshift_mode = CLN_GAPS;
      break;
    case 'I':
      check_alignment = 1;
      fshift_mode = NOVRLP_CLN_GAPS;
      indel_strict = 1;
      break;
    case 'C':
      check_alignment = check_splice = splice_strict = 1;
      break;
    case 'g':
      groupby = optarg;
      break;
    case 'i':
      msa_format = msa_str_to_format(optarg);
      if (msa_format == UNKNOWN_FORMAT) die("Bad alignment format.\n");
      break;
    case 'r':
      rseq_fname = optarg;
      break;
    case 'o':
      offset5 = get_arg_int(optarg);
      break;
    case 'p':
      offset3 = get_arg_int(optarg);
      break;
    case 'L':
      logf = phast_fopen(optarg, "w+");
      break;
    case 'M':
      mlogf = phast_fopen(optarg, "w+");
      break;
    case 'S':
      statsf = phast_fopen(optarg, "w+");
      break;
    case 'd':
      discardf = phast_fopen(optarg, "w+");
      break;
    case 'x':
      no_output = 1;
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("ERROR: Bad argument.  Try the --help option.\n");
    }
  }

  if (optind + 1 >= argc ) {
    die("ERROR:  Missing required arguments.  Try the --help option.\n");
  }
  
  set_seed(-1);

  gff = gff_read_set(phast_fopen(argv[optind], "r"));
  msa_fname = argv[optind+1];
  infile = phast_fopen(msa_fname, "r");
  if (msa_format == UNKNOWN_FORMAT)
    msa_format = msa_format_for_content(infile, 1);
  if (msa_format == MAF) {
    msa = maf_read(infile, 
                   rseq_fname == NULL ? NULL : phast_fopen(rseq_fname, "r"), 
                   1, NULL, NULL, NULL, -1, TRUE, NULL, NO_STRIP, FALSE); 
  }
  else {
    msa = msa_new_from_file_define_format(infile,
                            msa_format, NULL); 
    if (msa->ss == NULL) 
      ss_from_msas(msa, 1, 1, NULL, NULL, NULL, -1, 0);
  }
  if (!msa->ss->tuple_idx)
    die("ERROR: need ordered tuples\n");
  msa_remove_N_from_alph(msa);  /* for backward compatibility (old SS files) */

  if (msa->idx_offset != 0) {   /* avoids offset problem */
    for (i = 0; i < lst_size(gff->features); i++) {
      GFF_Feature *f = lst_get_ptr(gff->features, i);
      f->start -= msa->idx_offset;
      f->end -= msa->idx_offset;
    }
  }

  /* set up coordinate map; assume GFF is for sequence 1 */
  map = msa_build_coord_map(msa, 1);

  /* convert all features */
  for (i = 0; i < lst_size(gff->features); i++) {
    GFF_Feature *f = lst_get_ptr(gff->features, i);
    int newstart, newend;
 
    if (f->start < 0 || f->end < f->start)
      die("ERROR: bad feature in GFF (start=%d, end=%d).\n",
          f->start, f->end);

    newstart = msa_map_seq_to_msa(map, f->start);
    newend = msa_map_seq_to_msa(map, f->end);

    if (newstart < 0 || newend < newstart)
      die("ERROR: unable to map coordinates for feature (start=%d, end=%d).\n",
          f->start, f->end);

    f->start = newstart;
    f->end = newend;
  }

  gff_group(gff, groupby);	/* do this after coord conversion, or
                               group coords and feature coords
                               will be out of sync */

  keepers = lst_new_ptr(lst_size(gff->features));
  if (discardf != NULL) discards = lst_new_ptr(lst_size(gff->features));

  ncons_tested = nkept = nconserved_exons = 0;
  for (i = 0; i < NTYPES; i++) nconsid[i] = 0;
  for (i = 0; i < NTYPES; i++) nfail[i] = 0;
  for (i = 0; i < NGAP_TYPES; i++) nce_gap_type[i] = 0;  

  countNs = smalloc(msa->nseqs * sizeof(int));
  countCDSs = smalloc(msa->nseqs * sizeof(int));

  for (i = 0; i < lst_size(gff->groups); i++) {
    GFF_FeatureGroup *group = lst_get_ptr(gff->groups, i);
    List *gfeatures = group->features;
    GFF_Feature *feat;
    status_type status = OKAY;
    cds_gap_type gt = FSHIFT_BAD;
    problems_clear(problems);

    /* make sure have frame info for CDSs */
    for (j = 0; j < lst_size(gfeatures); j++) {
      feat = lst_get_ptr(gfeatures, j);
      if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) && 
          feat->frame == GFF_NULL_FRAME)
        die("ERROR: Missing frame info for CDS.\n");
    }

    /* First, exclude stop codons from cds's, if necessary (simplifies
       the detection of nonsense mutations). */
    exclude_stops(group, starts_adjusted, ends_adjusted);

    /* In all cases, discard any group for which the reference sequence
       doesn't have valid splice sites or start/stop codons, or has a
       premature stop codon */
    if (!ref_seq_okay(gfeatures, msa, offset3, indel_strict, splice_strict,
                      problems)) {
      status = BAD_REF;
      nfail[BAD_REF]++;
    }
    else
      /* Everything else counts as a potentially valid group */
      ncons_tested++;

    if (status == OKAY && check_alignment) {      
                                /* only bother with below if
                                   interested in cross-species
                                   conservation */

      /* Check first to make sure there's alignment across species in
         the cds; if not, there's no need to look at individual
         features. */
      for (j = 0; j < lst_size(gfeatures); j++) { 
        feat = lst_get_ptr(gfeatures, j);
        if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) &&
            is_incomplete_alignment(feat, msa)) {
          status = NO_ALN;
          nfail[NO_ALN]++;
          problem_add(problems, feat, NO_ALN, -1, -1);
          break;
        }
      }

      if (status == OKAY) {     /* we have alignment and agreement
                                   with the ref seq; now check feature
                                   by feature  */

        lst_clear(intron_splice);
        for (j = 0; j < msa->nseqs; j++) countNs[j] = countCDSs[j] = 0;

        for (j = 0; j < lst_size(gfeatures); j++) {
          feat = lst_get_ptr(gfeatures, j);

          if (feat->end - 1 >= msa->length) 
            die("ERROR: feature extends beyond alignment (%d >= %d).\n",
                feat->end - 1, msa->length);
        
          if (check_start && str_equals_charstr(feat->feature, GFF_START_TYPE)) {

            nconsid[BAD_START]++;

            if (!is_conserved_start(feat, msa)) {
              status = BAD_START;
              problem_add(problems, feat, BAD_START, -1, -1);
            }
          }

          else if (check_stop && str_equals_charstr(feat->feature, GFF_STOP_TYPE)) {

            nconsid[BAD_STOP]++;

            if (!is_conserved_stop(feat, msa)) {
              status = BAD_STOP;
              problem_add(problems, feat, BAD_STOP, -1, -1);
            }
          }

          else if (check_splice && 
                   str_equals_charstr(feat->feature, SPLICE_5)) {

            nconsid[BAD_5_SPLICE]++;

            if (!is_conserved_5splice(feat, msa, offset5, splice_strict)) {
              status = BAD_5_SPLICE;
              problem_add(problems, feat, BAD_5_SPLICE, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (check_splice && 
                   str_equals_charstr(feat->feature, SPLICE_5_UTR)) {

            nconsid[BAD_5_SPLICE_UTR]++;

            if (!is_conserved_5splice(feat, msa, offset5, splice_strict)) {
              status = BAD_5_SPLICE_UTR;
              problem_add(problems, feat, BAD_5_SPLICE_UTR, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (check_splice && str_equals_charstr(feat->feature, SPLICE_3)) {


            nconsid[BAD_3_SPLICE]++;

            if (!is_conserved_3splice(feat, msa, offset3, splice_strict)) {
              status = BAD_3_SPLICE;
              problem_add(problems, feat, BAD_3_SPLICE, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (check_splice && str_equals_charstr(feat->feature, SPLICE_3)) {

            nconsid[BAD_3_SPLICE_UTR]++;

            if (!is_conserved_3splice(feat, msa, offset3, splice_strict)) {
              status = BAD_3_SPLICE_UTR;
              problem_add(problems, feat, BAD_3_SPLICE_UTR, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) {
 
            if (fshift_mode > FSHIFT_BAD 
		&& (gt = get_cds_gap_type(feat, msa, problems)) < fshift_mode) {
              if (status == OKAY || status == NONSENSE) status = FSHIFT;
            }

            if (check_nonsense && !is_nonsense_clean(feat, msa, problems)) {
              if (status == OKAY) status = NONSENSE;
            }

            if (Nfrac < 1) 
              get_N_counts(countNs, countCDSs, feat, msa);
          }
        } /* end loop through features in group */

        /* still have to make sure splice sites are paired correctly
           (GT-AG, GC-AG, AT-AC) */
        if (status == OKAY && !splice_strict && lst_size(intron_splice) >= 2 &&
            !are_introns_okay(intron_splice, msa, problems, offset5, offset3)) 
          status = BAD_INTRON;

        /* also check fraction of Ns */
        if (Nfrac < 1) {
          enum {MY_OKAY, MY_FAIL, MY_WARN} Nstatus = MY_OKAY;
          for (j = 0; j < msa->nseqs; j++) {
            if ((double)countNs[j] / countCDSs[j] > Nfrac) Nstatus = MY_FAIL;
            if (Nstatus == MY_OKAY && countNs[j] > 0) Nstatus = MY_WARN;
          }
          if (Nstatus == MY_FAIL) {
            problem_add(problems, NULL, TOO_MANY_Ns, -1, -1);
            if (status == OKAY) status = TOO_MANY_Ns;
          }
          else if (Nstatus == MY_WARN) 
            problem_add(problems, NULL, WARN_Ns, -1, -1);
        }

        /* if collecting stats, record counts for failures */
        if (statsf != NULL) {
          if (status != OKAY) {
            for (j = 0; j < lst_size(problems); j++) {
              struct Problem *problem = lst_get_ptr(problems, j);
              status_type ftype = problem->status;
              if ((ftype == FSHIFT || ftype == NONSENSE) && 
                  status != FSHIFT && status != NONSENSE)
                continue;       /* don't count secondary frame shifts
                                   and nonsense mutations */ 

              if (ftype == BAD_INTRON && j % 2 == 0)
                continue;       /* only count one of every pair of these */

              nfail[ftype]++;
            }
          }

          /* also keep track of the total number of "conserved exons", and
             the number having each kind of gap */
          if ((status == OKAY || (status == FSHIFT && gt >= FSHIFT_OK))) {
            nconserved_exons++;
            nce_gap_type[gt]++;     /* number of conserved exons having
                                       given type of gaps */
          }
        }
      } /* end if (status == OKAY) [checks for conserved features] */
    } /* end if (status == OKAY && check_alignment) [all cross-species
         checks] */

    /* now we have looked at the whole group; we just need to do some
       final accounting and logging */

    if (status == OKAY) {
      nkept++;
      if (!no_output) {
        restore_stops(group, starts_adjusted, ends_adjusted);
        for (j = 0; j < lst_size(gfeatures); j++)
          lst_push_ptr(keepers, lst_get_ptr(gfeatures, j));
      }
      if (logf != NULL && lst_size(problems) > 0) /* warnings only */
        write_log(logf, group, status, problems, msa, map);
      if (mlogf != NULL) {
        /* no problem, need to add an okay status to log */
        problem_add(problems, NULL, OKAY, -1, -1);
        write_machine_log(mlogf, group, problems, map); /* may include
                                                           warnings */
      }
    }
    else {
      if (discardf != NULL) {
        restore_stops(group, starts_adjusted, ends_adjusted);
        for (j = 0; j < lst_size(gfeatures); j++) 
          lst_push_ptr(discards, lst_get_ptr(gfeatures, j));
      }
      if (logf != NULL) 
        write_log(logf, group, status, problems, msa, map);
      if (mlogf != NULL)
        write_machine_log(mlogf, group, problems, map);
    }
  } /* end loop over groups */

  /* write main output and discards */
  if (!no_output || discardf != NULL) {
    /* first map features back to coord frame of reference seq. */
    for (i = 0; i < lst_size(gff->features); i++) {
      GFF_Feature *f = lst_get_ptr(gff->features, i);
      f->start = msa_map_msa_to_seq(map, f->start) + msa->idx_offset;
      f->end = msa_map_msa_to_seq(map, f->end) + msa->idx_offset;
    }

    if (!no_output) {
      gff->features = keepers;
      gff_print_set(stdout, gff);
    }

    if (discardf != NULL) {
      gff->features = discards;
      gff_print_set(discardf, gff);
    }
  }


  /* dump counts to stats file */
  if (statsf != NULL) {
    fprintf(statsf, "#%11s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s\n", 
            "total", "nbad_ref", "nconsid", "nkept", "nno_aln", 
            "nbad_starts", "(out of)", "nbad_stops", "(out of)", 
            "nbad_5spl", "(out of)", "nbad_3spl", "(out of)", 
            "nbad_5utr", "(out of)", "nbad_3utr", "(out of)", 
            "nbad_intron", "nnons", "nfshifts", "nNs", "ncons_exons", 
            "nce_ngaps", "nce_nov_cln", "nce_clean", "nce_fshftok");
    fprintf(statsf, "%12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d\n", 
            nfail[BAD_REF]+ncons_tested, nfail[BAD_REF], ncons_tested, nkept, 
            nfail[NO_ALN], nfail[BAD_START], nconsid[BAD_START], 
            nfail[BAD_STOP], nconsid[BAD_STOP], nfail[BAD_5_SPLICE], 
            nconsid[BAD_5_SPLICE], nfail[BAD_3_SPLICE], nconsid[BAD_3_SPLICE],
            nfail[BAD_5_SPLICE_UTR], nconsid[BAD_5_SPLICE_UTR],
            nfail[BAD_3_SPLICE_UTR], nconsid[BAD_3_SPLICE_UTR], 
            nfail[BAD_INTRON], nfail[NONSENSE], nfail[FSHIFT], 
            nfail[TOO_MANY_Ns], nconserved_exons, nce_gap_type[NGAPS], 
            nce_gap_type[NOVRLP_CLN_GAPS], nce_gap_type[CLN_GAPS], 
            nce_gap_type[FSHIFT_OK]);
    fprintf(statsf, "%s", STATS_DESCRIPTION);
  }

  if (logf != NULL) phast_fclose(logf);
  if (mlogf != NULL) phast_fclose(mlogf);
  if (statsf != NULL) phast_fclose(statsf);
  if (discardf != NULL) phast_fclose(discardf);

  return 0;
}
示例#4
0
int main(int argc, char *argv[]) {
    struct phastCons_struct *p = phastCons_struct_new(0);
    struct option long_opts[] = {
        {"states", 1, 0, 'S'},
        {"hmm", 1, 0, 'H'},
        {"viterbi", 1, 0, 'V'},
        {"most-conserved", 1, 0, 'V'}, /* same as --viterbi */
        {"no-post-probs", 0, 0, 'n'},
        {"msa-format", 1, 0, 'i'},
        {"FC", 0, 0, 'X'},
        {"lambda", 1, 0, 'l'},
        {"target-coverage", 1, 0, 'C'},
        {"transitions", 1, 0, 't'},
        {"expected-length", 1, 0, 'E'},
        {"expected-lengths", 1, 0, 'E'}, /* for backward compatibility */
        {"estimate-trees", 1, 0, 'T'},
        {"estimate-rho", 1, 0, 'O'},
        {"rho", 1, 0, 'R'},
        {"gc", 1, 0, 'G'},
        {"ignore-missing", 0, 0, 'z'},
        {"nrates", 1, 0, 'k'},
        {"log", 1, 0, 'g'},
        {"refidx", 1, 0, 'r'},
        {"suppress-missing", 0, 0, 'x'}, /* for backward compatibility */
        {"reflect-strand", 1, 0, 'U'},
        {"catmap", 1, 0, 'c'},
        {"extrapolate", 1, 0, 'e'},
        {"indels", 0, 0, 'I'},
        {"max-micro-indel", 1, 0, 'Y'},
        {"indel-params", 1, 0, 'D'},
        {"min-informative-types", 1, 0, 'M'}, /* for backward compatibility */
        {"require-informative", 1, 0, 'M'},
        {"not-informative", 1, 0, 'F'},
        {"lnl", 1, 0, 'L'},
        {"seqname", 1, 0, 'N'},
        {"idpref", 1, 0, 'P'},
        {"score", 0, 0, 's'},
        {"coding-potential", 0, 0, 'p'},
        {"indels-only", 0, 0, 'J'},
        {"alias", 1, 0, 'A'},
        {"quiet", 0, 0, 'q'},
        {"help", 0, 0, 'h'},
        {0, 0, 0, 0}
    };

    /* other vars */
    FILE *infile;
    char *msa_fname;
    char c;
    int opt_idx, i, coding_potential=FALSE;
    List *tmpl = NULL;
    String *tmpstr;
    char *mods_fname = NULL;
    List *mod_fname_list;
    msa_format_type msa_format = UNKNOWN_FORMAT;

    while ((c = getopt_long(argc, argv,
                            "S:H:V:ni:k:l:C:G:zt:E:R:T:O:r:xL:sN:P:g:U:c:e:IY:D:JM:F:pA:Xqh",
                            long_opts, &opt_idx)) != -1) {
        switch (c) {
        case 'S':
            p->states = get_arg_list(optarg);
            break;
        case 'H':
            p->hmm = hmm_new_from_file(phast_fopen(optarg, "r"));
            p->two_state = FALSE;
            break;
        case 'V':
            p->viterbi_f = phast_fopen(optarg, "w+");
            tmpstr = str_new_charstr(optarg);
            if (str_ends_with_charstr(tmpstr, ".gff"))
                p->gff = TRUE;
            str_free(tmpstr);
            break;
        case 'n':
            p->post_probs = FALSE;
            break;
        case 'i':
            msa_format = msa_str_to_format(optarg);
            if (msa_format == UNKNOWN_FORMAT)
                die("ERROR: bad argument to --msa-format\n");
            break;
        case 'X':
            p->FC = TRUE;
            p->two_state = FALSE;
            break;
        case 'l':
            if (optarg[0] != '~')
                p->estim_lambda = FALSE;
            else optarg = &optarg[1];
            p->lambda = get_arg_dbl_bounds(optarg, 0, 1);
            break;
        case 'C':
            p->gamma = get_arg_dbl_bounds(optarg, 0, 1);
            break;
        case 'G':
            p->gc = get_arg_dbl_bounds(optarg, 0, 1);
            break;
        case 't':
            p->set_transitions = TRUE;
            if (optarg[0] != '~')
                p->estim_transitions = FALSE;
            else optarg = &optarg[1];
            tmpl = get_arg_list_dbl(optarg);
            if (lst_size(tmpl) != 2)
                die("ERROR: bad argument to --transitions.\n");
            p->mu = lst_get_dbl(tmpl, 0);
            p->nu = lst_get_dbl(tmpl, 1);
            if (p->mu <= 0 || p->mu >= 1 || p->nu <= 0 || p->nu >= 1)
                die("ERROR: bad argument to --transitions.\n");
            lst_free(tmpl);
            break;
        case 'E':
            if (optarg[0] != '~')
                p->estim_transitions = FALSE;
            else optarg = &optarg[1];
            p->omega = get_arg_dbl_bounds(optarg, 1, INFTY);
            p->mu = 1/p->omega;
            break;
        case 'T':
            p->estim_trees = TRUE;
            p->estim_trees_fname_root = optarg;
            break;
        case 'O':
            p->estim_rho = TRUE;
            p->estim_trees_fname_root = optarg;
            break;
        case 'z':
            p->ignore_missing = TRUE;
            break;
        case 'k':
            tmpl = get_arg_list_int(optarg);
            if (lst_size(tmpl) > 2)
                die("ERROR: too many arguments with --nrates.\n");
            p->nrates = lst_get_int(tmpl, 0);
            if (p->nrates <= 0)
                die("ERROR: bad argument to --nrates (%d).\n", p->nrates);
            if (lst_size(tmpl) == 2) {
                p->nrates2 = lst_get_int(tmpl, 1);
                if (p->nrates2 <= 0)
                    die("ERROR: bad argument to --nrates (%d).\n", p->nrates2);
            }
            lst_free(tmpl);
            break;
        case 'R':
            p->rho = get_arg_dbl_bounds(optarg, 0, 1);
            break;
        case 'g':
            if (!strcmp(optarg, "-"))
                p->log_f = stderr;
            else p->log_f = phast_fopen(optarg, "w+");
            break;
        case 'r':
            p->refidx = get_arg_int_bounds(optarg, 0, INFTY);
            break;
        case 'x':
            /* do nothing; left in for backward compatibility */
            break;
        case 'U':
            p->pivot_states = get_arg_list(optarg); /* we want strings not ints
						 for phmm_new */
            break;
        case 'e':
            p->extrapolate_tree_fname = optarg;
            break;
        case 'I':
            p->indels = TRUE;
            break;
        case 'Y':
            p->max_micro_indel = get_arg_int_bounds(optarg, 1, INFTY);
            break;
        case 'D':
            if (optarg[0] != '~')
                p->estim_indels = FALSE;
            else optarg = &optarg[1];
            tmpl = get_arg_list_dbl(optarg);
            if (lst_size(tmpl) != 6) die("ERROR: bad argument to --indel-params.\n");
            p->alpha_0 = lst_get_dbl(tmpl, 0);
            p->beta_0 = lst_get_dbl(tmpl, 1);
            p->tau_0 = lst_get_dbl(tmpl, 2);
            p->alpha_1 = lst_get_dbl(tmpl, 3);
            p->beta_1 = lst_get_dbl(tmpl, 4);
            p->tau_1 = lst_get_dbl(tmpl, 5);
            if (p->alpha_0 < 0 || p->beta_0 < 0 || p->tau_0 < 0 ||
                    p->alpha_1 < 0 || p->beta_1 < 0 || p->tau_1 < 0)
                die("ERROR: bad argument to --indel-params.\n");
            lst_free(tmpl);
            break;
        case 'J':
            p->indels_only = TRUE;
            p->two_state = FALSE;
            p->indels = TRUE;
            p->post_probs = FALSE;
            break;
        case 'M':
            p->inform_reqd = get_arg_list(optarg);
            break;
        case 'F':
            p->not_informative = get_arg_list(optarg);
            break;
        case 'c':
            p->cm = cm_new_string_or_file(optarg);
            break;
        case 'L':
            p->lnl_f = phast_fopen(optarg, "w+");
            break;
        case 'N':
            p->seqname = optarg;
            break;
        case 'P':
            p->idpref = optarg;
            break;
        case 's':
            p->score = TRUE;
            break;
        case 'p':
            coding_potential = TRUE;
            break;
        case 'A':
            p->alias_hash = make_name_hash(optarg);
            break;
        case 'q':
            p->results_f = NULL;
            break;
        case 'h':
            printf("%s", HELP);
            exit(0);
        case '?':
            die("Bad argument.  Try '%s -h'.\n", argv[0]);
        }
    }

    if ((!coding_potential && optind != argc - 2) ||
            (coding_potential && optind != argc - 2 && optind != argc - 1))
        die("ERROR: extra or missing arguments.  Try '%s -h'.\n", argv[0]);

    set_seed(-1);

    if (p->extrapolate_tree_fname != NULL &&
            !strcmp(p->extrapolate_tree_fname, "default")) {
        p->extrapolate_tree_fname = smalloc((strlen(PHAST_HOME)+100)*sizeof(char));
#if defined(__MINGW32__)
        sprintf(p->extrapolate_tree_fname,
                "%s\\data\\exoniphy\\mammals\\cftr25_hybrid.nh", PHAST_HOME);
#else
        sprintf(p->extrapolate_tree_fname,
                "%s/data/exoniphy/mammals/cftr25_hybrid.nh", PHAST_HOME);
#endif
    }
    if (p->extrapolate_tree_fname != NULL)
        p->extrapolate_tree = tr_new_from_file(phast_fopen(p->extrapolate_tree_fname, "r"));

    mods_fname = (optind == argc - 2 ? argv[argc - 1] : NULL);
    /* if there are two args, mods are the second one; otherwise will
       use default mods for coding potential (see below) */

    /* set defaults for coding-potential mode */
    if (coding_potential) {
        char tmp[5000];
        p->two_state = FALSE;
        if (p->cm == NULL)
            p->cm = cm_new_string_or_file("NCATS=4; CNS 1; CDS 2-4");
        if (p->hmm == NULL) {
#if defined(__MINGW32__)
            sprintf(tmp, "%s\\data\\phastCons\\%s", PHAST_HOME,
                    p->indels ? "simple-coding-indels.hmm" : "simple-coding.hmm");
#else
            sprintf(tmp, "%s/data/phastCons/%s", PHAST_HOME,
                    p->indels ? "simple-coding-indels.hmm" : "simple-coding.hmm");
#endif
            if (p->results_f!=NULL)
                fprintf(p->results_f, "Reading HMM from %s...\n", tmp);
            p->hmm = hmm_new_from_file(phast_fopen(tmp, "r"));
        }
        if (mods_fname == NULL) {
#if defined(__MINGW32__)
            sprintf(tmp, "%s\\data\\exoniphy\\mammals\\r3.ncns.mod, %s\\data\\exoniphy\\mammals\\r3.cns.mod, %s\\data\\exoniphy\\mammals\\r3.cds-1.mod, %s\\data\\exoniphy\\mammals\\r3.cds-2.mod, %s\\data\\exoniphy\\mammals\\r3.cds-3.mod",  PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME);
#else
            sprintf(tmp, "\
%s/data/exoniphy/mammals/r3.ncns.mod,\
%s/data/exoniphy/mammals/r3.cns.mod,\
%s/data/exoniphy/mammals/r3.cds-1.mod,\
%s/data/exoniphy/mammals/r3.cds-2.mod,\
%s/data/exoniphy/mammals/r3.cds-3.mod",
                    PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME);
#endif
            mods_fname = tmp;
        }
        if (p->states == NULL)
            p->states = get_arg_list("CDS");
        if (p->pivot_states == NULL)
            p->pivot_states = get_arg_list("background,CNS");
    }
示例#5
0
int main(int argc, char *argv[]) {
  char c;
  char *msa_fname = NULL;
  int opt_idx, i, old_nnodes;
  MSA *msa;
  List *pruned_names = lst_new_ptr(5), *tmpl;
  BDPhyloHmm *bdphmm;
  GFF_Set *predictions;
  int found = FALSE;
  List *ignore_types = lst_new_ptr(1);

  struct option long_opts[] = {
    {"refseq", 1, 0, 'M'},
    {"msa-format", 1, 0, 'i'},
    {"refidx", 1, 0, 'r'},
    {"rho", 1, 0, 'R'},
    {"phi", 1, 0, 'p'},
    {"transitions", 1, 0, 't'},    
    {"expected-length", 1, 0, 'E'},
    {"target-coverage", 1, 0, 'C'},
    {"seqname", 1, 0, 'N'},
    {"idpref", 1, 0, 'P'},
    {"indel-model", 1, 0, 'I'},
    {"indel-history", 1, 0, 'H'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  /* arguments and defaults for options */
  FILE *refseq_f = NULL, *msa_f = NULL;
  msa_format_type msa_format = UNKNOWN_FORMAT;
  TreeModel *source_mod;
  double rho = DEFAULT_RHO, mu = DEFAULT_MU, nu = DEFAULT_NU, 
    phi = DEFAULT_PHI, gamma = -1, omega = -1, 
    alpha_c = -1, beta_c = -1, tau_c = -1,
    alpha_n = -1, beta_n = -1, tau_n = -1;
  int set_transitions = FALSE, refidx = 1, estim_phi = TRUE, 
    estim_gamma = TRUE, estim_omega = TRUE;
  char *seqname = NULL, *idpref = NULL;
  IndelHistory *ih = NULL;

  while ((c = getopt_long(argc, argv, "R:t:p:E:C:r:M:i:N:P:I:H:h", long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 'R':
      rho = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 't':
      if (optarg[0] != '~') estim_gamma = estim_omega = FALSE;
      else optarg = &optarg[1];
      set_transitions = TRUE;
      tmpl = get_arg_list_dbl(optarg);
      if (lst_size(tmpl) != 2) 
        die("ERROR: bad argument to --transitions.\n");
      mu = lst_get_dbl(tmpl, 0);
      nu = lst_get_dbl(tmpl, 1);
      if (mu <= 0 || mu >= 1 || nu <= 0 || nu >= 1)
        die("ERROR: bad argument to --transitions.\n");
      lst_free(tmpl);
      break;
    case 'p':
      if (optarg[0] != '~') estim_phi = FALSE;
      else optarg = &optarg[1];
      phi = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'E':
      if (optarg[0] != '~') estim_omega = FALSE;
      else optarg = &optarg[1];
      omega = get_arg_dbl_bounds(optarg, 1, INFTY);
      mu = 1/omega;
      break;
    case 'C':
      if (optarg[0] != '~') estim_gamma = FALSE;
      else optarg = &optarg[1];
      gamma = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'r':
      refidx = get_arg_int_bounds(optarg, 0, INFTY);
      break;
    case 'M':
      refseq_f = phast_fopen(optarg, "r");
      break;
    case 'i':
      msa_format = msa_str_to_format(optarg);
      if (msa_format == UNKNOWN_FORMAT)
        die("ERROR: unrecognized alignment format.\n");
      break;
    case 'N':
      seqname = optarg;
      break;
    case 'P':
      idpref = optarg;
      break;
    case 'I':
      tmpl = get_arg_list_dbl(optarg);
      if (lst_size(tmpl) != 3 && lst_size(tmpl) != 6)
        die("ERROR: bad argument to --indel-model.\n");
      alpha_n = lst_get_dbl(tmpl, 0);
      beta_n = lst_get_dbl(tmpl, 1);
      tau_n = lst_get_dbl(tmpl, 2);
      if (lst_size(tmpl) == 6) {
        alpha_c = lst_get_dbl(tmpl, 3);
        beta_c = lst_get_dbl(tmpl, 4);
        tau_c = lst_get_dbl(tmpl, 5);
      }
      else {
        alpha_c = alpha_n; beta_c = beta_n; tau_c = tau_n;
      }
      if (alpha_c <= 0 || alpha_c >= 1 || beta_c <= 0 || beta_c >= 1 || 
          tau_c <= 0 || tau_c >= 1 || alpha_n <= 0 || alpha_n >= 1 || 
          beta_n <= 0 || beta_n >= 1 || tau_n <= 0 || tau_n >= 1)
        die("ERROR: bad argument to --indel-model.\n");
      break;
    case 'H':
      fprintf(stderr, "Reading indel history from %s...\n", optarg);
      ih = ih_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("Bad argument.  Try 'dless -h'.\n");
    }
  }

  if (optind != argc - 1)
    die("Missing alignment file or model file.  Try 'dless -h'.\n");

  if (set_transitions && (gamma != -1 || omega != -1))
    die("ERROR: --transitions and --target-coverage/--expected-length cannot be used together.\n");

  if ((gamma != -1 && omega == -1) || (gamma == -1 && omega != -1))
    die("ERROR: --target-coverage and --expecteed-length must be used together.\n");

  set_seed(-1);

  if (gamma != -1)
    nu = gamma/(1-gamma) * mu;

  fprintf(stderr, "Reading tree model from %s...\n", argv[optind]);
  source_mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1);

  if (source_mod->nratecats > 1) 
    die("ERROR: rate variation not currently supported.\n");

  if (source_mod->order > 0)
    die("ERROR: only single nucleotide models are currently supported.\n");

  if (!tm_is_reversible(source_mod))
    phast_warning("WARNING: p-value computation assumes reversibility and your model is non-reversible.\n");

  /* read alignment */
  msa_f = phast_fopen(argv[optind], "r");

  fprintf(stderr, "Reading alignment from %s...\n", argv[optind]);
  if (msa_format == UNKNOWN_FORMAT) 
    msa_format = msa_format_for_content(msa_f, 1);

  if (msa_format == MAF) {
    msa = maf_read(msa_f, refseq_f, 1, NULL, NULL, NULL, -1, TRUE, NULL, 
                   NO_STRIP, FALSE); 
  }
  else 
    msa = msa_new_from_file_define_format(msa_f, msa_format, NULL);

  if (msa_alph_has_lowercase(msa)) msa_toupper(msa); 
  msa_remove_N_from_alph(msa);

  if (msa->ss == NULL) {
    fprintf(stderr, "Extracting sufficient statistics...\n");
    ss_from_msas(msa, 1, TRUE, NULL, NULL, NULL, -1, 0);
  }
  else if (msa->ss->tuple_idx == NULL)
    die("ERROR: ordered representation of alignment required unless --suff-stats.\n");

  /* prune tree, if necessary */
  old_nnodes = source_mod->tree->nnodes;
  tm_prune(source_mod, msa, pruned_names);

  if (lst_size(pruned_names) == (old_nnodes + 1) / 2)
    die("ERROR: no match for leaves of tree in alignment (leaf names must match alignment names).\n");
  if (lst_size(pruned_names) > 0) {
    fprintf(stderr, "WARNING: pruned away leaves of tree with no match in alignment (");
    for (i = 0; i < lst_size(pruned_names); i++)
      fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, i))->chars, 
              i < lst_size(pruned_names) - 1 ? ", " : ").\n");
  }

  /* this has to be done after pruning tree */
  tr_name_ancestors(source_mod->tree);

  /* also make sure match for reference sequence in tree */
  if (refidx > 0) {
    for (i = 0, found = FALSE; !found && i < source_mod->tree->nnodes; i++) {
      TreeNode *n = lst_get_ptr(source_mod->tree->nodes, i);
      if (!strcmp(n->name, msa->names[refidx-1]))
        found = TRUE;
    }
    if (!found) die("ERROR: no match for reference sequence in tree.\n");
  }

  /* checks for indel model */
  if (alpha_c > 0) {
    if (ih == NULL) {
      fprintf(stderr, "Reconstructing indel history by parsimony...\n");
      ih = ih_reconstruct(msa, source_mod->tree);
    }
    else {
      if (ih->ncols != msa->length)
        die("ERROR: indel history doesn't seem to match alignment.\n");
      if (ih->tree->nnodes != source_mod->tree->nnodes)
        die("ERROR: indel history doesn't seem to match tree model.\n");
    }
  }

  bdphmm = bd_new(source_mod, rho, mu, nu, phi, alpha_c, beta_c, tau_c, 
                  alpha_n, beta_n, tau_n, estim_gamma, estim_omega, 
                  estim_phi);

  /* compute emissions */
  phmm_compute_emissions(bdphmm->phmm, msa, FALSE);

  /* add emissions for indel model, if necessary */
  if (alpha_c > 0) {
    fprintf(stderr, "Adjusting emissions for indels...\n");
    bd_add_indel_emissions(bdphmm, ih);
  }

  /* postprocess for missing data (requires special handling) */
  fprintf(stderr, "Adjusting emissions for missing data...\n");
  bd_handle_missing_data(bdphmm, msa);

  if (estim_gamma || estim_omega || estim_phi) {
    fprintf(stderr, "Estimating free parameters...\n");
    bd_estimate_transitions(bdphmm, msa);
  }

  /* set seqname and idpref, if necessary */
  if (seqname == NULL || idpref == NULL) {
    /* derive default from file name root */
    String *tmp = str_new_charstr(msa_fname);
    if (!str_equals_charstr(tmp, "-")) {
      str_remove_path(tmp);
      str_root(tmp, '.');
      if (idpref == NULL) idpref = copy_charstr(tmp->chars);
      str_root(tmp, '.');         /* apply one more time for double suffix */
      if (seqname == NULL) seqname = tmp->chars;    
    }
    else if (seqname == NULL) seqname = "refseq";
  }

  /* obtain predictions */
  fprintf(stderr, "Running Viterbi algorithm...\n");
  predictions = phmm_predict_viterbi(bdphmm->phmm, seqname, NULL, idpref, NULL);
  lst_push_ptr(ignore_types, str_new_charstr("nonconserved"));
  gff_filter_by_type(predictions, ignore_types, TRUE, NULL);

  /* score predictions */
  fprintf(stderr, "Scoring predictions...\n");
  bd_score_predictions(bdphmm, predictions);
  
  /* can free emissions now */
  for (i = 0; i < bdphmm->phmm->hmm->nstates; i++)
    sfree(bdphmm->phmm->emissions[i]);
  sfree(bdphmm->phmm->emissions);
  bdphmm->phmm->emissions = NULL;

  /* convert GFF to coord frame of reference sequence and adjust
     coords by idx_offset, if necessary  */
  if (refidx != 0 || msa->idx_offset != 0)
    msa_map_gff_coords(msa, predictions, 0, refidx, msa->idx_offset);

  if (refidx != 0) 
    gff_flatten(predictions);	
  /* necessary because coord conversion might create overlapping
     features (can happen in deletions in reference sequence) */

  /* now output predictions */
  fprintf(stderr, "Writing GFF to stdout...\n");
  gff_print_set(stdout, predictions);

  fprintf(stderr, "Done.\n");
  
  return 0;
}
int main(int argc, char* argv[]) {
  FILE* F;
  TreeModel *model;
  int i, j, k, alph_size, nstates, do_eqfreqs = 0, exch_mode = 0, 
    list_mode = 0, latex_mode = 0, suppress_diag = 0, ti_tv = 0, 
    scientific_mode = 0,
    induced_aa = 0, do_stop_codons = 0, do_zeroes = 0, symmetric = 0, 
    context_ti_tv = 0, all_branches = 0;
  int startcol, endcol, ncols, branch_no = 0, matrix_idx = 0;
/*   int aa_inv[256]; */
  double t = -1, total_ti = 0, total_tv = 0, rho_s = 0, cpg_ti = 0, 
    cpg_tv = 0, non_cpg_ti = 0, non_cpg_tv = 0, cpg_eqfreq = 0;
  char *rate_format_string = "%8.6f";
  MarkovMatrix *M;
  char c;
  char tuple[5], tuple2[5]; /* , aa_alph[50]; */
  char *subst_mat_fname = NULL, *subst_score_fname = NULL, 
    *subst_mat_fname_paml = NULL, *order1_mod_fname = NULL;
  Matrix *subst_mat = NULL;
  List *matrix_list = lst_new_ptr(20), *traversal = NULL;

  while ((c = (char)getopt(argc, argv, "t:fedlLiM:N:A:B:aszSECh")) != -1) {
   switch(c) {
    case 't':
      if (optarg[0] == 'A') all_branches = 1;
      else t = get_arg_dbl_bounds(optarg, 0, INFTY);
      break;
    case 'f':
      do_eqfreqs = 1;
      break;
    case 'e':
      exch_mode = 1;
      break;
    case 'd':
      suppress_diag = 1;
      break;
    case 'l':
      list_mode = 1;
      break;
    case 'L':
      latex_mode = 1;
      break;
    case 'i':
      ti_tv = 1;
      break;
    case 'M':
      subst_mat_fname = optarg;
      induced_aa = 1;
      break;
    case 'N':
      subst_mat_fname_paml = optarg;
      induced_aa = 1;
      break;
    case 'A':
      subst_score_fname = optarg;
      break;
    case 'B':
      order1_mod_fname = optarg;
      break;
    case 'a':
      induced_aa = 1;
      do_zeroes = 1;
      break;
    case 's':
      do_stop_codons = 1;
      break;
    case 'z':
      do_zeroes = 1;
      break;
    case 'S':
      symmetric = 1;
      break;
    case 'E':
      scientific_mode = 1;
      rate_format_string = "%13.6e";
      break;
    case 'C':
      context_ti_tv = 1;
      break;
    case 'h':
      print_usage();
      exit(0);
    case '?':
      die("Unrecognized option.  Try \"display_rate_matrix -h\" for help.\n");
    }
  }

  set_seed(-1);

  if ((t >= 0 && exch_mode) || (latex_mode && list_mode) || 
      ((ti_tv || subst_mat_fname != NULL || subst_score_fname != NULL || 
        subst_mat_fname_paml != NULL || scientific_mode) && !list_mode) || 
      (subst_mat_fname != NULL && subst_score_fname != NULL) || 
      (subst_score_fname != NULL && subst_mat_fname_paml != NULL) || 
      (subst_mat_fname != NULL && subst_mat_fname_paml != NULL) || 
      optind != argc - 1) {
    die("ERROR: missing required arguments or illegal combination of arguments.\nTry \"display_rate_matrix -h\" for help.\n");
  }

  F = phast_fopen(argv[optind], "r");
  model = tm_new_from_file(F, 1);

  if (context_ti_tv) {
    /* this option requires completely different handling from the others */
    if (model->order != 2) { 
      die("ERROR: -C requires a model of order 3.\n");
    }
    do_context_dependent_ti_tv(model);
    exit(0);
  }

  if (induced_aa) {
    TreeModel *aa_model = tm_induced_aa(model);
    char *codon_to_aa = get_codon_mapping(model->rate_matrix->states);

    /* before freeing model, grab the expected rate of synonymous
       subst, rho_s */
    for (i = 0; i < model->rate_matrix->size; i++)
      for (j = 0; j < model->rate_matrix->size; j++)
        if (i != j && codon_to_aa[i] == codon_to_aa[j])
          rho_s += mm_get(model->rate_matrix, i, j) * 
            vec_get(model->backgd_freqs, i);

    sfree(codon_to_aa);

    tm_free(model);
    model = aa_model;
  }

  if (all_branches) {
    traversal = tr_inorder(model->tree);
    for (matrix_idx = 0; matrix_idx < lst_size(traversal); matrix_idx++) {
      TreeNode *n = lst_get_ptr(traversal, matrix_idx);
      if (n->parent == NULL) { lst_push_ptr(matrix_list, NULL); continue; }
      M = mm_new(model->rate_matrix->size, model->rate_matrix->states, DISCRETE);
      mm_exp(M, model->rate_matrix, n->dparent);
      lst_push_ptr(matrix_list, M);      
    }
  }
  else if (t >= 0) {
    M = mm_new(model->rate_matrix->size, model->rate_matrix->states, DISCRETE);
    mm_exp(M, model->rate_matrix, t);
    lst_push_ptr(matrix_list, M);
  }
  else 
    lst_push_ptr(matrix_list, model->rate_matrix);

  alph_size = (int)strlen(model->rate_matrix->states);
  nstates = model->rate_matrix->size;

  if (subst_mat_fname != NULL) {
    if ((F = fopen(subst_mat_fname, "r")) == NULL) {
      die("ERROR: Can't open %s.\n", subst_mat_fname);
    }    
    subst_mat = read_subst_mat(F, AA_ALPHABET); 
  }
  else if (subst_mat_fname_paml != NULL) {
    if ((F = fopen(subst_mat_fname_paml, "r")) == NULL) {
      die("ERROR: Can't open %s.\n", subst_mat_fname_paml);
    }    
    subst_mat = read_paml_matrix(F, AA_ALPHABET); 
  }
  else if (subst_score_fname != NULL) {
    if ((F = fopen(subst_score_fname, "r")) == NULL) {
      die("ERROR: Can't open %s.\n", subst_score_fname);
    }    
    subst_mat = read_subst_scores(model, F);
  }
  else if (order1_mod_fname != NULL) {
    if ((F = fopen(order1_mod_fname, "r")) == NULL) {
      die("ERROR: Can't open %s.\n", order1_mod_fname);
    }    
    subst_mat = unproject_rates(model, tm_new_from_file(F, 1));
  }

  /* loop through matrices to print */
  for (matrix_idx = 0; matrix_idx < lst_size(matrix_list); matrix_idx++) {
    M = lst_get_ptr(matrix_list, matrix_idx);

    if (all_branches) {
      if (M == NULL) continue;  /* root */
      printf("BRANCH %d (t = %.6f)\n", ++branch_no,
             ((TreeNode*)lst_get_ptr(traversal, matrix_idx))->dparent);
    }

  /* print no more than 16 columns at a time (except with -a) */
  ncols = (induced_aa ? nstates : 16);
  for (startcol = 0; startcol < nstates; startcol += ncols) {
    endcol = min(nstates, startcol+ncols);

    /* table header */
    if (! list_mode) {
      if (latex_mode) {
        printf("\\begin{tabular}{|c|");
        for (i = startcol; i < endcol; i++) printf("r");
        printf("|}\n\\hline\n");
      }
      printf("%-5s ", "");
      if (latex_mode) printf("& ");
      for (i = startcol; i < endcol; i++) {
        get_state_tuple(model, tuple, i);
        if (latex_mode) {
          printf("{\\bf %s}", tuple);
          if (i < endcol-1) printf("& ");
        }
        else printf("%8s ", tuple);
    }
      if (latex_mode) printf("\\\\\n\\hline\n");
      else printf("\n");
    }

    /* table or list contents */
    for (i = 0; i < nstates; i++) {
      if (induced_aa && AA_ALPHABET[i] == '$' && !do_stop_codons) continue;
      get_state_tuple(model, tuple, i);

      /* get total eq freq of tuples containing CpG dinucs */
      for (k = 0; k < model->order; k++) {
        if (tuple[k] == 'C' && tuple[k+1] == 'G') {
          cpg_eqfreq += vec_get(model->backgd_freqs, i);
/*           printf("***CPG***"); */
          break;
        }
      }

      if (latex_mode) printf("{\\bf %s}& ", tuple);
      else if (!list_mode) printf("%-5s ", tuple);
      for (j = startcol; j < endcol; j++) {
        if (induced_aa && AA_ALPHABET[j] == '$' && !do_stop_codons) continue;
        if (latex_mode) printf("$");
        if (list_mode) {
          if (symmetric && j <= i) continue;
          else if ((t < 0 && ! all_branches) 
		   && (i == j || (!do_zeroes && mm_get(M, i, j) == 0))) 
            continue;
          get_state_tuple(model, tuple2, j);
          printf("%-5s %-5s ", tuple, tuple2);
        }
        if (i == j && suppress_diag && !list_mode) printf("%-7s", "-");
        else { 
	  /* get rate or probability */
	  double val = exch_mode == 0 ? mm_get(M, i, j) : 
	    safediv(mm_get(M, i, j), vec_get(model->backgd_freqs,j));
	  /* print value in format %8.6f or %13.6e */
	  printf(rate_format_string, val); 
	  printf(" ");
	}
        if (latex_mode) {
          printf("$");
          if (j < endcol-1) printf("& ");
        }
        else if (list_mode) {
          int ti, is_cpg;
          if (ti_tv) {
            ti = -1;
            is_cpg = 0;
            for (k = 0; k <= model->order; k++) {
              int dig_i = (i % int_pow(alph_size, k+1)) / int_pow(alph_size, k);
              int dig_j = (j % int_pow(alph_size, k+1)) / int_pow(alph_size, k);
              char next_char = '\0', prev_char = '\0';
              if (dig_i != dig_j) {
                ti = is_transition(M->states[dig_i], M->states[dig_j]);
                if (k != model->order)
                  prev_char = M->states[(i % int_pow(alph_size, k+2)) / 
                                        int_pow(alph_size, k+1)];
                if (k != 0)
                  next_char = M->states[(i % int_pow(alph_size, k)) / 
                                        int_pow(alph_size, k-1)];
                if ((M->states[dig_i] == 'C' && next_char == 'G') || 
                    (M->states[dig_i] == 'G' && prev_char == 'C')) 
                  is_cpg = 1;
              }
            }
	    if (ti == -1)
	      die("ERROR ti=-1\n");
            printf("%5s ", ti ? "ti" : "tv");
/*             printf("%5s ", is_cpg ? "CPG" : "-"); */
            if (ti) {
              total_ti += mm_get(M, i, j) * 
                vec_get(model->backgd_freqs, i);
              if (is_cpg) 
                cpg_ti += mm_get(M, i, j) * 
                  vec_get(model->backgd_freqs, i);
              else non_cpg_ti += mm_get(M, i, j) * 
                     vec_get(model->backgd_freqs, i);
            }
            else {
              total_tv += mm_get(M, i, j) * 
                vec_get(model->backgd_freqs, i);
              if (is_cpg)
                cpg_tv += mm_get(M, i, j) * 
                  vec_get(model->backgd_freqs, i);
              else non_cpg_tv += mm_get(M, i, j) * 
                     vec_get(model->backgd_freqs, i);
            }
          }
          if (subst_mat != NULL) {
            if (mat_get(subst_mat, i, j) == NEGINFTY) 
              printf("%8s", "-"); 
            else printf("%8.4f", mat_get(subst_mat, i, j)); 
          }
          printf("\n");
        }
      }
      if (latex_mode) printf("\\\\\n");
      else if (!list_mode) printf("\n");
    }
    
    /* equilibrium freqs (table case only) */
    if (do_eqfreqs && ! list_mode) {
      if (latex_mode) 
        printf("\\hline\n$\\boldsymbol{\\mathbf{\\pi}}$&");
      else 
        printf("%-5s ", "pi");
      for (i = startcol; i < endcol; i++) {
        if (latex_mode) 
          printf("$%8.4f$ ", vec_get(model->backgd_freqs, i));      
        else 
          printf("%8.4f ", vec_get(model->backgd_freqs, i));      
        if (latex_mode && i < endcol-1) printf("& ");
      }
      if (latex_mode) printf("\\\\\n");
      else printf("\n");
    }

    if (latex_mode) printf("\\hline\n\\end{tabular}\n\n");
  }

  /* equilibrium freqs (list case only) */
  if (do_eqfreqs &&  list_mode) {
    for (i = 0; i < nstates; i++) {
      get_state_tuple(model, tuple, i);
      printf("%-5s %-5s ", "-", tuple); //!!
      printf(rate_format_string, vec_get(model->backgd_freqs, i)); 
      printf("\n");
    }
  }
  
  if (ti_tv && list_mode) {
    printf("\n#Total ti/tv = %.4f\n", total_ti/total_tv);
    printf("#CpG ti ratio = %.4f, CpG tv ratio = %.4f\n", 
           cpg_ti/non_cpg_ti /* * (1 - cpg_eqfreq) */ / cpg_eqfreq, 
           cpg_tv/non_cpg_tv /* * (1 - cpg_eqfreq) */ / cpg_eqfreq);
  }
  else if (induced_aa) 
    printf("\n#Total rho_s/rho_v = %.4f\n", rho_s/(3-rho_s));

  if (all_branches == 1) printf("\n\n");
  }

  tm_free(model);
  lst_free(matrix_list);

  return 0;
}
示例#7
0
int main(int argc, char *argv[]) {
  /* variables for options, with defaults */
  TreeNode *tree = NULL, *merge_tree = NULL, *extrapolate_tree = NULL;
  Hashtable *rename_hash = NULL;
  double scale_factor = 1;
  List *prune_names = NULL, *label = NULL, *labelType = NULL;
  int prune_all_but = FALSE, tree_only = FALSE, dissect = FALSE,
    name_ancestors = FALSE, with_branch = FALSE, print_branchlen=FALSE,
    inNewick=FALSE, no_branchlen = FALSE, print_distance_to_root = FALSE;
  TreeModel *mod = NULL, *merge_mod = NULL;
  char *reroot_name = NULL, *subtree_name =NULL, *get_subtree_name = NULL,
    *node_distance_name = NULL;
  
  /* other variables */
  String *suffix,  *optstr;
  char c;
  int i, opt_idx;
  TreeNode *n;

  struct option long_opts[] = {
    {"scale", 1, 0, 's'},
    {"extrapolate", 1, 0, 'e'},
    {"prune", 1, 0, 'p'},
    {"prune-all-but", 1, 0, 'P'},
    {"get-subtree", 1, 0, 'g'},
    {"merge", 1, 0, 'm'},
    {"rename", 1, 0, 'r'},
    {"tree-only", 0, 0, 't'},
    {"no-branchlen", 0, 0, 'N'},
    {"dissect", 0, 0, 'd'},
    {"name-ancestors", 0, 0, 'a'},
    {"reroot", 1, 0, 'R'},
    {"with-branch", 1, 0, 'B'},
    {"subtree", 1, 0, 'S'},
    {"branchlen", 0, 0, 'b'},
    {"newick", 0, 0, 'n'},
    {"label-subtree", 1, 0, 'L'},
    {"label-branches", 1, 0, 'l'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  while ((c = getopt_long(argc, argv, "s:p:P:g:m:r:R:B:S:D:l:L:adtNbnh", 
                          long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 's':
      scale_factor = get_arg_dbl_bounds(optarg, 0, INFTY);
      break;
    case 'e':
      if (!strcmp(optarg, "default")) {
        optarg = smalloc(1000 * sizeof(char));
        #if defined(__MINGW32__)
          sprintf(optarg, "%s\\data\\exoniphy\\mammals\\cftr25_hybrid.nh",
		  PHAST_HOME);
        #else
          sprintf(optarg, "%s/data/exoniphy/mammals/cftr25_hybrid.nh", 
                  PHAST_HOME);
        #endif
      }
      extrapolate_tree = tr_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'p':
      prune_names = get_arg_list(optarg);
      break;
    case 'P':
      prune_names = get_arg_list(optarg);
      prune_all_but = TRUE;
      break;
    case 'g':
      get_subtree_name = optarg;
      break;
    case 'm':
      suffix = str_new_charstr(optarg);
      str_suffix(suffix, '.');
      if (str_equals_charstr(suffix, "nh"))
        merge_tree = tr_new_from_file(phast_fopen(optarg, "r"));
      else {
        merge_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1);
        merge_tree = merge_mod->tree;
      }
      break;
    case 'r':
      rename_hash = make_name_hash(optarg);
      break;
    case 't':
      tree_only = TRUE;
      break;
    case 'N':
      no_branchlen = TRUE;
      tree_only = TRUE;
      break;
    case 'd':
      dissect = TRUE;
      break;
    case 'b':
      print_branchlen = TRUE;
      break;
    case 'D':
      print_distance_to_root = TRUE;
      node_distance_name = optarg;
      break;
    case 'R':
      reroot_name = optarg;
      break;
    case 'B':
      with_branch = TRUE;
      break;
    case 'a':
      name_ancestors = TRUE;
      break;
    case 'S':
      subtree_name = optarg;
      break;
    case 'n':
      inNewick=TRUE;
      break;
    case 'L':  //do the same for --label--subtree and --label-branches
    case 'l':
      if (label == NULL) {
	label = lst_new_ptr(1);
	labelType = lst_new_int(1);
      }
      optstr = str_new_charstr(optarg);
      lst_push_ptr(label, optstr);
      lst_push_int(labelType, (int)c);
      break;
    case 'h':
      usage(argv[0]);
    case '?':
      die("Bad argument.  Try '%s -h'.\n", argv[0]);
    }
  }

  if (optind != argc - 1) 
    die("Input filename required.  Try '%s -h'.\n", argv[0]);

  if (merge_tree != NULL && extrapolate_tree != NULL)
    die("ERROR: Can't use --merge and --extrapolate together");

  set_seed(-1);
    
  suffix = str_new_charstr(argv[optind]);
  str_suffix(suffix, '.');
  if (inNewick || str_equals_charstr(suffix, "nh")) {
    tree = tr_new_from_file(phast_fopen(argv[optind], "r"));
    tree_only = TRUE;           /* can't output tree model in this case */
  }
  else {
    mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1);
    tree = mod->tree;
  }

  if (prune_names != NULL) {
    tr_prune(&tree, prune_names, prune_all_but, NULL);
    if (mod != NULL) mod->tree = tree; /* root may have changed */
  }

  if (get_subtree_name != NULL) {
    n = tr_get_node(tree, get_subtree_name);
    if (n == NULL) {
      tr_name_ancestors(tree);
      n = tr_get_node(tree, get_subtree_name);
      if (n == NULL) {
	die("ERROR: no node named '%s'.\n", subtree_name);
      }
    }
    tr_prune_supertree(&tree, n);
    if (mod != NULL) mod->tree = tree;
  }

  if (merge_tree != NULL) {
    tree = tr_hybrid(tree, merge_tree);
    if (mod != NULL) mod->tree = tree;
  }

  else if (extrapolate_tree != NULL) {
    tr_scale_by_subtree(extrapolate_tree, tree);
    tree = extrapolate_tree;
    if (mod != NULL) mod->tree = tree;
  }

  if (scale_factor != 1) {
    if (subtree_name == NULL)
      tr_scale(tree, scale_factor);
    else {
      n = tr_get_node(tree, subtree_name);
      if (n == NULL) die("ERROR: no node named '%s'.\n", subtree_name);
      tr_scale_subtree(tree, n, scale_factor, with_branch);
    }
  }

  if (name_ancestors)
    tr_name_ancestors(tree);

  if (rename_hash != NULL) {
    char *newname;
    for (i = 0; i < tree->nnodes; i++) {
      n = lst_get_ptr(tree->nodes, i);
      if (n->name != NULL && n->name[0] != '\0' && 
          (newname = hsh_get(rename_hash, n->name)) != (char*)-1) {
        strcpy(n->name, newname);
      }
    }
  }

  if (reroot_name != NULL) {
    n = tr_get_node(tree, reroot_name);
    if (n == NULL) die("ERROR: no node named '%s'.\n", reroot_name);
    tr_reroot(tree, n, with_branch);
    if (mod != NULL) mod->tree = with_branch ? n->parent : n;
    tree = with_branch ? n->parent : n;
  }

  if (label != NULL) {
    for (i=0; i < lst_size(label); i++) {
      String *currstr = (String*)lst_get_ptr(label, i), *arg1, *labelVal;
      List *tmplst = lst_new_ptr(10);
      String *nodename;
      int j;
      str_split(currstr, ":", tmplst);
      if (lst_size(tmplst) != 2) 
	die("ERROR: bad argument to --label-branches or --label-subtree.\n");
      arg1 = lst_get_ptr(tmplst, 0);
      labelVal = lst_get_ptr(tmplst, 1);
      lst_clear(tmplst);
      if (lst_get_int(labelType, i) == (int)'l') {
	str_split(arg1, ",", tmplst);
	for (j=0; j < lst_size(tmplst); j++) {
	  nodename = (String*)lst_get_ptr(tmplst, j);
	  tr_label_node(tree, nodename->chars, labelVal->chars);
	}
	lst_free_strings(tmplst);
      } else if (lst_get_int(labelType, i) == (int)'L') {
	int include_leading_branch = FALSE;
	TreeNode *node;
	nodename = arg1;
	node = tr_get_node(tree, nodename->chars);
	if (node == NULL && nodename->chars[nodename->length-1] == '+') {
	  nodename->chars[--nodename->length] = '\0';
	  node = tr_get_node(tree, nodename->chars);
	  include_leading_branch = TRUE;
	}
	tr_label_subtree(tree, nodename->chars, include_leading_branch, 
			 labelVal->chars);
      } else die("ERROR got label_type %c\n", lst_get_int(labelType, (char)i));
      str_free(arg1);
      str_free(labelVal);
      lst_free(tmplst);
      str_free(currstr);
    }
    lst_free(label);
    lst_free(labelType);
  }

  if (dissect) 
    tr_print_nodes(stdout, tree);
  if (print_branchlen) 
    printf("TOTAL_TREE_LEN: %f\n", tr_total_len(tree));
  if (print_distance_to_root) {
    TreeNode *node = tr_get_node(tree, node_distance_name);
    if (node == NULL) 
      die("ERROR: no node named '%s'.\n", node_distance_name);
    printf("length(root-%s): %f\n", node_distance_name, 
	   tr_distance_to_root(node));
  }

  if (dissect==0 && print_branchlen==0 && print_distance_to_root==0) {
    if (tree_only)
      tr_print(stdout, tree, no_branchlen==FALSE);
    else
      tm_print(stdout, mod);
  }
  return 0;
}