/* Exclude stop codons from all CDS in a group, as necessary.  Record
   any features that are changed, so they can be changed back before
   data is output */
void exclude_stops(GFF_FeatureGroup *group, List *starts_adjusted, 
                   List *ends_adjusted) {
  int j, k;
  List *stops = lst_new_ptr(1), *gfeatures = group->features;
  GFF_Feature *feat;
  lst_clear(stops); lst_clear(ends_adjusted); lst_clear(starts_adjusted);
  for (j = 0; j < lst_size(gfeatures); j++) { /* first grab all stops.  We 
                                                 expect at most one, but more 
                                                 are possible */
    feat = lst_get_ptr(gfeatures, j);
    if (str_equals_charstr(feat->feature, GFF_STOP_TYPE)) lst_push_ptr(stops, feat);
  }
  for (j = 0; j < lst_size(gfeatures); j++) { /* now look at CDSs */
    feat = lst_get_ptr(gfeatures, j);
    if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) {
      for (k = 0; k < lst_size(stops); k++) { /* check stops */
        GFF_Feature *stop = lst_get_ptr(stops, k);
        if (feat->strand == '+' && stop->strand == '+' && 
            feat->end == stop->end) {
          feat->end -= 3; 
          lst_push_ptr(ends_adjusted, feat);
        }
        else if (feat->strand == '-' && stop->strand == '-' && 
                 feat->start == stop->start) {
          feat->start += 3; 
          lst_push_ptr(starts_adjusted, feat);
        }
      }
    }
  }
  lst_free(stops);
}
/* Restore cds coords to include stop codons, as necessary */
void restore_stops(GFF_FeatureGroup *group, List *starts_adjusted,
                   List *ends_adjusted) {
  int j;
  if (lst_size(ends_adjusted) == 0 && lst_size(starts_adjusted) == 0)
    return;
  for (j = 0; j < lst_size(group->features); j++) {
    GFF_Feature *feat = lst_get_ptr(group->features, j);
    if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) {
      if (lst_find_ptr(ends_adjusted, feat) != -1) feat->end += 3;
      else if (lst_find_ptr(starts_adjusted, feat) != -1) feat->start -= 3;
    }
  }
}
int main(int argc, char *argv[]) {

  int check_start = 0, check_stop = 0, check_splice = 0, check_nonsense = 0,
    offset5 = 0, offset3 = 0, opt_idx, i, j, indel_strict = 0, no_output = 0,
    check_alignment = 0, splice_strict = 0;
  int ncons_tested, nkept, nconserved_exons;
  int nce_gap_type[NGAP_TYPES], nconsid[NTYPES], nfail[NTYPES];
  double Nfrac = 0.05;
  char c;
  MSA *msa;
  GFF_Set *gff;
  msa_format_type msa_format = UNKNOWN_FORMAT;
  List *keepers, *problems = lst_new_ptr(10), 
    *ends_adjusted = lst_new_ptr(1), *starts_adjusted = lst_new_ptr(1), 
    *discards=NULL, *intron_splice = lst_new_ptr(10);
  char *rseq_fname = NULL;
  FILE *logf = NULL, *mlogf = NULL, *statsf = NULL, *discardf = NULL;
  cds_gap_type fshift_mode = FSHIFT_BAD;
  char *groupby = "transcript_id";
  msa_coord_map *map;
  int *countNs, *countCDSs;
  FILE *infile;
  char *msa_fname;

  struct option long_opts[] = {
    {"start", 0, 0, 's'},
    {"stop", 0, 0, 't'},
    {"splice", 0, 0, 'l'},
    {"nonsense", 0, 0, 'n'},
    {"fshift", 0, 0, 'f'},
    {"conserved", 0, 0, 'c'},
    {"N-limit", 1, 0, 'N'},
    {"clean-gaps", 0, 0, 'e'},
    {"indel-strict", 0, 0, 'I'},
    {"splice-strict", 0, 0, 'C'},
    {"groupby", 1, 0, 'g'},
    {"msa-format", 1, 0, 'i'},
    {"refseq", 1, 0, 'r'},
    {"offset5", 1, 0, 'o'},
    {"offset3", 1, 0, 'p'},
    {"no-output", 0, 0, 'x'},
    {"discards", 1, 0, 'd'},
    {"log", 1, 0, 'L'},
    {"machine-log", 1, 0, 'M'},
    {"stats", 1, 0, 'S'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  while ((c = (char)getopt_long(argc, argv, "N:i:r:L:M:S:g:d:stlnfceICxh", 
                          long_opts, &opt_idx)) != -1) {
    switch(c) {
    case 's':
      check_alignment = check_start = 1;
      break;
    case 't':
      check_alignment = check_stop = 1;
      break;
    case 'l':
      check_alignment = check_splice = 1;
      break;
    case 'n':
      check_alignment = check_nonsense = 1;
      break;
    case 'f':
      check_alignment = 1;
      fshift_mode = FSHIFT_OK;
      break;
    case 'c':
      check_alignment = check_start = check_stop = check_splice = check_nonsense = 1;
      if (fshift_mode < FSHIFT_OK) fshift_mode = FSHIFT_OK;
      break;
    case 'N':
      Nfrac = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'e':
      check_alignment = 1;
      if (fshift_mode < CLN_GAPS) fshift_mode = CLN_GAPS;
      break;
    case 'I':
      check_alignment = 1;
      fshift_mode = NOVRLP_CLN_GAPS;
      indel_strict = 1;
      break;
    case 'C':
      check_alignment = check_splice = splice_strict = 1;
      break;
    case 'g':
      groupby = optarg;
      break;
    case 'i':
      msa_format = msa_str_to_format(optarg);
      if (msa_format == UNKNOWN_FORMAT) die("Bad alignment format.\n");
      break;
    case 'r':
      rseq_fname = optarg;
      break;
    case 'o':
      offset5 = get_arg_int(optarg);
      break;
    case 'p':
      offset3 = get_arg_int(optarg);
      break;
    case 'L':
      logf = phast_fopen(optarg, "w+");
      break;
    case 'M':
      mlogf = phast_fopen(optarg, "w+");
      break;
    case 'S':
      statsf = phast_fopen(optarg, "w+");
      break;
    case 'd':
      discardf = phast_fopen(optarg, "w+");
      break;
    case 'x':
      no_output = 1;
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("ERROR: Bad argument.  Try the --help option.\n");
    }
  }

  if (optind + 1 >= argc ) {
    die("ERROR:  Missing required arguments.  Try the --help option.\n");
  }
  
  set_seed(-1);

  gff = gff_read_set(phast_fopen(argv[optind], "r"));
  msa_fname = argv[optind+1];
  infile = phast_fopen(msa_fname, "r");
  if (msa_format == UNKNOWN_FORMAT)
    msa_format = msa_format_for_content(infile, 1);
  if (msa_format == MAF) {
    msa = maf_read(infile, 
                   rseq_fname == NULL ? NULL : phast_fopen(rseq_fname, "r"), 
                   1, NULL, NULL, NULL, -1, TRUE, NULL, NO_STRIP, FALSE); 
  }
  else {
    msa = msa_new_from_file_define_format(infile,
                            msa_format, NULL); 
    if (msa->ss == NULL) 
      ss_from_msas(msa, 1, 1, NULL, NULL, NULL, -1, 0);
  }
  if (!msa->ss->tuple_idx)
    die("ERROR: need ordered tuples\n");
  msa_remove_N_from_alph(msa);  /* for backward compatibility (old SS files) */

  if (msa->idx_offset != 0) {   /* avoids offset problem */
    for (i = 0; i < lst_size(gff->features); i++) {
      GFF_Feature *f = lst_get_ptr(gff->features, i);
      f->start -= msa->idx_offset;
      f->end -= msa->idx_offset;
    }
  }

  /* set up coordinate map; assume GFF is for sequence 1 */
  map = msa_build_coord_map(msa, 1);

  /* convert all features */
  for (i = 0; i < lst_size(gff->features); i++) {
    GFF_Feature *f = lst_get_ptr(gff->features, i);
    int newstart, newend;
 
    if (f->start < 0 || f->end < f->start)
      die("ERROR: bad feature in GFF (start=%d, end=%d).\n",
          f->start, f->end);

    newstart = msa_map_seq_to_msa(map, f->start);
    newend = msa_map_seq_to_msa(map, f->end);

    if (newstart < 0 || newend < newstart)
      die("ERROR: unable to map coordinates for feature (start=%d, end=%d).\n",
          f->start, f->end);

    f->start = newstart;
    f->end = newend;
  }

  gff_group(gff, groupby);	/* do this after coord conversion, or
                               group coords and feature coords
                               will be out of sync */

  keepers = lst_new_ptr(lst_size(gff->features));
  if (discardf != NULL) discards = lst_new_ptr(lst_size(gff->features));

  ncons_tested = nkept = nconserved_exons = 0;
  for (i = 0; i < NTYPES; i++) nconsid[i] = 0;
  for (i = 0; i < NTYPES; i++) nfail[i] = 0;
  for (i = 0; i < NGAP_TYPES; i++) nce_gap_type[i] = 0;  

  countNs = smalloc(msa->nseqs * sizeof(int));
  countCDSs = smalloc(msa->nseqs * sizeof(int));

  for (i = 0; i < lst_size(gff->groups); i++) {
    GFF_FeatureGroup *group = lst_get_ptr(gff->groups, i);
    List *gfeatures = group->features;
    GFF_Feature *feat;
    status_type status = OKAY;
    cds_gap_type gt = FSHIFT_BAD;
    problems_clear(problems);

    /* make sure have frame info for CDSs */
    for (j = 0; j < lst_size(gfeatures); j++) {
      feat = lst_get_ptr(gfeatures, j);
      if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) && 
          feat->frame == GFF_NULL_FRAME)
        die("ERROR: Missing frame info for CDS.\n");
    }

    /* First, exclude stop codons from cds's, if necessary (simplifies
       the detection of nonsense mutations). */
    exclude_stops(group, starts_adjusted, ends_adjusted);

    /* In all cases, discard any group for which the reference sequence
       doesn't have valid splice sites or start/stop codons, or has a
       premature stop codon */
    if (!ref_seq_okay(gfeatures, msa, offset3, indel_strict, splice_strict,
                      problems)) {
      status = BAD_REF;
      nfail[BAD_REF]++;
    }
    else
      /* Everything else counts as a potentially valid group */
      ncons_tested++;

    if (status == OKAY && check_alignment) {      
                                /* only bother with below if
                                   interested in cross-species
                                   conservation */

      /* Check first to make sure there's alignment across species in
         the cds; if not, there's no need to look at individual
         features. */
      for (j = 0; j < lst_size(gfeatures); j++) { 
        feat = lst_get_ptr(gfeatures, j);
        if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) &&
            is_incomplete_alignment(feat, msa)) {
          status = NO_ALN;
          nfail[NO_ALN]++;
          problem_add(problems, feat, NO_ALN, -1, -1);
          break;
        }
      }

      if (status == OKAY) {     /* we have alignment and agreement
                                   with the ref seq; now check feature
                                   by feature  */

        lst_clear(intron_splice);
        for (j = 0; j < msa->nseqs; j++) countNs[j] = countCDSs[j] = 0;

        for (j = 0; j < lst_size(gfeatures); j++) {
          feat = lst_get_ptr(gfeatures, j);

          if (feat->end - 1 >= msa->length) 
            die("ERROR: feature extends beyond alignment (%d >= %d).\n",
                feat->end - 1, msa->length);
        
          if (check_start && str_equals_charstr(feat->feature, GFF_START_TYPE)) {

            nconsid[BAD_START]++;

            if (!is_conserved_start(feat, msa)) {
              status = BAD_START;
              problem_add(problems, feat, BAD_START, -1, -1);
            }
          }

          else if (check_stop && str_equals_charstr(feat->feature, GFF_STOP_TYPE)) {

            nconsid[BAD_STOP]++;

            if (!is_conserved_stop(feat, msa)) {
              status = BAD_STOP;
              problem_add(problems, feat, BAD_STOP, -1, -1);
            }
          }

          else if (check_splice && 
                   str_equals_charstr(feat->feature, SPLICE_5)) {

            nconsid[BAD_5_SPLICE]++;

            if (!is_conserved_5splice(feat, msa, offset5, splice_strict)) {
              status = BAD_5_SPLICE;
              problem_add(problems, feat, BAD_5_SPLICE, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (check_splice && 
                   str_equals_charstr(feat->feature, SPLICE_5_UTR)) {

            nconsid[BAD_5_SPLICE_UTR]++;

            if (!is_conserved_5splice(feat, msa, offset5, splice_strict)) {
              status = BAD_5_SPLICE_UTR;
              problem_add(problems, feat, BAD_5_SPLICE_UTR, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (check_splice && str_equals_charstr(feat->feature, SPLICE_3)) {


            nconsid[BAD_3_SPLICE]++;

            if (!is_conserved_3splice(feat, msa, offset3, splice_strict)) {
              status = BAD_3_SPLICE;
              problem_add(problems, feat, BAD_3_SPLICE, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (check_splice && str_equals_charstr(feat->feature, SPLICE_3)) {

            nconsid[BAD_3_SPLICE_UTR]++;

            if (!is_conserved_3splice(feat, msa, offset3, splice_strict)) {
              status = BAD_3_SPLICE_UTR;
              problem_add(problems, feat, BAD_3_SPLICE_UTR, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) {
 
            if (fshift_mode > FSHIFT_BAD 
		&& (gt = get_cds_gap_type(feat, msa, problems)) < fshift_mode) {
              if (status == OKAY || status == NONSENSE) status = FSHIFT;
            }

            if (check_nonsense && !is_nonsense_clean(feat, msa, problems)) {
              if (status == OKAY) status = NONSENSE;
            }

            if (Nfrac < 1) 
              get_N_counts(countNs, countCDSs, feat, msa);
          }
        } /* end loop through features in group */

        /* still have to make sure splice sites are paired correctly
           (GT-AG, GC-AG, AT-AC) */
        if (status == OKAY && !splice_strict && lst_size(intron_splice) >= 2 &&
            !are_introns_okay(intron_splice, msa, problems, offset5, offset3)) 
          status = BAD_INTRON;

        /* also check fraction of Ns */
        if (Nfrac < 1) {
          enum {MY_OKAY, MY_FAIL, MY_WARN} Nstatus = MY_OKAY;
          for (j = 0; j < msa->nseqs; j++) {
            if ((double)countNs[j] / countCDSs[j] > Nfrac) Nstatus = MY_FAIL;
            if (Nstatus == MY_OKAY && countNs[j] > 0) Nstatus = MY_WARN;
          }
          if (Nstatus == MY_FAIL) {
            problem_add(problems, NULL, TOO_MANY_Ns, -1, -1);
            if (status == OKAY) status = TOO_MANY_Ns;
          }
          else if (Nstatus == MY_WARN) 
            problem_add(problems, NULL, WARN_Ns, -1, -1);
        }

        /* if collecting stats, record counts for failures */
        if (statsf != NULL) {
          if (status != OKAY) {
            for (j = 0; j < lst_size(problems); j++) {
              struct Problem *problem = lst_get_ptr(problems, j);
              status_type ftype = problem->status;
              if ((ftype == FSHIFT || ftype == NONSENSE) && 
                  status != FSHIFT && status != NONSENSE)
                continue;       /* don't count secondary frame shifts
                                   and nonsense mutations */ 

              if (ftype == BAD_INTRON && j % 2 == 0)
                continue;       /* only count one of every pair of these */

              nfail[ftype]++;
            }
          }

          /* also keep track of the total number of "conserved exons", and
             the number having each kind of gap */
          if ((status == OKAY || (status == FSHIFT && gt >= FSHIFT_OK))) {
            nconserved_exons++;
            nce_gap_type[gt]++;     /* number of conserved exons having
                                       given type of gaps */
          }
        }
      } /* end if (status == OKAY) [checks for conserved features] */
    } /* end if (status == OKAY && check_alignment) [all cross-species
         checks] */

    /* now we have looked at the whole group; we just need to do some
       final accounting and logging */

    if (status == OKAY) {
      nkept++;
      if (!no_output) {
        restore_stops(group, starts_adjusted, ends_adjusted);
        for (j = 0; j < lst_size(gfeatures); j++)
          lst_push_ptr(keepers, lst_get_ptr(gfeatures, j));
      }
      if (logf != NULL && lst_size(problems) > 0) /* warnings only */
        write_log(logf, group, status, problems, msa, map);
      if (mlogf != NULL) {
        /* no problem, need to add an okay status to log */
        problem_add(problems, NULL, OKAY, -1, -1);
        write_machine_log(mlogf, group, problems, map); /* may include
                                                           warnings */
      }
    }
    else {
      if (discardf != NULL) {
        restore_stops(group, starts_adjusted, ends_adjusted);
        for (j = 0; j < lst_size(gfeatures); j++) 
          lst_push_ptr(discards, lst_get_ptr(gfeatures, j));
      }
      if (logf != NULL) 
        write_log(logf, group, status, problems, msa, map);
      if (mlogf != NULL)
        write_machine_log(mlogf, group, problems, map);
    }
  } /* end loop over groups */

  /* write main output and discards */
  if (!no_output || discardf != NULL) {
    /* first map features back to coord frame of reference seq. */
    for (i = 0; i < lst_size(gff->features); i++) {
      GFF_Feature *f = lst_get_ptr(gff->features, i);
      f->start = msa_map_msa_to_seq(map, f->start) + msa->idx_offset;
      f->end = msa_map_msa_to_seq(map, f->end) + msa->idx_offset;
    }

    if (!no_output) {
      gff->features = keepers;
      gff_print_set(stdout, gff);
    }

    if (discardf != NULL) {
      gff->features = discards;
      gff_print_set(discardf, gff);
    }
  }


  /* dump counts to stats file */
  if (statsf != NULL) {
    fprintf(statsf, "#%11s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s\n", 
            "total", "nbad_ref", "nconsid", "nkept", "nno_aln", 
            "nbad_starts", "(out of)", "nbad_stops", "(out of)", 
            "nbad_5spl", "(out of)", "nbad_3spl", "(out of)", 
            "nbad_5utr", "(out of)", "nbad_3utr", "(out of)", 
            "nbad_intron", "nnons", "nfshifts", "nNs", "ncons_exons", 
            "nce_ngaps", "nce_nov_cln", "nce_clean", "nce_fshftok");
    fprintf(statsf, "%12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d\n", 
            nfail[BAD_REF]+ncons_tested, nfail[BAD_REF], ncons_tested, nkept, 
            nfail[NO_ALN], nfail[BAD_START], nconsid[BAD_START], 
            nfail[BAD_STOP], nconsid[BAD_STOP], nfail[BAD_5_SPLICE], 
            nconsid[BAD_5_SPLICE], nfail[BAD_3_SPLICE], nconsid[BAD_3_SPLICE],
            nfail[BAD_5_SPLICE_UTR], nconsid[BAD_5_SPLICE_UTR],
            nfail[BAD_3_SPLICE_UTR], nconsid[BAD_3_SPLICE_UTR], 
            nfail[BAD_INTRON], nfail[NONSENSE], nfail[FSHIFT], 
            nfail[TOO_MANY_Ns], nconserved_exons, nce_gap_type[NGAPS], 
            nce_gap_type[NOVRLP_CLN_GAPS], nce_gap_type[CLN_GAPS], 
            nce_gap_type[FSHIFT_OK]);
    fprintf(statsf, "%s", STATS_DESCRIPTION);
  }

  if (logf != NULL) phast_fclose(logf);
  if (mlogf != NULL) phast_fclose(mlogf);
  if (statsf != NULL) phast_fclose(statsf);
  if (discardf != NULL) phast_fclose(discardf);

  return 0;
}
Beispiel #4
0
/* Create a GFF_Set from a sequence of category/state numbers, using
   a specified category map and mapping from raw state numbers to
   category numbers.  */
GFF_Set *cm_labeling_as_gff(CategoryMap *cm, int *path, 
                            int length, int *path_to_cat, 
                            int *reverse_compl, char *seqname, 
                            char *source, List *frame_cats, 
                            char *grouptag,  char *idpref
                            ) {
  int beg, end, i, cat, frame, groupno;
  GFF_Set *gff = gff_new_set_init("PHAST", PHAST_VERSION);
  int do_frame[cm->ncats+1];
  char strand;
  char groupstr[STR_SHORT_LEN];
  int ignore_0 = str_equals_charstr(cm_get_feature(cm, 0), BACKGD_CAT_NAME);
                                /* ignore category 0 if background  */

  if (length <= 0) return gff;

  for (i = 0; i <= cm->ncats; i++) do_frame[i] = 0;
  if (frame_cats != NULL)
    for (i = 0; i < lst_size(frame_cats); i++) {
      int cat = cm_get_category(cm, lst_get_ptr(frame_cats, i));
      if (cat != 0)             /* ignore background or unrecognized name */
        do_frame[cat] = 1;
    }

  groupno = 1;
  if (idpref != NULL)
    sprintf(groupstr, "%s \"%s.%d\"", grouptag != NULL ? grouptag : "id", 
            idpref, groupno);
  else
    sprintf(groupstr, "%s \"%d\"", grouptag != NULL ? grouptag : "id", groupno);

  i = 0;
  while (i < length) {
    checkInterruptN(i, 10000);
    cat = cm->ranges[path_to_cat[path[i]]]->start_cat_no;
    strand = reverse_compl[path[i]] ? '-' : '+';
    frame = do_frame[cat] ? path_to_cat[path[i]] - cat : GFF_NULL_FRAME;

    /* scan ahead until enter new category range (or reach end of seq) */
    beg = i + 1;                /* begin of feature (GFF coords) */
    for (i++; i < length && 
           cm->ranges[path_to_cat[path[i]]]->start_cat_no == cat; i++);
    end = i;                    /* end of feature (GFF coords) */

    /* if minus strand, adjust frame to reflect end */
    if (strand == '-' && do_frame[cat]) 
      frame = path_to_cat[path[i-1]] - cat;

    /* if legitimate feature (non-background), then incorp into GFF_Set */
    if (cat != 0 || !ignore_0)  /* create new feature and add */
      lst_push_ptr(gff->features, 
                   gff_new_feature(str_new_charstr(seqname), 
                                   str_new_charstr(source), 
                                   str_dup(cm_get_feature(cm, cat)), 
                                   beg, end, 0, strand, frame, 
                                   str_new_charstr(groupstr), TRUE));

    if (cat == 0 && beg > 1) {
      groupno++;                /* increment group number each time a
                                   sequence of 0s is encountered  */
      if (idpref != NULL)
        sprintf(groupstr, "%s \"%s.%d\"", grouptag != NULL ? grouptag : "id", 
                idpref, groupno);
      else
        sprintf(groupstr, "%s \"%d\"", grouptag != NULL ? grouptag : "id", 
                groupno);
    }
  }

  return gff;
}
/* checks to see if reference sequence looks okay wrt a given
   list of features */
int ref_seq_okay(List *features, MSA *msa, int offset3, 
                 int indel_strict, int splice_strict, List *problems) {
  List *signals = NULL;
  char *seq = NULL;
  int seqalloc = 0;
  int idx, retval = TRUE;
  GFF_Feature *feat, *lastfeat_helper = NULL;

  if (indel_strict) {
    signals = lst_new_ptr(10);
    str_split(str_new_charstr(SIGNALS), ",", signals);
  }

  for (idx = 0; idx < lst_size(features); idx++) {
    int i, j, len, has_gaps = 0; 

    feat = lst_get_ptr(features, idx);

    if (seqalloc <= feat->end - feat->start + 2) {
      seqalloc = (feat->end - feat->start) * 2; 
      seq = srealloc(seq, seqalloc * sizeof(char));
    }

    for (i = feat->start - 1, len = 0; i < feat->end; i++) {
      if (ss_get_char_pos(msa, i, 0, 0) != GAP_CHAR)
        seq[len++] = ss_get_char_pos(msa, i, 0, 0);
      else if (!has_gaps) has_gaps = 1;
    }
    seq[len] = '\0';
    if (feat->strand == '-') msa_reverse_compl_seq(seq, len);

    if (str_equals_charstr(feat->feature, GFF_START_TYPE) && strcmp(seq, "ATG") != 0) {
      problem_add(problems, feat, BAD_REF_START, -1, -1);
      retval = FALSE;
    }
    else if (str_equals_charstr(feat->feature, GFF_STOP_TYPE) && 
             (feat->frame != 0 || !is_stop_codon(seq))) {
      problem_add(problems, feat, BAD_REF_STOP, -1, -1);
      retval = FALSE;
    }
    else if (str_starts_with_charstr(feat->feature, SPLICE_5) && 
             !is_valid_5splice(seq, splice_strict)) {
      problem_add(problems, feat, BAD_REF_5_SPLICE, -1, -1);
      retval = FALSE;
    }
    else if (str_starts_with_charstr(feat->feature, SPLICE_3) &&
             !is_valid_3splice(&seq[offset3], splice_strict)) {
      problem_add(problems, feat, BAD_REF_3_SPLICE, -1, -1);
      retval = FALSE;
    }
    else if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) {
      for (i = (3 - feat->frame) % 3; i <= len - 3; i += 3) {
        if (is_stop_codon(&seq[i])) {
          problem_add(problems, feat, BAD_REF_ORF, -1, -1);
          retval = FALSE;
          break;
        }
      }
    }

    if (indel_strict) {
      int strict_okay = TRUE;
      List *signals = lst_new_ptr(10);
      str_split(str_new_charstr(SIGNALS), ",", signals);

      if (str_in_list(feat->feature, signals)) {
        /* reject any signal feature with gaps in the ref seq, unless they
           appear in a non-critical part of a splice site or in a
           "prestart" feature  */
        if (has_gaps) {          
          if (str_starts_with_charstr(feat->feature, SPLICE_5)) {
            if (ss_get_char_pos(msa, feat->start-1, 0, 0) == GAP_CHAR ||
                ss_get_char_pos(msa, feat->start, 0, 0) == GAP_CHAR)
              strict_okay = FALSE;
          }
          else if (str_starts_with_charstr(feat->feature, SPLICE_3)) {
            if (ss_get_char_pos(msa, feat->end-1, 0, 0) == GAP_CHAR ||
                ss_get_char_pos(msa, feat->end-2, 0, 0) == GAP_CHAR)
              strict_okay = FALSE;
          }
          else if (!str_equals_charstr(feat->feature, "prestart"))
            strict_okay = FALSE;
        }
        /* in addition, if two signals occur consec. with gaps and
           only gaps between them, assume a violation of
           --indel-strict */
        if (lastfeat_helper != NULL && lastfeat_helper->end < feat->start-1) {
          int allgaps = 1;
          for (j = lastfeat_helper->end; allgaps && j < feat->start-1; j++) 
                                /* note indexing: -1+1 for end and -1
                                   for start  */
            if (ss_get_char_pos(msa, j, 0, 0) != GAP_CHAR) allgaps = 0;
          if (allgaps) 
            strict_okay = FALSE;
        }
        lastfeat_helper = feat;
      }
      else lastfeat_helper = NULL;
    
      /* also exclude CDS exons of length less than 6 in indel_strict
         case -- these cause problems in exoniphy training because
         start_codon is adjacent to cds5ss */
      if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) && len <= 6)
        strict_okay = FALSE;

      if (!strict_okay) {
        problem_add(problems, feat, BAD_REF_INDEL_STRICT_FAIL, -1, -1);
        retval = FALSE;
      }
      lst_free_strings(signals);
      lst_free(signals);
    }
  }
  if (seq != NULL) sfree(seq);
  return retval;
}
Beispiel #6
0
/* Read a CategoryMap from a file */
CategoryMap *cm_read(FILE *F) {
  String *line, *name;
  List *l;
  int cat, cat2, lineno, i, cm_read_error;
  CategoryMap *cm = NULL;
  CategoryRange *existing_range;
  static Regex *cat_range_re = NULL;
  static Regex *ncats_re = NULL;
  static Regex *fill_re = NULL;
  static Regex *label_re = NULL;
  static Regex *extend_re = NULL;
  int has_dependencies = 0;

  line = str_new(STR_SHORT_LEN);
  l = lst_new_ptr(3);
  if (cat_range_re == NULL) {
    cat_range_re = str_re_new("^[[:space:]]*([^[:space:]]+)[[:space:]]+([[:digit:]]+)(-([[:digit:]]+))?([[:space:]]+([[:digit:]].*))?"); 
    ncats_re = str_re_new("^[[:space:]]*NCATS[[:space:]]*=[[:space:]]*([[:digit:]]+)");
    fill_re = str_re_new("^[[:space:]]*FILL_PRECEDENCE[[:space:]]*=[[:space:]]*(.*)$");
    label_re = str_re_new("^[[:space:]]*LABELLING_PRECEDENCE[[:space:]]*=[[:space:]]*(.*)$");
    extend_re = str_re_new("^[[:space:]]*FEATURE_EXTEND[[:space:]]*:[[:space:]]*(.+)[[:space:]]*\\((.+)\\)$");
  }

  lineno = 0;
  while ((str_readline(line, F)) != EOF) {
    lineno++;
    str_trim(line);
    if (str_equals_charstr(line, ""))
      continue;

    if (str_re_match(line, ncats_re, l, 1) >= 0) { 
                                /* NCATS line */
      int ncats;
      str_as_int(lst_get_ptr(l, 1), &ncats);
      cm = cm_new(ncats);

      /* 0th category is "background" */
      cm->ranges[0] = 
        cm_new_category_range(str_new_charstr(BACKGD_CAT_NAME), 0, 0);
    }

    else if (cm == NULL || cm->ncats == 0) 
      die("ERROR: NCATS line must appear first, and must specify a positive number of categories.\n");

    else if (str_re_match(line, label_re, l, 1) >= 0) {               
                                /* LABELLING_PRECEDENCE line */
      List *tmpl = lst_new_ptr(cm->ncats);
      int tmpi;
      str_split((String*)lst_get_ptr(l, 1), " ,", tmpl);
      for (i = 0; i < lst_size(tmpl); i++) {
        String *s = (String*)lst_get_ptr(tmpl, i);
        if (str_as_int(s, &tmpi) != 0 || tmpi < 0 || tmpi > cm->ncats) 
          die("ERROR: bad integer in LABELLING_PRECEDENCE.\n");
        cm->labelling_precedence[tmpi] = i;
        str_free(s);
      }
      lst_free(tmpl);
    }

    else if (str_re_match(line, fill_re, l, 1) >= 0) {               
                                /* FILL_PRECEDENCE line */
      List *tmpl = lst_new_ptr(cm->ncats);
      int tmpi;
      str_split(lst_get_ptr(l, 1), " ,", tmpl);
      for (i = 0; i < lst_size(tmpl); i++) {
        String *s = lst_get_ptr(tmpl, i);
        if (str_as_int(s, &tmpi) != 0 || tmpi < 0 || tmpi > cm->ncats) 
          die("ERROR: bad integer in FILL_PRECEDENCE.\n");
        cm->fill_precedence[tmpi] = i;
        str_free(s);
      }
      lst_free(tmpl);
    }

    else if (str_re_match(line, extend_re, l, 2) >= 0) {
                                /* FEATURE_EXTEND line */
      String *target = lst_get_ptr(l, 2);
      List *sources = lst_new_ptr(2);
      str_split(lst_get_ptr(l, 1), " ,", sources);

      if (cm == NULL || (cat = cm_get_category(cm, target)) == 0)
        die("ERROR: FEATURE_EXTEND target must be a previously-defined non-background feature type.\n");

      for (i = 0; i < lst_size(sources); i++) {
        if (cm_get_category(cm, lst_get_ptr(sources, i)) == 0) 
          die("ERROR: FEATURE_EXTEND source list must consist of previously-defined non-background feature types.\n");
      }
    }

    else {                      /* 'range' line */
      if (str_re_match(line, cat_range_re, l, 6) < 0) 
        die("ERROR at line %d: '%s'\n", lineno, line->chars);

      name = str_dup((String*)lst_get_ptr(l, 1));
      str_as_int((String*)lst_get_ptr(l, 2), &cat);

      cat2 = cat;
      if (lst_get_ptr(l, 4) != NULL) 
        str_as_int((String*)lst_get_ptr(l, 4), &cat2);

      if (cat < 0 || cat2 < cat || cat2 > cm->ncats)
        die("ERROR: Illegal category range.\n");

      /* check for existing definitions of the specified category
         range.  Either no such definition must exist, or one must
         exist that spans exactly the same category numbers */
      existing_range = NULL;
      cm_read_error = 0;
      for (i = cat; !cm_read_error && i <= cat2; i++) {
        if (cm->ranges[i] != NULL && existing_range == NULL) 
          existing_range = cm->ranges[i];
        else if (cm->ranges[i] != existing_range)
          cm_read_error = 1;
      }
      if (existing_range != NULL && (existing_range->start_cat_no != cat || 
                                     existing_range->end_cat_no != cat2)) 
        cm_read_error = 1;

      if (cm_read_error) 
        die("ERROR: Overlapping category ranges.\n");

      /* either add new category range, or add new type to existing one */
      if (existing_range != NULL) {
        lst_push_ptr(existing_range->feature_types, name);
      }
      else {
        CategoryRange *cr = cm_new_category_range(name, cat, cat2);
        for (i = cat; i <= cat2; i++) 
          cm->ranges[i] = cr;
      }

      /* now address "conditioned_on" dependencies, if they have been
         specified */
      if (lst_get_ptr(l, 6) != NULL) {
        if (existing_range != NULL) 
          fprintf(stderr, "WARNING: ignoring 'conditioned on' list for type '%s'\n", 
                  name->chars);
        else {
          List *tmpl = lst_new_ptr(cm->ncats);
          int tmpi;
	  if (cm->conditioned_on[cat] != NULL)
	    die("ERROR cm_read: cm->conditioned_on[%i] should be NULL\n",
		cat);

          str_split((String*)lst_get_ptr(l, 6), " ,", tmpl);
          cm->conditioned_on[cat] = lst_new_int(lst_size(tmpl));
          for (i = cat + 1; i <= cat2; i++)
            cm->conditioned_on[i] = cm->conditioned_on[cat];
          /* all categories in range point to
             same "conditioned on" list */

          for (i = 0; i < lst_size(tmpl); i++) {
            String *s = (String*)lst_get_ptr(tmpl, i);
            if (str_as_int(s, &tmpi) != 0 || tmpi < 0 || tmpi > cm->ncats) 
              die("ERROR: bad integer in 'conditioned on' list for type '%s'.\n", 
                      name->chars);
            lst_push_int(cm->conditioned_on[cat], tmpi);
            str_free(s);
          }
          lst_free(tmpl);
          
          has_dependencies = 1;
        }
      }
    }

    for (i = 0; i < lst_size(l); i++)
      if (lst_get_ptr(l, i) != NULL)
        str_free((String*)lst_get_ptr(l, i));
  }

  /* make sure every category has been specified */
  for (i = 0; i <= cm->ncats; i++) 
    if (cm->ranges[i] == 0) 
      die("ERROR: category %d has not been specified.\n", i);

  /* build unspooler, if necessary */
  if (has_dependencies)
    cm->unspooler = cm_create_unspooler(cm->ncats + 1, cm->conditioned_on);

  str_free(line);
  lst_free(l);
  return cm;
}
LocalPwAlignment *la_read_lav(FILE *F, int read_seqs) {
  String *line = str_new(STR_MED_LEN);
  int line_no=0;
  LocalPwAlignment *lpwa = la_new();
  List *fields = lst_new_ptr(6);
  Regex *stanza_start_re = str_re_new("^([dshaxm])[[:space:]]*{");
  AlignmentBlock *aln_block = NULL;
  char stanza_type = '\0';
  int i;
  int done_with[256];
  done_with[(int)'d'] = done_with[(int)'s'] = done_with[(int)'h'] = 
    done_with[(int)'x'] = done_with[(int)'m'] = 0;

  while (str_readline(line, F) != EOF) {
    str_trim(line);
    if (line->length == 0) continue;

    checkInterruptN(line_no, 1000);
    line_no++;
    if (line_no == 1) {
      if (!str_equals_charstr(line, "#:lav")) {
        die("ERROR: lav file missing header.\n");
      }
    } 
    else if (str_re_match(line, stanza_start_re, fields, 1) >= 0) {
      String *tmpstr = lst_get_ptr(fields, 1);
      stanza_type = tmpstr->chars[0];
      str_free(tmpstr);
      str_free(lst_get_ptr(fields, 0));

      if (stanza_type != 'a' && done_with[(int)stanza_type]) {
        die("ERROR: multiple '%c' stanzas in lav file.\n", 
                stanza_type);
      }

      if (stanza_type == 'a') {
        aln_block = la_new_alignment_block(-1, -1, -1, -1, -1, NULL);
        lst_push_ptr(lpwa->alignment_blocks, aln_block);
      }
    }

    /* end current stanza */
    else if (str_equals_charstr(line, "}")) {
      if (stanza_type == '\0') {
        die("ERROR: end stanza without matching begin.\n");
      }
      done_with[(int)stanza_type] = 1;
      stanza_type = '\0';
    }

    else if (stanza_type == 'd') {
      ; /* do nothing for now */
    }
    else if (stanza_type == 's') {
      int beg, end;
      String *tmpstr, *fname, *seq=NULL;
      FILE *F2;

      str_double_trim(line);
      str_split(line, NULL, fields);
      if (lst_size(fields) != 3 || 
          str_as_int(lst_get_ptr(fields, 1), &beg) != 0 || 
          str_as_int(lst_get_ptr(fields, 2), &end) != 0) {
        die("ERROR: bad line in 's' stanza in lav file.\n");
      }
      tmpstr = lst_get_ptr(fields, 0);
      fname = str_new(tmpstr->length-2); /* remove quotes */
      str_substring(fname, tmpstr, 1, tmpstr->length-2);
      if (read_seqs) {
        F2 = phast_fopen(fname->chars, "r");
        seq = msa_read_seq_fasta(F2);
	phast_fclose(F2);
      }

      for (i = 0; i < lst_size(fields); i++) 
        str_free(lst_get_ptr(fields, i));
      
      if (beg != 1) {
        die("ERROR: unexpected begin index in 's' stanza of lav file (begin index currently must be 1).\n");
      }
      if (lpwa->query_len == -1) {
        lpwa->query_len = end;
        if (read_seqs) lpwa->query_seq = seq;
      }
      else if (lpwa->target_len == -1) {
        lpwa->target_len = end;
        if (read_seqs) lpwa->target_seq = seq;
      }
      else {
        die("ERROR: too many sequences listed in 's' stanza of lav file.\n");
      }
      str_free(fname);
    }
    else if (stanza_type == 'h') {
      String *name;

      str_double_trim(line);
      name = str_new(line->length-3); /* get rid of quotes and
                                         leading '>' */
      str_substring(name, line, 2, line->length-3);
      if (lpwa->query_name == NULL) lpwa->query_name = name;
      else if (lpwa->target_name == NULL) lpwa->target_name = name;
      else {
        die("ERROR: too many entries in 'h' stanza of lav file.\n");
      }
    }
    else if (stanza_type == 'a') {
      String *type;
      int val[6];
      if (!done_with[(int)'s'] || !done_with[(int)'d'] || 
          !done_with[(int)'h']) {
        die("ERROR: 'a' stanza appears in lav file before 'd', 's', or 'h' stanza.\n");
      }

      str_double_trim(line);
      str_split(line, NULL, fields);
      type = lst_get_ptr(fields, 0);
      if (lst_size(fields) > 6) {
        die("ERROR: illegal line in 'a' stanza.\n");
      }
      for (i = 1; i < lst_size(fields); i++) {
        str_as_int(lst_get_ptr(fields, i), &val[i]);
        str_free(lst_get_ptr(fields, i));
      }

      if (type->chars[0] == 's') 
        aln_block->score = val[1];
      else if (type->chars[0] == 'b') {
        aln_block->query_beg = val[1];
        aln_block->target_beg = val[2];      
      }
      else if (type->chars[0] == 'e') {
        aln_block->query_end = val[1];
        aln_block->target_end = val[2];      
      }
      else if (type->chars[0] == 'l') 
        lst_push_ptr(aln_block->gapless_alns, 
                     la_new_gapless_aln(val[1], val[3], val[2], val[4]));

      str_free(type);
    }     
  }
  str_free(line);
  lst_free(fields);
  str_re_free(stanza_start_re);

  return lpwa;
}
Beispiel #8
0
int main(int argc, char *argv[]) {
  char c;
  char *msa_fname = NULL;
  int opt_idx, i, old_nnodes;
  MSA *msa;
  List *pruned_names = lst_new_ptr(5), *tmpl;
  BDPhyloHmm *bdphmm;
  GFF_Set *predictions;
  int found = FALSE;
  List *ignore_types = lst_new_ptr(1);

  struct option long_opts[] = {
    {"refseq", 1, 0, 'M'},
    {"msa-format", 1, 0, 'i'},
    {"refidx", 1, 0, 'r'},
    {"rho", 1, 0, 'R'},
    {"phi", 1, 0, 'p'},
    {"transitions", 1, 0, 't'},    
    {"expected-length", 1, 0, 'E'},
    {"target-coverage", 1, 0, 'C'},
    {"seqname", 1, 0, 'N'},
    {"idpref", 1, 0, 'P'},
    {"indel-model", 1, 0, 'I'},
    {"indel-history", 1, 0, 'H'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  /* arguments and defaults for options */
  FILE *refseq_f = NULL, *msa_f = NULL;
  msa_format_type msa_format = UNKNOWN_FORMAT;
  TreeModel *source_mod;
  double rho = DEFAULT_RHO, mu = DEFAULT_MU, nu = DEFAULT_NU, 
    phi = DEFAULT_PHI, gamma = -1, omega = -1, 
    alpha_c = -1, beta_c = -1, tau_c = -1,
    alpha_n = -1, beta_n = -1, tau_n = -1;
  int set_transitions = FALSE, refidx = 1, estim_phi = TRUE, 
    estim_gamma = TRUE, estim_omega = TRUE;
  char *seqname = NULL, *idpref = NULL;
  IndelHistory *ih = NULL;

  while ((c = getopt_long(argc, argv, "R:t:p:E:C:r:M:i:N:P:I:H:h", long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 'R':
      rho = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 't':
      if (optarg[0] != '~') estim_gamma = estim_omega = FALSE;
      else optarg = &optarg[1];
      set_transitions = TRUE;
      tmpl = get_arg_list_dbl(optarg);
      if (lst_size(tmpl) != 2) 
        die("ERROR: bad argument to --transitions.\n");
      mu = lst_get_dbl(tmpl, 0);
      nu = lst_get_dbl(tmpl, 1);
      if (mu <= 0 || mu >= 1 || nu <= 0 || nu >= 1)
        die("ERROR: bad argument to --transitions.\n");
      lst_free(tmpl);
      break;
    case 'p':
      if (optarg[0] != '~') estim_phi = FALSE;
      else optarg = &optarg[1];
      phi = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'E':
      if (optarg[0] != '~') estim_omega = FALSE;
      else optarg = &optarg[1];
      omega = get_arg_dbl_bounds(optarg, 1, INFTY);
      mu = 1/omega;
      break;
    case 'C':
      if (optarg[0] != '~') estim_gamma = FALSE;
      else optarg = &optarg[1];
      gamma = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'r':
      refidx = get_arg_int_bounds(optarg, 0, INFTY);
      break;
    case 'M':
      refseq_f = phast_fopen(optarg, "r");
      break;
    case 'i':
      msa_format = msa_str_to_format(optarg);
      if (msa_format == UNKNOWN_FORMAT)
        die("ERROR: unrecognized alignment format.\n");
      break;
    case 'N':
      seqname = optarg;
      break;
    case 'P':
      idpref = optarg;
      break;
    case 'I':
      tmpl = get_arg_list_dbl(optarg);
      if (lst_size(tmpl) != 3 && lst_size(tmpl) != 6)
        die("ERROR: bad argument to --indel-model.\n");
      alpha_n = lst_get_dbl(tmpl, 0);
      beta_n = lst_get_dbl(tmpl, 1);
      tau_n = lst_get_dbl(tmpl, 2);
      if (lst_size(tmpl) == 6) {
        alpha_c = lst_get_dbl(tmpl, 3);
        beta_c = lst_get_dbl(tmpl, 4);
        tau_c = lst_get_dbl(tmpl, 5);
      }
      else {
        alpha_c = alpha_n; beta_c = beta_n; tau_c = tau_n;
      }
      if (alpha_c <= 0 || alpha_c >= 1 || beta_c <= 0 || beta_c >= 1 || 
          tau_c <= 0 || tau_c >= 1 || alpha_n <= 0 || alpha_n >= 1 || 
          beta_n <= 0 || beta_n >= 1 || tau_n <= 0 || tau_n >= 1)
        die("ERROR: bad argument to --indel-model.\n");
      break;
    case 'H':
      fprintf(stderr, "Reading indel history from %s...\n", optarg);
      ih = ih_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("Bad argument.  Try 'dless -h'.\n");
    }
  }

  if (optind != argc - 1)
    die("Missing alignment file or model file.  Try 'dless -h'.\n");

  if (set_transitions && (gamma != -1 || omega != -1))
    die("ERROR: --transitions and --target-coverage/--expected-length cannot be used together.\n");

  if ((gamma != -1 && omega == -1) || (gamma == -1 && omega != -1))
    die("ERROR: --target-coverage and --expecteed-length must be used together.\n");

  set_seed(-1);

  if (gamma != -1)
    nu = gamma/(1-gamma) * mu;

  fprintf(stderr, "Reading tree model from %s...\n", argv[optind]);
  source_mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1);

  if (source_mod->nratecats > 1) 
    die("ERROR: rate variation not currently supported.\n");

  if (source_mod->order > 0)
    die("ERROR: only single nucleotide models are currently supported.\n");

  if (!tm_is_reversible(source_mod))
    phast_warning("WARNING: p-value computation assumes reversibility and your model is non-reversible.\n");

  /* read alignment */
  msa_f = phast_fopen(argv[optind], "r");

  fprintf(stderr, "Reading alignment from %s...\n", argv[optind]);
  if (msa_format == UNKNOWN_FORMAT) 
    msa_format = msa_format_for_content(msa_f, 1);

  if (msa_format == MAF) {
    msa = maf_read(msa_f, refseq_f, 1, NULL, NULL, NULL, -1, TRUE, NULL, 
                   NO_STRIP, FALSE); 
  }
  else 
    msa = msa_new_from_file_define_format(msa_f, msa_format, NULL);

  if (msa_alph_has_lowercase(msa)) msa_toupper(msa); 
  msa_remove_N_from_alph(msa);

  if (msa->ss == NULL) {
    fprintf(stderr, "Extracting sufficient statistics...\n");
    ss_from_msas(msa, 1, TRUE, NULL, NULL, NULL, -1, 0);
  }
  else if (msa->ss->tuple_idx == NULL)
    die("ERROR: ordered representation of alignment required unless --suff-stats.\n");

  /* prune tree, if necessary */
  old_nnodes = source_mod->tree->nnodes;
  tm_prune(source_mod, msa, pruned_names);

  if (lst_size(pruned_names) == (old_nnodes + 1) / 2)
    die("ERROR: no match for leaves of tree in alignment (leaf names must match alignment names).\n");
  if (lst_size(pruned_names) > 0) {
    fprintf(stderr, "WARNING: pruned away leaves of tree with no match in alignment (");
    for (i = 0; i < lst_size(pruned_names); i++)
      fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, i))->chars, 
              i < lst_size(pruned_names) - 1 ? ", " : ").\n");
  }

  /* this has to be done after pruning tree */
  tr_name_ancestors(source_mod->tree);

  /* also make sure match for reference sequence in tree */
  if (refidx > 0) {
    for (i = 0, found = FALSE; !found && i < source_mod->tree->nnodes; i++) {
      TreeNode *n = lst_get_ptr(source_mod->tree->nodes, i);
      if (!strcmp(n->name, msa->names[refidx-1]))
        found = TRUE;
    }
    if (!found) die("ERROR: no match for reference sequence in tree.\n");
  }

  /* checks for indel model */
  if (alpha_c > 0) {
    if (ih == NULL) {
      fprintf(stderr, "Reconstructing indel history by parsimony...\n");
      ih = ih_reconstruct(msa, source_mod->tree);
    }
    else {
      if (ih->ncols != msa->length)
        die("ERROR: indel history doesn't seem to match alignment.\n");
      if (ih->tree->nnodes != source_mod->tree->nnodes)
        die("ERROR: indel history doesn't seem to match tree model.\n");
    }
  }

  bdphmm = bd_new(source_mod, rho, mu, nu, phi, alpha_c, beta_c, tau_c, 
                  alpha_n, beta_n, tau_n, estim_gamma, estim_omega, 
                  estim_phi);

  /* compute emissions */
  phmm_compute_emissions(bdphmm->phmm, msa, FALSE);

  /* add emissions for indel model, if necessary */
  if (alpha_c > 0) {
    fprintf(stderr, "Adjusting emissions for indels...\n");
    bd_add_indel_emissions(bdphmm, ih);
  }

  /* postprocess for missing data (requires special handling) */
  fprintf(stderr, "Adjusting emissions for missing data...\n");
  bd_handle_missing_data(bdphmm, msa);

  if (estim_gamma || estim_omega || estim_phi) {
    fprintf(stderr, "Estimating free parameters...\n");
    bd_estimate_transitions(bdphmm, msa);
  }

  /* set seqname and idpref, if necessary */
  if (seqname == NULL || idpref == NULL) {
    /* derive default from file name root */
    String *tmp = str_new_charstr(msa_fname);
    if (!str_equals_charstr(tmp, "-")) {
      str_remove_path(tmp);
      str_root(tmp, '.');
      if (idpref == NULL) idpref = copy_charstr(tmp->chars);
      str_root(tmp, '.');         /* apply one more time for double suffix */
      if (seqname == NULL) seqname = tmp->chars;    
    }
    else if (seqname == NULL) seqname = "refseq";
  }

  /* obtain predictions */
  fprintf(stderr, "Running Viterbi algorithm...\n");
  predictions = phmm_predict_viterbi(bdphmm->phmm, seqname, NULL, idpref, NULL);
  lst_push_ptr(ignore_types, str_new_charstr("nonconserved"));
  gff_filter_by_type(predictions, ignore_types, TRUE, NULL);

  /* score predictions */
  fprintf(stderr, "Scoring predictions...\n");
  bd_score_predictions(bdphmm, predictions);
  
  /* can free emissions now */
  for (i = 0; i < bdphmm->phmm->hmm->nstates; i++)
    sfree(bdphmm->phmm->emissions[i]);
  sfree(bdphmm->phmm->emissions);
  bdphmm->phmm->emissions = NULL;

  /* convert GFF to coord frame of reference sequence and adjust
     coords by idx_offset, if necessary  */
  if (refidx != 0 || msa->idx_offset != 0)
    msa_map_gff_coords(msa, predictions, 0, refidx, msa->idx_offset);

  if (refidx != 0) 
    gff_flatten(predictions);	
  /* necessary because coord conversion might create overlapping
     features (can happen in deletions in reference sequence) */

  /* now output predictions */
  fprintf(stderr, "Writing GFF to stdout...\n");
  gff_print_set(stdout, predictions);

  fprintf(stderr, "Done.\n");
  
  return 0;
}
Beispiel #9
0
int main(int argc, char *argv[]) {
  /* variables for options, with defaults */
  TreeNode *tree = NULL, *merge_tree = NULL, *extrapolate_tree = NULL;
  Hashtable *rename_hash = NULL;
  double scale_factor = 1;
  List *prune_names = NULL, *label = NULL, *labelType = NULL;
  int prune_all_but = FALSE, tree_only = FALSE, dissect = FALSE,
    name_ancestors = FALSE, with_branch = FALSE, print_branchlen=FALSE,
    inNewick=FALSE, no_branchlen = FALSE, print_distance_to_root = FALSE;
  TreeModel *mod = NULL, *merge_mod = NULL;
  char *reroot_name = NULL, *subtree_name =NULL, *get_subtree_name = NULL,
    *node_distance_name = NULL;
  
  /* other variables */
  String *suffix,  *optstr;
  char c;
  int i, opt_idx;
  TreeNode *n;

  struct option long_opts[] = {
    {"scale", 1, 0, 's'},
    {"extrapolate", 1, 0, 'e'},
    {"prune", 1, 0, 'p'},
    {"prune-all-but", 1, 0, 'P'},
    {"get-subtree", 1, 0, 'g'},
    {"merge", 1, 0, 'm'},
    {"rename", 1, 0, 'r'},
    {"tree-only", 0, 0, 't'},
    {"no-branchlen", 0, 0, 'N'},
    {"dissect", 0, 0, 'd'},
    {"name-ancestors", 0, 0, 'a'},
    {"reroot", 1, 0, 'R'},
    {"with-branch", 1, 0, 'B'},
    {"subtree", 1, 0, 'S'},
    {"branchlen", 0, 0, 'b'},
    {"newick", 0, 0, 'n'},
    {"label-subtree", 1, 0, 'L'},
    {"label-branches", 1, 0, 'l'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  while ((c = getopt_long(argc, argv, "s:p:P:g:m:r:R:B:S:D:l:L:adtNbnh", 
                          long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 's':
      scale_factor = get_arg_dbl_bounds(optarg, 0, INFTY);
      break;
    case 'e':
      if (!strcmp(optarg, "default")) {
        optarg = smalloc(1000 * sizeof(char));
        #if defined(__MINGW32__)
          sprintf(optarg, "%s\\data\\exoniphy\\mammals\\cftr25_hybrid.nh",
		  PHAST_HOME);
        #else
          sprintf(optarg, "%s/data/exoniphy/mammals/cftr25_hybrid.nh", 
                  PHAST_HOME);
        #endif
      }
      extrapolate_tree = tr_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'p':
      prune_names = get_arg_list(optarg);
      break;
    case 'P':
      prune_names = get_arg_list(optarg);
      prune_all_but = TRUE;
      break;
    case 'g':
      get_subtree_name = optarg;
      break;
    case 'm':
      suffix = str_new_charstr(optarg);
      str_suffix(suffix, '.');
      if (str_equals_charstr(suffix, "nh"))
        merge_tree = tr_new_from_file(phast_fopen(optarg, "r"));
      else {
        merge_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1);
        merge_tree = merge_mod->tree;
      }
      break;
    case 'r':
      rename_hash = make_name_hash(optarg);
      break;
    case 't':
      tree_only = TRUE;
      break;
    case 'N':
      no_branchlen = TRUE;
      tree_only = TRUE;
      break;
    case 'd':
      dissect = TRUE;
      break;
    case 'b':
      print_branchlen = TRUE;
      break;
    case 'D':
      print_distance_to_root = TRUE;
      node_distance_name = optarg;
      break;
    case 'R':
      reroot_name = optarg;
      break;
    case 'B':
      with_branch = TRUE;
      break;
    case 'a':
      name_ancestors = TRUE;
      break;
    case 'S':
      subtree_name = optarg;
      break;
    case 'n':
      inNewick=TRUE;
      break;
    case 'L':  //do the same for --label--subtree and --label-branches
    case 'l':
      if (label == NULL) {
	label = lst_new_ptr(1);
	labelType = lst_new_int(1);
      }
      optstr = str_new_charstr(optarg);
      lst_push_ptr(label, optstr);
      lst_push_int(labelType, (int)c);
      break;
    case 'h':
      usage(argv[0]);
    case '?':
      die("Bad argument.  Try '%s -h'.\n", argv[0]);
    }
  }

  if (optind != argc - 1) 
    die("Input filename required.  Try '%s -h'.\n", argv[0]);

  if (merge_tree != NULL && extrapolate_tree != NULL)
    die("ERROR: Can't use --merge and --extrapolate together");

  set_seed(-1);
    
  suffix = str_new_charstr(argv[optind]);
  str_suffix(suffix, '.');
  if (inNewick || str_equals_charstr(suffix, "nh")) {
    tree = tr_new_from_file(phast_fopen(argv[optind], "r"));
    tree_only = TRUE;           /* can't output tree model in this case */
  }
  else {
    mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1);
    tree = mod->tree;
  }

  if (prune_names != NULL) {
    tr_prune(&tree, prune_names, prune_all_but, NULL);
    if (mod != NULL) mod->tree = tree; /* root may have changed */
  }

  if (get_subtree_name != NULL) {
    n = tr_get_node(tree, get_subtree_name);
    if (n == NULL) {
      tr_name_ancestors(tree);
      n = tr_get_node(tree, get_subtree_name);
      if (n == NULL) {
	die("ERROR: no node named '%s'.\n", subtree_name);
      }
    }
    tr_prune_supertree(&tree, n);
    if (mod != NULL) mod->tree = tree;
  }

  if (merge_tree != NULL) {
    tree = tr_hybrid(tree, merge_tree);
    if (mod != NULL) mod->tree = tree;
  }

  else if (extrapolate_tree != NULL) {
    tr_scale_by_subtree(extrapolate_tree, tree);
    tree = extrapolate_tree;
    if (mod != NULL) mod->tree = tree;
  }

  if (scale_factor != 1) {
    if (subtree_name == NULL)
      tr_scale(tree, scale_factor);
    else {
      n = tr_get_node(tree, subtree_name);
      if (n == NULL) die("ERROR: no node named '%s'.\n", subtree_name);
      tr_scale_subtree(tree, n, scale_factor, with_branch);
    }
  }

  if (name_ancestors)
    tr_name_ancestors(tree);

  if (rename_hash != NULL) {
    char *newname;
    for (i = 0; i < tree->nnodes; i++) {
      n = lst_get_ptr(tree->nodes, i);
      if (n->name != NULL && n->name[0] != '\0' && 
          (newname = hsh_get(rename_hash, n->name)) != (char*)-1) {
        strcpy(n->name, newname);
      }
    }
  }

  if (reroot_name != NULL) {
    n = tr_get_node(tree, reroot_name);
    if (n == NULL) die("ERROR: no node named '%s'.\n", reroot_name);
    tr_reroot(tree, n, with_branch);
    if (mod != NULL) mod->tree = with_branch ? n->parent : n;
    tree = with_branch ? n->parent : n;
  }

  if (label != NULL) {
    for (i=0; i < lst_size(label); i++) {
      String *currstr = (String*)lst_get_ptr(label, i), *arg1, *labelVal;
      List *tmplst = lst_new_ptr(10);
      String *nodename;
      int j;
      str_split(currstr, ":", tmplst);
      if (lst_size(tmplst) != 2) 
	die("ERROR: bad argument to --label-branches or --label-subtree.\n");
      arg1 = lst_get_ptr(tmplst, 0);
      labelVal = lst_get_ptr(tmplst, 1);
      lst_clear(tmplst);
      if (lst_get_int(labelType, i) == (int)'l') {
	str_split(arg1, ",", tmplst);
	for (j=0; j < lst_size(tmplst); j++) {
	  nodename = (String*)lst_get_ptr(tmplst, j);
	  tr_label_node(tree, nodename->chars, labelVal->chars);
	}
	lst_free_strings(tmplst);
      } else if (lst_get_int(labelType, i) == (int)'L') {
	int include_leading_branch = FALSE;
	TreeNode *node;
	nodename = arg1;
	node = tr_get_node(tree, nodename->chars);
	if (node == NULL && nodename->chars[nodename->length-1] == '+') {
	  nodename->chars[--nodename->length] = '\0';
	  node = tr_get_node(tree, nodename->chars);
	  include_leading_branch = TRUE;
	}
	tr_label_subtree(tree, nodename->chars, include_leading_branch, 
			 labelVal->chars);
      } else die("ERROR got label_type %c\n", lst_get_int(labelType, (char)i));
      str_free(arg1);
      str_free(labelVal);
      lst_free(tmplst);
      str_free(currstr);
    }
    lst_free(label);
    lst_free(labelType);
  }

  if (dissect) 
    tr_print_nodes(stdout, tree);
  if (print_branchlen) 
    printf("TOTAL_TREE_LEN: %f\n", tr_total_len(tree));
  if (print_distance_to_root) {
    TreeNode *node = tr_get_node(tree, node_distance_name);
    if (node == NULL) 
      die("ERROR: no node named '%s'.\n", node_distance_name);
    printf("length(root-%s): %f\n", node_distance_name, 
	   tr_distance_to_root(node));
  }

  if (dissect==0 && print_branchlen==0 && print_distance_to_root==0) {
    if (tree_only)
      tr_print(stdout, tree, no_branchlen==FALSE);
    else
      tm_print(stdout, mod);
  }
  return 0;
}
int main(int argc, char *argv[]) {
  FILE *prob_f;
  char c;
  int opt_idx, i, nlines = 0, ngaps = 0;
  unsigned idx;
  PbsCode *code;
  List *fields = lst_new_ptr(10);
  double error, tot_error = 0, prob;
  Vector *v;
  String *line = str_new(STR_MED_LEN);

  struct option long_opts[] = {
    {"discard-gaps", 0, 0, 'G'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  /* variables for options, with defaults */
  int discard_gaps = FALSE;

  set_seed(-1);

  while ((c = (char)getopt_long(argc, argv, "Gh", long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 'G':
      discard_gaps = TRUE;
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("Bad argument.  Try 'pbsEncode -h'.\n");
    }
  }

  if (optind != argc - 2) 
    die("Two arguments required.  Try 'pbsEncode -h'.\n");
    
  prob_f = phast_fopen(argv[optind], "r");
  code = pbs_new_from_file(phast_fopen(argv[optind+1], "r"));
  v = vec_new(code->sg->d);
  
  while (str_readline(line, prob_f) != EOF) {

    if (line->length == 0 || line->chars[0] == '#') continue;

    str_split(line, NULL, fields);

    if (lst_size(fields) == 1 && str_equals_charstr(lst_get_ptr(fields, 0), "-")) {
      ngaps++;
      if (!discard_gaps)
	pbs_write_binary(code, code->gap_code, stdout);
    }
    else {			/* ordinary prob vector */
      if (lst_size(fields) != code->sg->d)
	die("ERROR: number of columns must equal dimension of code (%d).\n",
	    code->sg->d);
    
      for (i = 0; i < code->sg->d; i++) {
	if (str_as_dbl(lst_get_ptr(fields, i), &prob) != 0 || prob < 0 || prob > 1)
	  die("ERROR: bad value ('%s')\n", lst_get_ptr(fields, i));

	vec_set(v, i, prob);
      }

      idx = pbs_get_index(code, v, &error);
      tot_error += error;
      pbs_write_binary(code, idx, stdout);
      nlines++;
    }

    lst_free_strings(fields);
  }

  fprintf(stderr, "Dimensions: %d\n\
Rows per dimension: %d\n\
Code size: %d\n\
Bytes per vector: %d\n\
Vectors processed: %d\n\
Gaps: %d%s\n\
Average approximation error: %f bits\n", 
	  code->sg->d, code->sg->nrows, code->code_size, code->nbytes,  
	  nlines, ngaps, discard_gaps ? " (discarded)" : "", tot_error/nlines);

  return 0;
}