Esempio n. 1
0
SEXP rph_gff_featureBits(SEXP gffListP, SEXP orP, SEXP returnGffP) {
  int numGff, i, j, or, returnGff;
  long numbit = 0;
  List *gfflist;
  GFF_Set *gff, *newgff=NULL;
  GFF_Feature *feat, *newfeat;
  SEXP rv;

  numGff = length(gffListP);
  gfflist = lst_new_ptr(numGff);
  //  Rf_PrintValue(gffListP);
  for (i = 0; i < numGff; i++) {
    gff = (GFF_Set*)EXTPTR_PTR(VECTOR_ELT(gffListP, i));
    lst_push_ptr(gfflist, gff);
    gff_register_protect(gff);
  }
  or = LOGICAL_VALUE(orP);
  returnGff = LOGICAL_VALUE(returnGffP);
  if (!or && numGff >= 2) {
    newgff = gff_overlap_gff(lst_get_ptr(gfflist, 0),
			     lst_get_ptr(gfflist, 1),
			     1, -1.0, FALSE, TRUE, NULL);
    numbit = gff_flatten_mergeAll(newgff);
    for (i=2; i < numGff; i++) {
      checkInterrupt();
      gff = gff_overlap_gff(newgff,
			    lst_get_ptr(gfflist, i),
			    1, -1.0, FALSE, TRUE, NULL);
      numbit = gff_flatten_mergeAll(gff);
      gff_free_set(newgff);
      newgff = gff;
    }
  } else {
    newgff = gff_new_set();
    for (i=0; i< numGff; i++) {
      gff = (GFF_Set*)lst_get_ptr(gfflist, i);
      for (j=0; j < lst_size(gff->features); j++) {
	checkInterruptN(j, 1000);
	feat = lst_get_ptr(gff->features, j);
	newfeat = gff_new_feature_copy(feat);
	lst_push_ptr(newgff->features, newfeat);
      }
    }
    numbit = gff_flatten_mergeAll(newgff);
  }
  if (returnGff)
    return rph_gff_new_extptr(newgff);

  if (numbit > INT_MAX) {
    PROTECT(rv = allocVector(REALSXP, 1));
    REAL(rv)[0] = numbit;
  } else {
    PROTECT(rv = allocVector(INTSXP, 1));
    INTEGER(rv)[0] = numbit;
  }
  UNPROTECT(1);
  return rv;
}
Esempio n. 2
0
void mafBlock_mask_region(MafBlock *block, GFF_Set *mask_feats, List *speclist) {
  MafSubBlock *refblock, *maskblock;
  int i, j, spec_idx;
  GFF_Set *feat;
  GFF_Feature *f, *prevf=NULL;
  int next_feat_idx = 1;
  char **maskseq;
  int num_mask_seq=0;
  long coord;
  if (mask_feats == NULL || lst_size(mask_feats->features) == 0L) return;
  maskseq = smalloc(lst_size(speclist)*sizeof(char*));
  for (i=0; i < lst_size(speclist); i++) {
    spec_idx = hsh_get_int(block->specMap, ((String*)lst_get_ptr(speclist, i))->chars);
    if (spec_idx == -1) continue;
    maskblock = lst_get_ptr(block->data, spec_idx);
    if (maskblock->seq == NULL) continue;
    maskseq[num_mask_seq++] = maskblock->seq->chars;
  }
  if (num_mask_seq == 0) {
    sfree(maskseq);
    return;
  }
  feat = gff_copy_set_no_groups(mask_feats);
  gff_flatten_mergeAll(feat);
  f = lst_get_ptr(feat->features, 0);

  refblock = lst_get_ptr(block->data, 0);
  coord = refblock->start;
  for (i=0; i < block->seqlen; i++) {
    if (refblock->seq->chars[i] != '-') coord++;  //this is 1-based coordinate
    if (coord > f->end) {
      if (next_feat_idx == lst_size(feat->features))
	break;
      prevf = f;
      f = lst_get_ptr(feat->features, next_feat_idx++);
      if (f->start <= prevf->end) {
	die("Error: feats not sorted in mafBlock_mask_region");  //shouldn't happen
      }
    }
    if (coord >= f->start && coord <= f->end) {
      for (j=0; j < num_mask_seq; j++)
	if (maskseq[j][i] != '-') maskseq[j][i] = 'N';
    }
  }
  gff_free_set(feat);
  sfree(maskseq);
}
int main(int argc, char* argv[]) {
  FILE* F;
  MSA *msa;
  msa_format_type format = UNKNOWN_FORMAT;
  int src_ref = -1, dest_ref = 0, offset = 0;
  char *msa_fname = NULL, *feat_fname = NULL;
  GFF_Set *gff;
  char c;

  while ((c = (char)getopt(argc, argv, "hm:f:s:d:i:p:n:")) != -1) {
    switch(c) {
    case 'm':
      msa_fname = optarg;
      break;
    case 'f':
      feat_fname = optarg;
      break;
    case 's':
      src_ref = get_arg_int(optarg);
      break;
    case 'd':
      dest_ref = get_arg_int(optarg);
      break;
    case 'i':
      format = msa_str_to_format(optarg);
      if (format == UNKNOWN_FORMAT) die("ERROR: bad alignment format.\n");
      break;
    case 'p':
      offset = get_arg_int(optarg);
      break;
    case 'n':
      offset = -1 * get_arg_int(optarg);
      break;
    case 'h':
      print_usage();
      exit(1);
    case '?':
      print_usage();
      exit(1);
    }
  }

  if (msa_fname == NULL || feat_fname == NULL) {
    print_usage();
    exit(1);
  }

  set_seed(-1);

  F = phast_fopen(feat_fname, "r");
  if ((gff = gff_read_set(F)) == NULL) { 
    die("ERROR: error reading %s.\n", feat_fname);
  }
  phast_fclose(F);

  /* handle case of local alignment specially -- avoid representing
     the alignment explicitly */
  F = phast_fopen(msa_fname, "r");
  if (format == UNKNOWN_FORMAT)
    format = msa_format_for_content(F, 1);
  if (format == LAV) {
    LocalPwAlignment *lpwa = NULL;
/*     int i; */

    fprintf(stderr, "WARNING: in local alignment mode, coordinates may only be mapped from query (reference) sequence to target (aligned) sequence.\n"); 

    lpwa = la_read_lav(F, 0);
    la_gff_transform(lpwa, gff);
/*     for (i = 0; i < lst_size(gff->features); i++) { */
/*       GFF_Feature *feat = lst_get_ptr(gff->features, i); */
/*       feat->start = la_get_target_coord(lpwa, feat->start); */
/*       feat->end = la_get_target_coord(lpwa, feat->end); */
/*     } */
  }

  else {                        /* normal alignment */
    msa = msa_new_from_file_define_format(F, format, NULL);
    phast_fclose(F);

    msa_map_gff_coords(msa, gff, src_ref, dest_ref, offset);
    msa_free(msa);
  }

  gff_print_set(stdout, gff);

  gff_free_set(gff);

  return 0;
}
Esempio n. 4
0
int main(int argc, char* argv[]) {
    FILE* F;
    MSA *msa;
    int *msa_gap_patterns = NULL;
    HMM *hmm = NULL;
    TreeNode *tree = NULL;
    int i, input_format = SS, msa_idx, quiet_mode = FALSE,
           ncats, nmsas, ncats_unspooled, indel_nseqs = -1;
    String *msa_fname, *gff_fname;
    List *gff_fname_list = NULL, *msa_fname_list = NULL,
          *msa_length_list = NULL, *model_indels_str = NULL;
    Matrix *traincounts = NULL;
    Vector *begcounts = NULL, *statecounts = NULL;
    CategoryMap *cm = NULL;
    char c;
    GapPatternMap *gpm = NULL;
    GFF_Set *gff;
    char *reverse_groups_tag = NULL;

    while ((c = getopt(argc, argv, "i:g:c:m:M:R:I:n:t:P:G:qh")) != -1) {
        switch(c) {
        case 'i':
            input_format = msa_str_to_format(optarg);
            if (input_format == -1)
                die("ERROR: bad alignment format.\n");
            break;
        case 'g':
            gff_fname_list = get_arg_list(optarg);
            break;
        case 'c':
            cm = cm_new_string_or_file(optarg);
            break;
        case 'm':
            msa_fname_list = get_arg_list(optarg);
            break;
        case 'M':
            msa_length_list = str_list_as_int(get_arg_list(optarg));
            break;
        case 'R':
            reverse_groups_tag = optarg;
            break;
        case 'I':
            model_indels_str = get_arg_list(optarg);
            break;
        case 'n':
            indel_nseqs = get_arg_int(optarg);
            break;
        case 't':
            if (optarg[0] == '(')     /* in this case, assume topology given
                                   at command line */
                tree = tr_new_from_string(optarg);
            else
                tree = tr_new_from_file(phast_fopen(optarg, "r"));
            break;
        case 'q':
            quiet_mode = TRUE;
            break;
        case 'h':
            print_usage();
            exit(0);
        case '?':
            die("ERROR: unrecognized option.\n\nType 'hmm_train -h' for usage.\n");
        }
    }

    if (msa_fname_list == NULL)
        die("ERROR: -m required.  Type 'hmm_train -h' for usage.\n");
    if (gff_fname_list == NULL)
        die("ERROR: -g required in training mode.  Type 'hmm_train -h' for usage.\n");
    if (msa_length_list != NULL && msa_fname_list != NULL)
        die("ERROR: -m and -M are mutually exclusive.  Type 'hmm_train -h' for usage.\n");
    if (model_indels_str != NULL && tree == NULL)
        die("ERROR: -I requires -t.  Type 'hmm_train -h' for usage.\n");
    if (cm == NULL)
        die("ERROR: category map required.\n");

    set_seed(-1);

    ncats = cm->ncats + 1;
    ncats_unspooled = cm->unspooler != NULL ? cm->unspooler->nstates_unspooled :
                      ncats;
    nmsas = (msa_length_list != NULL ? lst_size(msa_length_list) :
             lst_size(msa_fname_list));

    if (model_indels_str != NULL) {
        if (tree == NULL)
            die("ERROR: tree is NULL\n");  /*FIXME: indel_ncats broken */
        gpm = gp_create_gapcats(cm, model_indels_str, tree, FALSE);
        ncats = cm->ncats + 1;    /* numbers will change */
        ncats_unspooled = cm->unspooler == NULL ? ncats :
                          cm->unspooler->nstates_unspooled;
    }

    /* allocate memory for storage of "training paths" */
    traincounts = mat_new(ncats_unspooled, ncats_unspooled);
    statecounts = vec_new(ncats_unspooled);
    begcounts = vec_new(ncats_unspooled);
    mat_zero(traincounts);
    vec_zero(statecounts);
    vec_zero(begcounts);


    /* create skeleton of new HMM. */
    hmm = hmm_new_nstates(ncats_unspooled, 0, 0);

    /* Main loop: consider each MSA in turn */
    for (msa_idx = 0; msa_idx < nmsas; msa_idx++) {
        if (msa_fname_list != NULL) {
            msa_fname = (String*)lst_get_ptr(msa_fname_list, msa_idx);
            F = phast_fopen(msa_fname->chars, "r");
            if (!quiet_mode)
                fprintf(stderr, "Reading alignment from %s ...\n",
                        F == stdin ? "stdin" : msa_fname->chars);
            msa = msa_new_from_file(F, NULL);
            phast_fclose(F);

        }
        else {                      /* only lengths of alignments specified */
            msa = msa_new(NULL, NULL, 0, lst_get_int(msa_length_list, msa_idx), NULL);
            /* just a shell in this case */
        }

        gff_fname = (String*)lst_get_ptr(gff_fname_list, msa_idx);
        if (!quiet_mode)
            fprintf(stderr, "Reading annotations from %s ...\n", gff_fname->chars);
        gff = gff_read_set(phast_fopen(gff_fname->chars, "r"));

        /* convert GFF to coordinate frame of alignment */
        if (msa_length_list == NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Mapping annotations to alignment ...\n");
            msa_map_gff_coords(msa, gff, 1, 0, 0); /* assume seq 1 is ref */
        }

        if (model_indels_str != NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Obtaining gap patterns ...\n");
            msa_gap_patterns = smalloc(msa->length * sizeof(int));
            gp_set_phylo_patterns(gpm, msa_gap_patterns, msa);
        }

        /* at this point, we don't actually need the alignment anymore;
           if using ordered suff stats (likely with large data sets),
           can free them now, to avoid running out of memory */
        if (msa->ss != NULL) {
            ss_free(msa->ss);
            msa->ss = NULL;
        }

        if (reverse_groups_tag != NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Reverse complementing features on negative strand (group by '%s') ...\n",
                        reverse_groups_tag);
            /* we don't need to reverse complement the whole alignment --
               just the gff and possibly the gap pattern array (pass a
               NULL msa) */
            gff_group(gff, reverse_groups_tag);
            msa_reverse_compl_feats(NULL, gff, msa_gap_patterns);
        }

        if (!quiet_mode)
            fprintf(stderr, "Labeling sites by category ...\n");
        msa_label_categories(msa, gff, cm);

        gff_free_set(gff);

        if (model_indels_str != NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Remapping categories according to gap patterns ...\n");

            if (indel_nseqs > 0 && indel_nseqs != msa->nseqs) {
                /* in this case, we'll simply reassign non-trivial gap
                   patterns randomly.  This will achieve the desired
                   effect with minimal coding, as long as the number of
                   sites is not too small (the indel model is probably
                   useless anyway if the number is small) */
                int pat, newpat;
                int npatterns = 4 * indel_nseqs - 5;
                int complex_allowed[cm->ncats+1];
                List *no_complex_names, *no_complex_nums;

                if (!quiet_mode)
                    fprintf(stderr, "(target number of sequences: %d)\n", indel_nseqs);

                /* set up index indicating by cat no. whether complex gaps
                   are allowed */
                for (i = 0; i < ncats; i++) complex_allowed[i] = 1;
                no_complex_names = lst_new_ptr(10);
                str_split(str_new_charstr(NO_COMPLEX), ",", no_complex_names);
                no_complex_nums = cm_get_category_list(cm, no_complex_names, 1);
                for (i = 0; i < lst_size(no_complex_nums); i++)
                    complex_allowed[lst_get_int(no_complex_nums, i)] = 0;
                lst_free(no_complex_nums);
                lst_free_strings(no_complex_names);
                lst_free(no_complex_names);

                /* now reassign all non-null numbers */
                for (i = 0; i < msa->length; ) {
                    if ((pat = msa_gap_patterns[i]) != 0) {
                        if (complex_allowed[msa->categories[i]])
                            newpat = 1 + ((double)npatterns * unif_rand());
                        /* random number in interval [1, npatterns] */
                        else
                            newpat = 1 + ((double)(npatterns-1) * unif_rand());
                        /* random number in interval [1,npatterns-1]
                           (excludes complex gap pattern) */
                        for (; i < msa->length && msa_gap_patterns[i] == pat; i++)
                            msa_gap_patterns[i] = newpat; /* change for whole sequence */
                    }
                    else i++;
                }
            }

            /* obtain gapped category number for each site */
            for (i = 0; i < msa->length; i++)
                if (gpm->cat_x_pattern_to_gapcat[msa->categories[i]] != NULL)
                    msa->categories[i] = gpm->cat_x_pattern_to_gapcat[msa->categories[i]][msa_gap_patterns[i]];
        }

        if (!quiet_mode)
            fprintf(stderr, "Unspooling categories ...\n");
        cm_spooled_to_unspooled(cm, msa->categories, msa->length);

        if (!quiet_mode)
            fprintf(stderr, "Collecting training data ...\n");
        hmm_train_update_counts(traincounts, statecounts, begcounts,
                                msa->categories, msa->length,
                                ncats_unspooled);

        if (msa_gap_patterns != NULL) sfree(msa_gap_patterns);
        msa_free(msa);
    }

    /* now train HMM, using cumulative data */
    hmm_train_from_counts(hmm, traincounts, NULL, statecounts, NULL,
                          begcounts, NULL);

    /* if modeling indels, adjust begin transitions so probability is
       distributed among different "gap pattern" states that all
       correspond to the same ungapped state (category); this helps
       avoid problems that occur when training on a few large sequences
       (e.g., whole chromosomes) and then testing on many shorter ones */
    if (model_indels_str != NULL) {
        double tprob[gpm->ncats];
        int nst[gpm->ncats];  /* total prob and number of states per
                             spooled, ungapped category */
        for (i = 0; i < gpm->ncats; i++) tprob[i] = nst[i] = 0;
        for (i = 0; i < hmm->nstates; i++) {
            if (vec_get(hmm->begin_transitions, i) > 0)
                /* have to go from unspooled space to spooled space, then to
                   ungapped space (HMM states correspond to unspooled,
                   gapped categories).  Note that states with nonzero begin
                   probs shouldn't be conditioned on other states. */
                tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] +=
                    vec_get(hmm->begin_transitions, i);
            nst[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]]++;
        }
        for (i = 0; i < hmm->nstates; i++)
            if (tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] > 0)
                vec_set(hmm->begin_transitions, i,
                        tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] /
                        nst[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]]);
        /* (uniform prior) */
    }

    /* write trained HMM */
    hmm_print(stdout, hmm);

    if (!quiet_mode) fprintf(stderr, "Done.\n");

    return 0;
}
int main(int argc, char *argv[]) {
  char c;
  int opt_idx;
  GFF_Set *gff;
  List *include = NULL;
  char *groupby = "transcript_id", *exongroup_tag = NULL;
  int unique = FALSE, sort = FALSE, simplebed = FALSE, fix_start_stop = FALSE,
    add_utrs = FALSE, add_introns = FALSE, add_signals = FALSE;
  enum {GFF, BED, GENEPRED, WIG} output_format = GFF;
  FILE *discards_f = NULL, *groups_f = NULL;

  struct option long_opts[] = {
    {"output", 1, 0, 'o'},
    {"include-only", 1, 0, 'i'},
    {"include-groups", 1, 0, 'l'},
    {"groupby", 1, 0, 'g'},
    {"exongroup", 1, 0, 'e'},
    {"add-utrs", 0, 0, 'U'},
    {"add-introns", 0, 0, 'I'},
    {"add-signals", 0, 0, 'S'},
    {"fix-start-stop", 0, 0, 'f'},
    {"unique", 0, 0, 'u'},
    {"sort", 0, 0, 's'},
    {"simplebed", 0, 0, 'b'},
    {"discards", 1, 0, 'd'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  while ((c = (char)getopt_long(argc, argv, "o:i:l:g:e:d:UISfusbh", long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 'o':
      if (!strcmp("bed", optarg)) output_format = BED;
      else if (!strcmp("genepred", optarg)) output_format = GENEPRED;
      else if (!strcmp("wig", optarg)) output_format = WIG;
      else if (strcmp("gff", optarg)) die("ERROR: bad output format.\n");
      break;
    case 'i':
      include = get_arg_list(optarg);
      break;
    case 'l':
      groups_f = phast_fopen(optarg, "r");
      break;
    case 'g':
      groupby = optarg;
      break;
    case 'e':
      exongroup_tag = optarg;
      break;
    case 'U':
      add_utrs = TRUE;
      break;
    case 'I':
      add_introns = TRUE;
      break;
    case 'S':
      add_signals = TRUE;
      break;
    case 'f':
      fix_start_stop = TRUE;
      break;
    case 'u':
      unique = TRUE;
      break;
    case 'b':
      simplebed = TRUE;
      output_format = BED;
      break;
    case 'd':
      discards_f = phast_fopen(optarg, "w+");
      break;
    case 's':
      sort = TRUE;
      break;
    case 'h':
      usage(argv[0]);
    case '?':
      die("Bad argument.  Try '%s -h'.\n", argv[0]);
    }
  }

  if (optind != argc - 1) 
    die("Input filename required.  Try '%s -h'.\n", argv[0]);

  set_seed(-1);

  gff = gff_read_set(phast_fopen(argv[optind], "r"));

  if (lst_size(gff->features) == 0) exit(0); /* helps avoid unexpected
                                                behavior below */

  /* filter by type */
  if (include != NULL) gff_filter_by_type(gff, include, FALSE, discards_f);

  /* group */
  gff_group(gff, groupby);

  /* utrs, introns, & signals */
  if (add_utrs) gff_create_utrs(gff);
  if (add_introns) gff_create_introns(gff);
  if (add_signals) gff_create_signals(gff);

  /* subgroup */
  if (exongroup_tag != NULL) gff_exon_group(gff, exongroup_tag);

  /* filter by group */
  if (groups_f != NULL) {
    String *s = str_new(STR_LONG_LEN);
    List *groups = lst_new_ptr(10000);
    str_slurp(s, groups_f);
    str_split(s, NULL, groups);
    gff_filter_by_group(gff, groups);
    lst_free_strings(groups); lst_free(groups);
    str_free(s);
  }

  /* sort */
  if (sort) gff_sort(gff);

  /* make unique */
  if (unique) gff_remove_overlaps(gff, discards_f);
  
  if (fix_start_stop) gff_fix_start_stop(gff);

  if (output_format == BED)
    gff_print_bed(stdout, gff, !simplebed);
  else if (output_format == GENEPRED)
    gff_print_genepred(stdout, gff);
  else if (output_format == WIG)
    wig_print(stdout, gff);
  else 
    gff_print_set(stdout, gff);
  gff_free_set(gff);
  
  return 0;
}
int main(int argc, char* argv[]) {
  FILE* F;
  GFF_Set *gff_real=NULL, *gff_pred=NULL;
  char c;
  List *real_fname_list = NULL, *pred_fname_list = NULL, 
    *feat_list = NULL, *seq_len_list = NULL, *l = NULL;
  int nfile, i, j;
  char *prefix = NULL;
  int tot_tp = 0, tot_fp = 0, tot_nreal_pos = 0, tot_npred_pos = 0, 
    tot_seqlen = 0, tot_ncr = 0, tot_npca = 0, tot_nola = 0, tot_nme = 0, 
    tot_npcp = 0, tot_nolp = 0, tot_nwe = 0, tot_nexons_real = 0, 
    tot_nexons_pred = 0, dump_exons = 0, nnc = -1, tot_nnc = -1, 
    nc_threshold = 0;

  while ((c = (char)getopt(argc, argv, "r:p:f:l:d:n:h")) != -1) {
    switch(c) {
    case 'r':
      real_fname_list = get_arg_list(optarg);
      break;
    case 'p':
      pred_fname_list = get_arg_list(optarg);
      break;
    case 'l':
      l = get_arg_list(optarg);
      /* convert to ints */
      seq_len_list = lst_new_int(lst_size(l));
      for (i = 0; i < lst_size(l); i++) {
        int tmp;
        if (str_as_int((String*)lst_get_ptr(l, i), 
                       &tmp) != 0) {
          die("ERROR: Bad integer in <seq_len_list>.\n"); 
        }
        lst_push_int(seq_len_list, tmp);
      }
      break;
    case 'f':
      feat_list = get_arg_list(optarg);
      break;
    case 'd':
      dump_exons = 1;
      prefix = optarg;
      break;
    case 'n':
      nnc = tot_nnc = 0;
      nc_threshold = get_arg_int(optarg);
      break;
    case 'h':
      print_usage();
      exit(0);
    case '?':
      die("Unrecognized option.  Try \"eval_predictions -h\" for help.\n");
    }
  }

  set_seed(-1);

  if (feat_list == NULL) {
    feat_list = lst_new_ptr(1);
    lst_push_ptr(feat_list, str_new_charstr(GFF_CDS_TYPE));
  }
  
  if (real_fname_list == NULL || pred_fname_list == NULL || 
      seq_len_list == NULL) {
    die("ERROR: Must specify -r, -p, and -l.  Try \"eval_predictions -h\" for help.\n");
  }

  if (lst_size(real_fname_list) != lst_size(pred_fname_list)) {
    die("ERROR: Must specify lists of equal length for real and predicted filenames.\n\n.");
  }

  if (lst_size(seq_len_list) == 1 && lst_size(real_fname_list) > 1)
    for (i = 1; i < lst_size(real_fname_list); i++)
      lst_push_int(seq_len_list, lst_get_int(seq_len_list, 0));
  else if (lst_size(seq_len_list) != lst_size(real_fname_list))
    die("ERROR: List of sequence lengths does not match lists of real and predicted filenames.\n");

  /* print header */
  printf("%-25s %-25s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s", "Real_fname", "Pred_fname", "Sn", "Sp", "AC", "CC", "ESn", "ESp", "CRa", "PCa", "OLa", "ME", "CRp", "PCp", "OLp", "WE");
  if (nnc != -1) printf(" %7s %7s %7s %7s", "NCa", "NCp", "CR+NCa", "CR+NCp");
  printf("\n");

  for (nfile = 0; nfile < lst_size(real_fname_list); nfile++) {
    int tp, fp, nexons_real, nexons_pred, nwe, nme, ncr, npca, 
      npcp, nola, nolp, nreal_pos, npred_pos, len_real, len_pred, seqlen,
      already_counted_real;
    String *real_fname, *pred_fname;
    GFF_Feature *feat_real, *feat_pred=NULL;

    real_fname = (String*)lst_get_ptr(real_fname_list, nfile);
    F = phast_fopen(real_fname->chars, "r");
    if ((gff_real = gff_read_set(F)) == NULL) {
      die("ERROR: Unable to read file \"%s\".\n", 
	  real_fname->chars);
    }
    phast_fclose(F);

    pred_fname = (String*)lst_get_ptr(pred_fname_list, nfile);
    F = phast_fopen(pred_fname->chars, "r");
    if ((gff_pred = gff_read_set(F)) == NULL) {
      die("ERROR: Unable to read file \"%s\".\n", 
	  pred_fname->chars);
    }
    phast_fclose(F);

    seqlen = lst_get_int(seq_len_list, nfile);

    /* sort ungrouped -- only cds exons will be considered, and each
       one will be considered individually */
    gff_ungroup(gff_real); 
    gff_ungroup(gff_pred);
    gff_sort(gff_real);
    gff_sort(gff_pred);

    nexons_real = nexons_pred = nwe = nme = ncr = npca = npcp = nola = 
      nolp = tp = fp = nreal_pos = npred_pos = 0;
    if (nnc != -1) nnc = 0;
    i = j = 0;
    already_counted_real = 0;
    while (i < lst_size(gff_real->features)) {
      feat_real = (GFF_Feature*)lst_get_ptr(gff_real->features, i);
      if (!is_exon(feat_real, feat_list)) { i++; continue; }

      len_real = feat_real->end - feat_real->start + 1;

      if (!already_counted_real) {
        nexons_real++;
        nreal_pos += len_real;
      }

      /* look at all predicted exons up to and overlapping this real exon */
      while (j < lst_size(gff_pred->features)) {
        feat_pred = (GFF_Feature*)lst_get_ptr(gff_pred->features, j);
        if (!is_exon(feat_pred, feat_list)) {
          j++;
          continue;
        }
        else if (feat_pred->start > feat_real->end) {
          if (!already_counted_real) {
            nme++;
            if (dump_exons) dump(prefix, feat_real, NULL, ME, -1);
          }
          break;
        }

        /* otherwise we have a predicted exon to count (start of pred
           <= end of real) */
        nexons_pred++;
        len_pred = feat_pred->end - feat_pred->start + 1;
        npred_pos += len_pred;
        j++;                    /* we'll be done with this prediction
                                   one way or another; next time
                                   through look at a new one */

        if (feat_pred->end < feat_real->start) { /* WE */
          nwe++;
          fp += len_pred;
          if (dump_exons) dump(prefix, NULL, feat_pred, WE, 0);
        }
        else if (feat_pred->start == feat_real->start && /* CR */
                 feat_pred->end == feat_real->end) {
          ncr++;
          tp += len_pred;
          if (dump_exons) dump(prefix, feat_real, feat_pred, CR, 1);
          break;
        }
        else if (feat_pred->start == feat_real->start || /* PC */
                 feat_pred->end == feat_real->end) {
          pred_type type;
          npca++;
          npcp++;
          if (nnc != -1 && 
              max(abs(feat_pred->start - feat_real->start), 
                  abs(feat_pred->end - feat_real->end)) <= nc_threshold) {
            nnc++; 
            type = NC;
          }
          else type = PC;
          if (len_pred < len_real) 
            tp += len_pred;
          else {
            tp += len_real;
            fp += (len_pred - len_real);
          }
          if (dump_exons) dump(prefix, feat_real, feat_pred, type, 
                               min(1, (double)len_real/len_pred));
          break;
        }
        else {                  /* OL */
          int overlap_size;
          pred_type type;
          nola++;
          nolp++;
          if (nnc != -1 && 
              max(abs(feat_pred->start - feat_real->start), 
                  abs(feat_pred->end - feat_real->end)) <= nc_threshold) {
            nnc++; 
            type = NC;
          }
          else type = PC;

          overlap_size = min(feat_pred->end, feat_real->end) - 
            max(feat_pred->start, feat_real->start) + 1;
          tp += overlap_size;
          fp += len_pred - overlap_size;
          if (dump_exons) dump(prefix, feat_real, feat_pred, type,
                               (double)overlap_size/len_pred);
          break;
        }
        /* NOTE: I'm ignoring the possibility that a predicted exon
           could be a PC and/or OL with respect to multiple real
           exons.  The effect on the exon-level stats will be fairly
           minor (at worst a predicted exon is scored as an OL when it
           should be scored as an PC, and a real exon is erroneously
           counted as a ME), but the effect on the nucleotide-level Sn
           and Sp could conceivably be significant.  */
      }

      /* if we have counted at least one prediction (and thus failed
         to reach the end of the list), but the last prediction did
         not extend as far as the end of the real exon, then delay
         moving on to the next real exon */
      if (j < lst_size(gff_pred->features) && feat_pred->end < feat_real->end) 
          already_counted_real = 1;
      else {
        /* if we reached the end of the list of predictions, then it
           must not have contained any exons, and the real exon in
           question is a ME (if it hasn't already been counted) */
        if (j == lst_size(gff_pred->features) && !already_counted_real) 
          nme++; 

        i++;
        already_counted_real = 0;
      }
    }
    
    /* any remaining predictions must be wrong */
    for (; j < lst_size(gff_pred->features); j++) {
      if (is_exon((GFF_Feature*)lst_get_ptr(gff_pred->features, j), 
                  feat_list)) {
        nexons_pred++;
        nwe++;
      }
    }

    compute_and_print_stats(stdout, real_fname, pred_fname, 
                            tp, fp, nreal_pos, npred_pos, seqlen, ncr, 
                            npca, nola, nme, npcp, nolp, nwe, 
                            nexons_real, nexons_pred, nnc);

    tot_tp += tp;
    tot_fp += fp;
    tot_nreal_pos += nreal_pos;
    tot_npred_pos += npred_pos;
    tot_seqlen += seqlen;
    tot_ncr += ncr;
    tot_npca += npca;
    tot_nola += nola;
    tot_nme += nme;
    tot_npcp += npcp;
    tot_nolp += nolp;
    tot_nwe += nwe;
    tot_nexons_real += nexons_real;
    tot_nexons_pred += nexons_pred;
    if (nnc != -1) tot_nnc += nnc;

    if (dump_exons && SUMF != NULL)
      fprintf(SUMF, "# Total number of bases in real exons: %d\n", nreal_pos);

    gff_free_set(gff_real);
    gff_free_set(gff_pred);
  }

  if (lst_size(real_fname_list) > 1)
    compute_and_print_stats(stdout, str_new_charstr("TOTAL"), str_new_charstr(""), 
                            tot_tp, tot_fp, tot_nreal_pos, tot_npred_pos, 
                            tot_seqlen, tot_ncr, tot_npca, tot_nola, tot_nme, 
                            tot_npcp, tot_nolp, tot_nwe, tot_nexons_real, 
                            tot_nexons_pred, tot_nnc);

  return 0;
}
Esempio n. 7
0
int main(int argc, char* argv[]) {
  char *maf_fname = NULL, *out_root_fname = "maf_parse", *masked_fn = NULL;
  String *refseq = NULL, *currRefseq;
  int opt_idx, startcol = 1, endcol = -1, include = 1, splitInterval = -1;
  char c, outfilename[1000], splitFormat[100]="%s%.1i.maf", *group_tag = NULL;
  List *order_list = NULL, *seqlist_str = NULL, *cats_to_do_str=NULL, *cats_to_do=NULL;
  MafBlock *block;
  FILE *mfile, *outfile=NULL, *masked_file=NULL;
  int useRefseq=TRUE, currLen=-1, blockIdx=0, currSize, sortWarned=0;
  int lastIdx = 0, currStart=0, by_category = FALSE, i, pretty_print = FALSE;
  int lastStart = -1, gffSearchIdx=0;
  GFF_Set *gff = NULL, *gffSub;
  GFF_Feature *feat;
  CategoryMap *cm = NULL;
  int base_mask_cutoff = -1, stripILines=FALSE, stripELines=FALSE;//, numspec=0;
  List *outfileList=NULL;
  Hashtable *outfileHash=NULL;//, *specNameHash=NULL;
  msa_format_type output_format = MAF;
  MSA *msa = NULL;//, **catMsa;
  char *mask_features_spec_arg=NULL;
  List *mask_features_spec=NULL;
  

  struct option long_opts[] = {
    {"start", 1, 0, 's'},
    {"end", 1, 0, 'e'},
    {"seqs", 1, 0, 'l'},
    {"exclude", 0, 0, 'x'},
    {"order", 1, 0, 'O'},
    {"split", 1, 0, 'S'},
    {"out-root", 1, 0, 'r'},
    {"out-root-digits", 1, 0, 'd'},
    {"no-refseq", 0, 0, 'n'},
    {"features", 1, 0, 'g'},
    {"by-category", 0, 0, 'L'},
    {"do-cats", 1, 0, 'C'},
    {"catmap", 1, 0, 'c'},
    {"by-group", 1, 0, 'P'},
    {"mask-bases", 1, 0, 'b'},
    {"masked-file", 1, 0, 'm'},
    {"strip-i-lines", 0, 0, 'I'},
    {"strip-e-lines", 0, 0, 'E'},
    {"mask-features", 1, 0, 'M'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };


  while ((c = getopt_long(argc, argv, "s:e:l:O:r:S:d:g:c:P:b:o:m:M:pLnxEIh", long_opts, &opt_idx)) != -1) {
    switch(c) {
    case 's':
      startcol = get_arg_int(optarg);
      break;
    case 'e':
      endcol = get_arg_int(optarg);
      break;
    case 'l':
      seqlist_str = get_arg_list(optarg);
      break;
    case 'O':
      order_list = get_arg_list(optarg);
      break;
    case 'x':
      include = FALSE;
      break;
    case 'S':
      splitInterval = atoi(optarg);
      break;
    case 'r':
      out_root_fname = optarg;
      break;
    case 'd':
      sprintf(splitFormat, "%%s%%.%si.%%s", optarg);
      break;
    case 'n':
      useRefseq = FALSE;
      break;
    case 'g':
      gff = gff_read_set(phast_fopen(optarg, "r"));
      gff_sort(gff);
      stripILines=TRUE;
      stripELines=TRUE;
      break;
    case 'c':
      cm = cm_new_string_or_file(optarg);
      break;
    case 'C':
      cats_to_do_str = get_arg_list(optarg);
      break;
    case 'L':
      by_category = TRUE;
      break;
    case 'P':
      group_tag = optarg;
      break;
    case 'b':
      base_mask_cutoff = atoi(optarg);
      break;
    case 'm':
      masked_fn = optarg;
      break;
    case 'M':
      mask_features_spec_arg = optarg;
      break;
    case 'E':
      stripELines=TRUE;
      break;
    case 'I':
      stripILines=TRUE;
      break;
    case 'o':
      output_format = msa_str_to_format(optarg);
      if (output_format == UNKNOWN_FORMAT) 
	die("ERROR: bad output format.  Try \"maf_parse -h\" for help.\n");
      if (output_format != MAF)
	die("Sorry, only MAF format output has been implemented right now.\n");
      break;
    case 'p':
      pretty_print = TRUE;
      break;
    case 'h':
      print_usage();
      exit(0);
    case '?':
      die("Bad argument.  Try 'maf_parse -h' for help.\n");
    }
  }

  if (optind >= argc) 
    die("Missing alignment filename.  Try 'maf_parse -h' for help.\n");
  else if (optind == argc - 1) 
    maf_fname = argv[optind];
  else 
    die("ERROR: Too many arguments.  Try 'maf_parse -h' for help.\n");
  
  set_seed(-1);

  if (startcol < 1 || (endcol != -1 && endcol < startcol))
    die("ERROR: must have 1 <= start <= end <= [msa_length]\n");

  if ((group_tag != NULL || by_category) && gff == NULL)
    die("ERROR: --by-category and --by-group require --features.  Try \"maf_parse -h\""
	" for help.\n");

  if (group_tag != NULL && by_category) 
    die("ERROR: --by-category and --by-group cannot be used together.  Try \"maf_parse -h\""
	" for help.\n");
  
  if (splitInterval != -1 && gff != NULL)
    die("ERROR: can't use --split and --features together.  Try \"maf_parse -h\""
	"for help\n");

  if (group_tag != NULL || by_category) {
    outfileList = lst_new_ptr(10);
    outfileHash = hsh_new(100);
  }

  if (gff != NULL && cm == NULL) 
    cm = cm_new_from_features(gff);

  if (cats_to_do_str != NULL) {
    cats_to_do = cm_get_category_str_list(cm, cats_to_do_str, FALSE);
    if (gff != NULL) 
      gff_filter_by_type(gff, cats_to_do, 0, NULL);
  }

  if (masked_fn != NULL) {
    if (base_mask_cutoff == -1)
      die("ERROR: need to use --mask-bases with --masked-file");
    masked_file = phast_fopen(masked_fn, "w");
  }

  if (mask_features_spec_arg != NULL) {
    if (gff==NULL)
      die("ERROR: need --features with --mask-features");
    mask_features_spec = lst_new_ptr(10);
    str_split(str_new_charstr(mask_features_spec_arg), ",", mask_features_spec);
    for (i=0; i < lst_size(mask_features_spec); i++) {
      fprintf(stderr, "masking species %s within features\n", 
	      ((String*)lst_get_ptr(mask_features_spec, i))->chars);
    }
  }

  /* Check to see if --do-cats names a feature which is length 1. 
     If so, set output_format to SS ? or FASTA ? */
  
  mfile = phast_fopen(maf_fname, "r");
  block = mafBlock_read_next(mfile, NULL, NULL);

  if (splitInterval == -1 && gff==NULL) {
    //TODO: do we want to copy header from original MAF in this case?
    mafBlock_open_outfile(NULL, argc, argv);
  }

  while (block != NULL) {
    if (order_list != NULL)
      mafBlock_reorder(block, order_list);
    if (seqlist_str != NULL)
      mafBlock_subSpec(block, seqlist_str, include);
    if (mafBlock_numSpec(block)==0 || mafBlock_all_gaps(block)) 
      goto get_next_block;
    if (stripILines)
      mafBlock_strip_iLines(block);
    if (stripELines)
      mafBlock_strip_eLines(block);
    if (base_mask_cutoff != -1)
      mafBlock_mask_bases(block, base_mask_cutoff, masked_file);
    //TODO: still need to implement (either here or elsewhere)
    //    if (indel_mask_cutoff != -1) 
    //      mafBlock_mask_indels(block, indel_mask_cutoff, mfile);

    if (useRefseq) {  //get refseq and check that it is consistent in MAF file
      currRefseq = mafBlock_get_refSpec(block);
      if (refseq == NULL) 
	refseq = str_new_charstr(currRefseq->chars);
      else if (str_compare(refseq, currRefseq)!=0)
	die("Error: refseq not consistent in MAF (got %s, %s)\n",
	    refseq->chars, currRefseq->chars);
    }
    
    if (startcol != 1 || endcol != -1) 
      if (0 == mafBlock_trim(block, startcol, endcol, refseq, useRefseq ? 0 : lastIdx))
	goto get_next_block;

    currSize = mafBlock_get_size(block, refseq);
    if (useRefseq) {
      currStart = mafBlock_get_start(block, refseq);
      if (currStart < lastIdx && sortWarned == 0) {
	fprintf(stderr, "Warning: input MAF not sorted with respect to refseq.  Output files may not represent contiguous alignments. (%i, %i)\n", lastIdx, currStart);
	sortWarned = 1;
      }
    }
    else currStart = lastIdx;

    if (currStart < lastStart) gffSearchIdx = 0;
    lastStart = currStart;
    
    lastIdx = currStart + currSize;

    //split by length
    if (splitInterval != -1) {
      if (currLen == -1 || currLen+currSize > splitInterval) {
	sprintf(outfilename, splitFormat, out_root_fname, ++blockIdx,
		msa_suffix_for_format(output_format));
	if (output_format == MAF) {
	  if (outfile != NULL) mafBlock_close_outfile(outfile);
	  outfile = mafBlock_open_outfile(outfilename, argc, argv);
	}
	else if (output_format != MAF && msa != NULL) {
	  //	  msa_print_to_filename(msa, outfilename, output_format, pretty_print);
	  msa_free(msa);
	  msa = NULL;
	}
	currLen = 0;
      }
      currLen += currSize;
    }
    else outfile = stdout;
    if (gff != NULL && mask_features_spec != NULL) {
      gffSub = gff_subset_range_overlap_sorted(gff, currStart+1, lastIdx,
					       &gffSearchIdx);
      if (gffSub != NULL) {
	mafBlock_mask_region(block, gffSub, mask_features_spec);
	gff_free_set(gffSub);
      }
      mafBlock_print(outfile, block, pretty_print);


    } else if (gff != NULL) {
      gffSub = gff_subset_range_overlap_sorted(gff, currStart+1, lastIdx, 
					       &gffSearchIdx);
      if (gffSub != NULL) {
	if (by_category) gff_group_by_feature(gffSub);
	else if (group_tag != NULL) gff_group(gffSub, group_tag);
	gff_sort(gffSub);
	gff_flatten_within_groups(gffSub);
	for (i=0; i<lst_size(gffSub->features); i++) {
	  feat = (GFF_Feature*)lst_get_ptr(gffSub->features, i);
	  MafBlock *subBlock = mafBlock_copy(block);
	  mafBlock_trim(subBlock, feat->start, feat->end, refseq, 0);
	  if (by_category) 
	    outfile = get_outfile(outfileList, outfileHash, feat->feature, out_root_fname,
				  argc, argv);
	  else if (group_tag != NULL) 
	    outfile = get_outfile(outfileList, outfileHash, 
				  gff_group_name(gffSub, feat), out_root_fname,
				  argc, argv);
	  else outfile = stdout;
	  if (output_format == MAF)
	    mafBlock_print(outfile, subBlock, pretty_print);
	  //	  else msa_add_mafBlock(msa);
	  mafBlock_free(subBlock);
	}
	gff_free_set(gffSub);
      }
    }
    else {
      if (output_format == MAF) 
	mafBlock_print(outfile, block, pretty_print);
      //      else msa = msa_add_mafBlock(mafBlock, msa, );
    }
    
  get_next_block:
    mafBlock_free(block);
    block = mafBlock_read_next(mfile, NULL, NULL);
  }

  if (masked_file != NULL) fclose(masked_file);

  if (output_format == MAF) {
    if (by_category || group_tag != NULL)
      close_outfiles(outfileList, outfileHash);
    else if (outfile!=NULL) mafBlock_close_outfile(outfile);
  } else {
    msa_print(stdout, msa, output_format, pretty_print);
    msa_free(msa);
  }
  if (gff != NULL) gff_free_set(gff);
  phast_fclose(mfile);
  return 0;
}
Esempio n. 8
0
void rph_gff_free(SEXP gffPtr) {
  GFF_Set *gff = (GFF_Set*)EXTPTR_PTR(gffPtr);
  phast_unregister_protected(gff);
  gff_free_set(gff);
}