Exemplo n.º 1
0
long mafBlock_get_start(MafBlock *block, String *specName) {
  int idx=0;
  if (specName != NULL) 
    idx = hsh_get_int(block->specMap, specName->chars);
  if (idx == -1 || idx >= lst_size(block->data)) return -1;
  return ((MafSubBlock*)lst_get_ptr(block->data, idx))->start;
}
Exemplo n.º 2
0
/* Closes all outfiles.  If already closed, reopen with append, add #eof 
   closer, and close again.  see comment above at get_outfile */
void close_outfiles(List *outfileList, Hashtable *outfileHash) {
  List *keys = hsh_keys(outfileHash);
  int *done, idx, i;
  char *fname;
  FILE *outfile;
  done = smalloc(lst_size(keys)*sizeof(int));
  for (i=0; i<lst_size(keys); i++) {
    done[i]=0;
    fname = (char*)lst_get_ptr(keys, i);
    idx = hsh_get_int(outfileHash, fname);
    outfile = (FILE*)lst_get_ptr(outfileList, idx);
    if (outfile != NULL) {
      mafBlock_close_outfile(outfile);
      done[i]=1;
    }
  }
  for (i=0; i<lst_size(keys); i++) {
    if (done[i]) continue;
    fname = (char*)lst_get_ptr(keys, i);
    outfile = phast_fopen(fname, "a");
    mafBlock_close_outfile(outfile);
  }
  sfree(done);
  lst_free(keys);
  lst_free(outfileList);
  hsh_free(outfileHash);
}
Exemplo n.º 3
0
int mafBlock_get_size(MafBlock *block, String *specName) {
  int idx=0;
  MafSubBlock *sub;
  if (specName == NULL) return block->seqlen;
    idx = hsh_get_int(block->specMap, specName->chars);
  if (idx == -1 || idx >= lst_size(block->data)) return -1;
  sub = (MafSubBlock*)lst_get_ptr(block->data, idx);
  if (sub->lineType[0]=='s') return sub->size;
  if (sub->lineType[0] != 'e')
    die("ERROR mafBlock_get_size, expected line type 'e', got %c\n",
	sub->lineType[0]);
  return 0;
}
Exemplo n.º 4
0
//if exclude==0, removes all species not in list.
//if exclude==1, removes all species in list
void mafBlock_subSpec(MafBlock *block, List *specNameList, int include) {
  String *str;
  int i, idx, *keep, oldSize = lst_size(block->data);

  keep = smalloc(oldSize*sizeof(int));
  for (i=0; i<oldSize; i++) keep[i]=(include==0);

  for (i=0; i<lst_size(specNameList); i++) {
    str = (String*)lst_get_ptr(specNameList, i);
    idx = hsh_get_int(block->specMap, str->chars);
    if (idx != -1) keep[idx] = !(include==0);
  }
  mafBlock_remove_lines(block, keep);
  sfree(keep);
  return;
}
Exemplo n.º 5
0
void mafBlock_mask_region(MafBlock *block, GFF_Set *mask_feats, List *speclist) {
  MafSubBlock *refblock, *maskblock;
  int i, j, spec_idx;
  GFF_Set *feat;
  GFF_Feature *f, *prevf=NULL;
  int next_feat_idx = 1;
  char **maskseq;
  int num_mask_seq=0;
  long coord;
  if (mask_feats == NULL || lst_size(mask_feats->features) == 0L) return;
  maskseq = smalloc(lst_size(speclist)*sizeof(char*));
  for (i=0; i < lst_size(speclist); i++) {
    spec_idx = hsh_get_int(block->specMap, ((String*)lst_get_ptr(speclist, i))->chars);
    if (spec_idx == -1) continue;
    maskblock = lst_get_ptr(block->data, spec_idx);
    if (maskblock->seq == NULL) continue;
    maskseq[num_mask_seq++] = maskblock->seq->chars;
  }
  if (num_mask_seq == 0) {
    sfree(maskseq);
    return;
  }
  feat = gff_copy_set_no_groups(mask_feats);
  gff_flatten_mergeAll(feat);
  f = lst_get_ptr(feat->features, 0);

  refblock = lst_get_ptr(block->data, 0);
  coord = refblock->start;
  for (i=0; i < block->seqlen; i++) {
    if (refblock->seq->chars[i] != '-') coord++;  //this is 1-based coordinate
    if (coord > f->end) {
      if (next_feat_idx == lst_size(feat->features))
	break;
      prevf = f;
      f = lst_get_ptr(feat->features, next_feat_idx++);
      if (f->start <= prevf->end) {
	die("Error: feats not sorted in mafBlock_mask_region");  //shouldn't happen
      }
    }
    if (coord >= f->start && coord <= f->end) {
      for (j=0; j < num_mask_seq; j++)
	if (maskseq[j][i] != '-') maskseq[j][i] = 'N';
    }
  }
  gff_free_set(feat);
  sfree(maskseq);
}
Exemplo n.º 6
0
//trim mafblock to only keep columns with indcies[startcol..endcol] wrt
//refseq.  If refseq is null use frame of entire alignment.  If endcol is -1
//then keep everything with index >= startcol.
int mafBlock_trim(MafBlock *block, int startcol, int endcol, String *refseq,
		  int offset) {
  MafSubBlock *sub=NULL;
  int i, specIdx, first=-1, last=-1, keep, length;
  long startIdx, lastIdx, idx;
  if (block->seqlen == 0) return 0;
  if (refseq == NULL) {
    startIdx = 1;
    length = block->seqlen;
  }
  else {
    specIdx = hsh_get_int(block->specMap, refseq->chars);
    if (specIdx == -1)
      die("Error: mafBlock_trim got specIdx -1\n");
    sub = (MafSubBlock*)lst_get_ptr(block->data, specIdx);
    startIdx = sub->start + 1;
    length = sub->size;
  }
  startIdx += offset;
  lastIdx = startIdx + length - 1;
  idx = startIdx;
  if (refseq != NULL && sub->seq->chars[0]=='-') startIdx--;

  if (startcol != 1 && endcol != -1 && startcol > endcol) 
    die("ERROR: startcol > endcol\n");
  if (startcol > lastIdx || (endcol != -1 && endcol < startIdx)) {
    mafBlock_free_data(block);
    return 0;
  }
  if (startcol <= startIdx && 
      (endcol   == -1 || endcol  >= lastIdx)) return 1;
  
  //now we know we have to do some trimming
  for (i=0; i<block->seqlen; i++) {
    if (refseq != NULL && sub->seq->chars[i]=='-') idx--;

    keep = (idx >= startcol &&
	    (idx <= endcol   || endcol == -1));
    if (first == -1 && keep) first = i+1;
    if (keep) last=i+1;
    idx++;
  }
  mafBlock_subAlign(block, first, last);
  return 1;
}
Exemplo n.º 7
0
void mafBlock_reorder(MafBlock *block, List *specNameOrder) {
  String *str;
  MafSubBlock *sub;
  List *newData;
  Hashtable *newSpecMap;
  int i, idx, *found, oldSize = lst_size(block->data), newSize = lst_size(specNameOrder);

  found = smalloc(oldSize*sizeof(int));
  for (i=0; i<oldSize; i++) found[i]=0;

  newData = lst_new_ptr(oldSize);
  newSpecMap = hsh_new(100);

  for (i=0; i<newSize; i++) {
    str = (String*)lst_get_ptr(specNameOrder, i);
    idx = hsh_get_int(block->specMap, str->chars);
    if (idx != -1) {
      if (found[idx]==1) die("ERROR: species %s appears twice in reorder list\n", 
			     str->chars);
      sub = (MafSubBlock*)lst_get_ptr(block->data, idx);
      hsh_put_int(newSpecMap, sub->src->chars, lst_size(newData));
      hsh_put_int(newSpecMap, sub->specName->chars, lst_size(newData));
      lst_push_ptr(newData, (void*)sub);
      found[idx] = 1;
    }
  }
  for (i=0; i<oldSize; i++) {
    if (found[i]==0) {
      sub = (MafSubBlock*)lst_get_ptr(block->data, i);
      mafSubBlock_free(sub);
    }
  }
  hsh_free(block->specMap);
  lst_free(block->data);
  block->specMap = newSpecMap;
  block->data = newData;
  sfree(found);
}
Exemplo n.º 8
0
//read next block in mfile and return MafBlock object or NULL if EOF.
//specHash and numSpec are not used, but if specHash is not NULL,
//it should be initialized, and any new species encountered will be added
//to the hash, with numSpec increased accordingly.  If specHash is NULL,
//numSpec will not be used or modified.
MafBlock *mafBlock_read_next(FILE *mfile, Hashtable *specHash, int *numSpec) {
  int i;
  char firstchar;
  String *currLine = str_new(1000);
  MafBlock *block=NULL;
  MafSubBlock *sub=NULL;

  if (specHash != NULL && numSpec==NULL) 
    die("ERROR: mafBlock_read_next: numSpec cannot be NULL "
	"if specHash is not NULL\n");

  while (EOF != str_readline(currLine, mfile)) {
    str_trim(currLine);
    if (currLine->length==0) {  //if blank line, it is either first or last line
      if (block == NULL) continue;
      else break;
    }
    firstchar = currLine->chars[0];
    if (firstchar == '#') continue;  //ignore comments
    if (block == NULL) {
      if (firstchar != 'a') 
	die("ERROR: first line of MAF block should start with 'a'\n");
      block = mafBlock_new();
      block->aLine = str_new_charstr(currLine->chars);
    }
    //if 's' or 'e', then this is first line of data for this species
    else if (firstchar == 's' || firstchar == 'e') {
      sub = mafBlock_get_subBlock(currLine);
      if (hsh_get_int(block->specMap, sub->src->chars) != -1) 
	die("ERROR: mafBlock has two alignments with same srcName (%s)\n", 
	    sub->src->chars);
      hsh_put_int(block->specMap, sub->src->chars, lst_size(block->data));
      hsh_put_int(block->specMap, sub->specName->chars, lst_size(block->data));
      lst_push_ptr(block->data, (void*)sub);
      if (specHash != NULL) {
	if (-1 == hsh_get_int(specHash, sub->specName->chars)) {
	  hsh_put_int(specHash, sub->specName->chars, *numSpec);
	  (*numSpec)++;
	}
      }
    }
    else {
      if (firstchar == 'i')
	mafBlock_add_iLine(currLine, sub);
      else if (firstchar == 'q')
	mafBlock_add_qLine(currLine, sub);
      else die("ERROR: found line in MAF block starting with '%c'\n", firstchar);
    }
  }
  str_free(currLine);
  if (block == NULL) return NULL;

  //set seqlen and make sure all seq arrays agree
  for (i=0; i<lst_size(block->data); i++) {
    sub = (MafSubBlock*)lst_get_ptr(block->data, i);
    if (sub->lineType[0]=='e') continue;
    if (block->seqlen == -1) block->seqlen = sub->seq->length;
    else if (sub->seq->length != block->seqlen) {
      die("ERROR: lengths of sequences in MAF block do not agree (%i, %i)\n",
	  block->seqlen, sub->seq->length);
    }
  }
  return block;
}
Exemplo n.º 9
0
int main(int argc, char *argv[]) {
  TreeNode *tree = NULL;
  TreeModel *backgd_mod = NULL;
  int i, j,
    size = DEFAULT_SIZE, meme_mode = 0, profile_mode = 0, 
    nrestarts = 10, npseudocounts = 5, nsamples = -1, 
    nmostprevalent = -1, tuple_size = -1, nbest = -1, sample_parms = 0,
    nmotifs = DEFAULT_NUMBER, nseqs = -1, do_html = 0, do_bed = 0, 
    suppress_stdout = 0;
  List *msa_name_list = NULL, *pos_examples = NULL, *init_list = NULL, *tmpl;
  List *msas, *motifs;
  SeqSet *seqset = NULL;
  PooledMSA *pmsa = NULL;
  msa_format_type msa_format = UNKNOWN_FORMAT;
  Vector *backgd_mnmod = NULL;
  Hashtable *hash=NULL;
  String *output_prefix = str_new_charstr("phastm.");
  double *has_motif = NULL;
  double prior = PRIOR;
  char c;
  GFF_Set *bedfeats = NULL;

  while ((c = getopt(argc, argv, "t:i:b:sk:md:pn:I:R:P:w:c:SB:o:HDxh")) != -1) {
    switch (c) {
    case 't':
      tree = tr_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'i':
      msa_format = msa_str_to_format(optarg);
      if (msa_format == UNKNOWN_FORMAT) 
	die("ERROR: bad input format.\n");
      break;
    case 'b':
      backgd_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1);
      break;
    case 's':
      break;
    case 'k':
      size = get_arg_int(optarg);
      break;
    case 'm':
      meme_mode = 1;
      break;
    case 'd':
      pos_examples = get_arg_list(optarg);
      break;
    case 'p':
      profile_mode = 1;
      break;
    case 'n':
      nrestarts = get_arg_int(optarg);
      break;
    case 'I':
      init_list = get_arg_list(optarg);
      break;
    case 'P':
      tmpl = str_list_as_int(get_arg_list(optarg));
      if (lst_size(tmpl) != 2) die("ERROR: bad argument to -P.\n");
      nmostprevalent = lst_get_int(tmpl, 0);
      tuple_size = lst_get_int(tmpl, 1);
      if (!(nmostprevalent > 0 && tuple_size > 0))
	die("ERROR: bad argument nmostprevalent=%i tuple_size=%i\n", 
	    nmostprevalent, tuple_size);
      lst_free(tmpl);
      break;
    case 'R':
      tmpl = str_list_as_int(get_arg_list(optarg));
      if (lst_size(tmpl) != 2) die("ERROR: bad argument to -R.\n");
      nsamples = lst_get_int(tmpl, 0);
      tuple_size = lst_get_int(tmpl, 1);
      if (!(nsamples > 0 && tuple_size > 0))
	die("ERROR nsamples=%i tuple_sizse=%i\n", nsamples, tuple_size);
      lst_free(tmpl);
      break;
    case 'c':
      npseudocounts = get_arg_int(optarg);
      break;
    case 'w':
      nbest = get_arg_int(optarg);
      break;
    case 'S':
      sample_parms = 1;
      break;
    case 'B':
      nmotifs = get_arg_int(optarg);
      break;
    case 'o': 
      str_free(output_prefix);
      output_prefix = str_new_charstr(optarg);
      str_append_char(output_prefix, '.'); 
      break;
    case 'H': 
      do_html = 1;
      break;
    case 'D': 
      do_bed = 1;
      break;
    case 'x':
      suppress_stdout = 1;
      break;
    case 'h':
      usage(argv[0]);
    case '?':
      die("Bad argument.  Try '%s -h'.\n", argv[0]);
    }
  }

  if (optind != argc - 1) 
    die("ERROR: List of alignment files required.  Try '%s -h'.\n", argv[0]);

  if ((nsamples > 0 && nmostprevalent > 0) || 
      (nsamples > 0 && init_list != NULL) || 
      (nmostprevalent > 0 && init_list != NULL)) 
    die("ERROR: -I, -P, and -R are mutually exclusive.");

  set_seed(-1);
    
  msa_name_list = get_arg_list(argv[optind]);

  if (backgd_mod != NULL && tree == NULL) tree = backgd_mod->tree;

  if (tree == NULL && !meme_mode && !profile_mode) 
    die("ERROR: Must specify -t, -m, or -p.\n");

  if ((init_list != NULL || nsamples > 0 || nmostprevalent > 0) && 
      !sample_parms)
    nrestarts = 1;

  if (pos_examples != NULL) {
    hash = hsh_new(lst_size(pos_examples));
    for (i = 0; i < lst_size(pos_examples); i++)
      hsh_put_int(hash, ((String*)lst_get_ptr(pos_examples, i))->chars, 1);
    has_motif = smalloc(lst_size(msa_name_list) * sizeof(double));
  }

  /* open all MSAs */
  msas = lst_new_ptr(lst_size(msa_name_list));
  fprintf(stderr, "Reading alignment(s) ...\n");
  for (i = 0, j = 0; i < lst_size(msa_name_list); i++) {
    String *name = lst_get_ptr(msa_name_list, i);
    FILE *mfile = phast_fopen(name->chars, "r");
    msa_format_type temp_format;
    MSA *msa;
    if (msa_format == UNKNOWN_FORMAT)
      temp_format = msa_format_for_content(mfile, 1);
    else temp_format = msa_format;
    msa = msa_new_from_file_define_format(mfile, temp_format, NULL);
    phast_fclose(mfile);
    if (nseqs == -1) nseqs = msa->nseqs;
    if (!meme_mode &&
        (msa->length - msa_num_gapped_cols(msa, STRIP_ANY_GAPS, -1, -1) < 300 ||
        msa->nseqs != nseqs)) {
      fprintf(stderr, "WARNING: ignoring alignment '%s' -- too few informative sites.\n", name->chars);
      msa_free(msa);
      continue;
    }

    if (msa_alph_has_lowercase(msa)) msa_toupper(msa); 
    msa_remove_N_from_alph(msa); /* Ns can be a problem */
    lst_push_ptr(msas, msa);
    if (has_motif != NULL) {
      int k, hm = (hsh_get_int(hash, name->chars) == 1);
      if (meme_mode) {          /* here need to record at individ seq level */
        has_motif = srealloc(has_motif, 
                             (j + msa->nseqs + 1) * sizeof(double)); /* FIXME */
        for (k = 0; k < msa->nseqs; k++) has_motif[j++] = hm;
      }
      else has_motif[j++] = hm;
    }
  }
  if (!meme_mode) {
    fprintf(stderr, "Extracting and pooling sufficient statistics ...\n");
    pmsa = ss_pooled_from_msas(msas, 1, size, NULL, 0);
    msa_remove_N_from_alph(pmsa->pooled_msa);
  }

  /* obtain individual sequences, if necessary */
  if (nmostprevalent > 0 || nsamples > 0 || meme_mode) {
    if (meme_mode) fprintf(stderr, "Converting to individual sequences ...\n");
    else fprintf(stderr, "Obtaining reference sequences for pre-processing ...\n");
    seqset = mtf_get_seqset(msas, meme_mode ? -1 : 1, 10 * size);
                                /* for now, assume 1st seq is reference */
    msa_remove_N_from_alph(seqset->set); 
  }

  if (nmostprevalent > 0) {
    fprintf(stderr, "Obtaining %d most prevalent %d-tuples ...\n", 
            nmostprevalent, tuple_size);
    init_list = lst_new_ptr(nmostprevalent);
    mtf_get_common_ntuples(seqset, init_list, tuple_size, nmostprevalent);
  }
  else if (nsamples > 0) {
    fprintf(stderr, "Sampling %d %d-tuples ...\n", nsamples, tuple_size);
    init_list = lst_new_ptr(nsamples);
    mtf_sample_ntuples(seqset, init_list, tuple_size, nsamples);
  }

  /* in meme_mode, backgd model can be specified as eq freqs in a .mod file */
  if (meme_mode && backgd_mod != NULL && has_motif == NULL)
    backgd_mnmod = backgd_mod->backgd_freqs;

  /* estimate background model, if necessary */
  else if (backgd_mod == NULL && (!meme_mode || has_motif == NULL)) {
    fprintf(stderr, "Fitting background model%s ...\n", 
            has_motif == NULL ? "" : " (for use in initialization)");
                                /* if discriminative, be clear
                                   backgd isn't really part of the
                                   estimation procedure */
    if (meme_mode) {
      backgd_mnmod = vec_new(strlen(seqset->set->alphabet));
      mtf_estim_backgd_mn(seqset, backgd_mnmod);
    }
    else {
      backgd_mod = tm_new(tr_create_copy(tree), NULL, NULL, F81, 
                          pmsa->pooled_msa->alphabet, 1, 0, NULL, -1);
      tm_fit(backgd_mod, pmsa->pooled_msa, 
             tm_params_init(backgd_mod, .1, 5, 0), 
             -1, OPT_MED_PREC, NULL, 0, NULL);
    }
  }

  /* select subset of init strings, if necessary */
  if (nbest > 0 && init_list != NULL) {
    fprintf(stderr, "Winnowing candidate start strings ...\n");
    tmpl = lst_new_ptr(nbest);
    mtf_winnow_starts(meme_mode ? (void*)seqset : (void*)pmsa,
                      init_list, nbest, tmpl, !meme_mode, size, tree,
                      meme_mode ? (void*)backgd_mnmod : (void*)backgd_mod, 
                      has_motif);
    lst_free(init_list);
    init_list = tmpl;
  }

  /* Now find motifs */
  motifs = mtf_find(meme_mode ? (void*)seqset : (void*)pmsa, 
                    !meme_mode, size, nmotifs, tree,
                    meme_mode ? (void*)backgd_mnmod : (void*)backgd_mod, 
                    has_motif, prior, nrestarts, init_list, sample_parms, 
                    npseudocounts);
     
  fprintf(stderr, "\n\n");
  if (do_bed)
    bedfeats = gff_new_set_init("phast_motif", "0.1b");

  /* generate output */
  for (i = 0; i < lst_size(motifs); i++) {
    Motif *m = lst_get_ptr(motifs, i);

    if (!suppress_stdout) {
      if (lst_size(motifs) > 1) 
        printf("\n**********\nMOTIF #%d\n**********\n\n", i+1);

      mtf_print(stdout, m);
    }

    if (do_html) {
      String *fname = str_dup(output_prefix);
      str_append_int(fname, i+1);
      str_append_charstr(fname, ".html");
      mtf_print_html(phast_fopen(fname->chars, "w+"), m);
      str_free(fname);
    }

    if (do_bed) 
      mtf_add_features(m, bedfeats);
  }
  if (do_html) {
    String *fname = str_dup(output_prefix);
    str_append_charstr(fname, "index.html");
    mtf_print_summary_html(phast_fopen(fname->chars, "w+"), 
                           motifs, output_prefix);
    str_free(fname);
  }
  if (do_bed) {
    String *fname = str_dup(output_prefix);
    str_append_charstr(fname, "bed");
    gff_print_bed(phast_fopen(fname->chars, "w+"),
                  bedfeats, FALSE);
    str_free(fname);
  }

  return 0;
}