SEXP rph_tree_rename(SEXP treeVec, SEXP oldNamesP, SEXP newNamesP) { int i, numtree = LENGTH(treeVec), treeIdx; TreeNode *tr, *n; SEXP result; Hashtable *hash = hsh_new(20); char *str; for (i=0; i<LENGTH(oldNamesP); i++) { str = smalloc((strlen(CHAR(STRING_ELT(newNamesP, i)))+1)*sizeof(char)); strcpy(str, CHAR(STRING_ELT(newNamesP, i))); hsh_put(hash, CHAR(STRING_ELT(oldNamesP, i)), str); } PROTECT(result = NEW_CHARACTER(numtree)); for (treeIdx=0; treeIdx < numtree; treeIdx++) { tr = rph_tree_new(STRING_ELT(treeVec, treeIdx)); // tr = tr_new_from_string(CHAR(STRING_ELT(treeVec, treeIdx))); for (i=0; i<tr->nnodes; i++) { n = lst_get_ptr(tr->nodes, i); if (n->name[0] != '\0' && (str = hsh_get(hash, n->name)) != (char*)-1) strcpy(n->name, str); } str = tr_to_string(tr, 1); SET_STRING_ELT(result, treeIdx, mkChar(str)); } UNPROTECT(1); return result; }
/* Create a category map with a category for each feature type in a GFF_Set. Category numbers are assigned in order of appearance of types */ CategoryMap* cm_new_from_features(GFF_Set *feats) { int i; CategoryMap *retval; Hashtable *hash; List *types; /* first scan features for all types */ hash = hsh_new(10); types = lst_new_ptr(10); for (i = 0; i < lst_size(feats->features); i++) { GFF_Feature *f = lst_get_ptr(feats->features, i); checkInterruptN(i, 10000); if (hsh_get(hash, f->feature->chars) == (void*)-1) { lst_push_ptr(types, f->feature); hsh_put_int(hash, f->feature->chars, 1); } } hsh_free(hash); /* now create a simple category map */ retval = cm_new(lst_size(types)); for (i = 0; i <= retval->ncats; i++) { String *type = i == 0 ? str_new_charstr(BACKGD_CAT_NAME) : str_dup(lst_get_ptr(types, i-1)); retval->ranges[i] = cm_new_category_range(type, i, i); } lst_free(types); return retval; }
MafBlock *mafBlock_new() { MafBlock *block = smalloc(sizeof(MafBlock)); block->aLine = NULL; block->specMap = hsh_new(100); block->seqlen = -1; block->data = lst_new_ptr(20); block->prev = block->next = NULL; return block; }
void mafBlock_reorder(MafBlock *block, List *specNameOrder) { String *str; MafSubBlock *sub; List *newData; Hashtable *newSpecMap; int i, idx, *found, oldSize = lst_size(block->data), newSize = lst_size(specNameOrder); found = smalloc(oldSize*sizeof(int)); for (i=0; i<oldSize; i++) found[i]=0; newData = lst_new_ptr(oldSize); newSpecMap = hsh_new(100); for (i=0; i<newSize; i++) { str = (String*)lst_get_ptr(specNameOrder, i); idx = hsh_get_int(block->specMap, str->chars); if (idx != -1) { if (found[idx]==1) die("ERROR: species %s appears twice in reorder list\n", str->chars); sub = (MafSubBlock*)lst_get_ptr(block->data, idx); hsh_put_int(newSpecMap, sub->src->chars, lst_size(newData)); hsh_put_int(newSpecMap, sub->specName->chars, lst_size(newData)); lst_push_ptr(newData, (void*)sub); found[idx] = 1; } } for (i=0; i<oldSize; i++) { if (found[i]==0) { sub = (MafSubBlock*)lst_get_ptr(block->data, i); mafSubBlock_free(sub); } } hsh_free(block->specMap); lst_free(block->data); block->specMap = newSpecMap; block->data = newData; sfree(found); }
int main(int argc, char* argv[]) { char *maf_fname = NULL, *out_root_fname = "maf_parse", *masked_fn = NULL; String *refseq = NULL, *currRefseq; int opt_idx, startcol = 1, endcol = -1, include = 1, splitInterval = -1; char c, outfilename[1000], splitFormat[100]="%s%.1i.maf", *group_tag = NULL; List *order_list = NULL, *seqlist_str = NULL, *cats_to_do_str=NULL, *cats_to_do=NULL; MafBlock *block; FILE *mfile, *outfile=NULL, *masked_file=NULL; int useRefseq=TRUE, currLen=-1, blockIdx=0, currSize, sortWarned=0; int lastIdx = 0, currStart=0, by_category = FALSE, i, pretty_print = FALSE; int lastStart = -1, gffSearchIdx=0; GFF_Set *gff = NULL, *gffSub; GFF_Feature *feat; CategoryMap *cm = NULL; int base_mask_cutoff = -1, stripILines=FALSE, stripELines=FALSE;//, numspec=0; List *outfileList=NULL; Hashtable *outfileHash=NULL;//, *specNameHash=NULL; msa_format_type output_format = MAF; MSA *msa = NULL;//, **catMsa; char *mask_features_spec_arg=NULL; List *mask_features_spec=NULL; struct option long_opts[] = { {"start", 1, 0, 's'}, {"end", 1, 0, 'e'}, {"seqs", 1, 0, 'l'}, {"exclude", 0, 0, 'x'}, {"order", 1, 0, 'O'}, {"split", 1, 0, 'S'}, {"out-root", 1, 0, 'r'}, {"out-root-digits", 1, 0, 'd'}, {"no-refseq", 0, 0, 'n'}, {"features", 1, 0, 'g'}, {"by-category", 0, 0, 'L'}, {"do-cats", 1, 0, 'C'}, {"catmap", 1, 0, 'c'}, {"by-group", 1, 0, 'P'}, {"mask-bases", 1, 0, 'b'}, {"masked-file", 1, 0, 'm'}, {"strip-i-lines", 0, 0, 'I'}, {"strip-e-lines", 0, 0, 'E'}, {"mask-features", 1, 0, 'M'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "s:e:l:O:r:S:d:g:c:P:b:o:m:M:pLnxEIh", long_opts, &opt_idx)) != -1) { switch(c) { case 's': startcol = get_arg_int(optarg); break; case 'e': endcol = get_arg_int(optarg); break; case 'l': seqlist_str = get_arg_list(optarg); break; case 'O': order_list = get_arg_list(optarg); break; case 'x': include = FALSE; break; case 'S': splitInterval = atoi(optarg); break; case 'r': out_root_fname = optarg; break; case 'd': sprintf(splitFormat, "%%s%%.%si.%%s", optarg); break; case 'n': useRefseq = FALSE; break; case 'g': gff = gff_read_set(phast_fopen(optarg, "r")); gff_sort(gff); stripILines=TRUE; stripELines=TRUE; break; case 'c': cm = cm_new_string_or_file(optarg); break; case 'C': cats_to_do_str = get_arg_list(optarg); break; case 'L': by_category = TRUE; break; case 'P': group_tag = optarg; break; case 'b': base_mask_cutoff = atoi(optarg); break; case 'm': masked_fn = optarg; break; case 'M': mask_features_spec_arg = optarg; break; case 'E': stripELines=TRUE; break; case 'I': stripILines=TRUE; break; case 'o': output_format = msa_str_to_format(optarg); if (output_format == UNKNOWN_FORMAT) die("ERROR: bad output format. Try \"maf_parse -h\" for help.\n"); if (output_format != MAF) die("Sorry, only MAF format output has been implemented right now.\n"); break; case 'p': pretty_print = TRUE; break; case 'h': print_usage(); exit(0); case '?': die("Bad argument. Try 'maf_parse -h' for help.\n"); } } if (optind >= argc) die("Missing alignment filename. Try 'maf_parse -h' for help.\n"); else if (optind == argc - 1) maf_fname = argv[optind]; else die("ERROR: Too many arguments. Try 'maf_parse -h' for help.\n"); set_seed(-1); if (startcol < 1 || (endcol != -1 && endcol < startcol)) die("ERROR: must have 1 <= start <= end <= [msa_length]\n"); if ((group_tag != NULL || by_category) && gff == NULL) die("ERROR: --by-category and --by-group require --features. Try \"maf_parse -h\"" " for help.\n"); if (group_tag != NULL && by_category) die("ERROR: --by-category and --by-group cannot be used together. Try \"maf_parse -h\"" " for help.\n"); if (splitInterval != -1 && gff != NULL) die("ERROR: can't use --split and --features together. Try \"maf_parse -h\"" "for help\n"); if (group_tag != NULL || by_category) { outfileList = lst_new_ptr(10); outfileHash = hsh_new(100); } if (gff != NULL && cm == NULL) cm = cm_new_from_features(gff); if (cats_to_do_str != NULL) { cats_to_do = cm_get_category_str_list(cm, cats_to_do_str, FALSE); if (gff != NULL) gff_filter_by_type(gff, cats_to_do, 0, NULL); } if (masked_fn != NULL) { if (base_mask_cutoff == -1) die("ERROR: need to use --mask-bases with --masked-file"); masked_file = phast_fopen(masked_fn, "w"); } if (mask_features_spec_arg != NULL) { if (gff==NULL) die("ERROR: need --features with --mask-features"); mask_features_spec = lst_new_ptr(10); str_split(str_new_charstr(mask_features_spec_arg), ",", mask_features_spec); for (i=0; i < lst_size(mask_features_spec); i++) { fprintf(stderr, "masking species %s within features\n", ((String*)lst_get_ptr(mask_features_spec, i))->chars); } } /* Check to see if --do-cats names a feature which is length 1. If so, set output_format to SS ? or FASTA ? */ mfile = phast_fopen(maf_fname, "r"); block = mafBlock_read_next(mfile, NULL, NULL); if (splitInterval == -1 && gff==NULL) { //TODO: do we want to copy header from original MAF in this case? mafBlock_open_outfile(NULL, argc, argv); } while (block != NULL) { if (order_list != NULL) mafBlock_reorder(block, order_list); if (seqlist_str != NULL) mafBlock_subSpec(block, seqlist_str, include); if (mafBlock_numSpec(block)==0 || mafBlock_all_gaps(block)) goto get_next_block; if (stripILines) mafBlock_strip_iLines(block); if (stripELines) mafBlock_strip_eLines(block); if (base_mask_cutoff != -1) mafBlock_mask_bases(block, base_mask_cutoff, masked_file); //TODO: still need to implement (either here or elsewhere) // if (indel_mask_cutoff != -1) // mafBlock_mask_indels(block, indel_mask_cutoff, mfile); if (useRefseq) { //get refseq and check that it is consistent in MAF file currRefseq = mafBlock_get_refSpec(block); if (refseq == NULL) refseq = str_new_charstr(currRefseq->chars); else if (str_compare(refseq, currRefseq)!=0) die("Error: refseq not consistent in MAF (got %s, %s)\n", refseq->chars, currRefseq->chars); } if (startcol != 1 || endcol != -1) if (0 == mafBlock_trim(block, startcol, endcol, refseq, useRefseq ? 0 : lastIdx)) goto get_next_block; currSize = mafBlock_get_size(block, refseq); if (useRefseq) { currStart = mafBlock_get_start(block, refseq); if (currStart < lastIdx && sortWarned == 0) { fprintf(stderr, "Warning: input MAF not sorted with respect to refseq. Output files may not represent contiguous alignments. (%i, %i)\n", lastIdx, currStart); sortWarned = 1; } } else currStart = lastIdx; if (currStart < lastStart) gffSearchIdx = 0; lastStart = currStart; lastIdx = currStart + currSize; //split by length if (splitInterval != -1) { if (currLen == -1 || currLen+currSize > splitInterval) { sprintf(outfilename, splitFormat, out_root_fname, ++blockIdx, msa_suffix_for_format(output_format)); if (output_format == MAF) { if (outfile != NULL) mafBlock_close_outfile(outfile); outfile = mafBlock_open_outfile(outfilename, argc, argv); } else if (output_format != MAF && msa != NULL) { // msa_print_to_filename(msa, outfilename, output_format, pretty_print); msa_free(msa); msa = NULL; } currLen = 0; } currLen += currSize; } else outfile = stdout; if (gff != NULL && mask_features_spec != NULL) { gffSub = gff_subset_range_overlap_sorted(gff, currStart+1, lastIdx, &gffSearchIdx); if (gffSub != NULL) { mafBlock_mask_region(block, gffSub, mask_features_spec); gff_free_set(gffSub); } mafBlock_print(outfile, block, pretty_print); } else if (gff != NULL) { gffSub = gff_subset_range_overlap_sorted(gff, currStart+1, lastIdx, &gffSearchIdx); if (gffSub != NULL) { if (by_category) gff_group_by_feature(gffSub); else if (group_tag != NULL) gff_group(gffSub, group_tag); gff_sort(gffSub); gff_flatten_within_groups(gffSub); for (i=0; i<lst_size(gffSub->features); i++) { feat = (GFF_Feature*)lst_get_ptr(gffSub->features, i); MafBlock *subBlock = mafBlock_copy(block); mafBlock_trim(subBlock, feat->start, feat->end, refseq, 0); if (by_category) outfile = get_outfile(outfileList, outfileHash, feat->feature, out_root_fname, argc, argv); else if (group_tag != NULL) outfile = get_outfile(outfileList, outfileHash, gff_group_name(gffSub, feat), out_root_fname, argc, argv); else outfile = stdout; if (output_format == MAF) mafBlock_print(outfile, subBlock, pretty_print); // else msa_add_mafBlock(msa); mafBlock_free(subBlock); } gff_free_set(gffSub); } } else { if (output_format == MAF) mafBlock_print(outfile, block, pretty_print); // else msa = msa_add_mafBlock(mafBlock, msa, ); } get_next_block: mafBlock_free(block); block = mafBlock_read_next(mfile, NULL, NULL); } if (masked_file != NULL) fclose(masked_file); if (output_format == MAF) { if (by_category || group_tag != NULL) close_outfiles(outfileList, outfileHash); else if (outfile!=NULL) mafBlock_close_outfile(outfile); } else { msa_print(stdout, msa, output_format, pretty_print); msa_free(msa); } if (gff != NULL) gff_free_set(gff); phast_fclose(mfile); return 0; }
int main(int argc, char *argv[]) { TreeNode *tree = NULL; TreeModel *backgd_mod = NULL; int i, j, size = DEFAULT_SIZE, meme_mode = 0, profile_mode = 0, nrestarts = 10, npseudocounts = 5, nsamples = -1, nmostprevalent = -1, tuple_size = -1, nbest = -1, sample_parms = 0, nmotifs = DEFAULT_NUMBER, nseqs = -1, do_html = 0, do_bed = 0, suppress_stdout = 0; List *msa_name_list = NULL, *pos_examples = NULL, *init_list = NULL, *tmpl; List *msas, *motifs; SeqSet *seqset = NULL; PooledMSA *pmsa = NULL; msa_format_type msa_format = UNKNOWN_FORMAT; Vector *backgd_mnmod = NULL; Hashtable *hash=NULL; String *output_prefix = str_new_charstr("phastm."); double *has_motif = NULL; double prior = PRIOR; char c; GFF_Set *bedfeats = NULL; while ((c = getopt(argc, argv, "t:i:b:sk:md:pn:I:R:P:w:c:SB:o:HDxh")) != -1) { switch (c) { case 't': tree = tr_new_from_file(phast_fopen(optarg, "r")); break; case 'i': msa_format = msa_str_to_format(optarg); if (msa_format == UNKNOWN_FORMAT) die("ERROR: bad input format.\n"); break; case 'b': backgd_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1); break; case 's': break; case 'k': size = get_arg_int(optarg); break; case 'm': meme_mode = 1; break; case 'd': pos_examples = get_arg_list(optarg); break; case 'p': profile_mode = 1; break; case 'n': nrestarts = get_arg_int(optarg); break; case 'I': init_list = get_arg_list(optarg); break; case 'P': tmpl = str_list_as_int(get_arg_list(optarg)); if (lst_size(tmpl) != 2) die("ERROR: bad argument to -P.\n"); nmostprevalent = lst_get_int(tmpl, 0); tuple_size = lst_get_int(tmpl, 1); if (!(nmostprevalent > 0 && tuple_size > 0)) die("ERROR: bad argument nmostprevalent=%i tuple_size=%i\n", nmostprevalent, tuple_size); lst_free(tmpl); break; case 'R': tmpl = str_list_as_int(get_arg_list(optarg)); if (lst_size(tmpl) != 2) die("ERROR: bad argument to -R.\n"); nsamples = lst_get_int(tmpl, 0); tuple_size = lst_get_int(tmpl, 1); if (!(nsamples > 0 && tuple_size > 0)) die("ERROR nsamples=%i tuple_sizse=%i\n", nsamples, tuple_size); lst_free(tmpl); break; case 'c': npseudocounts = get_arg_int(optarg); break; case 'w': nbest = get_arg_int(optarg); break; case 'S': sample_parms = 1; break; case 'B': nmotifs = get_arg_int(optarg); break; case 'o': str_free(output_prefix); output_prefix = str_new_charstr(optarg); str_append_char(output_prefix, '.'); break; case 'H': do_html = 1; break; case 'D': do_bed = 1; break; case 'x': suppress_stdout = 1; break; case 'h': usage(argv[0]); case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } if (optind != argc - 1) die("ERROR: List of alignment files required. Try '%s -h'.\n", argv[0]); if ((nsamples > 0 && nmostprevalent > 0) || (nsamples > 0 && init_list != NULL) || (nmostprevalent > 0 && init_list != NULL)) die("ERROR: -I, -P, and -R are mutually exclusive."); set_seed(-1); msa_name_list = get_arg_list(argv[optind]); if (backgd_mod != NULL && tree == NULL) tree = backgd_mod->tree; if (tree == NULL && !meme_mode && !profile_mode) die("ERROR: Must specify -t, -m, or -p.\n"); if ((init_list != NULL || nsamples > 0 || nmostprevalent > 0) && !sample_parms) nrestarts = 1; if (pos_examples != NULL) { hash = hsh_new(lst_size(pos_examples)); for (i = 0; i < lst_size(pos_examples); i++) hsh_put_int(hash, ((String*)lst_get_ptr(pos_examples, i))->chars, 1); has_motif = smalloc(lst_size(msa_name_list) * sizeof(double)); } /* open all MSAs */ msas = lst_new_ptr(lst_size(msa_name_list)); fprintf(stderr, "Reading alignment(s) ...\n"); for (i = 0, j = 0; i < lst_size(msa_name_list); i++) { String *name = lst_get_ptr(msa_name_list, i); FILE *mfile = phast_fopen(name->chars, "r"); msa_format_type temp_format; MSA *msa; if (msa_format == UNKNOWN_FORMAT) temp_format = msa_format_for_content(mfile, 1); else temp_format = msa_format; msa = msa_new_from_file_define_format(mfile, temp_format, NULL); phast_fclose(mfile); if (nseqs == -1) nseqs = msa->nseqs; if (!meme_mode && (msa->length - msa_num_gapped_cols(msa, STRIP_ANY_GAPS, -1, -1) < 300 || msa->nseqs != nseqs)) { fprintf(stderr, "WARNING: ignoring alignment '%s' -- too few informative sites.\n", name->chars); msa_free(msa); continue; } if (msa_alph_has_lowercase(msa)) msa_toupper(msa); msa_remove_N_from_alph(msa); /* Ns can be a problem */ lst_push_ptr(msas, msa); if (has_motif != NULL) { int k, hm = (hsh_get_int(hash, name->chars) == 1); if (meme_mode) { /* here need to record at individ seq level */ has_motif = srealloc(has_motif, (j + msa->nseqs + 1) * sizeof(double)); /* FIXME */ for (k = 0; k < msa->nseqs; k++) has_motif[j++] = hm; } else has_motif[j++] = hm; } } if (!meme_mode) { fprintf(stderr, "Extracting and pooling sufficient statistics ...\n"); pmsa = ss_pooled_from_msas(msas, 1, size, NULL, 0); msa_remove_N_from_alph(pmsa->pooled_msa); } /* obtain individual sequences, if necessary */ if (nmostprevalent > 0 || nsamples > 0 || meme_mode) { if (meme_mode) fprintf(stderr, "Converting to individual sequences ...\n"); else fprintf(stderr, "Obtaining reference sequences for pre-processing ...\n"); seqset = mtf_get_seqset(msas, meme_mode ? -1 : 1, 10 * size); /* for now, assume 1st seq is reference */ msa_remove_N_from_alph(seqset->set); } if (nmostprevalent > 0) { fprintf(stderr, "Obtaining %d most prevalent %d-tuples ...\n", nmostprevalent, tuple_size); init_list = lst_new_ptr(nmostprevalent); mtf_get_common_ntuples(seqset, init_list, tuple_size, nmostprevalent); } else if (nsamples > 0) { fprintf(stderr, "Sampling %d %d-tuples ...\n", nsamples, tuple_size); init_list = lst_new_ptr(nsamples); mtf_sample_ntuples(seqset, init_list, tuple_size, nsamples); } /* in meme_mode, backgd model can be specified as eq freqs in a .mod file */ if (meme_mode && backgd_mod != NULL && has_motif == NULL) backgd_mnmod = backgd_mod->backgd_freqs; /* estimate background model, if necessary */ else if (backgd_mod == NULL && (!meme_mode || has_motif == NULL)) { fprintf(stderr, "Fitting background model%s ...\n", has_motif == NULL ? "" : " (for use in initialization)"); /* if discriminative, be clear backgd isn't really part of the estimation procedure */ if (meme_mode) { backgd_mnmod = vec_new(strlen(seqset->set->alphabet)); mtf_estim_backgd_mn(seqset, backgd_mnmod); } else { backgd_mod = tm_new(tr_create_copy(tree), NULL, NULL, F81, pmsa->pooled_msa->alphabet, 1, 0, NULL, -1); tm_fit(backgd_mod, pmsa->pooled_msa, tm_params_init(backgd_mod, .1, 5, 0), -1, OPT_MED_PREC, NULL, 0, NULL); } } /* select subset of init strings, if necessary */ if (nbest > 0 && init_list != NULL) { fprintf(stderr, "Winnowing candidate start strings ...\n"); tmpl = lst_new_ptr(nbest); mtf_winnow_starts(meme_mode ? (void*)seqset : (void*)pmsa, init_list, nbest, tmpl, !meme_mode, size, tree, meme_mode ? (void*)backgd_mnmod : (void*)backgd_mod, has_motif); lst_free(init_list); init_list = tmpl; } /* Now find motifs */ motifs = mtf_find(meme_mode ? (void*)seqset : (void*)pmsa, !meme_mode, size, nmotifs, tree, meme_mode ? (void*)backgd_mnmod : (void*)backgd_mod, has_motif, prior, nrestarts, init_list, sample_parms, npseudocounts); fprintf(stderr, "\n\n"); if (do_bed) bedfeats = gff_new_set_init("phast_motif", "0.1b"); /* generate output */ for (i = 0; i < lst_size(motifs); i++) { Motif *m = lst_get_ptr(motifs, i); if (!suppress_stdout) { if (lst_size(motifs) > 1) printf("\n**********\nMOTIF #%d\n**********\n\n", i+1); mtf_print(stdout, m); } if (do_html) { String *fname = str_dup(output_prefix); str_append_int(fname, i+1); str_append_charstr(fname, ".html"); mtf_print_html(phast_fopen(fname->chars, "w+"), m); str_free(fname); } if (do_bed) mtf_add_features(m, bedfeats); } if (do_html) { String *fname = str_dup(output_prefix); str_append_charstr(fname, "index.html"); mtf_print_summary_html(phast_fopen(fname->chars, "w+"), motifs, output_prefix); str_free(fname); } if (do_bed) { String *fname = str_dup(output_prefix); str_append_charstr(fname, "bed"); gff_print_bed(phast_fopen(fname->chars, "w+"), bedfeats, FALSE); str_free(fname); } return 0; }