long mafBlock_get_start(MafBlock *block, String *specName) { int idx=0; if (specName != NULL) idx = hsh_get_int(block->specMap, specName->chars); if (idx == -1 || idx >= lst_size(block->data)) return -1; return ((MafSubBlock*)lst_get_ptr(block->data, idx))->start; }
/* Closes all outfiles. If already closed, reopen with append, add #eof closer, and close again. see comment above at get_outfile */ void close_outfiles(List *outfileList, Hashtable *outfileHash) { List *keys = hsh_keys(outfileHash); int *done, idx, i; char *fname; FILE *outfile; done = smalloc(lst_size(keys)*sizeof(int)); for (i=0; i<lst_size(keys); i++) { done[i]=0; fname = (char*)lst_get_ptr(keys, i); idx = hsh_get_int(outfileHash, fname); outfile = (FILE*)lst_get_ptr(outfileList, idx); if (outfile != NULL) { mafBlock_close_outfile(outfile); done[i]=1; } } for (i=0; i<lst_size(keys); i++) { if (done[i]) continue; fname = (char*)lst_get_ptr(keys, i); outfile = phast_fopen(fname, "a"); mafBlock_close_outfile(outfile); } sfree(done); lst_free(keys); lst_free(outfileList); hsh_free(outfileHash); }
int mafBlock_get_size(MafBlock *block, String *specName) { int idx=0; MafSubBlock *sub; if (specName == NULL) return block->seqlen; idx = hsh_get_int(block->specMap, specName->chars); if (idx == -1 || idx >= lst_size(block->data)) return -1; sub = (MafSubBlock*)lst_get_ptr(block->data, idx); if (sub->lineType[0]=='s') return sub->size; if (sub->lineType[0] != 'e') die("ERROR mafBlock_get_size, expected line type 'e', got %c\n", sub->lineType[0]); return 0; }
//if exclude==0, removes all species not in list. //if exclude==1, removes all species in list void mafBlock_subSpec(MafBlock *block, List *specNameList, int include) { String *str; int i, idx, *keep, oldSize = lst_size(block->data); keep = smalloc(oldSize*sizeof(int)); for (i=0; i<oldSize; i++) keep[i]=(include==0); for (i=0; i<lst_size(specNameList); i++) { str = (String*)lst_get_ptr(specNameList, i); idx = hsh_get_int(block->specMap, str->chars); if (idx != -1) keep[idx] = !(include==0); } mafBlock_remove_lines(block, keep); sfree(keep); return; }
void mafBlock_mask_region(MafBlock *block, GFF_Set *mask_feats, List *speclist) { MafSubBlock *refblock, *maskblock; int i, j, spec_idx; GFF_Set *feat; GFF_Feature *f, *prevf=NULL; int next_feat_idx = 1; char **maskseq; int num_mask_seq=0; long coord; if (mask_feats == NULL || lst_size(mask_feats->features) == 0L) return; maskseq = smalloc(lst_size(speclist)*sizeof(char*)); for (i=0; i < lst_size(speclist); i++) { spec_idx = hsh_get_int(block->specMap, ((String*)lst_get_ptr(speclist, i))->chars); if (spec_idx == -1) continue; maskblock = lst_get_ptr(block->data, spec_idx); if (maskblock->seq == NULL) continue; maskseq[num_mask_seq++] = maskblock->seq->chars; } if (num_mask_seq == 0) { sfree(maskseq); return; } feat = gff_copy_set_no_groups(mask_feats); gff_flatten_mergeAll(feat); f = lst_get_ptr(feat->features, 0); refblock = lst_get_ptr(block->data, 0); coord = refblock->start; for (i=0; i < block->seqlen; i++) { if (refblock->seq->chars[i] != '-') coord++; //this is 1-based coordinate if (coord > f->end) { if (next_feat_idx == lst_size(feat->features)) break; prevf = f; f = lst_get_ptr(feat->features, next_feat_idx++); if (f->start <= prevf->end) { die("Error: feats not sorted in mafBlock_mask_region"); //shouldn't happen } } if (coord >= f->start && coord <= f->end) { for (j=0; j < num_mask_seq; j++) if (maskseq[j][i] != '-') maskseq[j][i] = 'N'; } } gff_free_set(feat); sfree(maskseq); }
//trim mafblock to only keep columns with indcies[startcol..endcol] wrt //refseq. If refseq is null use frame of entire alignment. If endcol is -1 //then keep everything with index >= startcol. int mafBlock_trim(MafBlock *block, int startcol, int endcol, String *refseq, int offset) { MafSubBlock *sub=NULL; int i, specIdx, first=-1, last=-1, keep, length; long startIdx, lastIdx, idx; if (block->seqlen == 0) return 0; if (refseq == NULL) { startIdx = 1; length = block->seqlen; } else { specIdx = hsh_get_int(block->specMap, refseq->chars); if (specIdx == -1) die("Error: mafBlock_trim got specIdx -1\n"); sub = (MafSubBlock*)lst_get_ptr(block->data, specIdx); startIdx = sub->start + 1; length = sub->size; } startIdx += offset; lastIdx = startIdx + length - 1; idx = startIdx; if (refseq != NULL && sub->seq->chars[0]=='-') startIdx--; if (startcol != 1 && endcol != -1 && startcol > endcol) die("ERROR: startcol > endcol\n"); if (startcol > lastIdx || (endcol != -1 && endcol < startIdx)) { mafBlock_free_data(block); return 0; } if (startcol <= startIdx && (endcol == -1 || endcol >= lastIdx)) return 1; //now we know we have to do some trimming for (i=0; i<block->seqlen; i++) { if (refseq != NULL && sub->seq->chars[i]=='-') idx--; keep = (idx >= startcol && (idx <= endcol || endcol == -1)); if (first == -1 && keep) first = i+1; if (keep) last=i+1; idx++; } mafBlock_subAlign(block, first, last); return 1; }
void mafBlock_reorder(MafBlock *block, List *specNameOrder) { String *str; MafSubBlock *sub; List *newData; Hashtable *newSpecMap; int i, idx, *found, oldSize = lst_size(block->data), newSize = lst_size(specNameOrder); found = smalloc(oldSize*sizeof(int)); for (i=0; i<oldSize; i++) found[i]=0; newData = lst_new_ptr(oldSize); newSpecMap = hsh_new(100); for (i=0; i<newSize; i++) { str = (String*)lst_get_ptr(specNameOrder, i); idx = hsh_get_int(block->specMap, str->chars); if (idx != -1) { if (found[idx]==1) die("ERROR: species %s appears twice in reorder list\n", str->chars); sub = (MafSubBlock*)lst_get_ptr(block->data, idx); hsh_put_int(newSpecMap, sub->src->chars, lst_size(newData)); hsh_put_int(newSpecMap, sub->specName->chars, lst_size(newData)); lst_push_ptr(newData, (void*)sub); found[idx] = 1; } } for (i=0; i<oldSize; i++) { if (found[i]==0) { sub = (MafSubBlock*)lst_get_ptr(block->data, i); mafSubBlock_free(sub); } } hsh_free(block->specMap); lst_free(block->data); block->specMap = newSpecMap; block->data = newData; sfree(found); }
//read next block in mfile and return MafBlock object or NULL if EOF. //specHash and numSpec are not used, but if specHash is not NULL, //it should be initialized, and any new species encountered will be added //to the hash, with numSpec increased accordingly. If specHash is NULL, //numSpec will not be used or modified. MafBlock *mafBlock_read_next(FILE *mfile, Hashtable *specHash, int *numSpec) { int i; char firstchar; String *currLine = str_new(1000); MafBlock *block=NULL; MafSubBlock *sub=NULL; if (specHash != NULL && numSpec==NULL) die("ERROR: mafBlock_read_next: numSpec cannot be NULL " "if specHash is not NULL\n"); while (EOF != str_readline(currLine, mfile)) { str_trim(currLine); if (currLine->length==0) { //if blank line, it is either first or last line if (block == NULL) continue; else break; } firstchar = currLine->chars[0]; if (firstchar == '#') continue; //ignore comments if (block == NULL) { if (firstchar != 'a') die("ERROR: first line of MAF block should start with 'a'\n"); block = mafBlock_new(); block->aLine = str_new_charstr(currLine->chars); } //if 's' or 'e', then this is first line of data for this species else if (firstchar == 's' || firstchar == 'e') { sub = mafBlock_get_subBlock(currLine); if (hsh_get_int(block->specMap, sub->src->chars) != -1) die("ERROR: mafBlock has two alignments with same srcName (%s)\n", sub->src->chars); hsh_put_int(block->specMap, sub->src->chars, lst_size(block->data)); hsh_put_int(block->specMap, sub->specName->chars, lst_size(block->data)); lst_push_ptr(block->data, (void*)sub); if (specHash != NULL) { if (-1 == hsh_get_int(specHash, sub->specName->chars)) { hsh_put_int(specHash, sub->specName->chars, *numSpec); (*numSpec)++; } } } else { if (firstchar == 'i') mafBlock_add_iLine(currLine, sub); else if (firstchar == 'q') mafBlock_add_qLine(currLine, sub); else die("ERROR: found line in MAF block starting with '%c'\n", firstchar); } } str_free(currLine); if (block == NULL) return NULL; //set seqlen and make sure all seq arrays agree for (i=0; i<lst_size(block->data); i++) { sub = (MafSubBlock*)lst_get_ptr(block->data, i); if (sub->lineType[0]=='e') continue; if (block->seqlen == -1) block->seqlen = sub->seq->length; else if (sub->seq->length != block->seqlen) { die("ERROR: lengths of sequences in MAF block do not agree (%i, %i)\n", block->seqlen, sub->seq->length); } } return block; }
int main(int argc, char *argv[]) { TreeNode *tree = NULL; TreeModel *backgd_mod = NULL; int i, j, size = DEFAULT_SIZE, meme_mode = 0, profile_mode = 0, nrestarts = 10, npseudocounts = 5, nsamples = -1, nmostprevalent = -1, tuple_size = -1, nbest = -1, sample_parms = 0, nmotifs = DEFAULT_NUMBER, nseqs = -1, do_html = 0, do_bed = 0, suppress_stdout = 0; List *msa_name_list = NULL, *pos_examples = NULL, *init_list = NULL, *tmpl; List *msas, *motifs; SeqSet *seqset = NULL; PooledMSA *pmsa = NULL; msa_format_type msa_format = UNKNOWN_FORMAT; Vector *backgd_mnmod = NULL; Hashtable *hash=NULL; String *output_prefix = str_new_charstr("phastm."); double *has_motif = NULL; double prior = PRIOR; char c; GFF_Set *bedfeats = NULL; while ((c = getopt(argc, argv, "t:i:b:sk:md:pn:I:R:P:w:c:SB:o:HDxh")) != -1) { switch (c) { case 't': tree = tr_new_from_file(phast_fopen(optarg, "r")); break; case 'i': msa_format = msa_str_to_format(optarg); if (msa_format == UNKNOWN_FORMAT) die("ERROR: bad input format.\n"); break; case 'b': backgd_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1); break; case 's': break; case 'k': size = get_arg_int(optarg); break; case 'm': meme_mode = 1; break; case 'd': pos_examples = get_arg_list(optarg); break; case 'p': profile_mode = 1; break; case 'n': nrestarts = get_arg_int(optarg); break; case 'I': init_list = get_arg_list(optarg); break; case 'P': tmpl = str_list_as_int(get_arg_list(optarg)); if (lst_size(tmpl) != 2) die("ERROR: bad argument to -P.\n"); nmostprevalent = lst_get_int(tmpl, 0); tuple_size = lst_get_int(tmpl, 1); if (!(nmostprevalent > 0 && tuple_size > 0)) die("ERROR: bad argument nmostprevalent=%i tuple_size=%i\n", nmostprevalent, tuple_size); lst_free(tmpl); break; case 'R': tmpl = str_list_as_int(get_arg_list(optarg)); if (lst_size(tmpl) != 2) die("ERROR: bad argument to -R.\n"); nsamples = lst_get_int(tmpl, 0); tuple_size = lst_get_int(tmpl, 1); if (!(nsamples > 0 && tuple_size > 0)) die("ERROR nsamples=%i tuple_sizse=%i\n", nsamples, tuple_size); lst_free(tmpl); break; case 'c': npseudocounts = get_arg_int(optarg); break; case 'w': nbest = get_arg_int(optarg); break; case 'S': sample_parms = 1; break; case 'B': nmotifs = get_arg_int(optarg); break; case 'o': str_free(output_prefix); output_prefix = str_new_charstr(optarg); str_append_char(output_prefix, '.'); break; case 'H': do_html = 1; break; case 'D': do_bed = 1; break; case 'x': suppress_stdout = 1; break; case 'h': usage(argv[0]); case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } if (optind != argc - 1) die("ERROR: List of alignment files required. Try '%s -h'.\n", argv[0]); if ((nsamples > 0 && nmostprevalent > 0) || (nsamples > 0 && init_list != NULL) || (nmostprevalent > 0 && init_list != NULL)) die("ERROR: -I, -P, and -R are mutually exclusive."); set_seed(-1); msa_name_list = get_arg_list(argv[optind]); if (backgd_mod != NULL && tree == NULL) tree = backgd_mod->tree; if (tree == NULL && !meme_mode && !profile_mode) die("ERROR: Must specify -t, -m, or -p.\n"); if ((init_list != NULL || nsamples > 0 || nmostprevalent > 0) && !sample_parms) nrestarts = 1; if (pos_examples != NULL) { hash = hsh_new(lst_size(pos_examples)); for (i = 0; i < lst_size(pos_examples); i++) hsh_put_int(hash, ((String*)lst_get_ptr(pos_examples, i))->chars, 1); has_motif = smalloc(lst_size(msa_name_list) * sizeof(double)); } /* open all MSAs */ msas = lst_new_ptr(lst_size(msa_name_list)); fprintf(stderr, "Reading alignment(s) ...\n"); for (i = 0, j = 0; i < lst_size(msa_name_list); i++) { String *name = lst_get_ptr(msa_name_list, i); FILE *mfile = phast_fopen(name->chars, "r"); msa_format_type temp_format; MSA *msa; if (msa_format == UNKNOWN_FORMAT) temp_format = msa_format_for_content(mfile, 1); else temp_format = msa_format; msa = msa_new_from_file_define_format(mfile, temp_format, NULL); phast_fclose(mfile); if (nseqs == -1) nseqs = msa->nseqs; if (!meme_mode && (msa->length - msa_num_gapped_cols(msa, STRIP_ANY_GAPS, -1, -1) < 300 || msa->nseqs != nseqs)) { fprintf(stderr, "WARNING: ignoring alignment '%s' -- too few informative sites.\n", name->chars); msa_free(msa); continue; } if (msa_alph_has_lowercase(msa)) msa_toupper(msa); msa_remove_N_from_alph(msa); /* Ns can be a problem */ lst_push_ptr(msas, msa); if (has_motif != NULL) { int k, hm = (hsh_get_int(hash, name->chars) == 1); if (meme_mode) { /* here need to record at individ seq level */ has_motif = srealloc(has_motif, (j + msa->nseqs + 1) * sizeof(double)); /* FIXME */ for (k = 0; k < msa->nseqs; k++) has_motif[j++] = hm; } else has_motif[j++] = hm; } } if (!meme_mode) { fprintf(stderr, "Extracting and pooling sufficient statistics ...\n"); pmsa = ss_pooled_from_msas(msas, 1, size, NULL, 0); msa_remove_N_from_alph(pmsa->pooled_msa); } /* obtain individual sequences, if necessary */ if (nmostprevalent > 0 || nsamples > 0 || meme_mode) { if (meme_mode) fprintf(stderr, "Converting to individual sequences ...\n"); else fprintf(stderr, "Obtaining reference sequences for pre-processing ...\n"); seqset = mtf_get_seqset(msas, meme_mode ? -1 : 1, 10 * size); /* for now, assume 1st seq is reference */ msa_remove_N_from_alph(seqset->set); } if (nmostprevalent > 0) { fprintf(stderr, "Obtaining %d most prevalent %d-tuples ...\n", nmostprevalent, tuple_size); init_list = lst_new_ptr(nmostprevalent); mtf_get_common_ntuples(seqset, init_list, tuple_size, nmostprevalent); } else if (nsamples > 0) { fprintf(stderr, "Sampling %d %d-tuples ...\n", nsamples, tuple_size); init_list = lst_new_ptr(nsamples); mtf_sample_ntuples(seqset, init_list, tuple_size, nsamples); } /* in meme_mode, backgd model can be specified as eq freqs in a .mod file */ if (meme_mode && backgd_mod != NULL && has_motif == NULL) backgd_mnmod = backgd_mod->backgd_freqs; /* estimate background model, if necessary */ else if (backgd_mod == NULL && (!meme_mode || has_motif == NULL)) { fprintf(stderr, "Fitting background model%s ...\n", has_motif == NULL ? "" : " (for use in initialization)"); /* if discriminative, be clear backgd isn't really part of the estimation procedure */ if (meme_mode) { backgd_mnmod = vec_new(strlen(seqset->set->alphabet)); mtf_estim_backgd_mn(seqset, backgd_mnmod); } else { backgd_mod = tm_new(tr_create_copy(tree), NULL, NULL, F81, pmsa->pooled_msa->alphabet, 1, 0, NULL, -1); tm_fit(backgd_mod, pmsa->pooled_msa, tm_params_init(backgd_mod, .1, 5, 0), -1, OPT_MED_PREC, NULL, 0, NULL); } } /* select subset of init strings, if necessary */ if (nbest > 0 && init_list != NULL) { fprintf(stderr, "Winnowing candidate start strings ...\n"); tmpl = lst_new_ptr(nbest); mtf_winnow_starts(meme_mode ? (void*)seqset : (void*)pmsa, init_list, nbest, tmpl, !meme_mode, size, tree, meme_mode ? (void*)backgd_mnmod : (void*)backgd_mod, has_motif); lst_free(init_list); init_list = tmpl; } /* Now find motifs */ motifs = mtf_find(meme_mode ? (void*)seqset : (void*)pmsa, !meme_mode, size, nmotifs, tree, meme_mode ? (void*)backgd_mnmod : (void*)backgd_mod, has_motif, prior, nrestarts, init_list, sample_parms, npseudocounts); fprintf(stderr, "\n\n"); if (do_bed) bedfeats = gff_new_set_init("phast_motif", "0.1b"); /* generate output */ for (i = 0; i < lst_size(motifs); i++) { Motif *m = lst_get_ptr(motifs, i); if (!suppress_stdout) { if (lst_size(motifs) > 1) printf("\n**********\nMOTIF #%d\n**********\n\n", i+1); mtf_print(stdout, m); } if (do_html) { String *fname = str_dup(output_prefix); str_append_int(fname, i+1); str_append_charstr(fname, ".html"); mtf_print_html(phast_fopen(fname->chars, "w+"), m); str_free(fname); } if (do_bed) mtf_add_features(m, bedfeats); } if (do_html) { String *fname = str_dup(output_prefix); str_append_charstr(fname, "index.html"); mtf_print_summary_html(phast_fopen(fname->chars, "w+"), motifs, output_prefix); str_free(fname); } if (do_bed) { String *fname = str_dup(output_prefix); str_append_charstr(fname, "bed"); gff_print_bed(phast_fopen(fname->chars, "w+"), bedfeats, FALSE); str_free(fname); } return 0; }