int main(int argc, char** argv) { // beginning of the verbose OpenCL allocation cl_platform_id platform_id = NULL; cl_uint ret_num_platforms = 0; cl_uint ret_num_devices = 0; cl_int ret = 0; // the output from opencl kernel float *c_inputs = malloc(ARRAY_SIZE*sizeof(float)); float *c_outputs = malloc(ARRAY_SIZE*sizeof(float)); cl_float *cl_inputs = malloc(ARRAY_SIZE*sizeof(cl_float)); cl_float *cl_outputs = malloc(ARRAY_SIZE*sizeof(cl_float)); // get random numbers via Rmath set_seed(atoi(argv[1]), 197414); float tmp_in = 0.0; #pragma omp parallel for for (long i = 0; i < ARRAY_SIZE; i++) { tmp_in = rnorm(0, 1); c_inputs[i] = tmp_in; cl_inputs[i] = (cl_float) tmp_in; } // measure time elapse clock_t start = clock(); #pragma omp parallel for for (long i = 0; i < ARRAY_SIZE; i++) { c_outputs[i] = expf(c_inputs[i]); } printf("CPU time for %d exp operation: %d\n", ARRAY_SIZE, (int) (clock() - start)); // read kernel source FILE *fp; char filename[] = "./hello_log.cl"; char *source_str; size_t source_size; fp = fopen(filename, "r"); source_str = (char*) malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); // get platform and device info ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); cl_device_id device_ids[2]; ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 2, device_ids, &ret_num_devices); printf("Number of devices: %5d\n", ret_num_devices); // print device name char bdname[100]; clGetDeviceInfo(device_ids[1], CL_DEVICE_NAME, 100, bdname, NULL); printf("Used device: %s\n", bdname); // use second GPU cl_device_id device_id = device_ids[1]; // create opencl context cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); // create command queue cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device_id, 0, &ret); // create memory buffer for input cl_mem memobj_in = clCreateBuffer(context, CL_MEM_READ_WRITE, ARRAY_SIZE*sizeof(cl_float), NULL, &ret); // create memory buffer for output cl_mem memobj_out = clCreateBuffer(context, CL_MEM_READ_WRITE, ARRAY_SIZE*sizeof(cl_float), NULL, &ret); // create kernel program cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); // build program ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); printf("build program successfully\n"); // create opencl kernel cl_kernel kernel = clCreateKernel(program, "hello_exp", &ret); // set opencl parameters for inputs ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj_in); // set opencl parameters for inputs ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobj_out); // execute opencl kernel size_t global_item_size = ARRAY_SIZE/32; size_t local_item_size = 32; // measure time start = clock(); ret = clEnqueueWriteBuffer(command_queue, memobj_in, CL_TRUE, 0, ARRAY_SIZE*sizeof(cl_float), cl_inputs, 0, NULL, NULL); // run it ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); // copy results from the memory buffer ret = clEnqueueReadBuffer(command_queue, memobj_out, CL_TRUE, 0, ARRAY_SIZE*sizeof(cl_float), cl_outputs, 0, NULL, NULL); printf("GPU time (with PCI-E overhead): %d\n", (int) (clock() - start)); printf("inputs: %3.7f %3.7f\n", c_inputs[150000], cl_inputs[150000]); printf("outputs: %3.7f %3.7f\n", c_outputs[150000], (float) cl_outputs[150000]); // finalization ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(memobj_in); ret = clReleaseMemObject(memobj_out); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); free(source_str); return 0; }
int main(int argc, char *argv[]) { List *pruned_names = lst_new_ptr(5); TreeModel *source_mod; MSA *msa = NULL, *out_msa; IndelHistory *ih; char *read_hist_fname = NULL; char c; int opt_idx, old_nnodes, i; msa_format_type msa_format = UNKNOWN_FORMAT; int output_alignment = FALSE, ia_names = FALSE; struct option long_opts[] = { {"msa-format", 1, 0, 'i'}, {"output-alignment", 0, 0, 'A'}, {"read-history", 1, 0, 'H'}, {"ia-names", 0, 0, 'I'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "i:H:AIh", long_opts, &opt_idx)) != -1) { switch (c) { case 'i': msa_format = msa_str_to_format(optarg); if (msa_format == -1) die("ERROR: unrecognized alignment format.\n"); break; case 'A': output_alignment = TRUE; break; case 'H': read_hist_fname = optarg; break; case 'I': ia_names = TRUE; break; case 'h': printf("%s", HELP); exit(0); case '?': die("Bad argument. Try 'indelHistory -h'.\n"); } } set_seed(-1); if (read_hist_fname != NULL) { fprintf(stderr, "Reading indel history from %s...\n", read_hist_fname); ih = ih_new_from_file(phast_fopen(read_hist_fname, "r")); } else { FILE *mfile; if (optind != argc - 2) die("Two arguments required. Try 'indelHistory -h'.\n"); fprintf(stderr, "Reading alignment from %s...\n", argv[optind]); mfile = phast_fopen(argv[optind], "r"); if (msa_format == UNKNOWN_FORMAT) msa_format = msa_format_for_content(mfile, 1); msa = msa_new_from_file_define_format(mfile, msa_format, "ACGTNB^.-"); phast_fclose(mfile); if (msa->seqs == NULL && (msa->ss == NULL || msa->ss->tuple_idx == NULL)) die("ERROR: ordered representation of alignment required.\n"); fprintf(stderr, "Reading tree from %s...\n", argv[optind+1]); source_mod = tm_new_from_file(phast_fopen(argv[optind+1], "r"), 1); /* prune tree, if necessary */ old_nnodes = source_mod->tree->nnodes; tm_prune(source_mod, msa, pruned_names); if (lst_size(pruned_names) == (old_nnodes + 1) / 2) die("ERROR: no match for leaves of tree in alignment (leaf names must match alignment names).\n"); if (lst_size(pruned_names) > 0) { fprintf(stderr, "WARNING: pruned away leaves of tree with no match in alignment ("); for (i = 0; i < lst_size(pruned_names); i++) fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, i))->chars, i < lst_size(pruned_names) - 1 ? ", " : ").\n"); } lst_free(pruned_names); tr_name_ancestors(source_mod->tree); if (msa->nseqs > (source_mod->tree->nnodes + 1) / 2) { /* assume ancestral seqs specified in this case */ if (ia_names) { fprintf(stderr, "Converting sequence names...\n"); ih_convert_ia_names(msa, source_mod->tree); } fprintf(stderr, "Extracting indel history from alignment...\n"); ih = ih_extract_from_alignment(msa, source_mod->tree); } else { /* infer by parsimony */ if (msa->ss == NULL) { fprintf(stderr, "Extracting sufficient statistics...\n"); ss_from_msas(msa, 1, TRUE, NULL, NULL, NULL, -1, 0); } fprintf(stderr, "Inferring indel history by parsimony...\n"); ih = ih_reconstruct(msa, source_mod->tree); } } if (output_alignment) { out_msa = ih_as_alignment(ih, msa); msa_print(stdout, out_msa, FASTA, FALSE); } else ih_print(ih, stdout, read_hist_fname != NULL ? read_hist_fname : argv[optind], "indelHistory"); fprintf(stderr, "Done.\n"); return 0; }
int main(int argc, char *argv[]) { char c; char *msa_fname = NULL; int opt_idx, i, old_nnodes; MSA *msa; List *pruned_names = lst_new_ptr(5), *tmpl; BDPhyloHmm *bdphmm; GFF_Set *predictions; int found = FALSE; List *ignore_types = lst_new_ptr(1); struct option long_opts[] = { {"refseq", 1, 0, 'M'}, {"msa-format", 1, 0, 'i'}, {"refidx", 1, 0, 'r'}, {"rho", 1, 0, 'R'}, {"phi", 1, 0, 'p'}, {"transitions", 1, 0, 't'}, {"expected-length", 1, 0, 'E'}, {"target-coverage", 1, 0, 'C'}, {"seqname", 1, 0, 'N'}, {"idpref", 1, 0, 'P'}, {"indel-model", 1, 0, 'I'}, {"indel-history", 1, 0, 'H'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; /* arguments and defaults for options */ FILE *refseq_f = NULL, *msa_f = NULL; msa_format_type msa_format = UNKNOWN_FORMAT; TreeModel *source_mod; double rho = DEFAULT_RHO, mu = DEFAULT_MU, nu = DEFAULT_NU, phi = DEFAULT_PHI, gamma = -1, omega = -1, alpha_c = -1, beta_c = -1, tau_c = -1, alpha_n = -1, beta_n = -1, tau_n = -1; int set_transitions = FALSE, refidx = 1, estim_phi = TRUE, estim_gamma = TRUE, estim_omega = TRUE; char *seqname = NULL, *idpref = NULL; IndelHistory *ih = NULL; while ((c = getopt_long(argc, argv, "R:t:p:E:C:r:M:i:N:P:I:H:h", long_opts, &opt_idx)) != -1) { switch (c) { case 'R': rho = get_arg_dbl_bounds(optarg, 0, 1); break; case 't': if (optarg[0] != '~') estim_gamma = estim_omega = FALSE; else optarg = &optarg[1]; set_transitions = TRUE; tmpl = get_arg_list_dbl(optarg); if (lst_size(tmpl) != 2) die("ERROR: bad argument to --transitions.\n"); mu = lst_get_dbl(tmpl, 0); nu = lst_get_dbl(tmpl, 1); if (mu <= 0 || mu >= 1 || nu <= 0 || nu >= 1) die("ERROR: bad argument to --transitions.\n"); lst_free(tmpl); break; case 'p': if (optarg[0] != '~') estim_phi = FALSE; else optarg = &optarg[1]; phi = get_arg_dbl_bounds(optarg, 0, 1); break; case 'E': if (optarg[0] != '~') estim_omega = FALSE; else optarg = &optarg[1]; omega = get_arg_dbl_bounds(optarg, 1, INFTY); mu = 1/omega; break; case 'C': if (optarg[0] != '~') estim_gamma = FALSE; else optarg = &optarg[1]; gamma = get_arg_dbl_bounds(optarg, 0, 1); break; case 'r': refidx = get_arg_int_bounds(optarg, 0, INFTY); break; case 'M': refseq_f = phast_fopen(optarg, "r"); break; case 'i': msa_format = msa_str_to_format(optarg); if (msa_format == UNKNOWN_FORMAT) die("ERROR: unrecognized alignment format.\n"); break; case 'N': seqname = optarg; break; case 'P': idpref = optarg; break; case 'I': tmpl = get_arg_list_dbl(optarg); if (lst_size(tmpl) != 3 && lst_size(tmpl) != 6) die("ERROR: bad argument to --indel-model.\n"); alpha_n = lst_get_dbl(tmpl, 0); beta_n = lst_get_dbl(tmpl, 1); tau_n = lst_get_dbl(tmpl, 2); if (lst_size(tmpl) == 6) { alpha_c = lst_get_dbl(tmpl, 3); beta_c = lst_get_dbl(tmpl, 4); tau_c = lst_get_dbl(tmpl, 5); } else { alpha_c = alpha_n; beta_c = beta_n; tau_c = tau_n; } if (alpha_c <= 0 || alpha_c >= 1 || beta_c <= 0 || beta_c >= 1 || tau_c <= 0 || tau_c >= 1 || alpha_n <= 0 || alpha_n >= 1 || beta_n <= 0 || beta_n >= 1 || tau_n <= 0 || tau_n >= 1) die("ERROR: bad argument to --indel-model.\n"); break; case 'H': fprintf(stderr, "Reading indel history from %s...\n", optarg); ih = ih_new_from_file(phast_fopen(optarg, "r")); break; case 'h': printf("%s", HELP); exit(0); case '?': die("Bad argument. Try 'dless -h'.\n"); } } if (optind != argc - 1) die("Missing alignment file or model file. Try 'dless -h'.\n"); if (set_transitions && (gamma != -1 || omega != -1)) die("ERROR: --transitions and --target-coverage/--expected-length cannot be used together.\n"); if ((gamma != -1 && omega == -1) || (gamma == -1 && omega != -1)) die("ERROR: --target-coverage and --expecteed-length must be used together.\n"); set_seed(-1); if (gamma != -1) nu = gamma/(1-gamma) * mu; fprintf(stderr, "Reading tree model from %s...\n", argv[optind]); source_mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1); if (source_mod->nratecats > 1) die("ERROR: rate variation not currently supported.\n"); if (source_mod->order > 0) die("ERROR: only single nucleotide models are currently supported.\n"); if (!tm_is_reversible(source_mod)) phast_warning("WARNING: p-value computation assumes reversibility and your model is non-reversible.\n"); /* read alignment */ msa_f = phast_fopen(argv[optind], "r"); fprintf(stderr, "Reading alignment from %s...\n", argv[optind]); if (msa_format == UNKNOWN_FORMAT) msa_format = msa_format_for_content(msa_f, 1); if (msa_format == MAF) { msa = maf_read(msa_f, refseq_f, 1, NULL, NULL, NULL, -1, TRUE, NULL, NO_STRIP, FALSE); } else msa = msa_new_from_file_define_format(msa_f, msa_format, NULL); if (msa_alph_has_lowercase(msa)) msa_toupper(msa); msa_remove_N_from_alph(msa); if (msa->ss == NULL) { fprintf(stderr, "Extracting sufficient statistics...\n"); ss_from_msas(msa, 1, TRUE, NULL, NULL, NULL, -1, 0); } else if (msa->ss->tuple_idx == NULL) die("ERROR: ordered representation of alignment required unless --suff-stats.\n"); /* prune tree, if necessary */ old_nnodes = source_mod->tree->nnodes; tm_prune(source_mod, msa, pruned_names); if (lst_size(pruned_names) == (old_nnodes + 1) / 2) die("ERROR: no match for leaves of tree in alignment (leaf names must match alignment names).\n"); if (lst_size(pruned_names) > 0) { fprintf(stderr, "WARNING: pruned away leaves of tree with no match in alignment ("); for (i = 0; i < lst_size(pruned_names); i++) fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, i))->chars, i < lst_size(pruned_names) - 1 ? ", " : ").\n"); } /* this has to be done after pruning tree */ tr_name_ancestors(source_mod->tree); /* also make sure match for reference sequence in tree */ if (refidx > 0) { for (i = 0, found = FALSE; !found && i < source_mod->tree->nnodes; i++) { TreeNode *n = lst_get_ptr(source_mod->tree->nodes, i); if (!strcmp(n->name, msa->names[refidx-1])) found = TRUE; } if (!found) die("ERROR: no match for reference sequence in tree.\n"); } /* checks for indel model */ if (alpha_c > 0) { if (ih == NULL) { fprintf(stderr, "Reconstructing indel history by parsimony...\n"); ih = ih_reconstruct(msa, source_mod->tree); } else { if (ih->ncols != msa->length) die("ERROR: indel history doesn't seem to match alignment.\n"); if (ih->tree->nnodes != source_mod->tree->nnodes) die("ERROR: indel history doesn't seem to match tree model.\n"); } } bdphmm = bd_new(source_mod, rho, mu, nu, phi, alpha_c, beta_c, tau_c, alpha_n, beta_n, tau_n, estim_gamma, estim_omega, estim_phi); /* compute emissions */ phmm_compute_emissions(bdphmm->phmm, msa, FALSE); /* add emissions for indel model, if necessary */ if (alpha_c > 0) { fprintf(stderr, "Adjusting emissions for indels...\n"); bd_add_indel_emissions(bdphmm, ih); } /* postprocess for missing data (requires special handling) */ fprintf(stderr, "Adjusting emissions for missing data...\n"); bd_handle_missing_data(bdphmm, msa); if (estim_gamma || estim_omega || estim_phi) { fprintf(stderr, "Estimating free parameters...\n"); bd_estimate_transitions(bdphmm, msa); } /* set seqname and idpref, if necessary */ if (seqname == NULL || idpref == NULL) { /* derive default from file name root */ String *tmp = str_new_charstr(msa_fname); if (!str_equals_charstr(tmp, "-")) { str_remove_path(tmp); str_root(tmp, '.'); if (idpref == NULL) idpref = copy_charstr(tmp->chars); str_root(tmp, '.'); /* apply one more time for double suffix */ if (seqname == NULL) seqname = tmp->chars; } else if (seqname == NULL) seqname = "refseq"; } /* obtain predictions */ fprintf(stderr, "Running Viterbi algorithm...\n"); predictions = phmm_predict_viterbi(bdphmm->phmm, seqname, NULL, idpref, NULL); lst_push_ptr(ignore_types, str_new_charstr("nonconserved")); gff_filter_by_type(predictions, ignore_types, TRUE, NULL); /* score predictions */ fprintf(stderr, "Scoring predictions...\n"); bd_score_predictions(bdphmm, predictions); /* can free emissions now */ for (i = 0; i < bdphmm->phmm->hmm->nstates; i++) sfree(bdphmm->phmm->emissions[i]); sfree(bdphmm->phmm->emissions); bdphmm->phmm->emissions = NULL; /* convert GFF to coord frame of reference sequence and adjust coords by idx_offset, if necessary */ if (refidx != 0 || msa->idx_offset != 0) msa_map_gff_coords(msa, predictions, 0, refidx, msa->idx_offset); if (refidx != 0) gff_flatten(predictions); /* necessary because coord conversion might create overlapping features (can happen in deletions in reference sequence) */ /* now output predictions */ fprintf(stderr, "Writing GFF to stdout...\n"); gff_print_set(stdout, predictions); fprintf(stderr, "Done.\n"); return 0; }
int main(int argc, char *argv[]) { char c; List *l; int i, j, strand, bed_output = 0, backgd_nmods = -1, feat_nmods = -1, winsize = -1, verbose = 0, max_nmods, memblocksize, old_nleaves, refidx = 1, base_by_base = FALSE, windowWig = FALSE; TreeModel **backgd_mods = NULL, **feat_mods = NULL; HMM *backgd_hmm = NULL, *feat_hmm = NULL; msa_format_type inform = UNKNOWN_FORMAT; GFF_Set *features = NULL; MSA *msa, *msa_compl=NULL; double **backgd_emissions, **feat_emissions, **mem, **dummy_emissions, *winscore_pos=NULL, *winscore_neg=NULL; int *no_alignment=NULL; List *pruned_names; char *msa_fname; FILE *infile; int opt_idx; struct option long_opts[] = { {"background-mods", 1, 0, 'b'}, {"background-hmm", 1, 0, 'B'}, {"feature-mods", 1, 0, 'f'}, {"feature-hmm", 1, 0, 'F'}, {"features", 1, 0, 'g'}, {"window", 1, 0, 'w'}, {"window-wig", 1, 0, 'W'}, {"base-by-base", 0, 0, 'y'}, {"msa-format", 1, 0, 'i'}, {"refidx", 1, 0, 'r'}, {"output-bed", 0, 0, 'd'}, {"verbose", 0, 0, 'v'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "B:b:F:f:r:g:w:W:i:ydvh", long_opts, &opt_idx)) != -1) { switch (c) { case 'B': backgd_hmm = hmm_new_from_file(phast_fopen(optarg, "r")); break; case 'b': l = get_arg_list(optarg); backgd_nmods = lst_size(l); backgd_mods = smalloc(backgd_nmods * sizeof(void*)); for (i = 0; i < backgd_nmods; i++) backgd_mods[i] = tm_new_from_file(phast_fopen(((String*)lst_get_ptr(l, i))->chars, "r"), 1); lst_free_strings(l); lst_free(l); break; case 'F': feat_hmm = hmm_new_from_file(phast_fopen(optarg, "r")); break; case 'f': l = get_arg_list(optarg); feat_nmods = lst_size(l); feat_mods = smalloc(feat_nmods * sizeof(void*)); for (i = 0; i < feat_nmods; i++) feat_mods[i] = tm_new_from_file(phast_fopen(((String*)lst_get_ptr(l, i))->chars, "r"), 1); lst_free_strings(l); lst_free(l); break; case 'g': features = gff_read_set(phast_fopen(optarg, "r")); break; case 'w': winsize = get_arg_int(optarg); if (winsize <= 0) die("ERROR: window size must be positive.\n"); break; case 'W': winsize = get_arg_int(optarg); if (winsize <= 0) die("ERROR: window size must be positive.\n"); windowWig = TRUE; break; case 'y': base_by_base = TRUE; break; case 'i': inform = msa_str_to_format(optarg); if (inform == UNKNOWN_FORMAT) die("Bad argument to -i.\n"); break; case 'r': refidx = get_arg_int_bounds(optarg, 0, INFTY); break; case 'd': bed_output = 1; break; case 'h': printf("%s", HELP); exit(0); case 'v': verbose = 1; break; case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } set_seed(-1); if (backgd_mods == NULL || feat_mods == NULL) die("ERROR: -b and -f required. Try '%s -h'.\n", argv[0]); if (backgd_nmods == 1 && backgd_hmm == NULL) backgd_hmm = hmm_create_trivial(); else if (backgd_hmm == NULL) die("ERROR: -B required. Try '%s -h'.\n", argv[0]); if (feat_nmods == 1 && feat_hmm == NULL) feat_hmm = hmm_create_trivial(); else if (feat_hmm == NULL) die("ERROR: -F required. Try '%s -h'.\n", argv[0]); if ((winsize == -1 && features == NULL && !base_by_base) || (winsize != -1 && features != NULL) || (winsize != -1 && base_by_base) || (features != NULL && base_by_base)) die("ERROR: must specify exactly one of -g, -w, and -y. Try '%s -h'.\n", argv[0]); if (backgd_hmm->nstates != backgd_nmods) die("ERROR: number of states must equal number of tree models for background.\n"); if (feat_hmm->nstates != feat_nmods) die("ERROR: number of states must equal number of tree models for features.\n"); if (features != NULL && lst_size(features->features) == 0) die("ERROR: empty features file.\n"); if (base_by_base && (backgd_nmods > 1 || feat_nmods > 1)) die("ERROR: only single phylogenetic models (not HMMs) are supported with --base-by-base.\n"); if (optind != argc - 1) die("ERROR: too few arguments. Try '%s -h'.\n", argv[0]); if (verbose) fprintf(stderr, "Reading alignment ...\n"); msa_fname = argv[optind]; infile = phast_fopen(msa_fname, "r"); if (inform == UNKNOWN_FORMAT) inform = msa_format_for_content(infile, 1); if (inform == MAF) msa = maf_read(infile, NULL, 1, NULL, NULL, NULL, -1, TRUE, NULL, NO_STRIP, FALSE); else msa = msa_new_from_file_define_format(infile, inform, NULL); if (msa_alph_has_lowercase(msa)) msa_toupper(msa); msa_remove_N_from_alph(msa); /* need ordered representation of alignment */ if (msa->seqs == NULL && (msa->ss == NULL || msa->ss->tuple_idx == NULL) ) die("ERROR: ordered sufficient statistics are required.\n"); pruned_names = lst_new_ptr(msa->nseqs); for (i = 0; i < backgd_nmods; i++) { old_nleaves = (backgd_mods[i]->tree->nnodes + 1) / 2; tm_prune(backgd_mods[i], msa, pruned_names); if (lst_size(pruned_names) >= old_nleaves) die("ERROR: no match for leaves of tree in alignment (background model #%d)\n", i+1); else if (lst_size(pruned_names) > 0) { fprintf(stderr, "WARNING: pruned away leaves in background model (#%d) with no match in alignment (", i+1); for (j = 0; j < lst_size(pruned_names); j++) fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, j))->chars, j < lst_size(pruned_names) - 1 ? ", " : ").\n"); } lst_free_strings(pruned_names); } for (i = 0; i < feat_nmods; i++) { old_nleaves = (feat_mods[i]->tree->nnodes + 1) / 2; tm_prune(feat_mods[i], msa, pruned_names); if (lst_size(pruned_names) >= old_nleaves) die("ERROR: no match for leaves of tree in alignment (features model #%d)\n", i+1); else if (lst_size(pruned_names) > 0) { fprintf(stderr, "WARNING: pruned away leaves in features model (#%d) with no match in alignment (", i+1); for (j = 0; j < lst_size(pruned_names); j++) fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, j))->chars, j < lst_size(pruned_names) - 1 ? ", " : ").\n"); } lst_free_strings(pruned_names); } lst_free(pruned_names); /* first have to subtract offset from features, if necessary */ if (msa->idx_offset != 0 && features != NULL) { for (i = 0; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); f->start -= msa->idx_offset; f->end -= msa->idx_offset; } } /* convert to coord frame of alignment */ if (features != NULL && refidx != 0) { if (verbose) fprintf(stderr, "Mapping coordinates ...\n"); msa_map_gff_coords(msa, features, refidx, 0, 0); if (lst_size(features->features) == 0) die("ERROR: no features within coordinate range of alignment.\n"); } /* Make a reverse complemented copy of the alignment. The two strands will be processed separately, to avoid problems with overlapping features, etc. */ if (!base_by_base) { /* skip in base by base case */ if (verbose) fprintf(stderr, "Creating reverse complemented alignment ...\n"); msa_compl = msa_create_copy(msa, 0); /* temporary workaround: make sure reverse complement not based on sufficient stats */ if (msa_compl->seqs == NULL) ss_to_msa(msa_compl); if (msa_compl->ss != NULL) { ss_free(msa_compl->ss); msa_compl->ss = NULL; } msa_reverse_compl(msa_compl); } /* allocate memory for computing scores */ backgd_emissions = smalloc(backgd_nmods * sizeof(void*)); for (i = 0; i < backgd_nmods; i++) backgd_emissions[i] = smalloc(msa->length * sizeof(double)); feat_emissions = smalloc(feat_nmods * sizeof(void*)); for (i = 0; i < feat_nmods; i++) feat_emissions[i] = smalloc(msa->length * sizeof(double)); max_nmods = max(backgd_nmods, feat_nmods); dummy_emissions = smalloc(max_nmods * sizeof(void*)); mem = smalloc(max_nmods * sizeof(void*)); /* memory for forward algorithm -- each block must be as large as the largest feature */ if (features != NULL) { for (i = 0, memblocksize = -1; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); if (f->end - f->start + 1 > memblocksize) memblocksize = f->end - f->start + 1; } } else memblocksize = winsize; /* -1 if base-by-base mode */ if (memblocksize > 0) for (i = 0; i < max_nmods; i++) mem[i] = smalloc(memblocksize * sizeof(double)); if (winsize != -1) { winscore_pos = smalloc(msa->length * sizeof(double)); winscore_neg = smalloc(msa->length * sizeof(double)); no_alignment = smalloc(msa->length * sizeof(int)); for (i = 0; i < msa->length; i++) { winscore_pos[i] = winscore_neg[i] = NEGINFTY; if (refidx == 0) no_alignment[i] = FALSE; else no_alignment[i] = msa_missing_col(msa, refidx, i); } } /* the rest will be repeated for each strand */ for (strand = 1; strand <= 2; strand++) { MSA *thismsa = strand == 1 ? msa : msa_compl; double *winscore = strand == 1 ? winscore_pos : winscore_neg; if (base_by_base && strand == 2) break; /* don't do second pass in base_by_base case */ if (verbose) fprintf(stderr, "Processing %c strand ...\n", strand == 1 ? '+' : '-'); /* set up dummy categories array, so that emissions are only computed where needed */ thismsa->categories = smalloc(thismsa->length * sizeof(int)); thismsa->ncats = 1; if (winsize != -1) { if (strand == 1) for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = no_alignment[i] ? 0 : 1; else for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = no_alignment[thismsa->length - i - 1] ? 0 : 1; } else if (features != NULL) { for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = 0; for (i = 0; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); if (f->start <= 0 || f->end <= 0) { fprintf(stderr, "WARNING: feature out of range ('"); gff_print_feat(stderr, f); fprintf(stderr, "')\n"); continue; } if (strand == 1 && f->strand != '-') for (j = f->start - 1; j < f->end; j++) thismsa->categories[j] = 1; else if (strand == 2 && f->strand == '-') for (j = thismsa->length - f->end; j < thismsa->length - f->start + 1; j++) thismsa->categories[j] = 1; } } else { /* base-by-base scores */ for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = 1; } if (thismsa->ss != NULL) ss_update_categories(thismsa); /* compute emissions */ for (i = 0; i < backgd_nmods; i++) { if (verbose) fprintf(stderr, "Computing emissions for background model #%d ...\n", i+1); tl_compute_log_likelihood(backgd_mods[i], thismsa, backgd_emissions[i], NULL, 1, NULL); } for (i = 0; i < feat_nmods; i++) { if (verbose) fprintf(stderr, "Computing emissions for features model #%d ...\n", i+1); tl_compute_log_likelihood(feat_mods[i], thismsa, feat_emissions[i], NULL, 1, NULL); } /* now compute scores */ if (winsize != -1) { /* windows case */ int winstart; if (verbose) fprintf(stderr, "Computing scores ...\n"); for (winstart = 0; winstart <= thismsa->length - winsize; winstart++) { int centeridx = winstart + winsize/2; if (strand == 2) centeridx = thismsa->length - centeridx - 1; if (no_alignment[centeridx]) continue; for (j = 0; j < feat_nmods; j++) dummy_emissions[j] = &(feat_emissions[j][winstart]); winscore[centeridx] = hmm_forward(feat_hmm, dummy_emissions, winsize, mem); if (winscore[centeridx] <= NEGINFTY) { winscore[centeridx] = NEGINFTY; continue; } for (j = 0; j < backgd_nmods; j++) dummy_emissions[j] = &(backgd_emissions[j][winstart]); winscore[centeridx] -= hmm_forward(backgd_hmm, dummy_emissions, winsize, mem); if (winscore[centeridx] < NEGINFTY) winscore[centeridx] = NEGINFTY; } } else if (features != NULL) { /* features case */ if (verbose) fprintf(stderr, "Computing scores ...\n"); for (i = 0; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); int s, e; if ((strand == 1 && f->strand == '-') || (strand == 2 && f->strand != '-') || f->start <= 0 || f->end <= 0 || f->end - f->start < 0) continue; /* effective coords */ if (f->strand == '-') { s = thismsa->length - f->end + 1; e = thismsa->length - f->start + 1; } else { s = f->start; e = f->end; } f->score_is_null = 0; for (j = 0; j < feat_nmods; j++) dummy_emissions[j] = &(feat_emissions[j][s-1]); f->score = hmm_forward(feat_hmm, dummy_emissions, e - s + 1, mem); if (f->score <= NEGINFTY) { f->score = NEGINFTY; continue; } for (j = 0; j < backgd_nmods; j++) dummy_emissions[j] = &(backgd_emissions[j][s-1]); f->score -= hmm_forward(backgd_hmm, dummy_emissions, e - s + 1, mem); if (f->score < NEGINFTY) f->score = NEGINFTY; } } } if (verbose) fprintf(stderr, "Generating output ...\n"); if (winsize != -1 && windowWig == FALSE) { /* standard windows output */ for (i = 0, j = 0; i < msa->length; i++) { if (no_alignment[i] == FALSE) printf("%d\t%.3f\t%.3f\n", j + msa->idx_offset + 1, winscore_pos[i], winscore_neg[i]); if (ss_get_char_pos(msa, i, 0, 0) != GAP_CHAR) j++; } } else if (windowWig == TRUE) { /* windows with wig output */ int last = NEGINFTY; for (i = 0, j = 0; i < msa->length; i++) { if (refidx == 0 || msa_get_char(msa, refidx-1, i) != GAP_CHAR) { if (no_alignment[i] == FALSE && winscore_pos[i] > NEGINFTY) { if (j > last + 1) printf("fixedStep chrom=%s start=%d step=1\n", refidx > 0 ? msa->names[refidx-1] : "alignment", j + msa->idx_offset + 1); printf("%.3f\n", winscore_pos[i]); last = j; } j++; } } } else if (features != NULL) { /* features output */ /* return to coord frame of reference seq (also, replace offset) */ if (refidx != 0) msa_map_gff_coords(msa, features, 0, refidx, msa->idx_offset); else if (msa->idx_offset != 0) { for (i = 0; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); f->start += msa->idx_offset; f->end += msa->idx_offset; } } if (bed_output) gff_print_bed(stdout, features, FALSE); else gff_print_set(stdout, features); } else { /* base-by-base scores */ /* in this case, we can just output the difference between the emissions */ printf("fixedStep chrom=%s start=%d step=1\n", refidx > 0 ? msa->names[refidx-1] : "alignment", msa->idx_offset + 1); for (i = 0, j = 0; i < msa->length; i++) { if (refidx == 0 || msa_get_char(msa, refidx-1, i) != GAP_CHAR) { printf("%.3f\n", feat_emissions[0][i] - backgd_emissions[0][i]); j++; } } } if (verbose) fprintf(stderr, "\nDone.\n"); return 0; }
void magic_init(unsigned long random_seed) { /* { Initialize all potions, wands, staves, scrolls, etc... }*/ integer i1,tmpv; vtype tmps; set_seed(random_seed); randes(); for (i1 = 1; i1 <= MAX_OBJECTS; i1++) { /* * The arrays of the object materals all start at 0. * Object subvals start at 1. When doing the lookup * subtract one from subval! */ tmpv = (0xFF & object_list[i1].subval); switch (object_list[i1].tval) { case potion1: case potion2: if (tmpv <= MAX_COLORS) { insert_str(object_list[i1].name,"%C",colors[tmpv-1]); } break; case scroll1: case scroll2: rantitle(tmps); insert_str(object_list[i1].name,"%T",tmps); break; case ring: if (tmpv <= MAX_ROCKS) { insert_str(object_list[i1].name,"%R",rocks[tmpv-1]); } break; case valuable_gems: if (tmpv <= MAX_ROCKS) { insert_str(object_list[i1].name,"%R",rocks[tmpv-1]); } break; case valuable_gems_wear: if (tmpv <= MAX_ROCKS) { insert_str(object_list[i1].name,"%R",rocks[tmpv-1]); } break; case amulet: if (tmpv <= MAX_AMULETS) { insert_str(object_list[i1].name,"%A",amulets[tmpv-1]); } break; case wand: if (tmpv <= MAX_METALS) { insert_str(object_list[i1].name,"%M",metals[tmpv-1]); } break; case chime: if (tmpv <= MAX_METALS) { insert_str(object_list[i1].name,"%M",metals[tmpv-1]); } break; case horn: if (tmpv <= MAX_HORNS) { insert_str(object_list[i1].name,"%H",horns[tmpv-1]); } break; case staff: if (tmpv <= MAX_WOODS) { insert_str(object_list[i1].name,"%W",woods[tmpv-1]); } break; case Food: if (tmpv <= MAX_MUSH) { insert_str(object_list[i1].name,"%M",mushrooms[tmpv-1]); } break; case rod : /* what happened to the rods? */ /* if (tmpv <= MAX_RODS) { insert_str(object_list[i1].name,"%D",rods[tmpv-1]); } */ break; case bag_or_sack: if (tmpv <= MAX_CLOTHS) { insert_str(object_list[i1].name,"%N",cloths[tmpv-1]); } break; case misc_usable: if (tmpv <= MAX_ROCKS) { insert_str(object_list[i1].name,"%R",rocks[tmpv-1]); } if (tmpv <= MAX_WOODS) { insert_str(object_list[i1].name,"%W",woods[tmpv-1]); } if (tmpv <= MAX_METALS) { insert_str(object_list[i1].name,"%M",metals[tmpv-1]); } if (tmpv <= MAX_AMULETS) { insert_str(object_list[i1].name,"%A",amulets[tmpv-1]); } break; default: break; } /* end switch */ } /* end for */ #if DO_DEBUG && 0 for (i1 = 1; i1 <= MAX_OBJECTS; i1++) { fprintf(debug_file,": object_list[%ld] = %s\n", i1, object_list[i1].name); } fflush(debug_file); #endif };
int main(int argc, char *argv[]) { FILE *STATSF; char c; int opt_idx, i, max_nrows; String *line = str_new(STR_MED_LEN), *args = str_new(STR_MED_LEN); List *fields = lst_new_ptr(5), *vectors = lst_new_ptr(1000), *counts = lst_new_int(1000); int dim = -1; double error = -1; PbsCode *code; char comment[1000]; time_t t; int have_data = TRUE; /* argument variables and defaults */ int nrows = -1, nbytes = 1; training_mode mode = FULL; FILE *logf = NULL; struct option long_opts[] = { {"nrows", 1, 0, 'n'}, {"nbytes", 1, 0, 'b'}, {"no-greedy", 0, 0, 'G'}, {"no-train", 1, 0, 'x'}, {"log", 1, 0, 'l'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; set_seed(-1); /* first capture arg list for comment in output */ for (i = 1; i < argc; i++) { str_append_charstr(args, argv[i]); if (i < argc - 1) str_append_char(args, ' '); } while ((c = (char)getopt_long(argc, argv, "n:b:l:Gxh", long_opts, &opt_idx)) != -1) { switch (c) { case 'n': nrows = get_arg_int_bounds(optarg, 1, INFTY); break; case 'b': nbytes = get_arg_int_bounds(optarg, 1, MAX_NBYTES); break; case 'G': mode = NO_GREEDY; break; case 'x': mode = NO_TRAIN; dim = get_arg_int_bounds(optarg, 1, INFTY); break; case 'l': logf = phast_fopen(optarg, "w+"); break; case 'h': printf("%s", HELP); exit(0); case '?': die("Bad argument. Try 'pbsTrain -h'.\n"); } } if (mode == NO_TRAIN && optind == argc) have_data = FALSE; /* data optional */ if (have_data) { if (optind != argc - 1) die("ERROR: Bad arguments. Try 'pbsTrain -h'.\n"); STATSF = phast_fopen(argv[optind], "r"); /* read stats */ while (str_readline(line, STATSF) != EOF) { int count; double prob, norm_const; Vector *v; str_trim(line); if (line->length == 0 || line->chars[0] == '#') continue; str_split(line, NULL, fields); if (str_as_int(lst_get_ptr(fields, 0), &count) != 0) die("ERROR: Bad count in stats file ('%s')\n", lst_get_ptr(fields, 0)); lst_push_int(counts, count); if (dim == -1) dim = lst_size(fields) - 1; else if (dim != lst_size(fields) - 1) die("ERROR: Each probability vector must have the same dimension\n"); v = vec_new(dim); for (i = 0; i < dim; i++) { if (str_as_dbl(lst_get_ptr(fields, i+1), &prob) != 0 || prob < 0 || prob > 1) die("ERROR: Bad probability in stats file ('%s')\n", lst_get_ptr(fields, i+1)); vec_set(v, i, prob); } /* normalize to avoid problems from rounding errors */ norm_const = normalize_probs(v->data, dim); if (fabs(1-norm_const) > 1e-2) die("ERROR: Probabilities in stats file don't sum to one.\nOffending line: '%s'\n", line->chars); lst_push_ptr(vectors, v); lst_free_strings(fields); } } max_nrows = sxg_max_nrows(dim, ~(~0 << (8*nbytes))); if (nrows == -1) nrows = max_nrows; else if (nrows > max_nrows) die("ERROR: nrows exceeds maximum of %d for nbytes = %d and dimension = %d\n", max_nrows, nbytes, dim); code = pbs_new(dim, nrows, nbytes); if (mode != NO_TRAIN) error = pbs_estimate_from_data(code, vectors, counts, logf, mode); else if (have_data) { /* not training but need error */ int tot_count = 0; double this_error; error = 0; for (i = 0; i < lst_size(vectors); i++) { pbs_get_index(code, lst_get_ptr(vectors, i), &this_error); error += this_error * lst_get_int(counts, i); tot_count += lst_get_int(counts, i); } error /= tot_count; } /* generate comment */ t = time(NULL); sprintf(comment, "# Code generated by pbsTrain, with argument(s) \"%s\"\n\ # %s\n\ # Average training error = %f bits\n", args->chars, ctime(&t), error); pbs_write(code, stdout, comment); return 0; }
int main(int argc, char *argv[]) { /* variables for options, with defaults */ TreeNode *tree = NULL, *merge_tree = NULL, *extrapolate_tree = NULL; Hashtable *rename_hash = NULL; double scale_factor = 1; List *prune_names = NULL, *label = NULL, *labelType = NULL; int prune_all_but = FALSE, tree_only = FALSE, dissect = FALSE, name_ancestors = FALSE, with_branch = FALSE, print_branchlen=FALSE, inNewick=FALSE, no_branchlen = FALSE, print_distance_to_root = FALSE; TreeModel *mod = NULL, *merge_mod = NULL; char *reroot_name = NULL, *subtree_name =NULL, *get_subtree_name = NULL, *node_distance_name = NULL; /* other variables */ String *suffix, *optstr; char c; int i, opt_idx; TreeNode *n; struct option long_opts[] = { {"scale", 1, 0, 's'}, {"extrapolate", 1, 0, 'e'}, {"prune", 1, 0, 'p'}, {"prune-all-but", 1, 0, 'P'}, {"get-subtree", 1, 0, 'g'}, {"merge", 1, 0, 'm'}, {"rename", 1, 0, 'r'}, {"tree-only", 0, 0, 't'}, {"no-branchlen", 0, 0, 'N'}, {"dissect", 0, 0, 'd'}, {"name-ancestors", 0, 0, 'a'}, {"reroot", 1, 0, 'R'}, {"with-branch", 1, 0, 'B'}, {"subtree", 1, 0, 'S'}, {"branchlen", 0, 0, 'b'}, {"newick", 0, 0, 'n'}, {"label-subtree", 1, 0, 'L'}, {"label-branches", 1, 0, 'l'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "s:p:P:g:m:r:R:B:S:D:l:L:adtNbnh", long_opts, &opt_idx)) != -1) { switch (c) { case 's': scale_factor = get_arg_dbl_bounds(optarg, 0, INFTY); break; case 'e': if (!strcmp(optarg, "default")) { optarg = smalloc(1000 * sizeof(char)); #if defined(__MINGW32__) sprintf(optarg, "%s\\data\\exoniphy\\mammals\\cftr25_hybrid.nh", PHAST_HOME); #else sprintf(optarg, "%s/data/exoniphy/mammals/cftr25_hybrid.nh", PHAST_HOME); #endif } extrapolate_tree = tr_new_from_file(phast_fopen(optarg, "r")); break; case 'p': prune_names = get_arg_list(optarg); break; case 'P': prune_names = get_arg_list(optarg); prune_all_but = TRUE; break; case 'g': get_subtree_name = optarg; break; case 'm': suffix = str_new_charstr(optarg); str_suffix(suffix, '.'); if (str_equals_charstr(suffix, "nh")) merge_tree = tr_new_from_file(phast_fopen(optarg, "r")); else { merge_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1); merge_tree = merge_mod->tree; } break; case 'r': rename_hash = make_name_hash(optarg); break; case 't': tree_only = TRUE; break; case 'N': no_branchlen = TRUE; tree_only = TRUE; break; case 'd': dissect = TRUE; break; case 'b': print_branchlen = TRUE; break; case 'D': print_distance_to_root = TRUE; node_distance_name = optarg; break; case 'R': reroot_name = optarg; break; case 'B': with_branch = TRUE; break; case 'a': name_ancestors = TRUE; break; case 'S': subtree_name = optarg; break; case 'n': inNewick=TRUE; break; case 'L': //do the same for --label--subtree and --label-branches case 'l': if (label == NULL) { label = lst_new_ptr(1); labelType = lst_new_int(1); } optstr = str_new_charstr(optarg); lst_push_ptr(label, optstr); lst_push_int(labelType, (int)c); break; case 'h': usage(argv[0]); case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } if (optind != argc - 1) die("Input filename required. Try '%s -h'.\n", argv[0]); if (merge_tree != NULL && extrapolate_tree != NULL) die("ERROR: Can't use --merge and --extrapolate together"); set_seed(-1); suffix = str_new_charstr(argv[optind]); str_suffix(suffix, '.'); if (inNewick || str_equals_charstr(suffix, "nh")) { tree = tr_new_from_file(phast_fopen(argv[optind], "r")); tree_only = TRUE; /* can't output tree model in this case */ } else { mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1); tree = mod->tree; } if (prune_names != NULL) { tr_prune(&tree, prune_names, prune_all_but, NULL); if (mod != NULL) mod->tree = tree; /* root may have changed */ } if (get_subtree_name != NULL) { n = tr_get_node(tree, get_subtree_name); if (n == NULL) { tr_name_ancestors(tree); n = tr_get_node(tree, get_subtree_name); if (n == NULL) { die("ERROR: no node named '%s'.\n", subtree_name); } } tr_prune_supertree(&tree, n); if (mod != NULL) mod->tree = tree; } if (merge_tree != NULL) { tree = tr_hybrid(tree, merge_tree); if (mod != NULL) mod->tree = tree; } else if (extrapolate_tree != NULL) { tr_scale_by_subtree(extrapolate_tree, tree); tree = extrapolate_tree; if (mod != NULL) mod->tree = tree; } if (scale_factor != 1) { if (subtree_name == NULL) tr_scale(tree, scale_factor); else { n = tr_get_node(tree, subtree_name); if (n == NULL) die("ERROR: no node named '%s'.\n", subtree_name); tr_scale_subtree(tree, n, scale_factor, with_branch); } } if (name_ancestors) tr_name_ancestors(tree); if (rename_hash != NULL) { char *newname; for (i = 0; i < tree->nnodes; i++) { n = lst_get_ptr(tree->nodes, i); if (n->name != NULL && n->name[0] != '\0' && (newname = hsh_get(rename_hash, n->name)) != (char*)-1) { strcpy(n->name, newname); } } } if (reroot_name != NULL) { n = tr_get_node(tree, reroot_name); if (n == NULL) die("ERROR: no node named '%s'.\n", reroot_name); tr_reroot(tree, n, with_branch); if (mod != NULL) mod->tree = with_branch ? n->parent : n; tree = with_branch ? n->parent : n; } if (label != NULL) { for (i=0; i < lst_size(label); i++) { String *currstr = (String*)lst_get_ptr(label, i), *arg1, *labelVal; List *tmplst = lst_new_ptr(10); String *nodename; int j; str_split(currstr, ":", tmplst); if (lst_size(tmplst) != 2) die("ERROR: bad argument to --label-branches or --label-subtree.\n"); arg1 = lst_get_ptr(tmplst, 0); labelVal = lst_get_ptr(tmplst, 1); lst_clear(tmplst); if (lst_get_int(labelType, i) == (int)'l') { str_split(arg1, ",", tmplst); for (j=0; j < lst_size(tmplst); j++) { nodename = (String*)lst_get_ptr(tmplst, j); tr_label_node(tree, nodename->chars, labelVal->chars); } lst_free_strings(tmplst); } else if (lst_get_int(labelType, i) == (int)'L') { int include_leading_branch = FALSE; TreeNode *node; nodename = arg1; node = tr_get_node(tree, nodename->chars); if (node == NULL && nodename->chars[nodename->length-1] == '+') { nodename->chars[--nodename->length] = '\0'; node = tr_get_node(tree, nodename->chars); include_leading_branch = TRUE; } tr_label_subtree(tree, nodename->chars, include_leading_branch, labelVal->chars); } else die("ERROR got label_type %c\n", lst_get_int(labelType, (char)i)); str_free(arg1); str_free(labelVal); lst_free(tmplst); str_free(currstr); } lst_free(label); lst_free(labelType); } if (dissect) tr_print_nodes(stdout, tree); if (print_branchlen) printf("TOTAL_TREE_LEN: %f\n", tr_total_len(tree)); if (print_distance_to_root) { TreeNode *node = tr_get_node(tree, node_distance_name); if (node == NULL) die("ERROR: no node named '%s'.\n", node_distance_name); printf("length(root-%s): %f\n", node_distance_name, tr_distance_to_root(node)); } if (dissect==0 && print_branchlen==0 && print_distance_to_root==0) { if (tree_only) tr_print(stdout, tree, no_branchlen==FALSE); else tm_print(stdout, mod); } return 0; }
int main(int argc, char* argv[]) { FILE* F; TreeModel *model; int i, j, k, alph_size, nstates, do_eqfreqs = 0, exch_mode = 0, list_mode = 0, latex_mode = 0, suppress_diag = 0, ti_tv = 0, scientific_mode = 0, induced_aa = 0, do_stop_codons = 0, do_zeroes = 0, symmetric = 0, context_ti_tv = 0, all_branches = 0; int startcol, endcol, ncols, branch_no = 0, matrix_idx = 0; /* int aa_inv[256]; */ double t = -1, total_ti = 0, total_tv = 0, rho_s = 0, cpg_ti = 0, cpg_tv = 0, non_cpg_ti = 0, non_cpg_tv = 0, cpg_eqfreq = 0; char *rate_format_string = "%8.6f"; MarkovMatrix *M; char c; char tuple[5], tuple2[5]; /* , aa_alph[50]; */ char *subst_mat_fname = NULL, *subst_score_fname = NULL, *subst_mat_fname_paml = NULL, *order1_mod_fname = NULL; Matrix *subst_mat = NULL; List *matrix_list = lst_new_ptr(20), *traversal = NULL; while ((c = (char)getopt(argc, argv, "t:fedlLiM:N:A:B:aszSECh")) != -1) { switch(c) { case 't': if (optarg[0] == 'A') all_branches = 1; else t = get_arg_dbl_bounds(optarg, 0, INFTY); break; case 'f': do_eqfreqs = 1; break; case 'e': exch_mode = 1; break; case 'd': suppress_diag = 1; break; case 'l': list_mode = 1; break; case 'L': latex_mode = 1; break; case 'i': ti_tv = 1; break; case 'M': subst_mat_fname = optarg; induced_aa = 1; break; case 'N': subst_mat_fname_paml = optarg; induced_aa = 1; break; case 'A': subst_score_fname = optarg; break; case 'B': order1_mod_fname = optarg; break; case 'a': induced_aa = 1; do_zeroes = 1; break; case 's': do_stop_codons = 1; break; case 'z': do_zeroes = 1; break; case 'S': symmetric = 1; break; case 'E': scientific_mode = 1; rate_format_string = "%13.6e"; break; case 'C': context_ti_tv = 1; break; case 'h': print_usage(); exit(0); case '?': die("Unrecognized option. Try \"display_rate_matrix -h\" for help.\n"); } } set_seed(-1); if ((t >= 0 && exch_mode) || (latex_mode && list_mode) || ((ti_tv || subst_mat_fname != NULL || subst_score_fname != NULL || subst_mat_fname_paml != NULL || scientific_mode) && !list_mode) || (subst_mat_fname != NULL && subst_score_fname != NULL) || (subst_score_fname != NULL && subst_mat_fname_paml != NULL) || (subst_mat_fname != NULL && subst_mat_fname_paml != NULL) || optind != argc - 1) { die("ERROR: missing required arguments or illegal combination of arguments.\nTry \"display_rate_matrix -h\" for help.\n"); } F = phast_fopen(argv[optind], "r"); model = tm_new_from_file(F, 1); if (context_ti_tv) { /* this option requires completely different handling from the others */ if (model->order != 2) { die("ERROR: -C requires a model of order 3.\n"); } do_context_dependent_ti_tv(model); exit(0); } if (induced_aa) { TreeModel *aa_model = tm_induced_aa(model); char *codon_to_aa = get_codon_mapping(model->rate_matrix->states); /* before freeing model, grab the expected rate of synonymous subst, rho_s */ for (i = 0; i < model->rate_matrix->size; i++) for (j = 0; j < model->rate_matrix->size; j++) if (i != j && codon_to_aa[i] == codon_to_aa[j]) rho_s += mm_get(model->rate_matrix, i, j) * vec_get(model->backgd_freqs, i); sfree(codon_to_aa); tm_free(model); model = aa_model; } if (all_branches) { traversal = tr_inorder(model->tree); for (matrix_idx = 0; matrix_idx < lst_size(traversal); matrix_idx++) { TreeNode *n = lst_get_ptr(traversal, matrix_idx); if (n->parent == NULL) { lst_push_ptr(matrix_list, NULL); continue; } M = mm_new(model->rate_matrix->size, model->rate_matrix->states, DISCRETE); mm_exp(M, model->rate_matrix, n->dparent); lst_push_ptr(matrix_list, M); } } else if (t >= 0) { M = mm_new(model->rate_matrix->size, model->rate_matrix->states, DISCRETE); mm_exp(M, model->rate_matrix, t); lst_push_ptr(matrix_list, M); } else lst_push_ptr(matrix_list, model->rate_matrix); alph_size = (int)strlen(model->rate_matrix->states); nstates = model->rate_matrix->size; if (subst_mat_fname != NULL) { if ((F = fopen(subst_mat_fname, "r")) == NULL) { die("ERROR: Can't open %s.\n", subst_mat_fname); } subst_mat = read_subst_mat(F, AA_ALPHABET); } else if (subst_mat_fname_paml != NULL) { if ((F = fopen(subst_mat_fname_paml, "r")) == NULL) { die("ERROR: Can't open %s.\n", subst_mat_fname_paml); } subst_mat = read_paml_matrix(F, AA_ALPHABET); } else if (subst_score_fname != NULL) { if ((F = fopen(subst_score_fname, "r")) == NULL) { die("ERROR: Can't open %s.\n", subst_score_fname); } subst_mat = read_subst_scores(model, F); } else if (order1_mod_fname != NULL) { if ((F = fopen(order1_mod_fname, "r")) == NULL) { die("ERROR: Can't open %s.\n", order1_mod_fname); } subst_mat = unproject_rates(model, tm_new_from_file(F, 1)); } /* loop through matrices to print */ for (matrix_idx = 0; matrix_idx < lst_size(matrix_list); matrix_idx++) { M = lst_get_ptr(matrix_list, matrix_idx); if (all_branches) { if (M == NULL) continue; /* root */ printf("BRANCH %d (t = %.6f)\n", ++branch_no, ((TreeNode*)lst_get_ptr(traversal, matrix_idx))->dparent); } /* print no more than 16 columns at a time (except with -a) */ ncols = (induced_aa ? nstates : 16); for (startcol = 0; startcol < nstates; startcol += ncols) { endcol = min(nstates, startcol+ncols); /* table header */ if (! list_mode) { if (latex_mode) { printf("\\begin{tabular}{|c|"); for (i = startcol; i < endcol; i++) printf("r"); printf("|}\n\\hline\n"); } printf("%-5s ", ""); if (latex_mode) printf("& "); for (i = startcol; i < endcol; i++) { get_state_tuple(model, tuple, i); if (latex_mode) { printf("{\\bf %s}", tuple); if (i < endcol-1) printf("& "); } else printf("%8s ", tuple); } if (latex_mode) printf("\\\\\n\\hline\n"); else printf("\n"); } /* table or list contents */ for (i = 0; i < nstates; i++) { if (induced_aa && AA_ALPHABET[i] == '$' && !do_stop_codons) continue; get_state_tuple(model, tuple, i); /* get total eq freq of tuples containing CpG dinucs */ for (k = 0; k < model->order; k++) { if (tuple[k] == 'C' && tuple[k+1] == 'G') { cpg_eqfreq += vec_get(model->backgd_freqs, i); /* printf("***CPG***"); */ break; } } if (latex_mode) printf("{\\bf %s}& ", tuple); else if (!list_mode) printf("%-5s ", tuple); for (j = startcol; j < endcol; j++) { if (induced_aa && AA_ALPHABET[j] == '$' && !do_stop_codons) continue; if (latex_mode) printf("$"); if (list_mode) { if (symmetric && j <= i) continue; else if ((t < 0 && ! all_branches) && (i == j || (!do_zeroes && mm_get(M, i, j) == 0))) continue; get_state_tuple(model, tuple2, j); printf("%-5s %-5s ", tuple, tuple2); } if (i == j && suppress_diag && !list_mode) printf("%-7s", "-"); else { /* get rate or probability */ double val = exch_mode == 0 ? mm_get(M, i, j) : safediv(mm_get(M, i, j), vec_get(model->backgd_freqs,j)); /* print value in format %8.6f or %13.6e */ printf(rate_format_string, val); printf(" "); } if (latex_mode) { printf("$"); if (j < endcol-1) printf("& "); } else if (list_mode) { int ti, is_cpg; if (ti_tv) { ti = -1; is_cpg = 0; for (k = 0; k <= model->order; k++) { int dig_i = (i % int_pow(alph_size, k+1)) / int_pow(alph_size, k); int dig_j = (j % int_pow(alph_size, k+1)) / int_pow(alph_size, k); char next_char = '\0', prev_char = '\0'; if (dig_i != dig_j) { ti = is_transition(M->states[dig_i], M->states[dig_j]); if (k != model->order) prev_char = M->states[(i % int_pow(alph_size, k+2)) / int_pow(alph_size, k+1)]; if (k != 0) next_char = M->states[(i % int_pow(alph_size, k)) / int_pow(alph_size, k-1)]; if ((M->states[dig_i] == 'C' && next_char == 'G') || (M->states[dig_i] == 'G' && prev_char == 'C')) is_cpg = 1; } } if (ti == -1) die("ERROR ti=-1\n"); printf("%5s ", ti ? "ti" : "tv"); /* printf("%5s ", is_cpg ? "CPG" : "-"); */ if (ti) { total_ti += mm_get(M, i, j) * vec_get(model->backgd_freqs, i); if (is_cpg) cpg_ti += mm_get(M, i, j) * vec_get(model->backgd_freqs, i); else non_cpg_ti += mm_get(M, i, j) * vec_get(model->backgd_freqs, i); } else { total_tv += mm_get(M, i, j) * vec_get(model->backgd_freqs, i); if (is_cpg) cpg_tv += mm_get(M, i, j) * vec_get(model->backgd_freqs, i); else non_cpg_tv += mm_get(M, i, j) * vec_get(model->backgd_freqs, i); } } if (subst_mat != NULL) { if (mat_get(subst_mat, i, j) == NEGINFTY) printf("%8s", "-"); else printf("%8.4f", mat_get(subst_mat, i, j)); } printf("\n"); } } if (latex_mode) printf("\\\\\n"); else if (!list_mode) printf("\n"); } /* equilibrium freqs (table case only) */ if (do_eqfreqs && ! list_mode) { if (latex_mode) printf("\\hline\n$\\boldsymbol{\\mathbf{\\pi}}$&"); else printf("%-5s ", "pi"); for (i = startcol; i < endcol; i++) { if (latex_mode) printf("$%8.4f$ ", vec_get(model->backgd_freqs, i)); else printf("%8.4f ", vec_get(model->backgd_freqs, i)); if (latex_mode && i < endcol-1) printf("& "); } if (latex_mode) printf("\\\\\n"); else printf("\n"); } if (latex_mode) printf("\\hline\n\\end{tabular}\n\n"); } /* equilibrium freqs (list case only) */ if (do_eqfreqs && list_mode) { for (i = 0; i < nstates; i++) { get_state_tuple(model, tuple, i); printf("%-5s %-5s ", "-", tuple); //!! printf(rate_format_string, vec_get(model->backgd_freqs, i)); printf("\n"); } } if (ti_tv && list_mode) { printf("\n#Total ti/tv = %.4f\n", total_ti/total_tv); printf("#CpG ti ratio = %.4f, CpG tv ratio = %.4f\n", cpg_ti/non_cpg_ti /* * (1 - cpg_eqfreq) */ / cpg_eqfreq, cpg_tv/non_cpg_tv /* * (1 - cpg_eqfreq) */ / cpg_eqfreq); } else if (induced_aa) printf("\n#Total rho_s/rho_v = %.4f\n", rho_s/(3-rho_s)); if (all_branches == 1) printf("\n\n"); } tm_free(model); lst_free(matrix_list); return 0; }
TabulationHashing( uint64_t s ) { setup_table(); set_seed(s); }
TabulationHashing() { setup_table(); set_seed(0); }
int main(int argc, char *argv[]) { FILE *prob_f; char c; int opt_idx, i, nlines = 0, ngaps = 0; unsigned idx; PbsCode *code; List *fields = lst_new_ptr(10); double error, tot_error = 0, prob; Vector *v; String *line = str_new(STR_MED_LEN); struct option long_opts[] = { {"discard-gaps", 0, 0, 'G'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; /* variables for options, with defaults */ int discard_gaps = FALSE; set_seed(-1); while ((c = (char)getopt_long(argc, argv, "Gh", long_opts, &opt_idx)) != -1) { switch (c) { case 'G': discard_gaps = TRUE; break; case 'h': printf("%s", HELP); exit(0); case '?': die("Bad argument. Try 'pbsEncode -h'.\n"); } } if (optind != argc - 2) die("Two arguments required. Try 'pbsEncode -h'.\n"); prob_f = phast_fopen(argv[optind], "r"); code = pbs_new_from_file(phast_fopen(argv[optind+1], "r")); v = vec_new(code->sg->d); while (str_readline(line, prob_f) != EOF) { if (line->length == 0 || line->chars[0] == '#') continue; str_split(line, NULL, fields); if (lst_size(fields) == 1 && str_equals_charstr(lst_get_ptr(fields, 0), "-")) { ngaps++; if (!discard_gaps) pbs_write_binary(code, code->gap_code, stdout); } else { /* ordinary prob vector */ if (lst_size(fields) != code->sg->d) die("ERROR: number of columns must equal dimension of code (%d).\n", code->sg->d); for (i = 0; i < code->sg->d; i++) { if (str_as_dbl(lst_get_ptr(fields, i), &prob) != 0 || prob < 0 || prob > 1) die("ERROR: bad value ('%s')\n", lst_get_ptr(fields, i)); vec_set(v, i, prob); } idx = pbs_get_index(code, v, &error); tot_error += error; pbs_write_binary(code, idx, stdout); nlines++; } lst_free_strings(fields); } fprintf(stderr, "Dimensions: %d\n\ Rows per dimension: %d\n\ Code size: %d\n\ Bytes per vector: %d\n\ Vectors processed: %d\n\ Gaps: %d%s\n\ Average approximation error: %f bits\n", code->sg->d, code->sg->nrows, code->code_size, code->nbytes, nlines, ngaps, discard_gaps ? " (discarded)" : "", tot_error/nlines); return 0; }
/* ******************************************************** * * Create the sampling structure and initialize all values * * ******************************************************** */ void create_sampling(int n, int *L, int B, int generator_flag, int initial_count) { int *ordern, *permun, *myL; int i; // Set the local size of permutations local_perm_size = B; local_perm_count = 1; // Allocate and initialize local L to original L local_L = (int *)R_alloc(n, sizeof(int)); // To check random or complete if( generator_flag == 1 ) { // Initialize the permutation array (struct) // Sets the B in the struct equal to 0 to be // able to identify the generator later on init_permu_array(&local_pa, L, n, 0); // * ================================================================= * // * Forwarding logic * // * ---------------- * // * For parallel executions all processes, appart from the one with * // * rank == 0, should forward their random generators in order to * // * be able to reproduce the exact same random permutations as the * // * serial version of the code (or a version with only one thread) * // * * // * The code below uses the initial_count variable to "burn" the * // * cycles from the random generator * // * ================================================================= * // All processes apart from process with rank == 0 (translates to // initial_count == 0) will perform this forward if ( initial_count != 0 ) { // Initialize label init_label(local_pa.n, local_pa.k, local_pa.nk, local_L); // Burn the cycles // We *must* use L in order to initialize it in the proper permutation // L is given as the starting position for the next permutation for(i=0; i < initial_count; i++) { next_label(local_pa.n, local_pa.k, local_pa.nk, local_L); } } // * ================================================================= * // * ================================================================= * } else { // Intiailize the permu_array (struct) init_permu_array(&local_pa, L, n, B); permun = (int *)R_alloc(local_pa.n, sizeof(int)); ordern = (int*)R_alloc(local_pa.n, sizeof(int)); myL = (int *)R_alloc(local_pa.n, sizeof(int)); // * ================================================================= * // * Forwarding logic * // * ---------------- * // * For parallel executions all processes, appart from the one with * // * rank == 0, should forward their random generators in order to * // * be able to reproduce the exact same random permutations as the * // * serial version of the code (or a version with only one thread) * // * * // * The code below uses the initial_count variable to "burn" the * // * cycles from the random generator * // * ================================================================= * // Set initial seed set_seed(g_random_seed); // Burn the cycles // ("permun" is a safe scratch space for this) for(i=0; i < initial_count; i++) { sample(permun, n); } // * ================================================================= * // * ================================================================= * for(i=0; i<n; i++){ ordern[i]=i; } // Allocate and assign the values for l_first_sample set_permu(&local_pa, 0, L); for(i=1; i<B; i++) { memcpy(permun, ordern, sizeof(int)*n); sample(permun, n); // Change to labbeling sample2label(n, local_pa.k, local_pa.nk, permun, myL); set_permu(&local_pa, i, myL); } } }
int main(int argc, char *argv[]) { TreeNode *tree = NULL; TreeModel *backgd_mod = NULL; int i, j, size = DEFAULT_SIZE, meme_mode = 0, profile_mode = 0, nrestarts = 10, npseudocounts = 5, nsamples = -1, nmostprevalent = -1, tuple_size = -1, nbest = -1, sample_parms = 0, nmotifs = DEFAULT_NUMBER, nseqs = -1, do_html = 0, do_bed = 0, suppress_stdout = 0; List *msa_name_list = NULL, *pos_examples = NULL, *init_list = NULL, *tmpl; List *msas, *motifs; SeqSet *seqset = NULL; PooledMSA *pmsa = NULL; msa_format_type msa_format = UNKNOWN_FORMAT; Vector *backgd_mnmod = NULL; Hashtable *hash=NULL; String *output_prefix = str_new_charstr("phastm."); double *has_motif = NULL; double prior = PRIOR; char c; GFF_Set *bedfeats = NULL; while ((c = getopt(argc, argv, "t:i:b:sk:md:pn:I:R:P:w:c:SB:o:HDxh")) != -1) { switch (c) { case 't': tree = tr_new_from_file(phast_fopen(optarg, "r")); break; case 'i': msa_format = msa_str_to_format(optarg); if (msa_format == UNKNOWN_FORMAT) die("ERROR: bad input format.\n"); break; case 'b': backgd_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1); break; case 's': break; case 'k': size = get_arg_int(optarg); break; case 'm': meme_mode = 1; break; case 'd': pos_examples = get_arg_list(optarg); break; case 'p': profile_mode = 1; break; case 'n': nrestarts = get_arg_int(optarg); break; case 'I': init_list = get_arg_list(optarg); break; case 'P': tmpl = str_list_as_int(get_arg_list(optarg)); if (lst_size(tmpl) != 2) die("ERROR: bad argument to -P.\n"); nmostprevalent = lst_get_int(tmpl, 0); tuple_size = lst_get_int(tmpl, 1); if (!(nmostprevalent > 0 && tuple_size > 0)) die("ERROR: bad argument nmostprevalent=%i tuple_size=%i\n", nmostprevalent, tuple_size); lst_free(tmpl); break; case 'R': tmpl = str_list_as_int(get_arg_list(optarg)); if (lst_size(tmpl) != 2) die("ERROR: bad argument to -R.\n"); nsamples = lst_get_int(tmpl, 0); tuple_size = lst_get_int(tmpl, 1); if (!(nsamples > 0 && tuple_size > 0)) die("ERROR nsamples=%i tuple_sizse=%i\n", nsamples, tuple_size); lst_free(tmpl); break; case 'c': npseudocounts = get_arg_int(optarg); break; case 'w': nbest = get_arg_int(optarg); break; case 'S': sample_parms = 1; break; case 'B': nmotifs = get_arg_int(optarg); break; case 'o': str_free(output_prefix); output_prefix = str_new_charstr(optarg); str_append_char(output_prefix, '.'); break; case 'H': do_html = 1; break; case 'D': do_bed = 1; break; case 'x': suppress_stdout = 1; break; case 'h': usage(argv[0]); case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } if (optind != argc - 1) die("ERROR: List of alignment files required. Try '%s -h'.\n", argv[0]); if ((nsamples > 0 && nmostprevalent > 0) || (nsamples > 0 && init_list != NULL) || (nmostprevalent > 0 && init_list != NULL)) die("ERROR: -I, -P, and -R are mutually exclusive."); set_seed(-1); msa_name_list = get_arg_list(argv[optind]); if (backgd_mod != NULL && tree == NULL) tree = backgd_mod->tree; if (tree == NULL && !meme_mode && !profile_mode) die("ERROR: Must specify -t, -m, or -p.\n"); if ((init_list != NULL || nsamples > 0 || nmostprevalent > 0) && !sample_parms) nrestarts = 1; if (pos_examples != NULL) { hash = hsh_new(lst_size(pos_examples)); for (i = 0; i < lst_size(pos_examples); i++) hsh_put_int(hash, ((String*)lst_get_ptr(pos_examples, i))->chars, 1); has_motif = smalloc(lst_size(msa_name_list) * sizeof(double)); } /* open all MSAs */ msas = lst_new_ptr(lst_size(msa_name_list)); fprintf(stderr, "Reading alignment(s) ...\n"); for (i = 0, j = 0; i < lst_size(msa_name_list); i++) { String *name = lst_get_ptr(msa_name_list, i); FILE *mfile = phast_fopen(name->chars, "r"); msa_format_type temp_format; MSA *msa; if (msa_format == UNKNOWN_FORMAT) temp_format = msa_format_for_content(mfile, 1); else temp_format = msa_format; msa = msa_new_from_file_define_format(mfile, temp_format, NULL); phast_fclose(mfile); if (nseqs == -1) nseqs = msa->nseqs; if (!meme_mode && (msa->length - msa_num_gapped_cols(msa, STRIP_ANY_GAPS, -1, -1) < 300 || msa->nseqs != nseqs)) { fprintf(stderr, "WARNING: ignoring alignment '%s' -- too few informative sites.\n", name->chars); msa_free(msa); continue; } if (msa_alph_has_lowercase(msa)) msa_toupper(msa); msa_remove_N_from_alph(msa); /* Ns can be a problem */ lst_push_ptr(msas, msa); if (has_motif != NULL) { int k, hm = (hsh_get_int(hash, name->chars) == 1); if (meme_mode) { /* here need to record at individ seq level */ has_motif = srealloc(has_motif, (j + msa->nseqs + 1) * sizeof(double)); /* FIXME */ for (k = 0; k < msa->nseqs; k++) has_motif[j++] = hm; } else has_motif[j++] = hm; } } if (!meme_mode) { fprintf(stderr, "Extracting and pooling sufficient statistics ...\n"); pmsa = ss_pooled_from_msas(msas, 1, size, NULL, 0); msa_remove_N_from_alph(pmsa->pooled_msa); } /* obtain individual sequences, if necessary */ if (nmostprevalent > 0 || nsamples > 0 || meme_mode) { if (meme_mode) fprintf(stderr, "Converting to individual sequences ...\n"); else fprintf(stderr, "Obtaining reference sequences for pre-processing ...\n"); seqset = mtf_get_seqset(msas, meme_mode ? -1 : 1, 10 * size); /* for now, assume 1st seq is reference */ msa_remove_N_from_alph(seqset->set); } if (nmostprevalent > 0) { fprintf(stderr, "Obtaining %d most prevalent %d-tuples ...\n", nmostprevalent, tuple_size); init_list = lst_new_ptr(nmostprevalent); mtf_get_common_ntuples(seqset, init_list, tuple_size, nmostprevalent); } else if (nsamples > 0) { fprintf(stderr, "Sampling %d %d-tuples ...\n", nsamples, tuple_size); init_list = lst_new_ptr(nsamples); mtf_sample_ntuples(seqset, init_list, tuple_size, nsamples); } /* in meme_mode, backgd model can be specified as eq freqs in a .mod file */ if (meme_mode && backgd_mod != NULL && has_motif == NULL) backgd_mnmod = backgd_mod->backgd_freqs; /* estimate background model, if necessary */ else if (backgd_mod == NULL && (!meme_mode || has_motif == NULL)) { fprintf(stderr, "Fitting background model%s ...\n", has_motif == NULL ? "" : " (for use in initialization)"); /* if discriminative, be clear backgd isn't really part of the estimation procedure */ if (meme_mode) { backgd_mnmod = vec_new(strlen(seqset->set->alphabet)); mtf_estim_backgd_mn(seqset, backgd_mnmod); } else { backgd_mod = tm_new(tr_create_copy(tree), NULL, NULL, F81, pmsa->pooled_msa->alphabet, 1, 0, NULL, -1); tm_fit(backgd_mod, pmsa->pooled_msa, tm_params_init(backgd_mod, .1, 5, 0), -1, OPT_MED_PREC, NULL, 0, NULL); } } /* select subset of init strings, if necessary */ if (nbest > 0 && init_list != NULL) { fprintf(stderr, "Winnowing candidate start strings ...\n"); tmpl = lst_new_ptr(nbest); mtf_winnow_starts(meme_mode ? (void*)seqset : (void*)pmsa, init_list, nbest, tmpl, !meme_mode, size, tree, meme_mode ? (void*)backgd_mnmod : (void*)backgd_mod, has_motif); lst_free(init_list); init_list = tmpl; } /* Now find motifs */ motifs = mtf_find(meme_mode ? (void*)seqset : (void*)pmsa, !meme_mode, size, nmotifs, tree, meme_mode ? (void*)backgd_mnmod : (void*)backgd_mod, has_motif, prior, nrestarts, init_list, sample_parms, npseudocounts); fprintf(stderr, "\n\n"); if (do_bed) bedfeats = gff_new_set_init("phast_motif", "0.1b"); /* generate output */ for (i = 0; i < lst_size(motifs); i++) { Motif *m = lst_get_ptr(motifs, i); if (!suppress_stdout) { if (lst_size(motifs) > 1) printf("\n**********\nMOTIF #%d\n**********\n\n", i+1); mtf_print(stdout, m); } if (do_html) { String *fname = str_dup(output_prefix); str_append_int(fname, i+1); str_append_charstr(fname, ".html"); mtf_print_html(phast_fopen(fname->chars, "w+"), m); str_free(fname); } if (do_bed) mtf_add_features(m, bedfeats); } if (do_html) { String *fname = str_dup(output_prefix); str_append_charstr(fname, "index.html"); mtf_print_summary_html(phast_fopen(fname->chars, "w+"), motifs, output_prefix); str_free(fname); } if (do_bed) { String *fname = str_dup(output_prefix); str_append_charstr(fname, "bed"); gff_print_bed(phast_fopen(fname->chars, "w+"), bedfeats, FALSE); str_free(fname); } return 0; }