Esempio n. 1
0
int main(int argc, char** argv) {
    // beginning of the verbose OpenCL allocation
    cl_platform_id platform_id = NULL;
    cl_uint ret_num_platforms = 0;
    cl_uint ret_num_devices = 0;
    cl_int ret = 0;

    // the output from opencl kernel
    float *c_inputs = malloc(ARRAY_SIZE*sizeof(float));
    float *c_outputs = malloc(ARRAY_SIZE*sizeof(float));
    cl_float *cl_inputs = malloc(ARRAY_SIZE*sizeof(cl_float));
    cl_float *cl_outputs = malloc(ARRAY_SIZE*sizeof(cl_float));

    // get random numbers via Rmath
    set_seed(atoi(argv[1]), 197414);
    float tmp_in = 0.0;

    #pragma omp parallel for
    for (long i = 0; i < ARRAY_SIZE; i++) {
        tmp_in = rnorm(0, 1);
        c_inputs[i] = tmp_in;
        cl_inputs[i] = (cl_float) tmp_in;
    }

    // measure time elapse
    clock_t start = clock();
    #pragma omp parallel for
    for (long i = 0; i < ARRAY_SIZE; i++) {
        c_outputs[i] = expf(c_inputs[i]);
    }
    printf("CPU time for %d exp operation: %d\n", ARRAY_SIZE, (int) (clock() - start));

    // read kernel source
    FILE *fp;
    char filename[] = "./hello_log.cl";
    char *source_str;
    size_t source_size;
    fp = fopen(filename, "r");
    source_str = (char*) malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str,
                        1,
                        MAX_SOURCE_SIZE,
                        fp);
    fclose(fp);

    // get platform and device info
    ret = clGetPlatformIDs(1,
                           &platform_id,
                           &ret_num_platforms);
    cl_device_id device_ids[2];
    ret = clGetDeviceIDs(platform_id,
                         CL_DEVICE_TYPE_GPU,
                         2,
                         device_ids,
                         &ret_num_devices);
    printf("Number of devices: %5d\n", ret_num_devices);

    // print device name
    char bdname[100];
    clGetDeviceInfo(device_ids[1], CL_DEVICE_NAME, 100, bdname, NULL);
    printf("Used device: %s\n", bdname);

    // use second GPU
    cl_device_id device_id = device_ids[1];

    // create opencl context
    cl_context context = clCreateContext(NULL,
                                         1,
                                         &device_id,
                                         NULL,
                                         NULL,
                                         &ret);


    // create command queue
    cl_command_queue command_queue = clCreateCommandQueueWithProperties(context,
                                     device_id,
                                     0,
                                     &ret);

    // create memory buffer for input
    cl_mem memobj_in = clCreateBuffer(context,
                                      CL_MEM_READ_WRITE,
                                      ARRAY_SIZE*sizeof(cl_float),
                                      NULL,
                                      &ret);

    // create memory buffer for output
    cl_mem memobj_out = clCreateBuffer(context,
                                       CL_MEM_READ_WRITE,
                                       ARRAY_SIZE*sizeof(cl_float),
                                       NULL,
                                       &ret);

    // create kernel program
    cl_program program = clCreateProgramWithSource(context,
                         1,
                         (const char **)&source_str,
                         (const size_t *)&source_size,
                         &ret);

    // build program
    ret = clBuildProgram(program,
                         1,
                         &device_id,
                         NULL,
                         NULL,
                         NULL);
    printf("build program successfully\n");

    // create opencl kernel
    cl_kernel kernel = clCreateKernel(program,
                                      "hello_exp",
                                      &ret);

    // set opencl parameters for inputs
    ret = clSetKernelArg(kernel,
                         0,
                         sizeof(cl_mem),
                         (void *)&memobj_in);

    // set opencl parameters for inputs
    ret = clSetKernelArg(kernel,
                         1,
                         sizeof(cl_mem),
                         (void *)&memobj_out);

    // execute opencl kernel
    size_t global_item_size = ARRAY_SIZE/32;
    size_t local_item_size = 32;

    // measure time
    start = clock();
    ret = clEnqueueWriteBuffer(command_queue,
                               memobj_in,
                               CL_TRUE,
                               0,
                               ARRAY_SIZE*sizeof(cl_float),
                               cl_inputs,
                               0,
                               NULL,
                               NULL);
    // run it
    ret = clEnqueueNDRangeKernel(command_queue,
                                 kernel,
                                 1,
                                 NULL,
                                 &global_item_size,
                                 &local_item_size,
                                 0,
                                 NULL,
                                 NULL);

    // copy results from the memory buffer
    ret = clEnqueueReadBuffer(command_queue,
                              memobj_out,
                              CL_TRUE,
                              0,
                              ARRAY_SIZE*sizeof(cl_float),
                              cl_outputs,
                              0,
                              NULL,
                              NULL);
    printf("GPU time (with PCI-E overhead): %d\n", (int) (clock() - start));
    printf("inputs: %3.7f  %3.7f\n", c_inputs[150000], cl_inputs[150000]);
    printf("outputs: %3.7f  %3.7f\n", c_outputs[150000], (float) cl_outputs[150000]);

    // finalization
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(memobj_in);
    ret = clReleaseMemObject(memobj_out);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
    free(source_str);

    return 0;
}
Esempio n. 2
0
int main(int argc, char *argv[]) {
  List *pruned_names = lst_new_ptr(5);
  TreeModel *source_mod;
  MSA *msa = NULL, *out_msa;
  IndelHistory *ih;
  char *read_hist_fname = NULL;
  char c;
  int opt_idx, old_nnodes, i;

  msa_format_type msa_format = UNKNOWN_FORMAT;
  int output_alignment = FALSE, ia_names = FALSE;
  
  struct option long_opts[] = {
    {"msa-format", 1, 0, 'i'},
    {"output-alignment", 0, 0, 'A'},
    {"read-history", 1, 0, 'H'},
    {"ia-names", 0, 0, 'I'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  while ((c = getopt_long(argc, argv, "i:H:AIh", long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 'i':
      msa_format = msa_str_to_format(optarg);
      if (msa_format == -1)
        die("ERROR: unrecognized alignment format.\n");
      break;
    case 'A':
      output_alignment = TRUE;
      break;
    case 'H':
      read_hist_fname = optarg;
      break;
    case 'I':
      ia_names = TRUE;
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("Bad argument.  Try 'indelHistory -h'.\n");
    }
  }

  set_seed(-1);

  if (read_hist_fname != NULL) {
    fprintf(stderr, "Reading indel history from %s...\n", read_hist_fname);
    ih = ih_new_from_file(phast_fopen(read_hist_fname, "r"));
  }

  else {
    FILE *mfile;
    if (optind != argc - 2) 
      die("Two arguments required.  Try 'indelHistory -h'.\n");

    fprintf(stderr, "Reading alignment from %s...\n", argv[optind]);
    mfile = phast_fopen(argv[optind], "r");
    if (msa_format == UNKNOWN_FORMAT) 
      msa_format = msa_format_for_content(mfile, 1);
    msa = msa_new_from_file_define_format(mfile, msa_format, "ACGTNB^.-");
    phast_fclose(mfile);

    if (msa->seqs == NULL && (msa->ss == NULL || msa->ss->tuple_idx == NULL))
      die("ERROR: ordered representation of alignment required.\n");

    fprintf(stderr, "Reading tree from %s...\n", argv[optind+1]);
    source_mod = tm_new_from_file(phast_fopen(argv[optind+1], "r"), 1);
    
    /* prune tree, if necessary */
    old_nnodes = source_mod->tree->nnodes;
    tm_prune(source_mod, msa, pruned_names);
    
    if (lst_size(pruned_names) == (old_nnodes + 1) / 2)
      die("ERROR: no match for leaves of tree in alignment (leaf names must match alignment names).\n");
    if (lst_size(pruned_names) > 0) {
      fprintf(stderr, "WARNING: pruned away leaves of tree with no match in alignment (");
      for (i = 0; i < lst_size(pruned_names); i++)
	fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, i))->chars,
		i < lst_size(pruned_names) - 1 ? ", " : ").\n");
    }
    lst_free(pruned_names);
    
    tr_name_ancestors(source_mod->tree);

    if (msa->nseqs > (source_mod->tree->nnodes + 1) / 2) { /* assume ancestral 
							      seqs specified 
							      in this case */
      if (ia_names) {
        fprintf(stderr, "Converting sequence names...\n");
        ih_convert_ia_names(msa, source_mod->tree);
      }

      fprintf(stderr, "Extracting indel history from alignment...\n");
      ih = ih_extract_from_alignment(msa, source_mod->tree);
    }
   
    else {                        /* infer by parsimony */
      if (msa->ss == NULL) {
        fprintf(stderr, "Extracting sufficient statistics...\n");
        ss_from_msas(msa, 1, TRUE, NULL, NULL, NULL, -1, 0);
      }
      
      fprintf(stderr, "Inferring indel history by parsimony...\n");
      ih = ih_reconstruct(msa, source_mod->tree);
    }
  }

  if (output_alignment) {
    out_msa = ih_as_alignment(ih, msa);
    msa_print(stdout, out_msa, FASTA, FALSE);
  }
  else
    ih_print(ih, stdout, 
             read_hist_fname != NULL ? read_hist_fname : argv[optind], 
             "indelHistory");

  fprintf(stderr, "Done.\n");
  return 0;
}
Esempio n. 3
0
int main(int argc, char *argv[]) {
  char c;
  char *msa_fname = NULL;
  int opt_idx, i, old_nnodes;
  MSA *msa;
  List *pruned_names = lst_new_ptr(5), *tmpl;
  BDPhyloHmm *bdphmm;
  GFF_Set *predictions;
  int found = FALSE;
  List *ignore_types = lst_new_ptr(1);

  struct option long_opts[] = {
    {"refseq", 1, 0, 'M'},
    {"msa-format", 1, 0, 'i'},
    {"refidx", 1, 0, 'r'},
    {"rho", 1, 0, 'R'},
    {"phi", 1, 0, 'p'},
    {"transitions", 1, 0, 't'},    
    {"expected-length", 1, 0, 'E'},
    {"target-coverage", 1, 0, 'C'},
    {"seqname", 1, 0, 'N'},
    {"idpref", 1, 0, 'P'},
    {"indel-model", 1, 0, 'I'},
    {"indel-history", 1, 0, 'H'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  /* arguments and defaults for options */
  FILE *refseq_f = NULL, *msa_f = NULL;
  msa_format_type msa_format = UNKNOWN_FORMAT;
  TreeModel *source_mod;
  double rho = DEFAULT_RHO, mu = DEFAULT_MU, nu = DEFAULT_NU, 
    phi = DEFAULT_PHI, gamma = -1, omega = -1, 
    alpha_c = -1, beta_c = -1, tau_c = -1,
    alpha_n = -1, beta_n = -1, tau_n = -1;
  int set_transitions = FALSE, refidx = 1, estim_phi = TRUE, 
    estim_gamma = TRUE, estim_omega = TRUE;
  char *seqname = NULL, *idpref = NULL;
  IndelHistory *ih = NULL;

  while ((c = getopt_long(argc, argv, "R:t:p:E:C:r:M:i:N:P:I:H:h", long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 'R':
      rho = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 't':
      if (optarg[0] != '~') estim_gamma = estim_omega = FALSE;
      else optarg = &optarg[1];
      set_transitions = TRUE;
      tmpl = get_arg_list_dbl(optarg);
      if (lst_size(tmpl) != 2) 
        die("ERROR: bad argument to --transitions.\n");
      mu = lst_get_dbl(tmpl, 0);
      nu = lst_get_dbl(tmpl, 1);
      if (mu <= 0 || mu >= 1 || nu <= 0 || nu >= 1)
        die("ERROR: bad argument to --transitions.\n");
      lst_free(tmpl);
      break;
    case 'p':
      if (optarg[0] != '~') estim_phi = FALSE;
      else optarg = &optarg[1];
      phi = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'E':
      if (optarg[0] != '~') estim_omega = FALSE;
      else optarg = &optarg[1];
      omega = get_arg_dbl_bounds(optarg, 1, INFTY);
      mu = 1/omega;
      break;
    case 'C':
      if (optarg[0] != '~') estim_gamma = FALSE;
      else optarg = &optarg[1];
      gamma = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'r':
      refidx = get_arg_int_bounds(optarg, 0, INFTY);
      break;
    case 'M':
      refseq_f = phast_fopen(optarg, "r");
      break;
    case 'i':
      msa_format = msa_str_to_format(optarg);
      if (msa_format == UNKNOWN_FORMAT)
        die("ERROR: unrecognized alignment format.\n");
      break;
    case 'N':
      seqname = optarg;
      break;
    case 'P':
      idpref = optarg;
      break;
    case 'I':
      tmpl = get_arg_list_dbl(optarg);
      if (lst_size(tmpl) != 3 && lst_size(tmpl) != 6)
        die("ERROR: bad argument to --indel-model.\n");
      alpha_n = lst_get_dbl(tmpl, 0);
      beta_n = lst_get_dbl(tmpl, 1);
      tau_n = lst_get_dbl(tmpl, 2);
      if (lst_size(tmpl) == 6) {
        alpha_c = lst_get_dbl(tmpl, 3);
        beta_c = lst_get_dbl(tmpl, 4);
        tau_c = lst_get_dbl(tmpl, 5);
      }
      else {
        alpha_c = alpha_n; beta_c = beta_n; tau_c = tau_n;
      }
      if (alpha_c <= 0 || alpha_c >= 1 || beta_c <= 0 || beta_c >= 1 || 
          tau_c <= 0 || tau_c >= 1 || alpha_n <= 0 || alpha_n >= 1 || 
          beta_n <= 0 || beta_n >= 1 || tau_n <= 0 || tau_n >= 1)
        die("ERROR: bad argument to --indel-model.\n");
      break;
    case 'H':
      fprintf(stderr, "Reading indel history from %s...\n", optarg);
      ih = ih_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("Bad argument.  Try 'dless -h'.\n");
    }
  }

  if (optind != argc - 1)
    die("Missing alignment file or model file.  Try 'dless -h'.\n");

  if (set_transitions && (gamma != -1 || omega != -1))
    die("ERROR: --transitions and --target-coverage/--expected-length cannot be used together.\n");

  if ((gamma != -1 && omega == -1) || (gamma == -1 && omega != -1))
    die("ERROR: --target-coverage and --expecteed-length must be used together.\n");

  set_seed(-1);

  if (gamma != -1)
    nu = gamma/(1-gamma) * mu;

  fprintf(stderr, "Reading tree model from %s...\n", argv[optind]);
  source_mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1);

  if (source_mod->nratecats > 1) 
    die("ERROR: rate variation not currently supported.\n");

  if (source_mod->order > 0)
    die("ERROR: only single nucleotide models are currently supported.\n");

  if (!tm_is_reversible(source_mod))
    phast_warning("WARNING: p-value computation assumes reversibility and your model is non-reversible.\n");

  /* read alignment */
  msa_f = phast_fopen(argv[optind], "r");

  fprintf(stderr, "Reading alignment from %s...\n", argv[optind]);
  if (msa_format == UNKNOWN_FORMAT) 
    msa_format = msa_format_for_content(msa_f, 1);

  if (msa_format == MAF) {
    msa = maf_read(msa_f, refseq_f, 1, NULL, NULL, NULL, -1, TRUE, NULL, 
                   NO_STRIP, FALSE); 
  }
  else 
    msa = msa_new_from_file_define_format(msa_f, msa_format, NULL);

  if (msa_alph_has_lowercase(msa)) msa_toupper(msa); 
  msa_remove_N_from_alph(msa);

  if (msa->ss == NULL) {
    fprintf(stderr, "Extracting sufficient statistics...\n");
    ss_from_msas(msa, 1, TRUE, NULL, NULL, NULL, -1, 0);
  }
  else if (msa->ss->tuple_idx == NULL)
    die("ERROR: ordered representation of alignment required unless --suff-stats.\n");

  /* prune tree, if necessary */
  old_nnodes = source_mod->tree->nnodes;
  tm_prune(source_mod, msa, pruned_names);

  if (lst_size(pruned_names) == (old_nnodes + 1) / 2)
    die("ERROR: no match for leaves of tree in alignment (leaf names must match alignment names).\n");
  if (lst_size(pruned_names) > 0) {
    fprintf(stderr, "WARNING: pruned away leaves of tree with no match in alignment (");
    for (i = 0; i < lst_size(pruned_names); i++)
      fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, i))->chars, 
              i < lst_size(pruned_names) - 1 ? ", " : ").\n");
  }

  /* this has to be done after pruning tree */
  tr_name_ancestors(source_mod->tree);

  /* also make sure match for reference sequence in tree */
  if (refidx > 0) {
    for (i = 0, found = FALSE; !found && i < source_mod->tree->nnodes; i++) {
      TreeNode *n = lst_get_ptr(source_mod->tree->nodes, i);
      if (!strcmp(n->name, msa->names[refidx-1]))
        found = TRUE;
    }
    if (!found) die("ERROR: no match for reference sequence in tree.\n");
  }

  /* checks for indel model */
  if (alpha_c > 0) {
    if (ih == NULL) {
      fprintf(stderr, "Reconstructing indel history by parsimony...\n");
      ih = ih_reconstruct(msa, source_mod->tree);
    }
    else {
      if (ih->ncols != msa->length)
        die("ERROR: indel history doesn't seem to match alignment.\n");
      if (ih->tree->nnodes != source_mod->tree->nnodes)
        die("ERROR: indel history doesn't seem to match tree model.\n");
    }
  }

  bdphmm = bd_new(source_mod, rho, mu, nu, phi, alpha_c, beta_c, tau_c, 
                  alpha_n, beta_n, tau_n, estim_gamma, estim_omega, 
                  estim_phi);

  /* compute emissions */
  phmm_compute_emissions(bdphmm->phmm, msa, FALSE);

  /* add emissions for indel model, if necessary */
  if (alpha_c > 0) {
    fprintf(stderr, "Adjusting emissions for indels...\n");
    bd_add_indel_emissions(bdphmm, ih);
  }

  /* postprocess for missing data (requires special handling) */
  fprintf(stderr, "Adjusting emissions for missing data...\n");
  bd_handle_missing_data(bdphmm, msa);

  if (estim_gamma || estim_omega || estim_phi) {
    fprintf(stderr, "Estimating free parameters...\n");
    bd_estimate_transitions(bdphmm, msa);
  }

  /* set seqname and idpref, if necessary */
  if (seqname == NULL || idpref == NULL) {
    /* derive default from file name root */
    String *tmp = str_new_charstr(msa_fname);
    if (!str_equals_charstr(tmp, "-")) {
      str_remove_path(tmp);
      str_root(tmp, '.');
      if (idpref == NULL) idpref = copy_charstr(tmp->chars);
      str_root(tmp, '.');         /* apply one more time for double suffix */
      if (seqname == NULL) seqname = tmp->chars;    
    }
    else if (seqname == NULL) seqname = "refseq";
  }

  /* obtain predictions */
  fprintf(stderr, "Running Viterbi algorithm...\n");
  predictions = phmm_predict_viterbi(bdphmm->phmm, seqname, NULL, idpref, NULL);
  lst_push_ptr(ignore_types, str_new_charstr("nonconserved"));
  gff_filter_by_type(predictions, ignore_types, TRUE, NULL);

  /* score predictions */
  fprintf(stderr, "Scoring predictions...\n");
  bd_score_predictions(bdphmm, predictions);
  
  /* can free emissions now */
  for (i = 0; i < bdphmm->phmm->hmm->nstates; i++)
    sfree(bdphmm->phmm->emissions[i]);
  sfree(bdphmm->phmm->emissions);
  bdphmm->phmm->emissions = NULL;

  /* convert GFF to coord frame of reference sequence and adjust
     coords by idx_offset, if necessary  */
  if (refidx != 0 || msa->idx_offset != 0)
    msa_map_gff_coords(msa, predictions, 0, refidx, msa->idx_offset);

  if (refidx != 0) 
    gff_flatten(predictions);	
  /* necessary because coord conversion might create overlapping
     features (can happen in deletions in reference sequence) */

  /* now output predictions */
  fprintf(stderr, "Writing GFF to stdout...\n");
  gff_print_set(stdout, predictions);

  fprintf(stderr, "Done.\n");
  
  return 0;
}
Esempio n. 4
0
int main(int argc, char *argv[]) {
    char c;
    List *l;
    int i, j, strand, bed_output = 0, backgd_nmods = -1, feat_nmods = -1,
                      winsize = -1, verbose = 0, max_nmods, memblocksize, old_nleaves,
                      refidx = 1, base_by_base = FALSE, windowWig = FALSE;
    TreeModel **backgd_mods = NULL, **feat_mods = NULL;
    HMM *backgd_hmm = NULL, *feat_hmm = NULL;
    msa_format_type inform = UNKNOWN_FORMAT;
    GFF_Set *features = NULL;
    MSA *msa, *msa_compl=NULL;
    double **backgd_emissions, **feat_emissions, **mem, **dummy_emissions,
           *winscore_pos=NULL, *winscore_neg=NULL;
    int *no_alignment=NULL;
    List *pruned_names;
    char *msa_fname;
    FILE *infile;

    int opt_idx;
    struct option long_opts[] = {
        {"background-mods", 1, 0, 'b'},
        {"background-hmm", 1, 0, 'B'},
        {"feature-mods", 1, 0, 'f'},
        {"feature-hmm", 1, 0, 'F'},
        {"features", 1, 0, 'g'},
        {"window", 1, 0, 'w'},
        {"window-wig", 1, 0, 'W'},
        {"base-by-base", 0, 0, 'y'},
        {"msa-format", 1, 0, 'i'},
        {"refidx", 1, 0, 'r'},
        {"output-bed", 0, 0, 'd'},
        {"verbose", 0, 0, 'v'},
        {"help", 0, 0, 'h'},
        {0, 0, 0, 0}
    };

    while ((c = getopt_long(argc, argv, "B:b:F:f:r:g:w:W:i:ydvh", long_opts, &opt_idx)) != -1) {
        switch (c) {
        case 'B':
            backgd_hmm = hmm_new_from_file(phast_fopen(optarg, "r"));
            break;
        case 'b':
            l = get_arg_list(optarg);
            backgd_nmods = lst_size(l);
            backgd_mods = smalloc(backgd_nmods * sizeof(void*));
            for (i = 0; i < backgd_nmods; i++)
                backgd_mods[i] = tm_new_from_file(phast_fopen(((String*)lst_get_ptr(l, i))->chars, "r"), 1);
            lst_free_strings(l);
            lst_free(l);
            break;
        case 'F':
            feat_hmm = hmm_new_from_file(phast_fopen(optarg, "r"));
            break;
        case 'f':
            l = get_arg_list(optarg);
            feat_nmods = lst_size(l);
            feat_mods = smalloc(feat_nmods * sizeof(void*));
            for (i = 0; i < feat_nmods; i++)
                feat_mods[i] = tm_new_from_file(phast_fopen(((String*)lst_get_ptr(l, i))->chars, "r"), 1);
            lst_free_strings(l);
            lst_free(l);
            break;
        case 'g':
            features = gff_read_set(phast_fopen(optarg, "r"));
            break;
        case 'w':
            winsize = get_arg_int(optarg);
            if (winsize <= 0) die("ERROR: window size must be positive.\n");
            break;
        case 'W':
            winsize = get_arg_int(optarg);
            if (winsize <= 0) die("ERROR: window size must be positive.\n");
            windowWig = TRUE;
            break;
        case 'y':
            base_by_base = TRUE;
            break;
        case 'i':
            inform = msa_str_to_format(optarg);
            if (inform == UNKNOWN_FORMAT) die("Bad argument to -i.\n");
            break;
        case 'r':
            refidx = get_arg_int_bounds(optarg, 0, INFTY);
            break;
        case 'd':
            bed_output = 1;
            break;
        case 'h':
            printf("%s", HELP);
            exit(0);
        case 'v':
            verbose = 1;
            break;
        case '?':
            die("Bad argument.  Try '%s -h'.\n", argv[0]);
        }
    }

    set_seed(-1);

    if (backgd_mods == NULL || feat_mods == NULL)
        die("ERROR: -b and -f required.  Try '%s -h'.\n", argv[0]);

    if (backgd_nmods == 1 && backgd_hmm == NULL)
        backgd_hmm = hmm_create_trivial();
    else if (backgd_hmm == NULL)
        die("ERROR: -B required.  Try '%s -h'.\n", argv[0]);

    if (feat_nmods == 1 && feat_hmm == NULL)
        feat_hmm = hmm_create_trivial();
    else if (feat_hmm == NULL)
        die("ERROR: -F required.  Try '%s -h'.\n", argv[0]);

    if ((winsize == -1 && features == NULL && !base_by_base) ||
            (winsize != -1 && features != NULL) ||
            (winsize != -1 && base_by_base) ||
            (features != NULL && base_by_base))
        die("ERROR: must specify exactly one of -g, -w, and -y.  Try '%s -h'.\n", argv[0]);

    if (backgd_hmm->nstates != backgd_nmods)
        die("ERROR: number of states must equal number of tree models for background.\n");

    if (feat_hmm->nstates != feat_nmods)
        die("ERROR: number of states must equal number of tree models for features.\n");

    if (features != NULL && lst_size(features->features) == 0)
        die("ERROR: empty features file.\n");

    if (base_by_base && (backgd_nmods > 1 || feat_nmods > 1))
        die("ERROR: only single phylogenetic models (not HMMs) are supported with --base-by-base.\n");

    if (optind != argc - 1)
        die("ERROR: too few arguments.  Try '%s -h'.\n", argv[0]);

    if (verbose) fprintf(stderr, "Reading alignment ...\n");
    msa_fname = argv[optind];
    infile = phast_fopen(msa_fname, "r");
    if (inform == UNKNOWN_FORMAT)
        inform = msa_format_for_content(infile, 1);
    if (inform == MAF)
        msa = maf_read(infile, NULL, 1, NULL, NULL,
                       NULL, -1, TRUE, NULL, NO_STRIP, FALSE);
    else
        msa = msa_new_from_file_define_format(infile, inform, NULL);
    if (msa_alph_has_lowercase(msa)) msa_toupper(msa);
    msa_remove_N_from_alph(msa);

    /* need ordered representation of alignment */
    if (msa->seqs == NULL && (msa->ss == NULL || msa->ss->tuple_idx == NULL) )
        die("ERROR: ordered sufficient statistics are required.\n");

    pruned_names = lst_new_ptr(msa->nseqs);
    for (i = 0; i < backgd_nmods; i++) {
        old_nleaves = (backgd_mods[i]->tree->nnodes + 1) / 2;
        tm_prune(backgd_mods[i], msa, pruned_names);
        if (lst_size(pruned_names) >= old_nleaves)
            die("ERROR: no match for leaves of tree in alignment (background model #%d)\n", i+1);
        else if (lst_size(pruned_names) > 0) {
            fprintf(stderr, "WARNING: pruned away leaves in background model (#%d) with no match in alignment (", i+1);
            for (j = 0; j < lst_size(pruned_names); j++)
                fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, j))->chars,
                        j < lst_size(pruned_names) - 1 ? ", " : ").\n");
        }
        lst_free_strings(pruned_names);
    }
    for (i = 0; i < feat_nmods; i++) {
        old_nleaves = (feat_mods[i]->tree->nnodes + 1) / 2;
        tm_prune(feat_mods[i], msa, pruned_names);
        if (lst_size(pruned_names) >= old_nleaves)
            die("ERROR: no match for leaves of tree in alignment (features model #%d)\n", i+1);
        else if (lst_size(pruned_names) > 0) {
            fprintf(stderr, "WARNING: pruned away leaves in features model (#%d) with no match in alignment (", i+1);
            for (j = 0; j < lst_size(pruned_names); j++)
                fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, j))->chars,
                        j < lst_size(pruned_names) - 1 ? ", " : ").\n");
        }
        lst_free_strings(pruned_names);
    }
    lst_free(pruned_names);

    /* first have to subtract offset from features, if necessary */
    if (msa->idx_offset != 0 && features != NULL) {
        for (i = 0; i < lst_size(features->features); i++) {
            GFF_Feature *f = lst_get_ptr(features->features, i);
            f->start -= msa->idx_offset;
            f->end -= msa->idx_offset;
        }
    }

    /* convert to coord frame of alignment */
    if (features != NULL && refidx != 0) {
        if (verbose) fprintf(stderr, "Mapping coordinates ...\n");
        msa_map_gff_coords(msa, features, refidx, 0, 0);
        if (lst_size(features->features) == 0)
            die("ERROR: no features within coordinate range of alignment.\n");
    }

    /* Make a reverse complemented copy of the alignment.  The two
       strands will be processed separately, to avoid problems with
       overlapping features, etc. */
    if (!base_by_base) {          /* skip in base by base case */
        if (verbose) fprintf(stderr, "Creating reverse complemented alignment ...\n");
        msa_compl = msa_create_copy(msa, 0);
        /* temporary workaround: make sure reverse complement not based on
           sufficient stats */
        if (msa_compl->seqs == NULL) ss_to_msa(msa_compl);
        if (msa_compl->ss != NULL) {
            ss_free(msa_compl->ss);
            msa_compl->ss = NULL;
        }
        msa_reverse_compl(msa_compl);
    }

    /* allocate memory for computing scores */
    backgd_emissions = smalloc(backgd_nmods * sizeof(void*));
    for (i = 0; i < backgd_nmods; i++)
        backgd_emissions[i] = smalloc(msa->length * sizeof(double));
    feat_emissions = smalloc(feat_nmods * sizeof(void*));
    for (i = 0; i < feat_nmods; i++)
        feat_emissions[i] = smalloc(msa->length * sizeof(double));
    max_nmods = max(backgd_nmods, feat_nmods);
    dummy_emissions = smalloc(max_nmods * sizeof(void*));
    mem = smalloc(max_nmods * sizeof(void*));
    /* memory for forward algorithm -- each block must be as large as
       the largest feature */
    if (features != NULL) {
        for (i = 0, memblocksize = -1; i < lst_size(features->features); i++) {
            GFF_Feature *f = lst_get_ptr(features->features, i);
            if (f->end - f->start + 1 > memblocksize)
                memblocksize = f->end - f->start + 1;
        }
    }
    else memblocksize = winsize;  /* -1 if base-by-base mode */

    if (memblocksize > 0)
        for (i = 0; i < max_nmods; i++)
            mem[i] = smalloc(memblocksize * sizeof(double));

    if (winsize != -1) {
        winscore_pos = smalloc(msa->length * sizeof(double));
        winscore_neg = smalloc(msa->length * sizeof(double));
        no_alignment = smalloc(msa->length * sizeof(int));

        for (i = 0; i < msa->length; i++) {
            winscore_pos[i] = winscore_neg[i] = NEGINFTY;
            if (refidx == 0)
                no_alignment[i] = FALSE;
            else
                no_alignment[i] = msa_missing_col(msa, refidx, i);
        }
    }

    /* the rest will be repeated for each strand */
    for (strand = 1; strand <= 2; strand++) {
        MSA *thismsa = strand == 1 ? msa : msa_compl;
        double *winscore = strand == 1 ? winscore_pos : winscore_neg;

        if (base_by_base && strand == 2) break; /* don't do second pass in
                                               base_by_base case */

        if (verbose) fprintf(stderr, "Processing %c strand ...\n",
                                 strand == 1 ? '+' : '-');

        /* set up dummy categories array, so that emissions are only
           computed where needed */
        thismsa->categories = smalloc(thismsa->length * sizeof(int));
        thismsa->ncats = 1;
        if (winsize != -1) {
            if (strand == 1)
                for (i = 0; i < thismsa->length; i++)
                    thismsa->categories[i] = no_alignment[i] ? 0 : 1;
            else
                for (i = 0; i < thismsa->length; i++)
                    thismsa->categories[i] = no_alignment[thismsa->length - i - 1] ? 0 : 1;
        }
        else if (features != NULL) {
            for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = 0;
            for (i = 0; i < lst_size(features->features); i++) {
                GFF_Feature *f = lst_get_ptr(features->features, i);
                if (f->start <= 0 || f->end <= 0) {
                    fprintf(stderr, "WARNING: feature out of range ('");
                    gff_print_feat(stderr, f);
                    fprintf(stderr, "')\n");
                    continue;
                }

                if (strand == 1 && f->strand != '-')
                    for (j = f->start - 1; j < f->end; j++)
                        thismsa->categories[j] = 1;
                else if (strand == 2 && f->strand == '-')
                    for (j = thismsa->length - f->end;
                            j < thismsa->length - f->start + 1; j++)
                        thismsa->categories[j] = 1;
            }
        }
        else {                      /* base-by-base scores */
            for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = 1;
        }
        if (thismsa->ss != NULL) ss_update_categories(thismsa);

        /* compute emissions */
        for (i = 0; i < backgd_nmods; i++) {
            if (verbose)
                fprintf(stderr, "Computing emissions for background model #%d ...\n", i+1);
            tl_compute_log_likelihood(backgd_mods[i], thismsa,
                                      backgd_emissions[i], NULL, 1, NULL);
        }
        for (i = 0; i < feat_nmods; i++) {
            if (verbose)
                fprintf(stderr, "Computing emissions for features model #%d ...\n", i+1);
            tl_compute_log_likelihood(feat_mods[i], thismsa,
                                      feat_emissions[i], NULL, 1, NULL);
        }

        /* now compute scores */
        if (winsize != -1) {        /* windows case */
            int winstart;
            if (verbose) fprintf(stderr, "Computing scores ...\n");

            for (winstart = 0; winstart <= thismsa->length - winsize; winstart++) {
                int centeridx = winstart + winsize/2;

                if (strand == 2) centeridx = thismsa->length - centeridx - 1;

                if (no_alignment[centeridx]) continue;

                for (j = 0; j < feat_nmods; j++)
                    dummy_emissions[j] = &(feat_emissions[j][winstart]);
                winscore[centeridx] = hmm_forward(feat_hmm, dummy_emissions,
                                                  winsize, mem);

                if (winscore[centeridx] <= NEGINFTY) {
                    winscore[centeridx] = NEGINFTY;
                    continue;
                }

                for (j = 0; j < backgd_nmods; j++)
                    dummy_emissions[j] = &(backgd_emissions[j][winstart]);
                winscore[centeridx] -= hmm_forward(backgd_hmm, dummy_emissions,
                                                   winsize, mem);

                if (winscore[centeridx] < NEGINFTY) winscore[centeridx] = NEGINFTY;
            }
        }
        else if (features != NULL) { /* features case */
            if (verbose) fprintf(stderr, "Computing scores ...\n");
            for (i = 0; i < lst_size(features->features); i++) {
                GFF_Feature *f = lst_get_ptr(features->features, i);
                int s, e;

                if ((strand == 1 && f->strand == '-') ||
                        (strand == 2 && f->strand != '-') ||
                        f->start <= 0 || f->end <= 0 || f->end - f->start < 0)
                    continue;

                /* effective coords */
                if (f->strand == '-') {
                    s = thismsa->length - f->end + 1;
                    e = thismsa->length - f->start + 1;
                }
                else {
                    s = f->start;
                    e = f->end;
                }

                f->score_is_null = 0;

                for (j = 0; j < feat_nmods; j++)
                    dummy_emissions[j] = &(feat_emissions[j][s-1]);
                f->score = hmm_forward(feat_hmm, dummy_emissions, e - s + 1, mem);

                if (f->score <= NEGINFTY) {
                    f->score = NEGINFTY;
                    continue;
                }

                for (j = 0; j < backgd_nmods; j++)
                    dummy_emissions[j] = &(backgd_emissions[j][s-1]);
                f->score -= hmm_forward(backgd_hmm, dummy_emissions, e - s + 1, mem);

                if (f->score < NEGINFTY) f->score = NEGINFTY;
            }
        }
    }

    if (verbose) fprintf(stderr, "Generating output ...\n");

    if (winsize != -1 && windowWig == FALSE) { /* standard windows output */
        for (i = 0, j = 0; i < msa->length; i++) {
            if (no_alignment[i] == FALSE)
                printf("%d\t%.3f\t%.3f\n", j + msa->idx_offset + 1, winscore_pos[i],
                       winscore_neg[i]);
            if (ss_get_char_pos(msa, i, 0, 0) != GAP_CHAR) j++;
        }
    }
    else if (windowWig == TRUE) { /* windows with wig output */
        int last = NEGINFTY;
        for (i = 0, j = 0; i < msa->length; i++) {
            if (refidx == 0 || msa_get_char(msa, refidx-1, i) != GAP_CHAR) {
                if (no_alignment[i] == FALSE && winscore_pos[i] > NEGINFTY) {
                    if (j > last + 1)
                        printf("fixedStep chrom=%s start=%d step=1\n",
                               refidx > 0 ? msa->names[refidx-1] : "alignment",
                               j + msa->idx_offset + 1);
                    printf("%.3f\n", winscore_pos[i]);
                    last = j;
                }
                j++;
            }
        }
    }
    else if (features != NULL) {  /* features output */
        /* return to coord frame of reference seq (also, replace offset) */
        if (refidx != 0)
            msa_map_gff_coords(msa, features, 0, refidx, msa->idx_offset);
        else if (msa->idx_offset != 0) {
            for (i = 0; i < lst_size(features->features); i++) {
                GFF_Feature *f = lst_get_ptr(features->features, i);
                f->start += msa->idx_offset;
                f->end += msa->idx_offset;
            }
        }

        if (bed_output)
            gff_print_bed(stdout, features, FALSE);
        else
            gff_print_set(stdout, features);
    }
    else {           /* base-by-base scores */
        /* in this case, we can just output the difference between the emissions */
        printf("fixedStep chrom=%s start=%d step=1\n",
               refidx > 0 ? msa->names[refidx-1] : "alignment",
               msa->idx_offset + 1);
        for (i = 0, j = 0; i < msa->length; i++) {
            if (refidx == 0 || msa_get_char(msa, refidx-1, i) != GAP_CHAR) {
                printf("%.3f\n", feat_emissions[0][i] - backgd_emissions[0][i]);
                j++;
            }
        }
    }

    if (verbose) fprintf(stderr, "\nDone.\n");

    return 0;
}
Esempio n. 5
0
File: desc.c Progetto: lollek/imoria
void magic_init(unsigned long random_seed)
{
  /*	{ Initialize all potions, wands, staves, scrolls, etc...	}*/

  integer i1,tmpv;
  vtype   tmps;

  set_seed(random_seed);
  randes();

  for (i1 = 1; i1 <= MAX_OBJECTS; i1++) {

    /*
     * The arrays of the object materals all start at 0.
     * Object subvals start at 1.  When doing the lookup
     * subtract one from subval!
     */
    tmpv = (0xFF & object_list[i1].subval);

    switch (object_list[i1].tval) {
      
    case potion1: case potion2: 
      if (tmpv <= MAX_COLORS) {
	insert_str(object_list[i1].name,"%C",colors[tmpv-1]);
      }
      break;
      
    case scroll1: case scroll2:
      rantitle(tmps);
      insert_str(object_list[i1].name,"%T",tmps);
      break;
      
    case ring: 
      if (tmpv <= MAX_ROCKS) {
	insert_str(object_list[i1].name,"%R",rocks[tmpv-1]);
      }
      break;
      
    case valuable_gems:
      if (tmpv <= MAX_ROCKS) {
	insert_str(object_list[i1].name,"%R",rocks[tmpv-1]);
      }
      break;
      
    case valuable_gems_wear:
      if (tmpv <= MAX_ROCKS) {
	insert_str(object_list[i1].name,"%R",rocks[tmpv-1]);
      }
      break;
      
    case amulet:
      if (tmpv <= MAX_AMULETS) {
	insert_str(object_list[i1].name,"%A",amulets[tmpv-1]);
      }
      break;
      
    case wand:
      if (tmpv <= MAX_METALS) {
	insert_str(object_list[i1].name,"%M",metals[tmpv-1]);
      }
      break;
      
    case chime:
      if (tmpv <= MAX_METALS) {
	insert_str(object_list[i1].name,"%M",metals[tmpv-1]);
      }
      break;
      
    case horn:
      if (tmpv <= MAX_HORNS) {
	insert_str(object_list[i1].name,"%H",horns[tmpv-1]);
      }
      break;
      
    case staff:
      if (tmpv <= MAX_WOODS) {
	insert_str(object_list[i1].name,"%W",woods[tmpv-1]);
      }
      break;
      
    case Food:
      if (tmpv <= MAX_MUSH) {
	insert_str(object_list[i1].name,"%M",mushrooms[tmpv-1]);
      }
      break;
      
    case rod :
      /* what happened to the rods? */
      /*
      if (tmpv <= MAX_RODS) {
	insert_str(object_list[i1].name,"%D",rods[tmpv-1]);
      }
      */
      break;
      
    case bag_or_sack:
      if (tmpv <= MAX_CLOTHS) {
	insert_str(object_list[i1].name,"%N",cloths[tmpv-1]);
      }
      break;
      
    case misc_usable:
      if (tmpv <= MAX_ROCKS) {
	insert_str(object_list[i1].name,"%R",rocks[tmpv-1]);
      }
      if (tmpv <= MAX_WOODS) {
	insert_str(object_list[i1].name,"%W",woods[tmpv-1]);
      }
      if (tmpv <= MAX_METALS) {
	insert_str(object_list[i1].name,"%M",metals[tmpv-1]);
      }
      if (tmpv <= MAX_AMULETS) { 
	insert_str(object_list[i1].name,"%A",amulets[tmpv-1]);
      }
      break;
    default:
      break;
    } /* end switch */
  } /* end for */

#if DO_DEBUG && 0
  for (i1 = 1; i1 <= MAX_OBJECTS; i1++) {
    fprintf(debug_file,": object_list[%ld] = %s\n", 
	    i1, object_list[i1].name);
  }
  fflush(debug_file);
#endif
};
int main(int argc, char *argv[]) {
  FILE *STATSF;
  char c;
  int opt_idx, i, max_nrows;
  String *line = str_new(STR_MED_LEN), *args = str_new(STR_MED_LEN);
  List *fields = lst_new_ptr(5), *vectors = lst_new_ptr(1000), 
    *counts = lst_new_int(1000);
  int dim = -1;
  double error = -1;
  PbsCode *code;
  char comment[1000];
  time_t t;
  int have_data = TRUE;

  /* argument variables and defaults */
  int nrows = -1, nbytes = 1;
  training_mode mode = FULL;
  FILE *logf = NULL;

  struct option long_opts[] = {
    {"nrows", 1, 0, 'n'},
    {"nbytes", 1, 0, 'b'},
    {"no-greedy", 0, 0, 'G'},
    {"no-train", 1, 0, 'x'},
    {"log", 1, 0, 'l'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  set_seed(-1);

  /* first capture arg list for comment in output */
  for (i = 1; i < argc; i++) {
    str_append_charstr(args, argv[i]);
    if (i < argc - 1) str_append_char(args, ' ');
  }

  while ((c = (char)getopt_long(argc, argv, "n:b:l:Gxh", long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 'n':
      nrows = get_arg_int_bounds(optarg, 1, INFTY);
      break;
    case 'b':
      nbytes = get_arg_int_bounds(optarg, 1, MAX_NBYTES);
      break;
    case 'G':
      mode = NO_GREEDY;
      break;
    case 'x':
      mode = NO_TRAIN;
      dim = get_arg_int_bounds(optarg, 1, INFTY);
      break;
    case 'l':
      logf = phast_fopen(optarg, "w+");
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("Bad argument.  Try 'pbsTrain -h'.\n");
    }
  }

  if (mode == NO_TRAIN && optind == argc) 
    have_data = FALSE;		/* data optional */

  if (have_data) {
    if (optind != argc - 1) 
      die("ERROR: Bad arguments.  Try 'pbsTrain -h'.\n");

    STATSF = phast_fopen(argv[optind], "r");

    /* read stats */
    while (str_readline(line, STATSF) != EOF) {
      int count;
      double prob, norm_const;
      Vector *v;

      str_trim(line);
      if (line->length == 0 || line->chars[0] == '#') continue;

      str_split(line, NULL, fields);

      if (str_as_int(lst_get_ptr(fields, 0), &count) != 0)
	die("ERROR: Bad count in stats file ('%s')\n", lst_get_ptr(fields, 0));
      lst_push_int(counts, count);

      if (dim == -1) dim = lst_size(fields) - 1;
      else if (dim != lst_size(fields) - 1)
	die("ERROR: Each probability vector must have the same dimension\n");

      v = vec_new(dim);
      for (i = 0; i < dim; i++) {
	if (str_as_dbl(lst_get_ptr(fields, i+1), &prob) != 0 || 
	    prob < 0 || prob > 1)
	  die("ERROR: Bad probability in stats file ('%s')\n", 
	      lst_get_ptr(fields, i+1));

	vec_set(v, i, prob);
      }
    
      /* normalize to avoid problems from rounding errors */
      norm_const = normalize_probs(v->data, dim);
      if (fabs(1-norm_const) > 1e-2)
	die("ERROR: Probabilities in stats file don't sum to one.\nOffending line: '%s'\n",
	    line->chars);

      lst_push_ptr(vectors, v);
      lst_free_strings(fields);
    }
  }

  max_nrows = sxg_max_nrows(dim, ~(~0 << (8*nbytes)));
  if (nrows == -1) nrows = max_nrows;
  else if (nrows > max_nrows)
    die("ERROR: nrows exceeds maximum of %d for nbytes = %d and dimension = %d\n", 
	max_nrows, nbytes, dim);

  code = pbs_new(dim, nrows, nbytes);
  
  if (mode != NO_TRAIN)
    error = pbs_estimate_from_data(code, vectors, counts, logf, mode);

  else if (have_data) {		/* not training but need error */
    int tot_count = 0;
    double this_error;
    error = 0;
    for (i = 0; i < lst_size(vectors); i++) {
      pbs_get_index(code, lst_get_ptr(vectors, i), &this_error);
      error += this_error * lst_get_int(counts, i);
      tot_count += lst_get_int(counts, i);
    }    
    error /= tot_count;
  }

  /* generate comment */
  t = time(NULL);
  sprintf(comment, "# Code generated by pbsTrain, with argument(s) \"%s\"\n\
#  %s\n\
# Average training error = %f bits\n", 
	  args->chars, ctime(&t), error);

  pbs_write(code, stdout, comment);

  return 0;
}
Esempio n. 7
0
int main(int argc, char *argv[]) {
  /* variables for options, with defaults */
  TreeNode *tree = NULL, *merge_tree = NULL, *extrapolate_tree = NULL;
  Hashtable *rename_hash = NULL;
  double scale_factor = 1;
  List *prune_names = NULL, *label = NULL, *labelType = NULL;
  int prune_all_but = FALSE, tree_only = FALSE, dissect = FALSE,
    name_ancestors = FALSE, with_branch = FALSE, print_branchlen=FALSE,
    inNewick=FALSE, no_branchlen = FALSE, print_distance_to_root = FALSE;
  TreeModel *mod = NULL, *merge_mod = NULL;
  char *reroot_name = NULL, *subtree_name =NULL, *get_subtree_name = NULL,
    *node_distance_name = NULL;
  
  /* other variables */
  String *suffix,  *optstr;
  char c;
  int i, opt_idx;
  TreeNode *n;

  struct option long_opts[] = {
    {"scale", 1, 0, 's'},
    {"extrapolate", 1, 0, 'e'},
    {"prune", 1, 0, 'p'},
    {"prune-all-but", 1, 0, 'P'},
    {"get-subtree", 1, 0, 'g'},
    {"merge", 1, 0, 'm'},
    {"rename", 1, 0, 'r'},
    {"tree-only", 0, 0, 't'},
    {"no-branchlen", 0, 0, 'N'},
    {"dissect", 0, 0, 'd'},
    {"name-ancestors", 0, 0, 'a'},
    {"reroot", 1, 0, 'R'},
    {"with-branch", 1, 0, 'B'},
    {"subtree", 1, 0, 'S'},
    {"branchlen", 0, 0, 'b'},
    {"newick", 0, 0, 'n'},
    {"label-subtree", 1, 0, 'L'},
    {"label-branches", 1, 0, 'l'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  while ((c = getopt_long(argc, argv, "s:p:P:g:m:r:R:B:S:D:l:L:adtNbnh", 
                          long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 's':
      scale_factor = get_arg_dbl_bounds(optarg, 0, INFTY);
      break;
    case 'e':
      if (!strcmp(optarg, "default")) {
        optarg = smalloc(1000 * sizeof(char));
        #if defined(__MINGW32__)
          sprintf(optarg, "%s\\data\\exoniphy\\mammals\\cftr25_hybrid.nh",
		  PHAST_HOME);
        #else
          sprintf(optarg, "%s/data/exoniphy/mammals/cftr25_hybrid.nh", 
                  PHAST_HOME);
        #endif
      }
      extrapolate_tree = tr_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'p':
      prune_names = get_arg_list(optarg);
      break;
    case 'P':
      prune_names = get_arg_list(optarg);
      prune_all_but = TRUE;
      break;
    case 'g':
      get_subtree_name = optarg;
      break;
    case 'm':
      suffix = str_new_charstr(optarg);
      str_suffix(suffix, '.');
      if (str_equals_charstr(suffix, "nh"))
        merge_tree = tr_new_from_file(phast_fopen(optarg, "r"));
      else {
        merge_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1);
        merge_tree = merge_mod->tree;
      }
      break;
    case 'r':
      rename_hash = make_name_hash(optarg);
      break;
    case 't':
      tree_only = TRUE;
      break;
    case 'N':
      no_branchlen = TRUE;
      tree_only = TRUE;
      break;
    case 'd':
      dissect = TRUE;
      break;
    case 'b':
      print_branchlen = TRUE;
      break;
    case 'D':
      print_distance_to_root = TRUE;
      node_distance_name = optarg;
      break;
    case 'R':
      reroot_name = optarg;
      break;
    case 'B':
      with_branch = TRUE;
      break;
    case 'a':
      name_ancestors = TRUE;
      break;
    case 'S':
      subtree_name = optarg;
      break;
    case 'n':
      inNewick=TRUE;
      break;
    case 'L':  //do the same for --label--subtree and --label-branches
    case 'l':
      if (label == NULL) {
	label = lst_new_ptr(1);
	labelType = lst_new_int(1);
      }
      optstr = str_new_charstr(optarg);
      lst_push_ptr(label, optstr);
      lst_push_int(labelType, (int)c);
      break;
    case 'h':
      usage(argv[0]);
    case '?':
      die("Bad argument.  Try '%s -h'.\n", argv[0]);
    }
  }

  if (optind != argc - 1) 
    die("Input filename required.  Try '%s -h'.\n", argv[0]);

  if (merge_tree != NULL && extrapolate_tree != NULL)
    die("ERROR: Can't use --merge and --extrapolate together");

  set_seed(-1);
    
  suffix = str_new_charstr(argv[optind]);
  str_suffix(suffix, '.');
  if (inNewick || str_equals_charstr(suffix, "nh")) {
    tree = tr_new_from_file(phast_fopen(argv[optind], "r"));
    tree_only = TRUE;           /* can't output tree model in this case */
  }
  else {
    mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1);
    tree = mod->tree;
  }

  if (prune_names != NULL) {
    tr_prune(&tree, prune_names, prune_all_but, NULL);
    if (mod != NULL) mod->tree = tree; /* root may have changed */
  }

  if (get_subtree_name != NULL) {
    n = tr_get_node(tree, get_subtree_name);
    if (n == NULL) {
      tr_name_ancestors(tree);
      n = tr_get_node(tree, get_subtree_name);
      if (n == NULL) {
	die("ERROR: no node named '%s'.\n", subtree_name);
      }
    }
    tr_prune_supertree(&tree, n);
    if (mod != NULL) mod->tree = tree;
  }

  if (merge_tree != NULL) {
    tree = tr_hybrid(tree, merge_tree);
    if (mod != NULL) mod->tree = tree;
  }

  else if (extrapolate_tree != NULL) {
    tr_scale_by_subtree(extrapolate_tree, tree);
    tree = extrapolate_tree;
    if (mod != NULL) mod->tree = tree;
  }

  if (scale_factor != 1) {
    if (subtree_name == NULL)
      tr_scale(tree, scale_factor);
    else {
      n = tr_get_node(tree, subtree_name);
      if (n == NULL) die("ERROR: no node named '%s'.\n", subtree_name);
      tr_scale_subtree(tree, n, scale_factor, with_branch);
    }
  }

  if (name_ancestors)
    tr_name_ancestors(tree);

  if (rename_hash != NULL) {
    char *newname;
    for (i = 0; i < tree->nnodes; i++) {
      n = lst_get_ptr(tree->nodes, i);
      if (n->name != NULL && n->name[0] != '\0' && 
          (newname = hsh_get(rename_hash, n->name)) != (char*)-1) {
        strcpy(n->name, newname);
      }
    }
  }

  if (reroot_name != NULL) {
    n = tr_get_node(tree, reroot_name);
    if (n == NULL) die("ERROR: no node named '%s'.\n", reroot_name);
    tr_reroot(tree, n, with_branch);
    if (mod != NULL) mod->tree = with_branch ? n->parent : n;
    tree = with_branch ? n->parent : n;
  }

  if (label != NULL) {
    for (i=0; i < lst_size(label); i++) {
      String *currstr = (String*)lst_get_ptr(label, i), *arg1, *labelVal;
      List *tmplst = lst_new_ptr(10);
      String *nodename;
      int j;
      str_split(currstr, ":", tmplst);
      if (lst_size(tmplst) != 2) 
	die("ERROR: bad argument to --label-branches or --label-subtree.\n");
      arg1 = lst_get_ptr(tmplst, 0);
      labelVal = lst_get_ptr(tmplst, 1);
      lst_clear(tmplst);
      if (lst_get_int(labelType, i) == (int)'l') {
	str_split(arg1, ",", tmplst);
	for (j=0; j < lst_size(tmplst); j++) {
	  nodename = (String*)lst_get_ptr(tmplst, j);
	  tr_label_node(tree, nodename->chars, labelVal->chars);
	}
	lst_free_strings(tmplst);
      } else if (lst_get_int(labelType, i) == (int)'L') {
	int include_leading_branch = FALSE;
	TreeNode *node;
	nodename = arg1;
	node = tr_get_node(tree, nodename->chars);
	if (node == NULL && nodename->chars[nodename->length-1] == '+') {
	  nodename->chars[--nodename->length] = '\0';
	  node = tr_get_node(tree, nodename->chars);
	  include_leading_branch = TRUE;
	}
	tr_label_subtree(tree, nodename->chars, include_leading_branch, 
			 labelVal->chars);
      } else die("ERROR got label_type %c\n", lst_get_int(labelType, (char)i));
      str_free(arg1);
      str_free(labelVal);
      lst_free(tmplst);
      str_free(currstr);
    }
    lst_free(label);
    lst_free(labelType);
  }

  if (dissect) 
    tr_print_nodes(stdout, tree);
  if (print_branchlen) 
    printf("TOTAL_TREE_LEN: %f\n", tr_total_len(tree));
  if (print_distance_to_root) {
    TreeNode *node = tr_get_node(tree, node_distance_name);
    if (node == NULL) 
      die("ERROR: no node named '%s'.\n", node_distance_name);
    printf("length(root-%s): %f\n", node_distance_name, 
	   tr_distance_to_root(node));
  }

  if (dissect==0 && print_branchlen==0 && print_distance_to_root==0) {
    if (tree_only)
      tr_print(stdout, tree, no_branchlen==FALSE);
    else
      tm_print(stdout, mod);
  }
  return 0;
}
int main(int argc, char* argv[]) {
  FILE* F;
  TreeModel *model;
  int i, j, k, alph_size, nstates, do_eqfreqs = 0, exch_mode = 0, 
    list_mode = 0, latex_mode = 0, suppress_diag = 0, ti_tv = 0, 
    scientific_mode = 0,
    induced_aa = 0, do_stop_codons = 0, do_zeroes = 0, symmetric = 0, 
    context_ti_tv = 0, all_branches = 0;
  int startcol, endcol, ncols, branch_no = 0, matrix_idx = 0;
/*   int aa_inv[256]; */
  double t = -1, total_ti = 0, total_tv = 0, rho_s = 0, cpg_ti = 0, 
    cpg_tv = 0, non_cpg_ti = 0, non_cpg_tv = 0, cpg_eqfreq = 0;
  char *rate_format_string = "%8.6f";
  MarkovMatrix *M;
  char c;
  char tuple[5], tuple2[5]; /* , aa_alph[50]; */
  char *subst_mat_fname = NULL, *subst_score_fname = NULL, 
    *subst_mat_fname_paml = NULL, *order1_mod_fname = NULL;
  Matrix *subst_mat = NULL;
  List *matrix_list = lst_new_ptr(20), *traversal = NULL;

  while ((c = (char)getopt(argc, argv, "t:fedlLiM:N:A:B:aszSECh")) != -1) {
   switch(c) {
    case 't':
      if (optarg[0] == 'A') all_branches = 1;
      else t = get_arg_dbl_bounds(optarg, 0, INFTY);
      break;
    case 'f':
      do_eqfreqs = 1;
      break;
    case 'e':
      exch_mode = 1;
      break;
    case 'd':
      suppress_diag = 1;
      break;
    case 'l':
      list_mode = 1;
      break;
    case 'L':
      latex_mode = 1;
      break;
    case 'i':
      ti_tv = 1;
      break;
    case 'M':
      subst_mat_fname = optarg;
      induced_aa = 1;
      break;
    case 'N':
      subst_mat_fname_paml = optarg;
      induced_aa = 1;
      break;
    case 'A':
      subst_score_fname = optarg;
      break;
    case 'B':
      order1_mod_fname = optarg;
      break;
    case 'a':
      induced_aa = 1;
      do_zeroes = 1;
      break;
    case 's':
      do_stop_codons = 1;
      break;
    case 'z':
      do_zeroes = 1;
      break;
    case 'S':
      symmetric = 1;
      break;
    case 'E':
      scientific_mode = 1;
      rate_format_string = "%13.6e";
      break;
    case 'C':
      context_ti_tv = 1;
      break;
    case 'h':
      print_usage();
      exit(0);
    case '?':
      die("Unrecognized option.  Try \"display_rate_matrix -h\" for help.\n");
    }
  }

  set_seed(-1);

  if ((t >= 0 && exch_mode) || (latex_mode && list_mode) || 
      ((ti_tv || subst_mat_fname != NULL || subst_score_fname != NULL || 
        subst_mat_fname_paml != NULL || scientific_mode) && !list_mode) || 
      (subst_mat_fname != NULL && subst_score_fname != NULL) || 
      (subst_score_fname != NULL && subst_mat_fname_paml != NULL) || 
      (subst_mat_fname != NULL && subst_mat_fname_paml != NULL) || 
      optind != argc - 1) {
    die("ERROR: missing required arguments or illegal combination of arguments.\nTry \"display_rate_matrix -h\" for help.\n");
  }

  F = phast_fopen(argv[optind], "r");
  model = tm_new_from_file(F, 1);

  if (context_ti_tv) {
    /* this option requires completely different handling from the others */
    if (model->order != 2) { 
      die("ERROR: -C requires a model of order 3.\n");
    }
    do_context_dependent_ti_tv(model);
    exit(0);
  }

  if (induced_aa) {
    TreeModel *aa_model = tm_induced_aa(model);
    char *codon_to_aa = get_codon_mapping(model->rate_matrix->states);

    /* before freeing model, grab the expected rate of synonymous
       subst, rho_s */
    for (i = 0; i < model->rate_matrix->size; i++)
      for (j = 0; j < model->rate_matrix->size; j++)
        if (i != j && codon_to_aa[i] == codon_to_aa[j])
          rho_s += mm_get(model->rate_matrix, i, j) * 
            vec_get(model->backgd_freqs, i);

    sfree(codon_to_aa);

    tm_free(model);
    model = aa_model;
  }

  if (all_branches) {
    traversal = tr_inorder(model->tree);
    for (matrix_idx = 0; matrix_idx < lst_size(traversal); matrix_idx++) {
      TreeNode *n = lst_get_ptr(traversal, matrix_idx);
      if (n->parent == NULL) { lst_push_ptr(matrix_list, NULL); continue; }
      M = mm_new(model->rate_matrix->size, model->rate_matrix->states, DISCRETE);
      mm_exp(M, model->rate_matrix, n->dparent);
      lst_push_ptr(matrix_list, M);      
    }
  }
  else if (t >= 0) {
    M = mm_new(model->rate_matrix->size, model->rate_matrix->states, DISCRETE);
    mm_exp(M, model->rate_matrix, t);
    lst_push_ptr(matrix_list, M);
  }
  else 
    lst_push_ptr(matrix_list, model->rate_matrix);

  alph_size = (int)strlen(model->rate_matrix->states);
  nstates = model->rate_matrix->size;

  if (subst_mat_fname != NULL) {
    if ((F = fopen(subst_mat_fname, "r")) == NULL) {
      die("ERROR: Can't open %s.\n", subst_mat_fname);
    }    
    subst_mat = read_subst_mat(F, AA_ALPHABET); 
  }
  else if (subst_mat_fname_paml != NULL) {
    if ((F = fopen(subst_mat_fname_paml, "r")) == NULL) {
      die("ERROR: Can't open %s.\n", subst_mat_fname_paml);
    }    
    subst_mat = read_paml_matrix(F, AA_ALPHABET); 
  }
  else if (subst_score_fname != NULL) {
    if ((F = fopen(subst_score_fname, "r")) == NULL) {
      die("ERROR: Can't open %s.\n", subst_score_fname);
    }    
    subst_mat = read_subst_scores(model, F);
  }
  else if (order1_mod_fname != NULL) {
    if ((F = fopen(order1_mod_fname, "r")) == NULL) {
      die("ERROR: Can't open %s.\n", order1_mod_fname);
    }    
    subst_mat = unproject_rates(model, tm_new_from_file(F, 1));
  }

  /* loop through matrices to print */
  for (matrix_idx = 0; matrix_idx < lst_size(matrix_list); matrix_idx++) {
    M = lst_get_ptr(matrix_list, matrix_idx);

    if (all_branches) {
      if (M == NULL) continue;  /* root */
      printf("BRANCH %d (t = %.6f)\n", ++branch_no,
             ((TreeNode*)lst_get_ptr(traversal, matrix_idx))->dparent);
    }

  /* print no more than 16 columns at a time (except with -a) */
  ncols = (induced_aa ? nstates : 16);
  for (startcol = 0; startcol < nstates; startcol += ncols) {
    endcol = min(nstates, startcol+ncols);

    /* table header */
    if (! list_mode) {
      if (latex_mode) {
        printf("\\begin{tabular}{|c|");
        for (i = startcol; i < endcol; i++) printf("r");
        printf("|}\n\\hline\n");
      }
      printf("%-5s ", "");
      if (latex_mode) printf("& ");
      for (i = startcol; i < endcol; i++) {
        get_state_tuple(model, tuple, i);
        if (latex_mode) {
          printf("{\\bf %s}", tuple);
          if (i < endcol-1) printf("& ");
        }
        else printf("%8s ", tuple);
    }
      if (latex_mode) printf("\\\\\n\\hline\n");
      else printf("\n");
    }

    /* table or list contents */
    for (i = 0; i < nstates; i++) {
      if (induced_aa && AA_ALPHABET[i] == '$' && !do_stop_codons) continue;
      get_state_tuple(model, tuple, i);

      /* get total eq freq of tuples containing CpG dinucs */
      for (k = 0; k < model->order; k++) {
        if (tuple[k] == 'C' && tuple[k+1] == 'G') {
          cpg_eqfreq += vec_get(model->backgd_freqs, i);
/*           printf("***CPG***"); */
          break;
        }
      }

      if (latex_mode) printf("{\\bf %s}& ", tuple);
      else if (!list_mode) printf("%-5s ", tuple);
      for (j = startcol; j < endcol; j++) {
        if (induced_aa && AA_ALPHABET[j] == '$' && !do_stop_codons) continue;
        if (latex_mode) printf("$");
        if (list_mode) {
          if (symmetric && j <= i) continue;
          else if ((t < 0 && ! all_branches) 
		   && (i == j || (!do_zeroes && mm_get(M, i, j) == 0))) 
            continue;
          get_state_tuple(model, tuple2, j);
          printf("%-5s %-5s ", tuple, tuple2);
        }
        if (i == j && suppress_diag && !list_mode) printf("%-7s", "-");
        else { 
	  /* get rate or probability */
	  double val = exch_mode == 0 ? mm_get(M, i, j) : 
	    safediv(mm_get(M, i, j), vec_get(model->backgd_freqs,j));
	  /* print value in format %8.6f or %13.6e */
	  printf(rate_format_string, val); 
	  printf(" ");
	}
        if (latex_mode) {
          printf("$");
          if (j < endcol-1) printf("& ");
        }
        else if (list_mode) {
          int ti, is_cpg;
          if (ti_tv) {
            ti = -1;
            is_cpg = 0;
            for (k = 0; k <= model->order; k++) {
              int dig_i = (i % int_pow(alph_size, k+1)) / int_pow(alph_size, k);
              int dig_j = (j % int_pow(alph_size, k+1)) / int_pow(alph_size, k);
              char next_char = '\0', prev_char = '\0';
              if (dig_i != dig_j) {
                ti = is_transition(M->states[dig_i], M->states[dig_j]);
                if (k != model->order)
                  prev_char = M->states[(i % int_pow(alph_size, k+2)) / 
                                        int_pow(alph_size, k+1)];
                if (k != 0)
                  next_char = M->states[(i % int_pow(alph_size, k)) / 
                                        int_pow(alph_size, k-1)];
                if ((M->states[dig_i] == 'C' && next_char == 'G') || 
                    (M->states[dig_i] == 'G' && prev_char == 'C')) 
                  is_cpg = 1;
              }
            }
	    if (ti == -1)
	      die("ERROR ti=-1\n");
            printf("%5s ", ti ? "ti" : "tv");
/*             printf("%5s ", is_cpg ? "CPG" : "-"); */
            if (ti) {
              total_ti += mm_get(M, i, j) * 
                vec_get(model->backgd_freqs, i);
              if (is_cpg) 
                cpg_ti += mm_get(M, i, j) * 
                  vec_get(model->backgd_freqs, i);
              else non_cpg_ti += mm_get(M, i, j) * 
                     vec_get(model->backgd_freqs, i);
            }
            else {
              total_tv += mm_get(M, i, j) * 
                vec_get(model->backgd_freqs, i);
              if (is_cpg)
                cpg_tv += mm_get(M, i, j) * 
                  vec_get(model->backgd_freqs, i);
              else non_cpg_tv += mm_get(M, i, j) * 
                     vec_get(model->backgd_freqs, i);
            }
          }
          if (subst_mat != NULL) {
            if (mat_get(subst_mat, i, j) == NEGINFTY) 
              printf("%8s", "-"); 
            else printf("%8.4f", mat_get(subst_mat, i, j)); 
          }
          printf("\n");
        }
      }
      if (latex_mode) printf("\\\\\n");
      else if (!list_mode) printf("\n");
    }
    
    /* equilibrium freqs (table case only) */
    if (do_eqfreqs && ! list_mode) {
      if (latex_mode) 
        printf("\\hline\n$\\boldsymbol{\\mathbf{\\pi}}$&");
      else 
        printf("%-5s ", "pi");
      for (i = startcol; i < endcol; i++) {
        if (latex_mode) 
          printf("$%8.4f$ ", vec_get(model->backgd_freqs, i));      
        else 
          printf("%8.4f ", vec_get(model->backgd_freqs, i));      
        if (latex_mode && i < endcol-1) printf("& ");
      }
      if (latex_mode) printf("\\\\\n");
      else printf("\n");
    }

    if (latex_mode) printf("\\hline\n\\end{tabular}\n\n");
  }

  /* equilibrium freqs (list case only) */
  if (do_eqfreqs &&  list_mode) {
    for (i = 0; i < nstates; i++) {
      get_state_tuple(model, tuple, i);
      printf("%-5s %-5s ", "-", tuple); //!!
      printf(rate_format_string, vec_get(model->backgd_freqs, i)); 
      printf("\n");
    }
  }
  
  if (ti_tv && list_mode) {
    printf("\n#Total ti/tv = %.4f\n", total_ti/total_tv);
    printf("#CpG ti ratio = %.4f, CpG tv ratio = %.4f\n", 
           cpg_ti/non_cpg_ti /* * (1 - cpg_eqfreq) */ / cpg_eqfreq, 
           cpg_tv/non_cpg_tv /* * (1 - cpg_eqfreq) */ / cpg_eqfreq);
  }
  else if (induced_aa) 
    printf("\n#Total rho_s/rho_v = %.4f\n", rho_s/(3-rho_s));

  if (all_branches == 1) printf("\n\n");
  }

  tm_free(model);
  lst_free(matrix_list);

  return 0;
}
 TabulationHashing( uint64_t s ) {
   setup_table();
   set_seed(s);
 }
 TabulationHashing() {
   setup_table();
   set_seed(0);
 }
int main(int argc, char *argv[]) {
  FILE *prob_f;
  char c;
  int opt_idx, i, nlines = 0, ngaps = 0;
  unsigned idx;
  PbsCode *code;
  List *fields = lst_new_ptr(10);
  double error, tot_error = 0, prob;
  Vector *v;
  String *line = str_new(STR_MED_LEN);

  struct option long_opts[] = {
    {"discard-gaps", 0, 0, 'G'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  /* variables for options, with defaults */
  int discard_gaps = FALSE;

  set_seed(-1);

  while ((c = (char)getopt_long(argc, argv, "Gh", long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 'G':
      discard_gaps = TRUE;
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("Bad argument.  Try 'pbsEncode -h'.\n");
    }
  }

  if (optind != argc - 2) 
    die("Two arguments required.  Try 'pbsEncode -h'.\n");
    
  prob_f = phast_fopen(argv[optind], "r");
  code = pbs_new_from_file(phast_fopen(argv[optind+1], "r"));
  v = vec_new(code->sg->d);
  
  while (str_readline(line, prob_f) != EOF) {

    if (line->length == 0 || line->chars[0] == '#') continue;

    str_split(line, NULL, fields);

    if (lst_size(fields) == 1 && str_equals_charstr(lst_get_ptr(fields, 0), "-")) {
      ngaps++;
      if (!discard_gaps)
	pbs_write_binary(code, code->gap_code, stdout);
    }
    else {			/* ordinary prob vector */
      if (lst_size(fields) != code->sg->d)
	die("ERROR: number of columns must equal dimension of code (%d).\n",
	    code->sg->d);
    
      for (i = 0; i < code->sg->d; i++) {
	if (str_as_dbl(lst_get_ptr(fields, i), &prob) != 0 || prob < 0 || prob > 1)
	  die("ERROR: bad value ('%s')\n", lst_get_ptr(fields, i));

	vec_set(v, i, prob);
      }

      idx = pbs_get_index(code, v, &error);
      tot_error += error;
      pbs_write_binary(code, idx, stdout);
      nlines++;
    }

    lst_free_strings(fields);
  }

  fprintf(stderr, "Dimensions: %d\n\
Rows per dimension: %d\n\
Code size: %d\n\
Bytes per vector: %d\n\
Vectors processed: %d\n\
Gaps: %d%s\n\
Average approximation error: %f bits\n", 
	  code->sg->d, code->sg->nrows, code->code_size, code->nbytes,  
	  nlines, ngaps, discard_gaps ? " (discarded)" : "", tot_error/nlines);

  return 0;
}
/* ******************************************************** *
 *  Create the sampling structure and initialize all values *
 * ******************************************************** */
void create_sampling(int n, int *L, int B, int generator_flag, int initial_count)
{  
    int *ordern, *permun, *myL;
    int i;

    // Set the local size of permutations
    local_perm_size = B;
    local_perm_count = 1;

    // Allocate and initialize local L to original L
    local_L = (int *)R_alloc(n, sizeof(int));

    // To check random or complete
    if( generator_flag == 1 ) {

        // Initialize the permutation array (struct)
        // Sets the B in the struct equal to 0 to be
        // able to identify the generator later on
        init_permu_array(&local_pa, L, n, 0);

        // * ================================================================= *
        // *                     Forwarding logic                              *
        // *                     ----------------                              *
        // *  For parallel executions all processes, appart from the one with  *
        // *  rank == 0, should forward their random generators in order to    *
        // *  be able to reproduce the exact same random permutations as the   *
        // *  serial version of the code (or a version with only one thread)   *
        // *                                                                   *
        // *  The code below uses the initial_count variable to "burn" the     *
        // *  cycles from the random generator                                 *
        // * ================================================================= *

        // All processes apart from process with rank == 0 (translates to
        // initial_count == 0) will perform this forward
        if ( initial_count != 0 ) {

            // Initialize label
            init_label(local_pa.n, local_pa.k, local_pa.nk, local_L);

            // Burn the cycles
            // We *must* use L in order to initialize it in the proper permutation
            // L is given as the starting position for the next permutation
            for(i=0; i < initial_count; i++) {
                next_label(local_pa.n, local_pa.k, local_pa.nk, local_L);
            }
        }

        // * ================================================================= *
        // * ================================================================= *

    } else {

        // Intiailize the permu_array (struct)
        init_permu_array(&local_pa, L, n, B);

        permun = (int *)R_alloc(local_pa.n, sizeof(int));
        ordern = (int*)R_alloc(local_pa.n, sizeof(int));
        myL = (int *)R_alloc(local_pa.n, sizeof(int));

        // * ================================================================= *
        // *                     Forwarding logic                              *
        // *                     ----------------                              *
        // *  For parallel executions all processes, appart from the one with  *
        // *  rank == 0, should forward their random generators in order to    *
        // *  be able to reproduce the exact same random permutations as the   *
        // *  serial version of the code (or a version with only one thread)   *
        // *                                                                   *
        // *  The code below uses the initial_count variable to "burn" the     *
        // *  cycles from the random generator                                 *
        // * ================================================================= *

        // Set initial seed
        set_seed(g_random_seed);

        // Burn the cycles
        // ("permun" is a safe scratch space for this)
        for(i=0; i < initial_count; i++) {
            sample(permun, n);
        }

        // * ================================================================= *
        // * ================================================================= *

        for(i=0; i<n; i++){
            ordern[i]=i;
        }

        // Allocate and assign the values for l_first_sample
        set_permu(&local_pa, 0, L);

        for(i=1; i<B; i++) {
            memcpy(permun, ordern, sizeof(int)*n);
            sample(permun, n);

            // Change to labbeling
            sample2label(n, local_pa.k, local_pa.nk, permun, myL);
            set_permu(&local_pa, i, myL);
        }


    }
}
Esempio n. 13
0
int main(int argc, char *argv[]) {
  TreeNode *tree = NULL;
  TreeModel *backgd_mod = NULL;
  int i, j,
    size = DEFAULT_SIZE, meme_mode = 0, profile_mode = 0, 
    nrestarts = 10, npseudocounts = 5, nsamples = -1, 
    nmostprevalent = -1, tuple_size = -1, nbest = -1, sample_parms = 0,
    nmotifs = DEFAULT_NUMBER, nseqs = -1, do_html = 0, do_bed = 0, 
    suppress_stdout = 0;
  List *msa_name_list = NULL, *pos_examples = NULL, *init_list = NULL, *tmpl;
  List *msas, *motifs;
  SeqSet *seqset = NULL;
  PooledMSA *pmsa = NULL;
  msa_format_type msa_format = UNKNOWN_FORMAT;
  Vector *backgd_mnmod = NULL;
  Hashtable *hash=NULL;
  String *output_prefix = str_new_charstr("phastm.");
  double *has_motif = NULL;
  double prior = PRIOR;
  char c;
  GFF_Set *bedfeats = NULL;

  while ((c = getopt(argc, argv, "t:i:b:sk:md:pn:I:R:P:w:c:SB:o:HDxh")) != -1) {
    switch (c) {
    case 't':
      tree = tr_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'i':
      msa_format = msa_str_to_format(optarg);
      if (msa_format == UNKNOWN_FORMAT) 
	die("ERROR: bad input format.\n");
      break;
    case 'b':
      backgd_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1);
      break;
    case 's':
      break;
    case 'k':
      size = get_arg_int(optarg);
      break;
    case 'm':
      meme_mode = 1;
      break;
    case 'd':
      pos_examples = get_arg_list(optarg);
      break;
    case 'p':
      profile_mode = 1;
      break;
    case 'n':
      nrestarts = get_arg_int(optarg);
      break;
    case 'I':
      init_list = get_arg_list(optarg);
      break;
    case 'P':
      tmpl = str_list_as_int(get_arg_list(optarg));
      if (lst_size(tmpl) != 2) die("ERROR: bad argument to -P.\n");
      nmostprevalent = lst_get_int(tmpl, 0);
      tuple_size = lst_get_int(tmpl, 1);
      if (!(nmostprevalent > 0 && tuple_size > 0))
	die("ERROR: bad argument nmostprevalent=%i tuple_size=%i\n", 
	    nmostprevalent, tuple_size);
      lst_free(tmpl);
      break;
    case 'R':
      tmpl = str_list_as_int(get_arg_list(optarg));
      if (lst_size(tmpl) != 2) die("ERROR: bad argument to -R.\n");
      nsamples = lst_get_int(tmpl, 0);
      tuple_size = lst_get_int(tmpl, 1);
      if (!(nsamples > 0 && tuple_size > 0))
	die("ERROR nsamples=%i tuple_sizse=%i\n", nsamples, tuple_size);
      lst_free(tmpl);
      break;
    case 'c':
      npseudocounts = get_arg_int(optarg);
      break;
    case 'w':
      nbest = get_arg_int(optarg);
      break;
    case 'S':
      sample_parms = 1;
      break;
    case 'B':
      nmotifs = get_arg_int(optarg);
      break;
    case 'o': 
      str_free(output_prefix);
      output_prefix = str_new_charstr(optarg);
      str_append_char(output_prefix, '.'); 
      break;
    case 'H': 
      do_html = 1;
      break;
    case 'D': 
      do_bed = 1;
      break;
    case 'x':
      suppress_stdout = 1;
      break;
    case 'h':
      usage(argv[0]);
    case '?':
      die("Bad argument.  Try '%s -h'.\n", argv[0]);
    }
  }

  if (optind != argc - 1) 
    die("ERROR: List of alignment files required.  Try '%s -h'.\n", argv[0]);

  if ((nsamples > 0 && nmostprevalent > 0) || 
      (nsamples > 0 && init_list != NULL) || 
      (nmostprevalent > 0 && init_list != NULL)) 
    die("ERROR: -I, -P, and -R are mutually exclusive.");

  set_seed(-1);
    
  msa_name_list = get_arg_list(argv[optind]);

  if (backgd_mod != NULL && tree == NULL) tree = backgd_mod->tree;

  if (tree == NULL && !meme_mode && !profile_mode) 
    die("ERROR: Must specify -t, -m, or -p.\n");

  if ((init_list != NULL || nsamples > 0 || nmostprevalent > 0) && 
      !sample_parms)
    nrestarts = 1;

  if (pos_examples != NULL) {
    hash = hsh_new(lst_size(pos_examples));
    for (i = 0; i < lst_size(pos_examples); i++)
      hsh_put_int(hash, ((String*)lst_get_ptr(pos_examples, i))->chars, 1);
    has_motif = smalloc(lst_size(msa_name_list) * sizeof(double));
  }

  /* open all MSAs */
  msas = lst_new_ptr(lst_size(msa_name_list));
  fprintf(stderr, "Reading alignment(s) ...\n");
  for (i = 0, j = 0; i < lst_size(msa_name_list); i++) {
    String *name = lst_get_ptr(msa_name_list, i);
    FILE *mfile = phast_fopen(name->chars, "r");
    msa_format_type temp_format;
    MSA *msa;
    if (msa_format == UNKNOWN_FORMAT)
      temp_format = msa_format_for_content(mfile, 1);
    else temp_format = msa_format;
    msa = msa_new_from_file_define_format(mfile, temp_format, NULL);
    phast_fclose(mfile);
    if (nseqs == -1) nseqs = msa->nseqs;
    if (!meme_mode &&
        (msa->length - msa_num_gapped_cols(msa, STRIP_ANY_GAPS, -1, -1) < 300 ||
        msa->nseqs != nseqs)) {
      fprintf(stderr, "WARNING: ignoring alignment '%s' -- too few informative sites.\n", name->chars);
      msa_free(msa);
      continue;
    }

    if (msa_alph_has_lowercase(msa)) msa_toupper(msa); 
    msa_remove_N_from_alph(msa); /* Ns can be a problem */
    lst_push_ptr(msas, msa);
    if (has_motif != NULL) {
      int k, hm = (hsh_get_int(hash, name->chars) == 1);
      if (meme_mode) {          /* here need to record at individ seq level */
        has_motif = srealloc(has_motif, 
                             (j + msa->nseqs + 1) * sizeof(double)); /* FIXME */
        for (k = 0; k < msa->nseqs; k++) has_motif[j++] = hm;
      }
      else has_motif[j++] = hm;
    }
  }
  if (!meme_mode) {
    fprintf(stderr, "Extracting and pooling sufficient statistics ...\n");
    pmsa = ss_pooled_from_msas(msas, 1, size, NULL, 0);
    msa_remove_N_from_alph(pmsa->pooled_msa);
  }

  /* obtain individual sequences, if necessary */
  if (nmostprevalent > 0 || nsamples > 0 || meme_mode) {
    if (meme_mode) fprintf(stderr, "Converting to individual sequences ...\n");
    else fprintf(stderr, "Obtaining reference sequences for pre-processing ...\n");
    seqset = mtf_get_seqset(msas, meme_mode ? -1 : 1, 10 * size);
                                /* for now, assume 1st seq is reference */
    msa_remove_N_from_alph(seqset->set); 
  }

  if (nmostprevalent > 0) {
    fprintf(stderr, "Obtaining %d most prevalent %d-tuples ...\n", 
            nmostprevalent, tuple_size);
    init_list = lst_new_ptr(nmostprevalent);
    mtf_get_common_ntuples(seqset, init_list, tuple_size, nmostprevalent);
  }
  else if (nsamples > 0) {
    fprintf(stderr, "Sampling %d %d-tuples ...\n", nsamples, tuple_size);
    init_list = lst_new_ptr(nsamples);
    mtf_sample_ntuples(seqset, init_list, tuple_size, nsamples);
  }

  /* in meme_mode, backgd model can be specified as eq freqs in a .mod file */
  if (meme_mode && backgd_mod != NULL && has_motif == NULL)
    backgd_mnmod = backgd_mod->backgd_freqs;

  /* estimate background model, if necessary */
  else if (backgd_mod == NULL && (!meme_mode || has_motif == NULL)) {
    fprintf(stderr, "Fitting background model%s ...\n", 
            has_motif == NULL ? "" : " (for use in initialization)");
                                /* if discriminative, be clear
                                   backgd isn't really part of the
                                   estimation procedure */
    if (meme_mode) {
      backgd_mnmod = vec_new(strlen(seqset->set->alphabet));
      mtf_estim_backgd_mn(seqset, backgd_mnmod);
    }
    else {
      backgd_mod = tm_new(tr_create_copy(tree), NULL, NULL, F81, 
                          pmsa->pooled_msa->alphabet, 1, 0, NULL, -1);
      tm_fit(backgd_mod, pmsa->pooled_msa, 
             tm_params_init(backgd_mod, .1, 5, 0), 
             -1, OPT_MED_PREC, NULL, 0, NULL);
    }
  }

  /* select subset of init strings, if necessary */
  if (nbest > 0 && init_list != NULL) {
    fprintf(stderr, "Winnowing candidate start strings ...\n");
    tmpl = lst_new_ptr(nbest);
    mtf_winnow_starts(meme_mode ? (void*)seqset : (void*)pmsa,
                      init_list, nbest, tmpl, !meme_mode, size, tree,
                      meme_mode ? (void*)backgd_mnmod : (void*)backgd_mod, 
                      has_motif);
    lst_free(init_list);
    init_list = tmpl;
  }

  /* Now find motifs */
  motifs = mtf_find(meme_mode ? (void*)seqset : (void*)pmsa, 
                    !meme_mode, size, nmotifs, tree,
                    meme_mode ? (void*)backgd_mnmod : (void*)backgd_mod, 
                    has_motif, prior, nrestarts, init_list, sample_parms, 
                    npseudocounts);
     
  fprintf(stderr, "\n\n");
  if (do_bed)
    bedfeats = gff_new_set_init("phast_motif", "0.1b");

  /* generate output */
  for (i = 0; i < lst_size(motifs); i++) {
    Motif *m = lst_get_ptr(motifs, i);

    if (!suppress_stdout) {
      if (lst_size(motifs) > 1) 
        printf("\n**********\nMOTIF #%d\n**********\n\n", i+1);

      mtf_print(stdout, m);
    }

    if (do_html) {
      String *fname = str_dup(output_prefix);
      str_append_int(fname, i+1);
      str_append_charstr(fname, ".html");
      mtf_print_html(phast_fopen(fname->chars, "w+"), m);
      str_free(fname);
    }

    if (do_bed) 
      mtf_add_features(m, bedfeats);
  }
  if (do_html) {
    String *fname = str_dup(output_prefix);
    str_append_charstr(fname, "index.html");
    mtf_print_summary_html(phast_fopen(fname->chars, "w+"), 
                           motifs, output_prefix);
    str_free(fname);
  }
  if (do_bed) {
    String *fname = str_dup(output_prefix);
    str_append_charstr(fname, "bed");
    gff_print_bed(phast_fopen(fname->chars, "w+"),
                  bedfeats, FALSE);
    str_free(fname);
  }

  return 0;
}