Example #1
0
auto apply_pointer_tuple(F&& f,Tuple&& t) 
{
  using index_sequence=
    typename make_index_sequence<
      std::tuple_size<typename std::decay<Tuple>::type>::value>::type;
  return apply_pointer_tuple_impl(
    std::forward<F>(f),std::forward<Tuple>(t),index_sequence());
}
Example #2
0
/*************************************************************************
 * Entry point for ama
 *************************************************************************/
int main(int argc, char **argv) {
  AMA_OPTIONS_T options;
  ARRAYLST_T *motifs;
  clock_t c0, c1; // measuring cpu_time
  MOTIF_AND_PSSM_T *combo;
  CISML_T *cisml;
  PATTERN_T** patterns;
  PATTERN_T *pattern;
  FILE *fasta_file, *text_output, *cisml_output;
  int i, seq_loading_num, seq_counter, unique_seqs, seq_len, scan_len, x1, x2, y1, y2;
  char *seq_name, *path;
  bool need_postprocessing, created;
  SEQ_T *sequence;
  RBTREE_T *seq_ids;
  RBNODE_T *seq_node;
  double *logcumback;
  ALPH_T *alph;

  // process the command
  process_command_line(argc, argv, &options);

  // load DNA motifs
  motifs = load_motifs(&options);

  // get the alphabet
  if (arraylst_size(motifs) > 0) {
    combo = (MOTIF_AND_PSSM_T*)arraylst_get(0, motifs);
    alph = alph_hold(get_motif_alph(combo->motif));
  } else {
    alph = alph_dna();
  }

  // pick columns for GC operations
  x1 = -1; x2 = -1; y1 = -1; y2 = -1;
  if (alph_size_core(alph) == 4 && alph_size_pairs(alph) == 2) {
    x1 = 0; // A
    x2 = alph_complement(alph, x1); // T
    y1 = (x2 == 1 ? 2 : 1); // C
    y2 = alph_complement(alph, y1); // G
    assert(x1 != x2 && y1 != y2 && x1 != y1 && x2 != y2 && x1 != y2 && x2 != y1);
  }

  // record starting time
  c0 = clock();

  // Create cisml data structure for recording results
  cisml = allocate_cisml(PROGRAM_NAME, options.command_line, options.motif_filename, options.fasta_filename);
  set_cisml_background_file(cisml, options.bg_filename);

  // make a CISML pattern to hold scores for each motif
  for (i = 0; i < arraylst_size(motifs); i++) {
    combo = (MOTIF_AND_PSSM_T*)arraylst_get(i, motifs);
    add_cisml_pattern(cisml, allocate_pattern(get_motif_id(combo->motif), ""));
  }

  // Open the FASTA file for reading.
  fasta_file = NULL;
  if (!open_file(options.fasta_filename, "r", false, "FASTA", "sequences", &fasta_file)) {
    die("Couldn't open the file %s.\n", options.fasta_filename);
  }
  if (verbosity >= NORMAL_VERBOSE) {
    if (options.last == 0) {
      fprintf(stderr, "Using entire sequence\n");
    } else {
      fprintf(stderr, "Limiting sequence to last %d positions.\n", options.last);
    }
  }

  //
  // Read in all sequences and score with all motifs
  //
  seq_loading_num = 0;  // keeps track on the number of sequences read in total
  seq_counter = 0;      // holds the index to the seq in the pattern
  unique_seqs = 0;      // keeps track on the number of unique sequences
  need_postprocessing = false;
  sequence = NULL;
  logcumback = NULL;
  seq_ids = rbtree_create(rbtree_strcasecmp,rbtree_strcpy,free,rbtree_intcpy,free);
  while (read_one_fasta(alph, fasta_file, options.max_seq_length, &sequence)) {
    ++seq_loading_num;
    seq_name = get_seq_name(sequence);
    seq_len = get_seq_length(sequence);
    scan_len = (options.last != 0 ? options.last : seq_len);
    // red-black trees are only required if duplicates should be combined
    if (options.combine_duplicates){
      //lookup seq id and create new entry if required, return sequence index
      seq_node = rbtree_lookup(seq_ids, get_seq_name(sequence), true, &created);
      if (created) { // assign it a loading number
        rbtree_set(seq_ids, seq_node, &unique_seqs);
        seq_counter = unique_seqs;
        ++unique_seqs;
      } else {
        seq_counter = *((int*)rbnode_get(seq_node));
      }
    }
          
    //
    // Set up sequence-dependent background model and compute
    // log cumulative probability of sequence.
    // This needs the sequence in raw format.
    //
    if (options.sdbg_order >= 0)
      logcumback = log_cumulative_background(alph, options.sdbg_order, sequence);

    // Index the sequence, throwing away the raw format and ambiguous characters
    index_sequence(sequence, alph, SEQ_NOAMBIG);

    // Get the GC content of the sequence if binning p-values by GC
    // and store it in the sequence object.
    if (options.num_gc_bins > 1) {
      ARRAY_T *freqs = get_sequence_freqs(sequence, alph);
      set_total_gc_sequence(sequence, get_array_item(y1, freqs) + get_array_item(y2, freqs)); // f(C) + f(G)
      free_array(freqs);                        // clean up
    } else {
      set_total_gc_sequence(sequence, -1);      // flag ignore
    }

    // Scan with motifs.
    for (i = 0; i < arraylst_size(motifs); i++) {
      pattern = get_cisml_patterns(cisml)[i];
      combo = (MOTIF_AND_PSSM_T*)arraylst_get(i, motifs);
      if (verbosity >= HIGHER_VERBOSE) {
        fprintf(stderr, "Scanning %s sequence with length %d "
            "abbreviated to %d with motif %s with length %d.\n",
            seq_name, seq_len, scan_len, 
            get_motif_id(combo->motif), get_motif_length(combo->motif));
      }
      SCANNED_SEQUENCE_T* scanned_seq = NULL;
      if (!options.combine_duplicates || get_pattern_num_scanned_sequences(pattern) <= seq_counter) {
        // Create a scanned_sequence record and save it in the pattern.
        scanned_seq = allocate_scanned_sequence(seq_name, seq_name, pattern);
        set_scanned_sequence_length(scanned_seq, scan_len);
      } else {
        // get existing sequence record
        scanned_seq = get_pattern_scanned_sequences(pattern)[seq_counter];
        set_scanned_sequence_length(scanned_seq, max(scan_len, get_scanned_sequence_length(scanned_seq)));
      }
      
      // check if scanned component of sequence has sufficient length for the motif
      if (scan_len < get_motif_length(combo->motif)) {
        // set score to zero and p-value to 1 if not set yet
        if(!has_scanned_sequence_score(scanned_seq)){
          set_scanned_sequence_score(scanned_seq, 0.0);
        }
        if(options.pvalues && !has_scanned_sequence_pvalue(scanned_seq)){
          set_scanned_sequence_pvalue(scanned_seq, 1.0);
        } 
        add_scanned_sequence_scanned_position(scanned_seq); 
        if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 0L) {
          need_postprocessing = true;
        }
        if (verbosity >= HIGH_VERBOSE) {
          fprintf(stderr, "%s too short for motif %s. Score set to 0.\n",
              seq_name, get_motif_id(combo->motif));
        }
      } else {
        // scan the sequence using average/maximum motif affinity
        ama_sequence_scan(alph, sequence, logcumback, combo->pssm_pair,
            options.scoring, options.pvalues, options.last, scanned_seq,
            &need_postprocessing);
      }
    } // All motifs scanned

    free_seq(sequence);
    if (options.sdbg_order >= 0) myfree(logcumback);

  } // read sequences

  fclose(fasta_file);
  if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "(%d) sequences read in.\n", seq_loading_num);
  if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "Finished          \n");

        
  // if any sequence identifier was multiple times in the sequence set  then
  // postprocess of the data is required
  if (need_postprocessing || options.normalize_scores) {
    post_process(cisml, motifs, options.normalize_scores);
  }
        
  // output results
  if (options.output_format == DIRECTORY_FORMAT) {
    if (create_output_directory(options.out_dir, options.clobber, verbosity > QUIET_VERBOSE)) {
      // only warn in higher verbose modes
      fprintf(stderr, "failed to create output directory `%s' or already exists\n", options.out_dir);
      exit(1);
    }
    path = make_path_to_file(options.out_dir, text_filename);
    //FIXME check for errors: MEME doesn't either and we at least know we have a good directory
    text_output = fopen(path, "w");
    free(path);
    path = make_path_to_file(options.out_dir, cisml_filename);
    //FIXME check for errors
    cisml_output = fopen(path, "w");
    free(path);
    print_cisml(cisml_output, cisml, true, NULL, false);
    print_score(cisml, text_output);
    fclose(cisml_output);
    fclose(text_output);
  } else if (options.output_format == GFF_FORMAT) {
    print_score(cisml, stdout);
  } else if (options.output_format == CISML_FORMAT) {
    print_cisml(stdout, cisml, true, NULL, false);
  } else {
    die("Output format invalid!\n");
  }

  //
  // Clean up.
  //
  rbtree_destroy(seq_ids);
  arraylst_destroy(motif_and_pssm_destroy, motifs);
  free_cisml(cisml);
  rbtree_destroy(options.selected_motifs);
  alph_release(alph);
        
  // measure time
  if (verbosity >= NORMAL_VERBOSE) { // starting time
    c1 = clock();
    fprintf(stderr, "cycles (CPU);            %ld cycles\n", (long) c1);
    fprintf(stderr, "elapsed CPU time:        %f seconds\n", (float) (c1-c0) / CLOCKS_PER_SEC);
  }
  return 0;
}