예제 #1
0
파일: seq.c 프로젝트: CPFL/gmeme
/**********************************************************************
  shuffle_sequence()

  shuffle a given sequences based on their content
**********************************************************************/
void shuffle_sequence(
  SEQ_T* seq,		/* original sequence IN */
  unsigned int seed,	/* seed IN */
  SEQ_T** target	/* target sequence OUT */
){
	my_srand(seed);
	assert(*target==NULL);
	// reset target if not null
	if (*target != NULL){
		free_seq(*target);
	}

	*target = allocate_seq(get_seq_name(seq),"shuffled",get_seq_offset(seq),get_raw_sequence(seq));
	char *raw = get_raw_sequence(*target);

	/* copy original in temp string */
	char* tmp = (char*)mm_calloc(get_seq_length(seq)+1,sizeof(char));
	strcpy(tmp,get_raw_sequence(seq));
	tmp[get_seq_length(seq)]='\0';

	int i,j;
	char *ss;
	char *dd;
	for(j=0,i=get_seq_length(seq);i>0;i--){
		// Pick a random number in the range:
		int pick = rand() % i;
		raw[j++] = tmp[pick];
		// "shift" routine here eliminates the "picked" base from the _src string:
		// dd starts at the picked position: ss is one beyond that:
		for( dd = tmp+pick , ss = dd + 1 ; *dd ; *dd++=*ss++ );
	}
	myfree(tmp);
}
예제 #2
0
void PlaySeq(const char* seqFile, const char* bnkFile, const char* war1, const char* war2, const char* war3, const char* war4)
{
	StopSeq();
	free_seq();
	curr_seq.msg = SNDSYS_PLAYSEQ;

	LoadFile(&curr_seq.seq, seqFile);
	LoadFile(&curr_seq.bnk, bnkFile);
	LoadFile(curr_seq.war + 0, war1);
	LoadFile(curr_seq.war + 1, war2);
	LoadFile(curr_seq.war + 2, war3);
	LoadFile(curr_seq.war + 3, war4);

	fifoSendDatamsg(FIFO_SNDSYS, sizeof(curr_seq), (u8*) &curr_seq);
}
예제 #3
0
/****************************************************************************
 * Extract a small alignment out of the middle of a larger alignment.
 ****************************************************************************/
ALIGNMENT_T* extract_subalignment
  (int start,
   int width,
   ALIGNMENT_T* alignment)
{
  int num_sequences = get_num_aligned_sequences(alignment);
  SEQ_T** sequences = get_alignment_sequences(alignment);
  SEQ_T** subsequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*));

  // Extract the specified columns into a new list of sequences.
  int i_seq = 0;
  char* subsequence = mm_malloc((width + 1) * sizeof(char));
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    SEQ_T* this_seq = sequences[i_seq];
    char* raw_seq = get_raw_sequence(this_seq);
    strncpy(subsequence, raw_seq + start, width);
    subsequence[width] = '\0';
    subsequences[i_seq] = 
      allocate_seq(get_seq_name(this_seq),
		   get_seq_description(this_seq),
		   get_seq_offset(this_seq), 
		   subsequence);
  }

  // Extract the consensus string in the specified columns.
  char* consensus = get_consensus_string(alignment);
  char* subconsensus = mm_malloc(sizeof(char) * (width + 1));
  strncpy(subconsensus, consensus + start, width);
  subconsensus[width] = '\0';

  // Allocate and return the new alignment.
  ALIGNMENT_T* subalignment 
    = allocate_alignment(get_alignment_name(alignment),
			 get_alignment_description(alignment),
			 num_sequences,
			 subsequences,
			 subconsensus);

  // Free local dynamic memory.
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    free_seq(subsequences[i_seq]);
  }
  myfree(subsequences);
  myfree(subsequence);
  return(subalignment);
}
예제 #4
0
/****************************************************************************
 * Free one alignment object.
 ****************************************************************************/
void free_alignment(ALIGNMENT_T* alignment) {

  if (alignment == NULL) {
    return;
  }
  else {
    if (alignment->consensus_string != NULL) {
      myfree(alignment->consensus_string);
    }
    if (alignment->sequences != NULL) {
      int i; 
      for(i = 0; i < alignment->num_sequences; i++) {
        assert(alignment->sequences[i] != NULL);
        free_seq(alignment->sequences[i]);
      }
      myfree(alignment->sequences);
    }
    myfree(alignment);
  }
}
예제 #5
0
파일: list.c 프로젝트: dank101/net-2
dsEnqError list_start()
{
  struct ds_search_arg search_arg;
  struct ds_search_result result;
  struct DSError          error;
  dsEnqError return_error;

  return_error = Okay;

  if (get_default_service (&search_arg.sra_common) != 0) {
    return localdsaerror;
  }

  search_arg.sra_common.ca_servicecontrol.svc_options = SVC_OPT_PREFERCHAIN;

  search_arg.sra_baseobject = (*base_path != 'T'?
                               str2dn (base_path):
                               NULLDN);

  search_arg.sra_eis.eis_allattributes = FALSE;
  search_arg.sra_eis.eis_infotypes = EIS_ATTRIBUTETYPESONLY;
  search_arg.sra_eis.eis_select = 0;

  search_arg.sra_searchaliases = TRUE;
  search_arg.sra_subset = SRA_ONELEVEL;

  search_arg.sra_filter = filter_alloc();
  search_arg.sra_filter->flt_type = FILTER_NOT;
  search_arg.sra_filter->flt_next = NULLFILTER;
  search_arg.sra_filter->flt_un.flt_un_filter = filter_alloc();
  search_arg.sra_filter->flt_un.flt_un_filter->flt_type = FILTER_ITEM;
  search_arg.sra_filter->flt_un.flt_un_filter->flt_next = NULLFILTER;
  search_arg.sra_filter->flt_un.flt_un_filter->flt_un.flt_un_item.fi_type
    = FILTERITEM_EQUALITY;
  search_arg.sra_filter->flt_un.flt_un_filter->flt_un.flt_un_item.fi_un.
    fi_un_ava.ava_type = AttrT_new("2.5.4.0");

  search_arg.sra_filter->flt_un.flt_un_filter->flt_un.flt_un_item.fi_un.
    fi_un_ava.ava_value =
      str2AttrV("dsa", search_arg.sra_filter->flt_un.flt_un_filter->
                flt_un.flt_un_item.fi_un.fi_un_ava.ava_type->
                oa_syntax);

#ifndef NO_STATS
  LLOG (log_stat,LLOG_NOTICE,("search +%s,extent %d, val objectClass != dsa",
			      base_path,search_arg.sra_subset));
#endif

  if (search_arg.sra_filter->flt_un.flt_un_filter->flt_un.flt_un_item.
      fi_un.fi_un_ava.ava_value == NULLAttrV) {
    return_error = localdsaerror;
  } else if (ds_search (&search_arg, &error, &result) != DS_OK) {
    free_seq(dnseq);
    dnseq = NULLDS;
    dn_number = 0;
    log_ds_error(&error);
    ds_error_free(&error);
    switch (error.dse_type) {
    case DSE_LOCALERROR:
      return_error = duaerror;
      break;
    case DSE_REMOTEERROR:
      return_error = localdsaerror;
      break;
    case DSE_ATTRIBUTEERROR:
      return_error = attributerror;
      break;
    case DSE_REFERRAL:
    case DSE_DSAREFERRAL:
      return_error = remotedsaerror;
      break;
    case DSE_SECURITYERROR:
      return_error = security;
      break;
    case DSE_NAMEERROR:
      return_error = namerror;
      break;
    case DSE_SERVICEERROR:
      return_error = serviceerror;
      break;
    default:
      return_error = localdsaerror;
      break;
    }
  } else {
    dn_number = 0;
    if (result.CSR_entries != NULLENTRYINFO) {
      register EntryInfo *ptr;
      
      free_seq(dnseq);
      dnseq = NULLDS;
      dn_number = 0;

      for (ptr = result.CSR_entries; ptr != NULLENTRYINFO;
           ptr = ptr->ent_next) {
        dn_number++;
        dn2buf ((caddr_t)ptr->ent_dn, goto_path);
        add_seq (&dnseq, goto_path);
      }

      if (dn_number) dnseq = SortList(dnseq);
    } else if (result.CSR_limitproblem == LSR_NOLIMITPROBLEM) {
      free_seq(dnseq);
      dnseq = NULLDS;
      dn_number = 0;
      return_error = nothingfound;
    }

    if (result.CSR_limitproblem != LSR_NOLIMITPROBLEM) {
      switch (result.CSR_limitproblem) {
      case LSR_TIMELIMITEXCEEDED:
	if (dn_number > 0) return_error = timelimit_w_partial;
	else {
	  free_seq(dnseq);
	  dnseq = NULLDS;
	  return_error = timelimit;
	}
	break;
      case LSR_SIZELIMITEXCEEDED:
	return_error = listsizelimit;
	break;
      case LSR_ADMINSIZEEXCEEDED:
	if (dn_number > 0) return_error = adminlimit_w_partial;
	else {
	  free_seq(dnseq);
	  dnseq = NULLDS;
	  return_error = adminlimit;
	}
	break;
      }
    }
    if (result.CSR_entries) entryinfo_free(result.CSR_entries, 0);
  }
  entry_number = dn_number;
  filter_free(search_arg.sra_filter);
  dn_free(search_arg.sra_baseobject);
  ds_error_free(&error);
  return return_error;
}
예제 #6
0
/*************************************************************************
 * Entry point for ama
 *************************************************************************/
int main(int argc, char **argv) {
  AMA_OPTIONS_T options;
  ARRAYLST_T *motifs;
  clock_t c0, c1; // measuring cpu_time
  MOTIF_AND_PSSM_T *combo;
  CISML_T *cisml;
  PATTERN_T** patterns;
  PATTERN_T *pattern;
  FILE *fasta_file, *text_output, *cisml_output;
  int i, seq_loading_num, seq_counter, unique_seqs, seq_len, scan_len, x1, x2, y1, y2;
  char *seq_name, *path;
  bool need_postprocessing, created;
  SEQ_T *sequence;
  RBTREE_T *seq_ids;
  RBNODE_T *seq_node;
  double *logcumback;
  ALPH_T *alph;

  // process the command
  process_command_line(argc, argv, &options);

  // load DNA motifs
  motifs = load_motifs(&options);

  // get the alphabet
  if (arraylst_size(motifs) > 0) {
    combo = (MOTIF_AND_PSSM_T*)arraylst_get(0, motifs);
    alph = alph_hold(get_motif_alph(combo->motif));
  } else {
    alph = alph_dna();
  }

  // pick columns for GC operations
  x1 = -1; x2 = -1; y1 = -1; y2 = -1;
  if (alph_size_core(alph) == 4 && alph_size_pairs(alph) == 2) {
    x1 = 0; // A
    x2 = alph_complement(alph, x1); // T
    y1 = (x2 == 1 ? 2 : 1); // C
    y2 = alph_complement(alph, y1); // G
    assert(x1 != x2 && y1 != y2 && x1 != y1 && x2 != y2 && x1 != y2 && x2 != y1);
  }

  // record starting time
  c0 = clock();

  // Create cisml data structure for recording results
  cisml = allocate_cisml(PROGRAM_NAME, options.command_line, options.motif_filename, options.fasta_filename);
  set_cisml_background_file(cisml, options.bg_filename);

  // make a CISML pattern to hold scores for each motif
  for (i = 0; i < arraylst_size(motifs); i++) {
    combo = (MOTIF_AND_PSSM_T*)arraylst_get(i, motifs);
    add_cisml_pattern(cisml, allocate_pattern(get_motif_id(combo->motif), ""));
  }

  // Open the FASTA file for reading.
  fasta_file = NULL;
  if (!open_file(options.fasta_filename, "r", false, "FASTA", "sequences", &fasta_file)) {
    die("Couldn't open the file %s.\n", options.fasta_filename);
  }
  if (verbosity >= NORMAL_VERBOSE) {
    if (options.last == 0) {
      fprintf(stderr, "Using entire sequence\n");
    } else {
      fprintf(stderr, "Limiting sequence to last %d positions.\n", options.last);
    }
  }

  //
  // Read in all sequences and score with all motifs
  //
  seq_loading_num = 0;  // keeps track on the number of sequences read in total
  seq_counter = 0;      // holds the index to the seq in the pattern
  unique_seqs = 0;      // keeps track on the number of unique sequences
  need_postprocessing = false;
  sequence = NULL;
  logcumback = NULL;
  seq_ids = rbtree_create(rbtree_strcasecmp,rbtree_strcpy,free,rbtree_intcpy,free);
  while (read_one_fasta(alph, fasta_file, options.max_seq_length, &sequence)) {
    ++seq_loading_num;
    seq_name = get_seq_name(sequence);
    seq_len = get_seq_length(sequence);
    scan_len = (options.last != 0 ? options.last : seq_len);
    // red-black trees are only required if duplicates should be combined
    if (options.combine_duplicates){
      //lookup seq id and create new entry if required, return sequence index
      seq_node = rbtree_lookup(seq_ids, get_seq_name(sequence), true, &created);
      if (created) { // assign it a loading number
        rbtree_set(seq_ids, seq_node, &unique_seqs);
        seq_counter = unique_seqs;
        ++unique_seqs;
      } else {
        seq_counter = *((int*)rbnode_get(seq_node));
      }
    }
          
    //
    // Set up sequence-dependent background model and compute
    // log cumulative probability of sequence.
    // This needs the sequence in raw format.
    //
    if (options.sdbg_order >= 0)
      logcumback = log_cumulative_background(alph, options.sdbg_order, sequence);

    // Index the sequence, throwing away the raw format and ambiguous characters
    index_sequence(sequence, alph, SEQ_NOAMBIG);

    // Get the GC content of the sequence if binning p-values by GC
    // and store it in the sequence object.
    if (options.num_gc_bins > 1) {
      ARRAY_T *freqs = get_sequence_freqs(sequence, alph);
      set_total_gc_sequence(sequence, get_array_item(y1, freqs) + get_array_item(y2, freqs)); // f(C) + f(G)
      free_array(freqs);                        // clean up
    } else {
      set_total_gc_sequence(sequence, -1);      // flag ignore
    }

    // Scan with motifs.
    for (i = 0; i < arraylst_size(motifs); i++) {
      pattern = get_cisml_patterns(cisml)[i];
      combo = (MOTIF_AND_PSSM_T*)arraylst_get(i, motifs);
      if (verbosity >= HIGHER_VERBOSE) {
        fprintf(stderr, "Scanning %s sequence with length %d "
            "abbreviated to %d with motif %s with length %d.\n",
            seq_name, seq_len, scan_len, 
            get_motif_id(combo->motif), get_motif_length(combo->motif));
      }
      SCANNED_SEQUENCE_T* scanned_seq = NULL;
      if (!options.combine_duplicates || get_pattern_num_scanned_sequences(pattern) <= seq_counter) {
        // Create a scanned_sequence record and save it in the pattern.
        scanned_seq = allocate_scanned_sequence(seq_name, seq_name, pattern);
        set_scanned_sequence_length(scanned_seq, scan_len);
      } else {
        // get existing sequence record
        scanned_seq = get_pattern_scanned_sequences(pattern)[seq_counter];
        set_scanned_sequence_length(scanned_seq, max(scan_len, get_scanned_sequence_length(scanned_seq)));
      }
      
      // check if scanned component of sequence has sufficient length for the motif
      if (scan_len < get_motif_length(combo->motif)) {
        // set score to zero and p-value to 1 if not set yet
        if(!has_scanned_sequence_score(scanned_seq)){
          set_scanned_sequence_score(scanned_seq, 0.0);
        }
        if(options.pvalues && !has_scanned_sequence_pvalue(scanned_seq)){
          set_scanned_sequence_pvalue(scanned_seq, 1.0);
        } 
        add_scanned_sequence_scanned_position(scanned_seq); 
        if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 0L) {
          need_postprocessing = true;
        }
        if (verbosity >= HIGH_VERBOSE) {
          fprintf(stderr, "%s too short for motif %s. Score set to 0.\n",
              seq_name, get_motif_id(combo->motif));
        }
      } else {
        // scan the sequence using average/maximum motif affinity
        ama_sequence_scan(alph, sequence, logcumback, combo->pssm_pair,
            options.scoring, options.pvalues, options.last, scanned_seq,
            &need_postprocessing);
      }
    } // All motifs scanned

    free_seq(sequence);
    if (options.sdbg_order >= 0) myfree(logcumback);

  } // read sequences

  fclose(fasta_file);
  if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "(%d) sequences read in.\n", seq_loading_num);
  if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "Finished          \n");

        
  // if any sequence identifier was multiple times in the sequence set  then
  // postprocess of the data is required
  if (need_postprocessing || options.normalize_scores) {
    post_process(cisml, motifs, options.normalize_scores);
  }
        
  // output results
  if (options.output_format == DIRECTORY_FORMAT) {
    if (create_output_directory(options.out_dir, options.clobber, verbosity > QUIET_VERBOSE)) {
      // only warn in higher verbose modes
      fprintf(stderr, "failed to create output directory `%s' or already exists\n", options.out_dir);
      exit(1);
    }
    path = make_path_to_file(options.out_dir, text_filename);
    //FIXME check for errors: MEME doesn't either and we at least know we have a good directory
    text_output = fopen(path, "w");
    free(path);
    path = make_path_to_file(options.out_dir, cisml_filename);
    //FIXME check for errors
    cisml_output = fopen(path, "w");
    free(path);
    print_cisml(cisml_output, cisml, true, NULL, false);
    print_score(cisml, text_output);
    fclose(cisml_output);
    fclose(text_output);
  } else if (options.output_format == GFF_FORMAT) {
    print_score(cisml, stdout);
  } else if (options.output_format == CISML_FORMAT) {
    print_cisml(stdout, cisml, true, NULL, false);
  } else {
    die("Output format invalid!\n");
  }

  //
  // Clean up.
  //
  rbtree_destroy(seq_ids);
  arraylst_destroy(motif_and_pssm_destroy, motifs);
  free_cisml(cisml);
  rbtree_destroy(options.selected_motifs);
  alph_release(alph);
        
  // measure time
  if (verbosity >= NORMAL_VERBOSE) { // starting time
    c1 = clock();
    fprintf(stderr, "cycles (CPU);            %ld cycles\n", (long) c1);
    fprintf(stderr, "elapsed CPU time:        %f seconds\n", (float) (c1-c0) / CLOCKS_PER_SEC);
  }
  return 0;
}
예제 #7
0
int
main(int argc, char *argv[])
{
  int count;
  seq_t seq1, seq2;
  hash_env_t he;
  collec_t res, rev_res;
#if defined(DEBUG) && (DEBUG > 1)
  mcheck(NULL);
  mtrace();
#endif
  argv0 = argv[0];
  if (setlocale(LC_ALL, "POSIX") == NULL)
    fprintf(stderr, "%s: Warning: could not set locale to POSIX\n", argv[0]);
  signal(SIGSEGV, bug_handler);
#ifndef __MINGW32__  
  signal(SIGBUS, bug_handler);
#endif  
  /* Default options.  */
  options.C = DEFAULT_C;
  options.cutoff = DIST_CUTOFF;
  options.gapPct = DEFAULT_GAPPCT;
  options.intron_window = 6;
  options.K = DEFAULT_K;
  options.splice_type_list = "GTAG,GCAG,GTAC,ATAC";
  options.nbSplice = 4;
  options.scoreSplice_window = 10;
  options.mismatchScore = MISMATCH;
  options.reverse = 2;
  options.matchScore = MATCH;
  options.W = DEFAULT_W;
  options.X = DEFAULT_X;
  options.filterPct = DEFAULT_FILTER;
  options.minScore_cutoff = MATCH_CUTOFF;
  while (1) {
    int c = getopt(argc, argv, "A:C:c:E:f:g:I:K:L:M:o:q:R:r:W:X:");
    if (c == -1)
      break;
    switch (c) {
    case 'A':
      options.ali_flag = atoi(optarg);
      if (options.ali_flag < 0 || options.ali_flag > 4)
	fatal("A must be one of 0, 1, 2, 3, or 4.\n");
      break;
    case 'C': {
      int val = atoi(optarg);
      if (val < 0)
	fatal("Value for option C must be non-negative.\n");
      options.C = val;
      break;
    }
    case 'c': {
      int val = atoi(optarg);
      if (val < 0)
	fatal("Value for option c must be non-negative.\n");
      options.minScore_cutoff = val;
      break;
    }
    case 'E':
      options.cutoff = atoi(optarg);
      if (options.cutoff < 3 || options.cutoff > 10)
	fatal("Cutoff (E) must be within [3,10].\n");
      break;
    case 'f':
      options.filterPct = atoi(optarg);
      if (options.filterPct > 100)
	fatal("Filter in percent (f) must be within [0,100].\n");
      break;
    case 'g':
      options.gapPct = atoi(optarg);
      break;
    case 'I':
      options.intron_window = atoi(optarg);
      break;
    case 'K': {
      int val = atoi(optarg);
      if (val < 0)
	fatal("Value for option K must be non-negative.\n");
      options.K = val;
      break;
    }
    case 'L': {
      size_t i;
      size_t len = strlen(optarg);
      options.splice_type_list = optarg;
      options.nbSplice = 1;
      if (len % 5 != 4)
	fatal("Splice types list has illegal length (%zu)\n", len);
      for (i = 0; i < len; i++)
	if (i % 5 == 4) {
	  if (options.splice_type_list[i] != ',')
	    fatal("Comma expected instead of %c at position %zu"
		  "in splice types list.\n",
		  options.splice_type_list[i], i);
	  options.nbSplice += 1;
	} else {
	  if (options.splice_type_list[i] != 'A'
	      && options.splice_type_list[i] != 'C'
	      && options.splice_type_list[i] != 'G'
	      && options.splice_type_list[i] != 'T')
	    fatal("Expected 'A', 'C', 'G' or 'T' instead of '%c' at"
		  "position %zu in splice types list.\n",
		  options.splice_type_list[i], i);
	}
      break;
    }
    case 'M': {
      int val = atoi(optarg);
      if (val < 0)
	fatal("Value for option M must be non-negative.\n");
      options.scoreSplice_window = val;
      break;
    }
    case 'o':
      options.dnaOffset = atoi(optarg);
      break;
    case 'q':
      options.mismatchScore = atoi(optarg);
      break;
    case 'R':
      options.reverse = atoi(optarg);
      if (options.reverse < 0 || options.reverse > 2)
	fatal("R must be one of 0, 1, or 2.\n");
      break;
    case 'r':
      options.matchScore = atoi(optarg);
      break;
    case 'W':
      options.W = atoi(optarg);
      if (options.W < 1 || options.W > 15)
	fatal("W must be within [1,15].\n");
      break;
    case 'X':
      options.X = atoi(optarg);
      if (options.X < 1)
	fatal("X must be positive.\n");
      break;
    case '?':
      break;
    default:
      fprintf(stderr, "?? getopt returned character code 0%o ??\n", c);
    }
  }
  if (optind + 2 != argc) {
    fprintf(stderr, Usage, argv[0], options.ali_flag, options.C,
	    options.minScore_cutoff, options.cutoff,
	    options.filterPct, options.gapPct, options.intron_window,
	    options.K, options.splice_type_list, options.scoreSplice_window,
	    options.dnaOffset, options.mismatchScore, options.reverse,
	    options.matchScore, options.W, options.X);
    return 1;
  }

  /* read seq1 */
  init_seq(argv[optind], &seq1);
  if (get_next_seq(&seq1, options.dnaOffset, 1) != 0)
    fatal("Cannot read sequence from %s.\n", argv[optind]);
  strncpy(dna_seq_head, seq1.header, 256);

  /* read seq2 */
  init_seq(argv[optind + 1], &seq2);
  if (get_next_seq(&seq2, 0, 0) != 0)
    fatal("Cannot read sequence from %s.\n", argv[optind + 1]);

  init_encoding();
  init_hash_env(&he, options.W, seq1.seq, seq1.len);
  init_col(&res, 1);
  init_col(&rev_res, 1);
  bld_table(&he);
  init_splice_junctions();

  count = 0;
  while (!count || get_next_seq(&seq2, 0, 0) == 0) {
    unsigned int curRes;
    strncpy(rna_seq_head, seq2.header, 256);
    ++count;

    switch (options.reverse) {
    case  0:
      SIM4(&he, &seq2, &res);
      break;
    case  2:
      SIM4(&he, &seq2, &res);
    case  1:
      seq_revcomp_inplace(&seq2);
      SIM4(&he, &seq2, &rev_res);
      break;
    default:
      fatal ("Unrecognized request for EST orientation.\n");
    }
    /* Keep only the best matches, according to filterPct.  */
    if (options.filterPct > 0) {
      unsigned int max_nmatches = 0;
      for (curRes = 0; curRes < rev_res.nb; curRes++) {
	result_p_t r = rev_res.e.result[curRes];
	if (r->st.nmatches > max_nmatches)
	  max_nmatches = r->st.nmatches;
      }
      for (curRes = 0; curRes < res.nb; curRes++) {
	result_p_t r = res.e.result[curRes];
	if (r->st.nmatches > max_nmatches)
	  max_nmatches = r->st.nmatches;
      }
      max_nmatches = (max_nmatches * options.filterPct) / 100;
      for (curRes = 0; curRes < rev_res.nb; curRes++) {
	result_p_t r = rev_res.e.result[curRes];
	if (r->st.nmatches < max_nmatches)
	  r->st.nmatches = 0;
      }
      for (curRes = 0; curRes < res.nb; curRes++) {
	result_p_t r = res.e.result[curRes];
	if (r->st.nmatches < max_nmatches)
	  r->st.nmatches = 0;
      }
    }
    /* Now, print results.  */
    for (curRes = 0; curRes < rev_res.nb; curRes++)
      print_res(rev_res.e.result[curRes], 1, &seq1, &seq2);
    rev_res.nb = 0;
    if (options.reverse && options.ali_flag)
      /* reverse-complement back seq2 for alignment */
      seq_revcomp_inplace(&seq2);
    for (curRes = 0; curRes < res.nb; curRes++)
      print_res(res.e.result[curRes], 0, &seq1, &seq2);
    res.nb = 0;
  }
#ifdef DEBUG
  fprintf(stderr, "DEBUG mode: freeing all memory...\n");
  fflush(stdout);
  fflush(stderr);
  free_hash_env(&he);
  free_seq(&seq1);
  free_seq(&seq2);
  free(options.splice);
  free(res.e.elt);
  free(rev_res.e.elt);
#endif
  return 0;
}
예제 #8
0
파일: ama.c 프로젝트: a1aks/Haystack
/*************************************************************************
 * Entry point for ama
 *************************************************************************/
int main(int argc, char *argv[]) {
  int max_seq_length = MAX_SEQ;
  STRING_LIST_T* selected_motifs = NULL;
  double pseudocount = 0.01;
  int output_format = CISML_FORMAT;
  program_name = "ama";
  int scoring = AVG_ODDS;
  BOOLEAN_T pvalues = FALSE;
  BOOLEAN_T normalize_scores = FALSE;
  BOOLEAN_T combine_duplicates = FALSE;
  int num_gc_bins = 1;
  int sdbg_order = -1;				// don't use sequence background
  BOOLEAN_T scan_both_strands = TRUE;
  ARRAY_T* pos_bg_freqs = NULL;
  ARRAY_T* rev_bg_freqs = NULL;
  clock_t c0, c1; /* measuring cpu_time */
  CISML_T *cisml;
  char * out_dir = NULL;
  BOOLEAN_T clobber = FALSE;
  int i;
  int last = 0;
  ALPH_T alph = INVALID_ALPH;

  /**********************************************
   * COMMAND LINE PROCESSING
   **********************************************/

  const int num_options = 16;
  cmdoption const motif_scan_options[] = {
    { "max-seq-length", REQUIRED_VALUE },
    { "motif", REQUIRED_VALUE },
    { "motif-pseudo", REQUIRED_VALUE },
    { "rma", NO_VALUE },
    { "pvalues", NO_VALUE },
    { "sdbg", REQUIRED_VALUE },
    { "norc", NO_VALUE },
    { "cs", NO_VALUE },
    { "o-format", REQUIRED_VALUE },
    { "o", REQUIRED_VALUE },
    { "oc", REQUIRED_VALUE },
    { "scoring", REQUIRED_VALUE },
    { "verbosity", REQUIRED_VALUE },
    { "gcbins", REQUIRED_VALUE },
    { "last", REQUIRED_VALUE },
    { "version", NO_VALUE }
  };

  int option_index = 0;

  // Define the usage message.
  char usage[] = "USAGE: ama [options] <motif file> <sequence file> [<background file>]\n"
    "\n"
    "   Options:\n"
    "     --sdbg <order>\t\t\tUse Markov background model of\n"
    "       \t\t\t\t\torder <order> derived from the sequence\n"
    "       \t\t\t\t\tto compute its likelihood ratios.\n"
    "       \t\t\t\t\tOverrides --pvalues, --gcbins and --rma;\n"
    "       \t\t\t\t\t<background file> is required unless\n"
    "       \t\t\t\t\t--sdbg is given.\n"
    "     --motif <id>\t\t\tUse only the motif identified by <id>.\n"
    "       \t\t\t\t\tThis option may be repeated.\n"
    "     --motif-pseudo <float>\t\tThe value <float> times the background\n"
    "       \t\t\t\t\tfrequency is added to the count of each\n"
    "       \t\t\t\t\tletter when creating the likelihood \n"
    "       \t\t\t\t\tratio matrix (default: %g).\n"
    "     --norc\t\t\t\tDisables the scanning of the reverse\n"
    "       \t\t\t\t\tcomplement strand.\n"
    "     --scoring [avg-odds|max-odds]\tIndicates whether the average or \n"
    "       \t\t\t\t\tthe maximum odds should be calculated\n"
    "       \t\t\t\t\t(default: avg-odds)\n"
    "     --rma\t\t\t\tScale motif scores to the range 0-1.\n"
    "       \t\t\t\t\t(Relative Motif Affinity).\n"
    "       \t\t\t\t\tMotif scores are scaled by the maximum\n"
    "       \t\t\t\t\tscore achievable by that PWM. (default:\n"
    "       \t\t\t\t\tmotif scores are not normalized)\n"
    "     --pvalues\t\t\t\tPrint p-value of avg-odds score in cisml\n"
    "       \t\t\t\t\toutput. Ignored for max-odds scoring.\n"
    "       \t\t\t\t\t(default: p-values are not printed)\n"
    "     --gcbins <bins>\t\t\tCompensate p-values for GC content of\n"
    "       \t\t\t\t\teach sequence using given number of \n"
    "       \t\t\t\t\tGC range bins. Recommended bins: 41.\n"
    "       \t\t\t\t\t(default: p-values are based on\n"
    "       \t\t\t\t\tfrequencies in background file)\n"
    "     --cs\t\t\t\tEnable combining sequences with same\n"
    "       \t\t\t\t\tidentifier by taking the average score\n"
    "       \t\t\t\t\tand the Sidac corrected p-value.\n"
    "     --o-format [gff|cisml]\t\tOutput file format (default: cisml)\n"
    "       \t\t\t\t\tignored if --o or --oc option used\n"
    "     --o <directory>\t\t\tOutput all available formats to\n"
    "       \t\t\t\t\t<directory>; give up if <directory>\n"
    "       \t\t\t\t\texists\n"
    "     --oc <directory>\t\t\tOutput all available formats to\n"
    "       \t\t\t\t\t<directory>; if <directory> exists\n"
    "       \t\t\t\t\toverwrite contents\n"
    "     --verbosity [1|2|3|4]\t\tControls amount of screen output\n"
    "       \t\t\t\t\t(default: %d)\n"
    "     --max-seq-length <int>\t\tSet the maximum length allowed for \n"
    "       \t\t\t\t\tinput sequences. (default: %d)\n"
    "     --last <int>\t\t\tUse only scores of (up to) last <n>\n"
    "       \t\t\t\t\tsequence positions to compute AMA.\n"
    "     --version   \t\t\tPrint version and exit.\n"
    "\n";

  // Parse the command line.
  if (simple_setopt(argc, argv, num_options, motif_scan_options) != NO_ERROR) {
    die("Error processing command line options: option name too long.\n");
  }
    
    BOOLEAN_T setoutputformat = FALSE;
    BOOLEAN_T setoutputdirectory = FALSE;

  while (TRUE) {
    int c = 0;
    char* option_name = NULL;
    char* option_value = NULL;
    const char * message = NULL;

    // Read the next option, and break if we're done.
    c = simple_getopt(&option_name, &option_value, &option_index);
    if (c == 0) {
      break;
    } else if (c < 0) {
      (void) simple_getopterror(&message);
      die("Error processing command line options (%s).\n", message);
    } else if (strcmp(option_name, "max-seq-length") == 0) {
	max_seq_length = atoi(option_value);
    } else if (strcmp(option_name, "norc") == 0) {
	scan_both_strands = FALSE;
    } else if (strcmp(option_name, "cs") == 0) {
		combine_duplicates = TRUE;
    } else if (strcmp(option_name, "motif") == 0) {
	if (selected_motifs == NULL) {
	  selected_motifs = new_string_list();
	}
	add_string(option_value, selected_motifs);
    } else if (strcmp(option_name, "motif-pseudo") == 0) {
	pseudocount = atof(option_value);
    } else if (strcmp(option_name, "o-format") == 0) {
        if (setoutputdirectory) {
            if (verbosity >= NORMAL_VERBOSE)
                fprintf(stderr, "output directory specified, ignoring --o-format\n");
        } else {
            setoutputformat = TRUE;
            if (strcmp(option_value, "gff") == 0)
                output_format = GFF_FORMAT;
            else if (strcmp(option_value, "cisml") == 0)
                output_format = CISML_FORMAT;
            else {
                if (verbosity >= NORMAL_VERBOSE)
                  fprintf(stderr, "Output format not known. Using standard instead (cisML).\n");
                  output_format = CISML_FORMAT;
            }
        }
    } else if (strcmp(option_name, "o") == 0 || strcmp(option_name, "oc") == 0) {
        setoutputdirectory = TRUE;
        if (setoutputformat) {
            if (verbosity >= NORMAL_VERBOSE)
                fprintf(stderr, "output directory specified, ignoring --o-format\n");
        }
        clobber = strcmp(option_name, "oc") == 0;
        out_dir = (char*) malloc (sizeof(char)*(strlen(option_value)+1));
        strcpy(out_dir, option_value);
        output_format = DIRECTORY_FORMAT;
    } else if (strcmp(option_name, "verbosity") == 0) {
	verbosity = atoi(option_value);
    } else if (strcmp(option_name, "scoring") == 0) {
      if (strcmp(option_value, "max-odds") == 0)
	scoring = MAX_ODDS;
      else if (strcmp(option_value, "avg-odds") == 0)
	scoring = AVG_ODDS;
      else if (strcmp(option_value, "sum-odds") == 0)
	scoring = SUM_ODDS;
	  else
	die("Specified scoring scheme not known.\n", message);
    } else if (strcmp(option_name, "pvalues") == 0) {
      pvalues = TRUE;
    } else if (strcmp(option_name, "rma") == 0) {
      normalize_scores = TRUE;
      fprintf(stderr, "Normalizing motif scores using RMA method.\n");
    } else if (strcmp(option_name, "gcbins") == 0) {
      num_gc_bins = atoi(option_value);
      pvalues = TRUE;
      if (num_gc_bins <= 1) die("Number of bins in --gcbins must be greater than 1.\n", message);
    } else if (strcmp(option_name, "sdbg") == 0) {
      sdbg_order = atoi(option_value);			// >=0 means use sequence bkg
    }
    else if (strcmp(option_name, "last") == 0) {
      int i = 0;
      if (option_value[0] == '-') ++i;
      while (option_value[i] != '\0') {
        if (!isdigit(option_value[i])) {
          die("Specified parameter 'last' contains non-numeric characters.\n");
        }
        ++i;
      }
      last = atoi(option_value);
      if (errno != 0) {
        die("Specified parameter 'last' could not be parsed as a number as:\n%s\n",strerror(errno));
      }
      if (last < 0) {
        die("Specified parameter 'last' had negative value (%d) when only postive or zero values are allowed \n", last);
      }
    }
    else if (strcmp(option_name, "version") == 0) {
      fprintf(stdout, VERSION "\n");
      exit(EXIT_SUCCESS);
    }
  }

  // --sdbg overrides --pvalues and --gcbins and --rma
  int req_args = 3;
  if (sdbg_order >= 0) {
    pvalues = FALSE;
    normalize_scores = FALSE;
    num_gc_bins = 1;
    req_args = 2;
  }

  // Check all required arguments given
  if (sdbg_order >= 0 && argc > option_index + req_args) {
    die("<background file> cannot be given together with --sdbg.\n");
  } else if (argc != option_index + req_args) {
    fprintf(stderr, usage, pseudocount, verbosity, max_seq_length);
    exit(EXIT_FAILURE);
  }

  // Get required arguments. 
  char* motif_filename = argv[option_index];
  option_index++;
  char* fasta_filename = argv[option_index];
  option_index++;
  char* bg_filename;
  if (req_args == 3) {			// required unless --sdbg given
    bg_filename = argv[option_index];
    option_index++;
  } else {
    bg_filename = "--uniform--";	// So PSSMs will use uniform background;
					// we can multiply them out later.
  }

  // measure time
  c0 = clock();

  // Set up hash tables for computing reverse complement if doing --sdbg
  if (sdbg_order >= 0) setup_hash_alph(DNAB);

  // Create cisml data structure for recording results
  cisml = allocate_cisml(program_name, motif_filename, fasta_filename);
  set_cisml_background_file(cisml, bg_filename);

  /**********************************************
   * Read the motifs and background model.
   **********************************************/
  int num_motifs = 0;
  MREAD_T *mread;
  ARRAYLST_T *motifs;
  PSSM_PAIR_T** pssm_pairs;	// note pssm_pairs is an array of pointers

  //this reads any meme file, xml, txt and html
  mread = mread_create(motif_filename, OPEN_MFILE);
  mread_set_bg_source(mread, bg_filename);
  mread_set_pseudocount(mread, pseudocount);

  motifs = mread_load(mread, NULL);
  alph = mread_get_alphabet(mread);
  pos_bg_freqs = mread_get_background(mread);

  mread_destroy(mread);

  num_motifs = arraylst_size(motifs);

  // allocate memory for PSSM pairs
  pssm_pairs = (PSSM_PAIR_T**)mm_malloc(sizeof(PSSM_PAIR_T*) * num_motifs);

  if (verbosity >= NORMAL_VERBOSE) 
    fprintf(stderr, "Number of motifs in file %d.\n", num_motifs);

  // make a CISML pattern to hold scores for each motif
  PATTERN_T** patterns = NULL;
  Resize(patterns, num_motifs, PATTERN_T*);
  int motif_index;
  for (motif_index = 0; motif_index < num_motifs; motif_index++) {
    MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs);
    patterns[motif_index] = allocate_pattern(get_motif_id(motif), "");
    add_cisml_pattern(cisml, patterns[motif_index]);
  }

  // make reverse complement motifs and background frequencies.
  if (scan_both_strands == TRUE) {
    add_reverse_complements(motifs);
    assert(arraylst_size(motifs) == (2 * num_motifs));
    rev_bg_freqs = allocate_array(get_array_length(pos_bg_freqs));
    complement_dna_freqs(pos_bg_freqs, rev_bg_freqs);
  }

  /**************************************************************
   * Convert motif matrices into log-odds matrices.
   * Scale them.
   * Compute the lookup tables for the PDF of scaled log-odds scores.
   **************************************************************/
  int ns = scan_both_strands ? 2 : 1;	// number of strands
  for (motif_index = 0; motif_index < num_motifs; motif_index++) {
    MOTIF_T *motif, *motif_rc;
    motif = (MOTIF_T*)arraylst_get(motif_index*ns, motifs);
    if (scan_both_strands)
      motif_rc = (MOTIF_T*)arraylst_get(motif_index*ns + 1, motifs);
    else
      motif_rc = NULL;
    /*
     *  Note: If scanning both strands, we complement the motif frequencies
     *  but not the background frequencies so the motif looks the same.
     *  However, the given frequencies are used in computing the p-values
     *  since they represent the frequencies on the negative strands.
     *  (If we instead were to complement the input sequence, keeping the
     *  the motif fixed, we would need to use the complemented frequencies
     *  in computing the p-values.  Is that any clearer?)
    */
    double range = 300;		// 100 is not very good; 1000 is great but too slow
    PSSM_T* pos_pssm =
      build_motif_pssm(
        motif, 
        pos_bg_freqs, 
        pos_bg_freqs, 
        NULL, // Priors not used
        0.0L, // alpha not used
        range, 
        num_gc_bins, 
        TRUE
      );
    PSSM_T* neg_pssm = (scan_both_strands ?
      build_motif_pssm(
        motif_rc, 
        rev_bg_freqs, 
        pos_bg_freqs, 
        NULL, // Priors not used
        0.0L, // alpha not used
        range, 
        num_gc_bins, 
        TRUE
      )
      : NULL
    );
    pssm_pairs[motif_index] = create_pssm_pair(pos_pssm, neg_pssm);
  }

  // Open the FASTA file for reading.
  FILE* fasta_file = NULL;
  if (open_file(fasta_filename, "r", FALSE, "FASTA", "sequences", &fasta_file) == 0) {
    die("Couldn't open the file %s.\n", fasta_filename);
  }
  if (verbosity >= NORMAL_VERBOSE) {
    if (last == 0) {
      fprintf(stderr, "Using entire sequence\n");
    } else {
      fprintf(stderr, "Limiting sequence to last %d positions.\n", last);
    }
  }

  /**************************************************************
   * Read in all sequences and score with all motifs
   **************************************************************/
  int seq_loading_num = 0;  // keeps track on the number of sequences read in total
  int seq_counter = 0;		// holds the index to the seq in the pattern
  int unique_seqs = 0;      // keeps track on the number of unique sequences
  BOOLEAN_T need_postprocessing = FALSE;
  SEQ_T* sequence = NULL;
  RBTREE_T* seq_ids = rbtree_create(rbtree_strcasecmp,NULL,free,rbtree_intcpy,free);
  RBNODE_T* seq_node;
  BOOLEAN_T created;
  while (read_one_fasta(alph, fasta_file, max_seq_length, &sequence)) {
    ++seq_loading_num;
	created = FALSE;
    char* seq_name = get_seq_name(sequence);
    int seq_len = get_seq_length(sequence);
    int scan_len;
    if (last != 0) {
      scan_len = last;
    } else {
      scan_len = seq_len;
    }
	  
	// red-black trees are only required if duplicates should be combined
	if (combine_duplicates){
		//lookup seq id and create new entry if required, return sequence index
		char *tmp_id = mm_malloc(strlen(seq_name)+1); // required copy for rb-tree
		strncpy(tmp_id,seq_name,strlen(seq_name)+1);
		seq_node = rbtree_lookup(seq_ids, tmp_id, TRUE, &created);
		if (created) {// assign it a loading number
			rbtree_set(seq_ids, seq_node, &unique_seqs);
			seq_counter = unique_seqs;
			++unique_seqs;
		} else {
			seq_counter = *((int*)rbnode_get(seq_node));
		}
	}
	  
    //
    // Set up sequence-dependent background model and compute
    // log cumulative probability of sequence.
    //
    double *logcumback = NULL;                    // array of log cumulative probs.
    if (sdbg_order >= 0) {
      Resize(logcumback, seq_len+1, double);
      char* raw_seq = get_raw_sequence(sequence);
      BOOLEAN rc = FALSE;
      double *a_cp = get_markov_from_sequence(raw_seq, alph_string(alph), rc, sdbg_order, 0);
      log_cum_back(raw_seq, a_cp, sdbg_order, logcumback);
      myfree(a_cp);
    }

    // Get the GC content of the sequence if binning p-values by GC
    // and store it in the sequence object.
    if (num_gc_bins > 1) {
      ARRAY_T *freqs = get_sequence_freqs(sequence, alph);
      set_total_gc_sequence(sequence,
        get_array_item(1,freqs) + get_array_item(2,freqs));	// f(C) + f(G)
      free_array(freqs);			// clean up
    } else {
      set_total_gc_sequence(sequence, -1);	// flag ignore
    }

    /**************************************************************
     * Process all motifs.
     **************************************************************/
    int ns = scan_both_strands ? 2 : 1;
    for (motif_index = 0; motif_index < num_motifs; motif_index++) {
      PATTERN_T *pattern = patterns[motif_index];
      MOTIF_T* motif = (MOTIF_T*)arraylst_get(ns*motif_index, motifs);
      char* motif_id = (scan_both_strands ? get_motif_st_id(motif) : get_motif_id(motif));
      if (verbosity >= HIGH_VERBOSE) {
        fprintf(stderr, "Using motif %s of width %d.\n", motif_id, get_motif_length(motif));
      }
      if ((selected_motifs == NULL) || (have_string(get_motif_id(motif), selected_motifs) == TRUE)) {
        if (verbosity >= HIGHER_VERBOSE) {
          fprintf(stderr, "Scanning %s sequence with length %d "
              "abbreviated to %d with motif %s with length %d.\n",
              seq_name, seq_len, scan_len, motif_id, get_motif_length(motif));
        }
		SCANNED_SEQUENCE_T* scanned_seq = NULL;

		
		if (!combine_duplicates || get_pattern_num_scanned_sequences(pattern) <= seq_counter){
			// Create a scanned_sequence record and save it in the pattern.
			scanned_seq = allocate_scanned_sequence(seq_name, seq_name, pattern);
			set_scanned_sequence_length(scanned_seq, scan_len);
		} else {
			// get existing sequence record
			scanned_seq = get_pattern_scanned_sequences(pattern)[seq_counter];
			set_scanned_sequence_length(scanned_seq, max(scan_len, get_scanned_sequence_length(scanned_seq)));
		}
		
		// check if scanned component of sequence has sufficient length for the motif
		if (scan_len < get_motif_length(motif)) {
			// set score to zero and p-value to 1 if not set yet
			if(!has_scanned_sequence_score(scanned_seq)){
				set_scanned_sequence_score(scanned_seq, 0.0);
			}
			if(pvalues && !has_scanned_sequence_pvalue(scanned_seq)){
				set_scanned_sequence_pvalue(scanned_seq, 1.0);
			} 
			add_scanned_sequence_scanned_position(scanned_seq); 
			if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 0L) need_postprocessing = TRUE;
			if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "%s too short for motif %s. Score set to 0!\n", seq_name, motif_id);
		} else {  
			// scan the sequence using average/maximum motif affinity
			ama_sequence_scan(alph, sequence, logcumback, pssm_pairs[motif_index], scoring, 
							  pvalues, last, scanned_seq, &need_postprocessing);
		}

      } else {
        if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "Skipping motif %s.\n", motif_id);
      }
    } // All motifs parsed

    free_seq(sequence);
    if (sdbg_order >= 0) myfree(logcumback);

  } // read sequences
예제 #9
0
파일: centrimo.c 프로젝트: CPFL/gmeme
/*************************************************************************
 * Entry point for centrimo
 *************************************************************************/
int main(int argc, char *argv[]) {
  CENTRIMO_OPTIONS_T options;
  SEQ_SITES_T seq_sites;
  SITE_COUNTS_T counts;
  int seqN, motifN, seqlen, db_i, motif_i, i;
  double log_pvalue_thresh;
  SEQ_T** sequences = NULL;
  ARRAY_T* bg_freqs = NULL;
  ARRAYLST_T *stats_list;
  MOTIF_DB_T **dbs, *db;
  MREAD_T *mread;
  MOTIF_STATS_T *stats;
  MOTIF_T *motif, *rev_motif;
  PSSM_T *pos_pssm, *rev_pssm;
  char *sites_path, *desc;
  FILE *sites_file;
  HTMLWR_T *html;
  JSONWR_T *json;

  // COMMAND LINE PROCESSING
  process_command_line(argc, argv, &options);

  // load the sequences
  read_sequences(options.alphabet, options.seq_source, &sequences, &seqN);
  seqlen = (seqN ? get_seq_length(sequences[0]) : 0);
  // calculate a sequence background (unless other background is given)
  if (!options.bg_source) {
    bg_freqs = calc_bg_from_fastas(options.alphabet, seqN, sequences);
  }

  // load the motifs
  motifN = 0;
  dbs = mm_malloc(sizeof(MOTIF_DB_T*) * arraylst_size(options.motif_sources));
  for (i = 0; i < arraylst_size(options.motif_sources); i++) {
    char* db_source;
    db_source = (char*)arraylst_get(i, options.motif_sources);
    dbs[i] = read_motifs(i, db_source, options.bg_source, &bg_freqs, 
        options.pseudocount, options.selected_motifs, options.alphabet);
    motifN += arraylst_size(dbs[i]->motifs);
  }
  log_pvalue_thresh = log(options.evalue_thresh) - log(motifN);
  // Setup some things for double strand scanning
  if (options.scan_both_strands == TRUE) {
    // Set up hash tables for computing reverse complement
    setup_hash_alph(DNAB);
    setalph(0);
    // Correct background by averaging on freq. for both strands.
    average_freq_with_complement(options.alphabet, bg_freqs);
    normalize_subarray(0, alph_size(options.alphabet, ALPH_SIZE), 0.0, bg_freqs);
    calc_ambigs(options.alphabet, FALSE, bg_freqs);
  }
  // Create output directory
  if (create_output_directory(options.output_dirname, options.allow_clobber, 
        (verbosity >= NORMAL_VERBOSE))) {
    die("Couldn't create output directory %s.\n", options.output_dirname);
  }
  // open output files
  sites_path = make_path_to_file(options.output_dirname, SITES_FILENAME);
  sites_file = fopen(sites_path, "w");
  free(sites_path);
  // setup html monolith writer
  json = NULL;
  if ((html = htmlwr_create(get_meme_etc_dir(), TEMPLATE_FILENAME))) {
    htmlwr_set_dest_name(html, options.output_dirname, HTML_FILENAME);
    htmlwr_replace(html, "centrimo_data.js", "data");
    json = htmlwr_output(html);
    if (json == NULL) die("Template does not contain data section.\n");
  } else {
    DEBUG_MSG(QUIET_VERBOSE, "Failed to open html template file.\n");
  }
  if (json) {
    // output some top level variables
    jsonwr_str_prop(json, "version", VERSION);
    jsonwr_str_prop(json, "revision", REVISION);
    jsonwr_str_prop(json, "release", ARCHIVE_DATE);
    jsonwr_str_array_prop(json, "cmd", argv, argc);
    jsonwr_property(json, "options");
    jsonwr_start_object_value(json);
    jsonwr_dbl_prop(json, "motif-pseudo", options.pseudocount);
    jsonwr_dbl_prop(json, "score", options.score_thresh);
    jsonwr_dbl_prop(json, "ethresh", options.evalue_thresh);
    jsonwr_lng_prop(json, "maxbin", options.max_window+1);
    jsonwr_bool_prop(json, "norc", !options.scan_both_strands);
    jsonwr_bool_prop(json, "noflip", options.no_flip);
    jsonwr_end_object_value(json);
    // output the description
    desc = prepare_description(&options);
    if (desc) {
      jsonwr_str_prop(json, "job_description", desc);
      free(desc);
    }
    // output size metrics
    jsonwr_lng_prop(json, "seqlen", seqlen);
    jsonwr_lng_prop(json, "tested", motifN);
    // output the fasta db
    jsonwr_property(json, "sequence_db");
    jsonwr_start_object_value(json);
    jsonwr_str_prop(json, "source", options.seq_source);
    jsonwr_lng_prop(json, "count", seqN);
    jsonwr_end_object_value(json);
    // output the motif dbs
    jsonwr_property(json, "motif_dbs");
    jsonwr_start_array_value(json);
    for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) {
      db = dbs[db_i];
      jsonwr_start_object_value(json);
      jsonwr_str_prop(json, "source", db->source);
      jsonwr_lng_prop(json, "count", arraylst_size(db->motifs));
      jsonwr_end_object_value(json);
    }
    jsonwr_end_array_value(json);
    // start the motif array
    jsonwr_property(json, "motifs");
    jsonwr_start_array_value(json);
  }
  /**************************************************************
   * Tally the positions of the best sites for each of the 
   * selected motifs.
   **************************************************************/
  // prepare the sequence sites
  memset(&seq_sites, 0, sizeof(SEQ_SITES_T));
  // prepare the site counts
  counts.allocated = ((2 * seqlen) - 1);
  counts.sites = mm_malloc(sizeof(double) * counts.allocated);
  // prepare the motifs stats list
  stats_list = arraylst_create();
  // prepare the other vars
  motif = NULL; pos_pssm = NULL; rev_motif = NULL; rev_pssm = NULL;
  for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) {
    db = dbs[db_i];
    for (motif_i = 0; motif_i < arraylst_size(db->motifs); motif_i++) {
      motif = (MOTIF_T *) arraylst_get(motif_i, db->motifs);
      DEBUG_FMT(NORMAL_VERBOSE, "Using motif %s of width %d.\n",  
          get_motif_id(motif), get_motif_length(motif));
      // reset the counts
      for (i = 0; i < counts.allocated; i++) counts.sites[i] = 0;
      counts.total_sites = 0;
      // create the pssm 
      pos_pssm = make_pssm(bg_freqs, motif);
      // If required, do the same for the reverse complement motif.
      if (options.scan_both_strands) {
        rev_motif = dup_rc_motif(motif);
        rev_pssm = make_pssm(bg_freqs, rev_motif);
      }
      // scan the sequences
      for (i = 0; i < seqN; i++)
        score_sequence(&options, sequences[i], pos_pssm, rev_pssm, 
            &seq_sites, &counts);
      // DEBUG check that the sum of the sites is close to the site count
      double sum_check = 0, sum_diff;
      for (i = 0; i < counts.allocated; i++) sum_check += counts.sites[i];
      sum_diff = counts.total_sites - sum_check;
      if (sum_diff < 0) sum_diff = -sum_diff;
      if (sum_diff > 0.1) {
        fprintf(stderr, "Warning: site counts don't sum to accurate value! "
            "%g != %ld", sum_check, counts.total_sites);
      }
      // output the plain text site counts
      output_site_counts(sites_file, seqlen, db, motif, &counts);
      // compute the best central window
      stats = compute_stats(options.max_window, seqlen, db, motif, &counts);
      // check if it passes the threshold
      if (json && stats->log_adj_pvalue <= log_pvalue_thresh) {
        output_motif_json(json, stats, &counts);
        arraylst_add(stats, stats_list);
      } else {
        free(stats);
      }
      // Free memory associated with this motif.
      free_pssm(pos_pssm);
      free_pssm(rev_pssm);
      destroy_motif(rev_motif);
    }
  }
  if (json) jsonwr_end_array_value(json);
  // finish writing sites
  fclose(sites_file);
  // finish writing html file
  if (html) {
    if (htmlwr_output(html) != NULL) {
      die("Found another JSON replacement!\n");
    }
    htmlwr_destroy(html);
  }
  // write text file
  output_centrimo_text(&options, motifN, stats_list);
  // Clean up.
  for (i = 0; i < seqN; ++i) {
    free_seq(sequences[i]); 
  }
  free(sequences);
  for (i = 0; i < arraylst_size(options.motif_sources); i++) {
    free_db(dbs[i]);
  }
  free(dbs);
  free_array(bg_freqs);
  free(counts.sites);
  free(seq_sites.sites);
  arraylst_destroy(free, stats_list);
  cleanup_options(&options);
  return 0;

}
예제 #10
0
파일: search.c 프로젝트: dank101/net-2
dsEnqError srch_start()
{
  struct ds_search_arg search_arg;
  struct ds_search_result result;
  struct DSError          error;
  dsEnqError return_error;
  extern Filter make_filter();
  DN curr_rdn;

  if (*mvalue == '\0') {
    return list_start();
  }

  if (get_default_service (&search_arg.sra_common) != 0) {
    return nothingfound;
  }

  search_arg.sra_common.ca_servicecontrol.svc_options = SVC_OPT_PREFERCHAIN;

  curr_rdn = search_arg.sra_baseobject = (*base_path != 'T'?
					  str2dn (base_path):
					  NULLDN);

  search_arg.sra_eis.eis_allattributes = FALSE;
  search_arg.sra_eis.eis_infotypes = EIS_ATTRIBUTETYPESONLY;
  search_arg.sra_eis.eis_select = 0;
  search_arg.sra_searchaliases = TRUE;

  search_arg.sra_subset = SRA_ONELEVEL;
  while (curr_rdn != NULLDN) {
    if (!strcmp(curr_rdn->dn_rdn->rdn_at->oa_ot.ot_stroid, 
		"2.5.4.10")) {
      search_arg.sra_subset = SRA_WHOLESUBTREE;
      break;
    }
    curr_rdn = curr_rdn->dn_parent;
  }

  if ((search_arg.sra_filter = make_filter(filt_arr[typeindx])) == NULLFILTER)
    return duaerror;

#ifndef NO_STATS
  LLOG (log_stat, LLOG_NOTICE, ("search +%s, extent %d, val %s",
                                base_path,search_arg.sra_subset, mvalue));
#endif

  if(ds_search (&search_arg, &error, &result) != DS_OK) {
    /* deal with error */
    free_seq(dnseq);
    dnseq = NULLDS;
    dn_number = 0;
    log_ds_error(&error);
    ds_error_free(&error);
    switch (error.dse_type) {
    case DSE_LOCALERROR:
      return_error = duaerror;
      break;
    case DSE_REMOTEERROR:
      return_error = localdsaerror;
      break;
    case DSE_ATTRIBUTEERROR:
      return_error = attributerror;
      break;
    case DSE_REFERRAL:
    case DSE_DSAREFERRAL:
      return_error = remotedsaerror;
      break;
    case DSE_SECURITYERROR:
      return_error = security;
      break;
    case DSE_NAMEERROR:
      return_error = namerror;
      break;
    case DSE_SERVICEERROR:
      return_error = serviceerror;
      break;
    default:
      return_error = localdsaerror;
      break;
    }
  } else {
    correlate_search_results (&result);
    dn_number = 0;

    if (result.CSR_entries != NULLENTRYINFO) {
      register EntryInfo *ptr;

      return_error = Okay;
      free_seq(dnseq);
      dnseq = NULLDS;
      dn_number = 0;

      for (ptr = result.CSR_entries;
           ptr != NULLENTRYINFO; 
	   ptr = ptr->ent_next){
        dn_number++;
        dn2buf((caddr_t) ptr->ent_dn, goto_path);
        add_seq(&dnseq, goto_path);
      }

      if (dn_number) dnseq = SortList(dnseq);
    } else if (result.CSR_limitproblem == LSR_NOLIMITPROBLEM) {
      free_seq(dnseq);
      dnseq = NULLDS;
      dn_number = 0;
      return_error = nothingfound;
    }

    if(result.CSR_limitproblem != LSR_NOLIMITPROBLEM) {
      switch (result.CSR_limitproblem) {
      case LSR_TIMELIMITEXCEEDED:
	if (dn_number > 0) return_error = timelimit_w_partial;
	else {
	  free_seq(dnseq);
	  dnseq = NULLDS;
	  return_error = timelimit;
	}
	break;
      case LSR_SIZELIMITEXCEEDED:
	return_error = listsizelimit;
	break;
      case LSR_ADMINSIZEEXCEEDED:
	if (dn_number > 0) return_error = adminlimit_w_partial;
	else {
	  free_seq(dnseq);
	  dnseq = NULLDS;
	  return_error = adminlimit;
	}
	break;
      }
      entryinfo_free(result.CSR_entries, 0);
    }
  }
  entry_number = dn_number;
  filter_free(search_arg.sra_filter);
  dn_free(search_arg.sra_baseobject);
  ds_error_free(&error);
  return return_error;
}
예제 #11
0
/****************************************************************************
 * Remove from the alignment all columns that contain gaps for the
 * specified species.
 ****************************************************************************/
ALIGNMENT_T* remove_alignment_gaps
  (char*        species,
   ALIGNMENT_T* alignment)
{
  // Locate this species in the alignment.
  int species_index = get_index_in_string_list(species, 
					       get_species_names(alignment));
  if (species_index == -1) {
    die("Can't find %s in alignment.\n", species);
  }
  SEQ_T* this_seq = get_alignment_sequence(species_index, alignment);

  // Get the dimensions of the original matrix.
  int num_sequences = get_num_aligned_sequences(alignment);
  int alignment_length = get_alignment_length(alignment);

  // Allocate memory for raw sequences that will constitute the new alignment.
  char** raw_sequences = (char**)mm_malloc(sizeof(char*) * num_sequences);
  int i_seq = 0;
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    raw_sequences[i_seq] 
      = (char*)mm_calloc(alignment_length + 1, sizeof(char*));
  }
  char* consensus = get_consensus_string(alignment);
  char* new_consensus 
    = (char*)mm_calloc(alignment_length + 1, sizeof(char*));

  // Iterate over all columns.
  int i_column;
  int i_raw = 0;
  for (i_column = 0; i_column < alignment_length; i_column++) {

    // Is there a gap?
    char this_char = get_seq_char(i_column, this_seq);
    if ((this_char != '-') && (this_char != '.')) {

      // If no gap, then copy over this column.
      for (i_seq = 0; i_seq < num_sequences; i_seq++) {
	SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment);
	char this_char = get_seq_char(i_column, this_sequence);
				      
	raw_sequences[i_seq][i_raw] = this_char;
      }
      new_consensus[i_raw] = consensus[i_column];
      i_raw++;
    }
  }

  // Create new sequence objects.
  SEQ_T** new_sequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*));
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment);
    new_sequences[i_seq] = allocate_seq(get_seq_name(this_sequence),
					get_seq_description(this_sequence),
					get_seq_offset(this_sequence),
					raw_sequences[i_seq]);
  }

  // Allocate and return the new alignment.
  ALIGNMENT_T* new_alignment
    = allocate_alignment(get_alignment_name(alignment),
			 get_alignment_description(alignment),
			 num_sequences,
			 new_sequences,
			 new_consensus);
  
  // Free local dynamic memory.
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    myfree(raw_sequences[i_seq]);
    free_seq(new_sequences[i_seq]);
  }
  myfree(raw_sequences);
  myfree(new_sequences);
  myfree(new_consensus);

  return(new_alignment);
}