コード例 #1
0
ファイル: alignment.c プロジェクト: brsaran/FuzzyApp
/****************************************************************************
 * Create a new alignment with any sequence that contains nothing but 
 * gap ('-') characters removed. Returns the new alignment.  Does not 
 * change the old alignment.
 * If there are no all-gap sequences, the returned alignment is the
 * same object as the original alignment.
 ****************************************************************************/
static ALIGNMENT_T* remove_allgap_sequences(ALIGNMENT_T* alignment)
{
  ALIGNMENT_T* new_alignment;
  int i_aln;
  int l_aln = get_num_aligned_sequences(alignment);
  STRING_LIST_T* keeper_seqs = new_string_list();

  // Identify the all-gap sequences.
  for (i_aln=0; i_aln<l_aln; i_aln++) {
    SEQ_T* sequence = get_alignment_sequence(i_aln, alignment);
    int i_seq;
    int l_seq = get_seq_length(sequence);
    // Add sequence to keepers if it contains a non-gap.
    for (i_seq=0; i_seq<l_seq; i_seq++) {
      if (get_seq_char(i_seq, sequence) != '-') {           // not gap?
	add_string(get_seq_name(sequence), keeper_seqs);    // non-gap: keeper
	break;
      }
    }
  }

  // Remove any sequences not in keeper list.
  if (get_num_strings(keeper_seqs) < l_aln) {
    new_alignment = remove_alignment_seqs(keeper_seqs, alignment);
    free_string_list(keeper_seqs);
  } else {
    new_alignment = alignment;
  }

  return(new_alignment);
} // remove_allgap_sequences
コード例 #2
0
ファイル: strlist.c プロジェクト: UIKit0/fontforge-1
struct string_list *prepend_string_list(struct string_list* list, const char *str) {
    struct string_list *newlist;
    newlist = new_string_list(str);
    if ( newlist==NULL )
	return list;
    newlist->next = list;
    return newlist;
}
コード例 #3
0
ファイル: string_list.c プロジェクト: senechal/shell
StringList copy_list(StringList list){
    StringList copy = new_string_list();
    String string;
    while((string = list_next(&list))){
        list_append(&copy, string);
    }
    return copy;
}
コード例 #4
0
ファイル: jcache.c プロジェクト: MaddTheSane/haiku-buildtools
/*!	\brief Reads in a text file and returns its contents as a string_list.

	The function strips leading white spaces from each line and omits empty
	or comment lines.

	If a string_list is supplied via \a list the file's content is appended
	to this list, otherwise a new string_list is allocated.

	\param filename The name of the file to be read in.
	\param list Pointer to a pre-allocated string_list. May be 0.
	\return A pointer to the string_list containing the contents of the file,
			or 0, if an error occured.
*/
static
string_list*
read_file(const char *filename, string_list* list)
{
	int result = 0;
	FILE *file = 0;
	string_list *allocatedList = 0;
	// open file
	if ((file = fopen(filename, "r")) != 0
		&& (list || (list = allocatedList = new_string_list(100)) != 0)) {
		// read the file
		char buffer[513];
		result = !0;
		while (result && fgets(buffer, sizeof(buffer) - 1, file)) {
			char* line = buffer;
			int len = 0;
			char *string = 0;
			// skip leading white spaces
			while (*line == ' ' || *line == '\t' || *line == '\n')
				line++;
			// make empty and comment lines simple new-line lines
			if (!*line || *line == '#') {
				line[0] = '\n';
				line[1] = '\0';
			}
			len = strlen(line);
			// make sure, the line ends in a LF
			if (line[len - 1] != '\n') {
				line[len] = '\n';
				len++;
				line[len] = '\0';
			}
			if ((size_t)len + 1 == sizeof(buffer)) {
				fprintf(stderr, "error: %s:%d: line too long!\n", filename,
					list->count + 1);
				exit(1);
			}
			// copy it
			string = (char*)malloc(len + 1);
			if (string) {
				strcpy(string, line);
				result = push_string(list, string);
			} else
				result = 0;
		}
		// close the file
		fclose(file);
	} else
		perror(filename);
	// cleanup on error
	if (!result) {
		delete_string_list(allocatedList);
		list = 0;
	}
	return list;
}
コード例 #5
0
ファイル: string_list.c プロジェクト: senechal/shell
// last array element must be NULL
StringList from_array(char * array[]){
    int i = 0;
    StringList list = new_string_list();
    String string;
    while(array[i] != NULL){
        string = copy_string(array[i]);
        list_append(&list, string);
        i++;
    }
    return list;
}
コード例 #6
0
ファイル: strlist.c プロジェクト: UIKit0/fontforge-1
struct string_list *append_string_list(struct string_list* list, const char *str) {
    struct string_list *newlist;
    struct string_list *last;
    newlist = new_string_list(str);
    if ( newlist==NULL )
	return list;
    if ( list == NULL )
	return newlist;
    for ( last=list; last->next!=NULL; last=last->next );
    last->next = newlist;
    return list;
}
コード例 #7
0
BOOLEAN read_csv(
  FILE *data_file,		/* file containing sequences */
  char comment,			/* the character that define a comment line */
  char separator,		/* the character used for splitting a line */
  STRING_LIST_T ***content,	/* retrieved content of the csv file as list of string lists */
  int *length			/* retrieve length of the file (number of elements in content) */
)
{
  int i;
//  long j;
  char c;
  c = ' ';
  char* entry = NULL;
  
  if (data_file == NULL ){            	/* Could not open file */
	  die("Error with file! \n");
  }
  
  STRING_LIST_T* line = new_string_list();
  /* get the sample name */
  /* read to first blank/tab/ or end of line/file */
  for (i=0,*length=0; (c=fgetc(data_file))!=EOF; ) {
    if (c==comment) {
    	Skip_eol(c, data_file);	/* comment ends line */
    } else if (c== '\n' || c== '\r') {
    	Resize(entry, i+1, char);
    	entry[i] = '\0';
    	add_string(entry,line); /* return ends entry & line */
    	myfree(entry);
    	entry = NULL;
    	i=0;
    	/* resize array of pointers if necessary */
    	if ((*length % SCHUNK) == 0) {
    		Resize(*content, *length+SCHUNK, STRING_LIST_T*);
    	}
    	(*content)[(*length)++] = line;
		line = new_string_list();
    	
    } else if (c==separator) {
コード例 #8
0
ファイル: string_list.c プロジェクト: senechal/shell
StringList from_array_size(char * array[], int size){
    int i;
    StringList list = new_string_list();
    String string;
    for(i = 0; i < size; i++){
        if(array[i] != NULL){
            string = copy_string(array[i]);
            list_append(&list, string);
            i++;
        }
    }
    return list;
}
コード例 #9
0
ファイル: jcache.c プロジェクト: MaddTheSane/haiku-buildtools
/*!	\brief Creates a new jamfile_cache.
	\return A pointer to the newly allocated jamfile_cache, or 0, if out of
			memory.
*/
static
jamfile_cache*
new_jamfile_cache(void)
{
	jamfile_cache *cache = (jamfile_cache*)malloc(sizeof(jamfile_cache));
	if (cache) {
		cache->entries = hashinit(sizeof(jcache_entry), "jcache");
		cache->filenames = new_string_list(100);
		cache->cache_file = 0;
		if (!cache->entries || !cache->filenames) {
			delete_jamfile_cache(cache);
			cache = 0;
		}
	}
	return cache;
}
コード例 #10
0
ファイル: alignment.c プロジェクト: brsaran/FuzzyApp
/****************************************************************************
 * Get a list of the names of the species in the alignment.
 ****************************************************************************/
STRING_LIST_T* get_species_names(ALIGNMENT_T* an_alignment) {
  STRING_LIST_T* return_value;
  int i_seq;
  int num_seqs;

  // Allocate a new string list.
  return_value = new_string_list();

  // Extract all the sequence names and add them to the list.
  num_seqs = get_num_aligned_sequences(an_alignment);
  for (i_seq = 0; i_seq < num_seqs; i_seq++) {
    add_string(get_seq_name(get_alignment_sequence(i_seq, an_alignment)),
	       return_value);
  }

  return(return_value);
}
コード例 #11
0
ファイル: clparse.c プロジェクト: hmatyschok/MeshBSD
void
parse_string_list(FILE *cfile, struct string_list **lp, int multiple)
{
	int			 token;
	char			*val;
	size_t			 valsize;
	struct string_list	*cur, *tmp;

	/* Find the last medium in the media list. */
	if (*lp)
		for (cur = *lp; cur->next; cur = cur->next)
			;	/* nothing */
	else
		cur = NULL;

	do {
		token = next_token(&val, cfile);
		if (token != STRING) {
			parse_warn("Expecting media options.");
			skip_to_semi(cfile);
			return;
		}

		valsize = strlen(val) + 1;
		tmp = new_string_list(valsize);
		if (tmp == NULL)
			error("no memory for string list entry.");
		memcpy(tmp->string, val, valsize);
		tmp->next = NULL;

		/* Store this medium at the end of the media list. */
		if (cur)
			cur->next = tmp;
		else
			*lp = tmp;
		cur = tmp;

		token = next_token(&val, cfile);
	} while (multiple && token == COMMA);

	if (token != SEMI) {
		parse_warn("expecting semicolon.");
		skip_to_semi(cfile);
	}
}
コード例 #12
0
ファイル: string_list.c プロジェクト: senechal/shell
//Quebra uma String e cria um StringList, separando os comandos, do parser
StringList explode(String string, char separator){
    StringList list = new_string_list();
    int i;
    int size = size_of(string);
    String buffer = new_string();
    // loop in string
    for(i = 0; i <= size; i++){
        if(string[i] != separator && i < size)
            append(&buffer, string[i]);
        else{
            list_append(&list, buffer);
            dispose_string(&buffer);
            buffer = new_string();
        }
    }
    dispose_string(&buffer);
    return list;
}
コード例 #13
0
ファイル: files.c プロジェクト: h3rb/gc
/*
 * Returns a list of lines
 */
STRING_LIST *fread_file( char *filename ) {
     FILE *fp=fopen( filename, "r" );
     STRING_LIST *slist=NULL;
     STRING_LIST *last=NULL;

     if (fp == NULL) { OUTPUT("File not found: %s\n", filename); return NULL; }
     
     while( !feof(fp) && !ferror(fp) )
     {
        STRING_LIST *snode = new_string_list( );
        snode->str=fread_string_eol(fp);
        if ( snode->str == NULL ) { free_string_list(snode); break; }
        if ( last != NULL ) last->next = snode;
        else slist=snode;
        last=snode;
     }
     
     fclose(fp);
     return slist;
}
コード例 #14
0
ファイル: jcache.c プロジェクト: MaddTheSane/haiku-buildtools
/*!	\brief Initializes a pre-allocated jcache_entry.
	\param entry The jcache_entry to be initialized.
	\param filename The name of the include file to be associated with the
		   entry.
	\param time The time stamp of the include file to be associated with the
		   entry.
	\param used Whether or not the entry shall be marked used.
	\return \c !0, if everything went fine, 0, if an error occured (out of
			memory).
*/
static
int
init_jcache_entry(jcache_entry* entry, char *filename, time_t time, int used)
{
	int result = 0;
	if (entry) {
		result = !0;
		entry->filename = (char*)malloc(strlen(filename) + 1);
		if (entry->filename)
			strcpy(entry->filename, filename);
		entry->time = time;
		entry->strings = new_string_list(100);
		entry->used = used;
		// cleanup on error
		if (!entry->filename || !entry->strings) {
			cleanup_jcache_entry(entry);
			result = 0;
		}
	}
	return result;
}
コード例 #15
0
ファイル: pmp_bf.c プロジェクト: brsaran/FuzzyApp
/*************************************************************************
 * Entry point for pmp_bf
 *************************************************************************/
int main(int argc, char *argv[]) {

  char* bg_filename = NULL;
  char* motif_name = "motif"; // Use this motif name in the output.
  STRING_LIST_T* selected_motifs = NULL;
  double fg_rate = 1.0;
  double bg_rate = 1.0;
  double purine_pyrimidine = 1.0; // r
  double transition_transversion = 0.5; // R
  double pseudocount = 0.1;
  GAP_SUPPORT_T gap_support = SKIP_GAPS;
  MODEL_TYPE_T model_type = F81_MODEL;
  BOOLEAN_T use_halpern_bruno = FALSE;
  char* ustar_label = NULL;	// TLB; create uniform star tree
  int i;

  program_name = "pmp_bf";

  /**********************************************
   * COMMAND LINE PROCESSING
   **********************************************/

  // Define command line options. (FIXME: Repeated code)
  // FIXME: Note that if you add or remove options you
  // must change n_options.
  int n_options = 12;
  cmdoption const pmp_options[] = {
    {"hb", NO_VALUE},
    {"ustar", REQUIRED_VALUE},
    {"model", REQUIRED_VALUE},
    {"pur-pyr", REQUIRED_VALUE},
    {"transition-transversion", REQUIRED_VALUE},
    {"bg", REQUIRED_VALUE},
    {"fg", REQUIRED_VALUE},
    {"motif", REQUIRED_VALUE},
    {"motif-name", REQUIRED_VALUE},
    {"bgfile", REQUIRED_VALUE},
    {"pseudocount", REQUIRED_VALUE},
    {"verbosity", REQUIRED_VALUE}
  };

  int option_index = 0;

  // Define the usage message.
  char      usage[1000] = "";
  strcat(usage, "USAGE: pmp [options] <tree file> <MEME file>\n");
  strcat(usage, "\n");
  strcat(usage, "   Options:\n");

  // Evolutionary model parameters.
  strcat(usage, "     --hb\n");
  strcat(usage, "     --model single|average|jc|k2|f81|f84|hky|tn");
  strcat(usage, " (default=f81)\n");
  strcat(usage, "     --pur-pyr <float> (default=1.0)\n");
  strcat(usage, "     --transition-transversion <float> (default=0.5)\n");
  strcat(usage, "     --bg <float> (default=1.0)\n");
  strcat(usage, "     --fg <float> (default=1.0)\n");

  // Motif parameters.
  strcat(usage, "     --motif <id> (default=all)\n");
  strcat(usage, "     --motif-name <string> (default from motif file)\n");

  // Miscellaneous parameters
  strcat(usage, "     --bgfile <background> (default from motif file)\n");
  strcat(usage, "     --pseudocount <float> (default=0.1)\n");
  strcat(usage, "     --ustar <label>\n");	// TLB; create uniform star tree
  strcat(usage, "     --verbosity [1|2|3|4] (default 2)\n");
  strcat(usage, "\n    Prints the FP and FN rate at each of 10000 score values.\n");
  strcat(usage, "\n    Output format: [<motif_id> score <score> FPR <fpr> TPR <tpr>]+\n");

  // Parse the command line.
  if (simple_setopt(argc, argv, n_options, pmp_options) != NO_ERROR) {
    die("Error processing command line options: option name too long.\n");
  }

  while (TRUE) { 
    int c = 0;
    char* option_name = NULL;
    char* option_value = NULL;
    const char * message = NULL;

    // Read the next option, and break if we're done.
    c = simple_getopt(&option_name, &option_value, &option_index);
    if (c == 0) {
      break;
    } else if (c < 0) {
      (void) simple_getopterror(&message);
      die("Error processing command line options (%s)\n", message);
    }
    
    if (strcmp(option_name, "model") == 0) {
      if (strcmp(option_value, "jc") == 0) {
        model_type = JC_MODEL;
      } else if (strcmp(option_value, "k2") == 0) {
        model_type = K2_MODEL;
      } else if (strcmp(option_value, "f81") == 0) {
        model_type = F81_MODEL;
      } else if (strcmp(option_value, "f84") == 0) {
        model_type = F84_MODEL;
      } else if (strcmp(option_value, "hky") == 0) {
        model_type = HKY_MODEL;
      } else if (strcmp(option_value, "tn") == 0) {
        model_type = TAMURA_NEI_MODEL;
      } else if (strcmp(option_value, "single") == 0) {
        model_type = SINGLE_MODEL;
      } else if (strcmp(option_value, "average") == 0) {
        model_type = AVERAGE_MODEL;
      } else {
        die("Unknown model: %s\n", option_value);
      }
    } else if (strcmp(option_name, "hb") == 0){
        use_halpern_bruno = TRUE;
    } else if (strcmp(option_name, "ustar") == 0){	// TLB; create uniform star tree
        ustar_label = option_value;
    } else if (strcmp(option_name, "pur-pyr") == 0){
        purine_pyrimidine = atof(option_value);
    } else if (strcmp(option_name, "transition-transversion") == 0){
        transition_transversion = atof(option_value);
    } else if (strcmp(option_name, "bg") == 0){
      bg_rate = atof(option_value);
    } else if (strcmp(option_name, "fg") == 0){
      fg_rate = atof(option_value);
    } else if (strcmp(option_name, "motif") == 0){
        if (selected_motifs == NULL) {
          selected_motifs = new_string_list();
        }
       add_string(option_value, selected_motifs);
    } else if (strcmp(option_name, "motif-name") == 0){
        motif_name = option_value;
    } else if (strcmp(option_name, "bgfile") == 0){
      bg_filename = option_value;
    } else if (strcmp(option_name, "pseudocount") == 0){
        pseudocount = atof(option_value);
    } else if (strcmp(option_name, "verbosity") == 0){
        verbosity = atoi(option_value);
    }
  }

  // Must have tree and motif file names
  if (argc != option_index + 2) {
    fprintf(stderr, "%s", usage);
    exit(EXIT_FAILURE);
  } 

  /**********************************************
   * Read the phylogenetic tree.
   **********************************************/
  char* tree_filename = NULL;
  TREE_T* tree = NULL;
  tree_filename = argv[option_index];
  option_index++;
  tree = read_tree_from_file(tree_filename);

  // get the species names
  STRING_LIST_T* alignment_species = make_leaf_list(tree);
  char *root_label = get_label(tree);	// in case target in center
  if (strlen(root_label)>0) add_string(root_label, alignment_species);
  //write_string_list(" ", alignment_species, stderr);

  // TLB; Convert the tree to a uniform star tree with
  // the target sequence at its center.
  if (ustar_label != NULL) {
    tree = convert_to_uniform_star_tree(tree, ustar_label);
    if (tree == NULL) 
      die("Tree or alignment missing target %s\n", ustar_label);
    if (verbosity >= NORMAL_VERBOSE) {
      fprintf(stderr, 
	"Target %s placed at center of uniform (d=%.3f) star tree:\n", 
          ustar_label, get_total_length(tree) / get_num_children(tree) 
      );
      write_tree(tree, stderr);
    }
  }

  /**********************************************
   * Read the motifs.
   **********************************************/
  char* meme_filename = argv[option_index];
  option_index++;
  int num_motifs = 0; 

  MREAD_T *mread;
  ALPH_T alph;
  ARRAYLST_T *motifs;
  ARRAY_T *bg_freqs;

  mread = mread_create(meme_filename, OPEN_MFILE);
  mread_set_bg_source(mread, bg_filename);
  mread_set_pseudocount(mread, pseudocount);
  // read motifs
  motifs = mread_load(mread, NULL);
  alph = mread_get_alphabet(mread);
  bg_freqs = mread_get_background(mread);
  // check
  if (arraylst_size(motifs) == 0) die("No motifs in %s.", meme_filename);

  

  // TLB; need to resize bg_freqs array to ALPH_SIZE items
  // or copy array breaks in HB mode.  This throws away
  // the freqs for the ambiguous characters;
  int asize = alph_size(alph, ALPH_SIZE);
  resize_array(bg_freqs, asize);

  /**************************************************************
  * Compute probability distributions for each of the selected motifs.
  **************************************************************/
  int motif_index;
  for (motif_index = 0; motif_index < arraylst_size(motifs); motif_index++) {

    MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs);
    char* motif_id = get_motif_id(motif);
    char* bare_motif_id = motif_id;

    // We may have specified on the command line that
    // only certain motifs were to be used.
    if (selected_motifs != NULL) {
      if (*bare_motif_id == '+' || *bare_motif_id == '-') {
        // The selected  motif id won't included a strand indicator.
        bare_motif_id++;
      }
      if (have_string(bare_motif_id, selected_motifs) == FALSE) {
        continue;
      }
    }

    if (verbosity >= NORMAL_VERBOSE) {
      fprintf(
        stderr, 
        "Using motif %s of width %d.\n",
        motif_id, get_motif_length(motif)
      );
    }

    // Build an array of evolutionary models for each position in the motif.
    EVOMODEL_T** models = make_motif_models(
      motif, 
      bg_freqs,
      model_type,
      fg_rate, 
      bg_rate, 
      purine_pyrimidine, 
      transition_transversion, 
      use_halpern_bruno
    );

    // Get the frequencies under the background model (row 0) 
    // and position-dependent scores (rows 1..w)
    // for each possible alignment column.
    MATRIX_T* pssm_matrix = build_alignment_pssm_matrix(
      alph,
      alignment_species,
      get_motif_length(motif) + 1, 
      models, 
      tree, 
      gap_support
    );
    ARRAY_T* alignment_col_freqs = allocate_array(get_num_cols(pssm_matrix)); 
    copy_array(get_matrix_row(0, pssm_matrix), alignment_col_freqs);
    remove_matrix_row(0, pssm_matrix);		// throw away first row
    //print_col_frequencies(alph, alignment_col_freqs);

    //
    // Get the position-dependent null model alignment column frequencies
    //
    int w = get_motif_length(motif);
    int ncols = get_num_cols(pssm_matrix); 
    MATRIX_T* pos_dep_bkg = allocate_matrix(w, ncols);
    for (i=0; i<w; i++) {
      // get the evo model corresponding to this column of the motif
      // and store it as the first evolutionary model.
      myfree(models[0]);
      // Use motif PSFM for equilibrium freqs. for model.
      ARRAY_T* site_specific_freqs = allocate_array(asize);
      int j = 0;
      for(j = 0; j < asize; j++) {
	double value = get_matrix_cell(i, j, get_motif_freqs(motif));
	set_array_item(j, value, site_specific_freqs);
      }
      if (use_halpern_bruno == FALSE) {
	models[0] = make_model(
	  model_type,
	  fg_rate,
	  transition_transversion,
	  purine_pyrimidine,
	  site_specific_freqs,
          NULL
	);
      } else {
        models[0] = make_model(
	  model_type,
	  fg_rate,
	  transition_transversion,
	  purine_pyrimidine,
	  bg_freqs,
	  site_specific_freqs
	);
      }
      // get the alignment column frequencies using this model
      MATRIX_T* tmp_pssm_matrix = build_alignment_pssm_matrix(
        alph,
	alignment_species,
	2,				// only interested in freqs under bkg
	models, 
	tree, 
	gap_support
      );
      // assemble the position-dependent background alignment column freqs.
      set_matrix_row(i, get_matrix_row(0, tmp_pssm_matrix), pos_dep_bkg);
      // chuck the pssm (not his real name)
      free_matrix(tmp_pssm_matrix);
    }

    //
    // Compute and print the score distribution under the background model
    // and under the (position-dependent) motif model.
    //
    int range = 10000;	// 10^4 gives same result as 10^5, but 10^3 differs

    // under background model
    PSSM_T* pssm = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range);

    // under position-dependent background (motif) model
    PSSM_T* pssm_pos_dep = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range);
    get_pv_lookup_pos_dep(
      pssm_pos_dep, 
      pos_dep_bkg, 
      NULL // no priors used
    );

    // print FP and FN distributions
    int num_items = get_pssm_pv_length(pssm_pos_dep);
    for (i=0; i<num_items; i++) {
      double pvf = get_pssm_pv(i, pssm);
      double pvt = get_pssm_pv(i, pssm_pos_dep);
      double fpr = pvf;
      double fnr = 1 - pvt;
      if (fpr >= 0.99999 || fnr == 0) continue;
      printf("%s score %d FPR %.3g FNR %.3g\n", motif_id, i, fpr, fnr);
    }

    // free stuff
    free_pssm(pssm);
    free_pssm(pssm_pos_dep);
    if (models != NULL) {
      int model_index;
      int num_models = get_motif_length(motif) + 1;
      for (model_index = 0; model_index < num_models; model_index++) {
        free_model(models[model_index]);
      }
      myfree(models);
    }

  } // motif

  arraylst_destroy(destroy_motif, motifs);

  /**********************************************
   * Clean up.
   **********************************************/
  // TLB may have encountered a memory corruption bug here
  // CEG has not been able to reproduce it. valgrind says all is well.
  free_array(bg_freqs);
  free_tree(TRUE, tree);
  free_string_list(selected_motifs);

  return(0);
} // main
コード例 #16
0
ファイル: mhmm.c プロジェクト: PanosFirmpas/gimmemotifs
/*************************************************************************
 * int main
 *************************************************************************/
int main(int argc, char *argv[])
{
  /* Data structures. */
  int       num_motifs;         /* The number of motifs in the model. */
  MOTIF_T   motifs[2 * MAX_MOTIFS]; /* The motifs. */
  STRING_LIST_T* motif_occurrences = NULL; /* Strings describing occurrences of
                                              motifs */
  BOOLEAN_T has_reverse_strand = FALSE;    /* MEME file contained both strands */
  ARRAY_T*  background;         /* Background probs for alphabet. */
  ORDER_T*  order_spacing;      /* Linear HMM order and spacing. */
  MATRIX_T* transp_freq = NULL; /* Matrix of inter-motif transitions freqs. */
  MATRIX_T* spacer_ave = NULL;  /* Matrix of average spacer lengths. */
  MHMM_T *  the_hmm = NULL;     /* The HMM being constructed. */

  /* Command line parameters. */
  char *    meme_filename;      /* Input file containg motifs. */
  char *    hmm_type_str;       /* HMM type. */
  HMM_T     hmm_type;
  STRING_LIST_T* requested_motifs; /* Indices of requested motifs. */
  int       request_n;          /* The user asked for the first n motifs. */
  double    e_threshold;        /* E-value threshold for motif inclusion. */
  double    complexity_threshold; // For eliminating low-complexity motifs.
  double    p_threshold;        /* p-value threshold for motif occurences. */
  char*     order_string;       /* Motif order and spacing. */
  int       spacer_states;      /* Number of states in each spacer. */
  BOOLEAN_T fim;                /* Represent spacers as free insertion
				   modules? */
  BOOLEAN_T keep_unused;        // Drop unused inter-motif transitions?
  double    trans_pseudo;       /* Transition pseudocount. */
  double    spacer_pseudo;      // Spacer (self-loop) pseudocount. */
  char*     description;        // Descriptive text to be stored in model.
  BOOLEAN_T print_header;       /* Print file header? */
  BOOLEAN_T print_params;       /* Print parameter summary? */
  BOOLEAN_T print_time;         /* Print timing data (dummy: always false). */

  /* Local variables. */
  int       i_motif;

  /**********************************************
   * COMMAND LINE PROCESSING
   **********************************************/
  // Define command line options.
  cmdoption const options[] = {
    {"type", OPTIONAL_VALUE},
    {"description", REQUIRED_VALUE},
    {"motif", REQUIRED_VALUE},
    {"nmotifs", REQUIRED_VALUE},
    {"ethresh", REQUIRED_VALUE},
    {"lowcomp", REQUIRED_VALUE},
    {"pthresh", REQUIRED_VALUE},
    {"order", REQUIRED_VALUE},
    {"nspacer", REQUIRED_VALUE},
    {"fim", NO_VALUE},
    {"keep-unused", NO_VALUE},
    {"transpseudo", REQUIRED_VALUE},
    {"spacerpseudo", REQUIRED_VALUE},
    {"verbosity", REQUIRED_VALUE},
    {"noheader", NO_VALUE},
    {"noparams", NO_VALUE},
    {"notime", NO_VALUE},
    {"quiet", NO_VALUE},
  };
  int option_count = 18;
  int option_index = 0;

  // Define the usage message.
  char      usage[1000] = "";
  strcat(usage, "USAGE: mhmm [options] <MEME file>\n");
  strcat(usage, "\n");
  strcat(usage, "   Options:\n");
  strcat(usage, "     --type [linear|complete|star] (default=linear)\n");
  strcat(usage, "     --description <string> (may be repeated)\n");
  strcat(usage, "     --motif <motif #> (may be repeated)\n");
  strcat(usage, "     --nmotifs <#>\n");
  strcat(usage, "     --ethresh <E-value>\n");
  strcat(usage, "     --lowcomp <value>\n");
  strcat(usage, "     --pthresh <p-value>\n");
  strcat(usage, "     --order <string>\n");
  strcat(usage, "     --nspacer <spacer length> (default=1)\n");
  strcat(usage, "     --fim\n");
  strcat(usage, "     --keep-unused\n");
  strcat(usage, "     --transpseudo <pseudocount>\n");
  strcat(usage, "     --spacerpseudo <pseudocount>\n");
  strcat(usage, "     --verbosity 1|2|3|4|5 (default=2)\n");
  strcat(usage, "     --noheader\n");
  strcat(usage, "     --noparams\n");
  strcat(usage, "     --notime\n");
  strcat(usage, "     --quiet\n");
  strcat(usage, "\n");

  /* Make sure various options are set to NULL or defaults. */
  meme_filename = NULL;
  hmm_type_str = NULL;
  hmm_type = INVALID_HMM;
  requested_motifs = new_string_list();
  request_n = 0;
  e_threshold = 0.0;
  complexity_threshold = 0.0;
  p_threshold = 0.0;
  order_string = NULL;
  spacer_states = DEFAULT_SPACER_STATES,
  fim = FALSE;
  keep_unused = FALSE;
  trans_pseudo = DEFAULT_TRANS_PSEUDO;
  spacer_pseudo = DEFAULT_SPACER_PSEUDO;
  description = NULL;
  print_header = TRUE;
  print_params = TRUE;
  print_time = FALSE;

	simple_setopt(argc, argv, option_count, options);

  // Parse the command line.
  while (1) { 
    int c = 0;
    char* option_name = NULL;
    char* option_value = NULL;
    const char * message = NULL;


    // Read the next option, and break if we're done.
    c = simple_getopt(&option_name, &option_value, &option_index);
    if (c == 0) {
      break;
    } else if (c < 0) {
    	simple_getopterror(&message);
      die("Error processing command line options (%s)\n", message);
    }

    if (strcmp(option_name, "type") == 0) {
			if (option_value != NULL) {
      	hmm_type_str = option_value;
			}
    } else if (strcmp(option_name, "description") == 0) {
      description = option_value;
    } else if (strcmp(option_name, "motif") == 0) {
      add_string(option_value, requested_motifs);
    } else if (strcmp(option_name, "nmotifs") == 0) {
      request_n = atoi(option_value);
    } else if (strcmp(option_name, "ethresh") == 0) {
      e_threshold = atof(option_value);
    } else if (strcmp(option_name, "lowcomp") == 0) {
      complexity_threshold = atof(option_value);
    } else if (strcmp(option_name, "pthresh") == 0) {
      p_threshold = atof(option_value);
    } else if (strcmp(option_name, "order") == 0) {
      order_string = option_value;
    } else if (strcmp(option_name, "nspacer") == 0) {
      spacer_states = atoi(option_value);
    } else if (strcmp(option_name, "fim") == 0) {
      fim = TRUE;
    } else if (strcmp(option_name, "keep-unused") == 0) {
      keep_unused = TRUE;
    } else if (strcmp(option_name, "transpseudo") == 0) {
      trans_pseudo = atof(option_value);
    } else if (strcmp(option_name, "spacerpseudo") == 0) {
      spacer_pseudo = atof(option_value);
    } else if (strcmp(option_name, "verbosity") == 0) {
      verbosity = (VERBOSE_T)atoi(option_value);
    } else if (strcmp(option_name, "noheader") == 0) {
      print_header = FALSE;
    } else if (strcmp(option_name, "noparams") == 0) {
      print_params = FALSE;
    } else if (strcmp(option_name, "notime") == 0) {
      print_time = FALSE;
    } else if (strcmp(option_name, "quiet") == 0) {
      print_header = print_params = print_time = FALSE;
      verbosity = QUIET_VERBOSE;
    }
  }

  // Read the single required argument.
  if (option_index + 1 != argc) {
    fprintf(stderr, "%s", usage);
    exit(1);
  }
  meme_filename = argv[option_index];

  // Set up motif requests. 
  if (request_n != 0) {
    if (get_num_strings(requested_motifs) != 0) {
      die("Can't combine the -motif and -nmotifs options.\n");
    } else {
      for (i_motif = 0; i_motif < request_n; i_motif++) {
        char motif_id[MAX_MOTIF_ID_LENGTH + 1];
        sprintf(motif_id, "%d", i_motif + 1);
        add_string(motif_id, requested_motifs);
      }
    }
  }

  /* Set the model type. */
  hmm_type = convert_enum_type_str(hmm_type_str, LINEAR_HMM, HMM_STRS, 
				   NUM_HMM_T);

  /* Gotta have positive spacer length. */
  if (spacer_states <= 0) {
    die("Negative spacer length (%d).\n", spacer_states);
  }

  /* Make sure motifs weren't selected redundantly. */
  // FIXME: Add tests for complexity threshold.
  if ((get_num_strings(requested_motifs) != 0) && (e_threshold != 0.0)) {
    die("Can't use -motif or -nmotifs with -ethresh.");
  }
  if ((get_num_strings(requested_motifs) != 0) && (order_string != NULL)) {
    die("Can't use -motif or -nmotifs with -order.");
  }
  if ((order_string != NULL) && (e_threshold != 0.0)) {
    die("Can't use -ethresh and -order.");
  }

  /* Prevent trying to build a complete or star model with ordering. */
  if (order_string != NULL) {
    if (hmm_type == COMPLETE_HMM) 
      die("Can't specify motif order with a completely connected model.");
    else if (hmm_type == STAR_HMM)
      die("Can't specify motif order with a star model.");
  } 

  // Parse the order string. 
  order_spacing = create_order(order_string);

  /**********************************************
   * READING THE MOTIFS
   **********************************************/

  BOOLEAN_T read_file = FALSE;
  double pseudocount = 0;

  read_meme_file(
		 meme_filename,
		 "motif-file", // Take bg freq. from motif file.
		 pseudocount,
     REQUIRE_PSPM,
		 &num_motifs,
		 motifs,
		 &motif_occurrences,
		 &has_reverse_strand,
		 &background
		 );

  process_raw_motifs_for_model(
       &num_motifs,
       motifs,
       motif_occurrences,
       requested_motifs,
       has_reverse_strand,
       keep_unused,
       p_threshold,
       e_threshold, 
       complexity_threshold, 
       &order_spacing,
       &transp_freq,
       &spacer_ave,
       trans_pseudo,
       spacer_pseudo
  );

  /**********************************************
   * BUILDING THE HMM
   **********************************************/

  /* Build the motif-based HMM. */
  if (hmm_type == LINEAR_HMM) {

    if (order_spacing != NULL) {
      reorder_motifs(order_spacing, &num_motifs, motifs);
    }
    else {
      die("No order specified for the motifs.\n"
          "For the linear model the motif file must contain motif occurence\n" 
          "data or the motif order must be specified using "
          "the --order option.");
    }

    build_linear_hmm(
      background,
		  order_spacing,
		  spacer_states,
		  motifs,
		  num_motifs, 
		  fim,
		  &the_hmm
    );

  } else if (hmm_type == COMPLETE_HMM) {

    build_complete_hmm(
      background,
		  spacer_states,
		  motifs,
		  num_motifs,
		  transp_freq,
		  spacer_ave,
		  fim,
		  &the_hmm
    );

  } else if (hmm_type == STAR_HMM) {

    build_star_hmm(
      background,
		  spacer_states,
		  motifs,
		  num_motifs,
		  fim,
		  &the_hmm
    );

  }

  // Add some global information.
  copy_string(&(the_hmm->motif_file), meme_filename);

  /**********************************************
   * WRITING THE HMM
   **********************************************/

  /* Print the header. */
  if (print_header)
    write_header(
     program, 
     "",
		 description,
		 meme_filename,
		 NULL,
		 NULL, 
		 stdout
    );

  /* Write the HMM. */
  write_mhmm(verbosity, the_hmm, stdout);

  /* Print the program parameters. */
  if (print_params) {
    printf("Program parameters for mhmm\n");
    printf("  MEME file: %s\n", meme_filename);
    printf("  Motifs:");
    write_string_list(" ", requested_motifs, stdout);
    printf("\n");
    printf("  Model topology: %s\n",
	   convert_enum_type(hmm_type, HMM_STRS, NUM_HMM_T));
    printf("  States per spacer: %d\n", spacer_states);
    printf("  Spacers are free-insertion modules: %s\n",
	   boolean_to_string(fim));
    printf("\n");
  }

  free_array(background);
  free_string_list(requested_motifs);
  free_order(order_spacing);
  free_matrix(transp_freq);
  free_matrix(spacer_ave);
  for (i_motif = 0; i_motif < num_motifs; i_motif++)
    free_motif(&(motifs[i_motif]));
  free_mhmm(the_hmm);
  return(0);
}
コード例 #17
0
ファイル: string_list-c.cpp プロジェクト: Distrotech/aspell
extern "C" StringList * new_aspell_string_list()
{
  return new_string_list();
}
コード例 #18
0
ファイル: transfac.c プロジェクト: rreja/CRM_discovery
/***********************************************************************
 * Read TRANSFAC motifs from a TRANSFAC file.
 * Returns an arraylist of pointers to TRANSFAC_MOTIF_T
 ***********************************************************************/
ARRAYLST_T *read_motifs_from_transfac_file (
    const char* transfac_filename  // Name of TRANSFAC file or '-' for stdin IN
) {

    // Create dynamic storage for motifs
    ARRAYLST_T *motif_list = arraylst_create();

    // Open the TRANFAC file for reading.
    FILE *transfac_file = NULL;
    if (open_file(
                transfac_filename,
                "r",
                TRUE, // Allow '-' for stdin
                "transfac file",
                "",
                &transfac_file
            ) == FALSE) {
        exit(1);
    }

    // Read and parse the TRANFAC file.
    int num_bases = 4;
    char *line = NULL;
    while ((line = getline2(transfac_file)) != NULL) {

        // Split the line into an initial tag and everything else.
        char *this_accession = split(line, ' ');
        char *tag = line;

        // Have we reached a new matrix?
        if (strcmp(tag, "AC") == 0) {

            trim(this_accession);

            char *this_id = NULL;
            char *this_name = NULL;
            char *this_descr = NULL;
            char *this_species = NULL;
            char this_consensus[MAX_CONSENSUS_LENGTH];
            STRING_LIST_T *species_list = new_string_list();

            // Old versions of TRANSFAC use pee-zero; new use pee-oh.
            while (strcmp(tag, "PO") != 0 && strcmp(tag, "P0") != 0) {

                line = getline2(transfac_file);
                if (line == NULL) {
                    die ("Can't find PO line for TRANSFAC matrix %s.\n", this_accession);
                }
                char *data = split(line, ' ');
                if (data != NULL) {
                    trim(data);
                }
                tag = line;

                // Store the id line.
                if (strcmp(tag, "ID") == 0) {
                    this_id = strdup(data);
                }
                // Store the species line.
                else if (strcmp(tag, "BF") == 0) {
                    add_string(data, species_list);
                }
                // Store the name line.
                else if (strcmp(tag, "NA") == 0) {
                    this_name = strdup(data);
                }
                // Store the description line.
                else if (strcmp(tag, "DE") == 0) {
                    this_descr = strdup(data);
                }
            }

            // Check how many positions in the motif
            // Mark current position in file
            fpos_t file_position;
            errno = 0;
            int status = fgetpos(transfac_file, &file_position);
            if (status) {
                die("Error reading file %s: %s", transfac_filename, strerror(errno));
            }

            int num_motif_positions = 0;
            while (TRUE) {

                // Read till we reach the end of the counts or the end of the motif
                line = getline2(transfac_file);
                if (line == NULL) {
                    break;
                }

                char *data = split(line, ' ');
                if (data != NULL) {
                    trim(data);
                }
                tag = line;

                // Read till we reach the end of the counts or the end of the motif
                if ((strcmp(tag, "XX\n") == 0) || (strcmp(tag, "//\n") == 0)) {
                    break;
                }

                ++num_motif_positions;
            }
            // Rewind file
            errno = 0;
            status = fsetpos(transfac_file, &file_position);
            if (status) {
                die("Error reading file %s: %s", transfac_filename, strerror(errno));
            }

            // Read the motif counts.
            int num_seqs = 0;
            this_consensus[0] = 0;
            MATRIX_T *motif_counts = allocate_matrix(num_motif_positions, 4);
            int position = 0;
            while (TRUE) {

                line = getline2(transfac_file);
                if (line == NULL) {
                    break;
                }

                char *data = split(line, ' ');
                if (data != NULL) {
                    trim(data);
                }
                tag = line;

                // Look for the end of the motif.
                if ((strcmp(tag, "XX\n") == 0) || (strcmp(tag, "//\n") == 0)) {
                    break;
                }

                position = atoi(tag);
                if (position > num_motif_positions) {
                    die(
                        "Error reading motif counts at position %d of motif %s in file %s",
                        position,  this_accession, transfac_filename
                    );
                }

                // Store the contents of this row.
                int count[4];
                char consensus;
                sscanf(
                    data,
                    "%d %d %d %d %c",
                    &(count[0]),
                    &(count[1]),
                    &(count[2]),
                    &(count[3]),
                    &consensus
                );
                int i_base;
                for (i_base = 0; i_base < num_bases; i_base++) {
                    set_matrix_cell(position - 1, i_base, count[i_base], motif_counts);
                }
                this_consensus[position - 1] = consensus;

            }

            this_consensus[position] = 0;
            TRANSFAC_MOTIF_T *motif = new_transfac_motif(
                                          this_accession,
                                          this_id,
                                          this_name,
                                          this_descr,
                                          this_consensus,
                                          species_list,
                                          motif_counts
                                      );
            arraylst_add(motif, motif_list);

        }
    }

    fclose(transfac_file);
    return motif_list;

}
コード例 #19
0
ファイル: ramen.c プロジェクト: CPFL/gmeme
void ramen_scan_sequences() {
		FILE* seq_file = NULL;
		MOTIF_T* motif = NULL;
		MOTIF_T* rev_motif = NULL;
		SEQ_T* sequence = NULL;
		SCANNED_SEQUENCE_T* scanned_seq = NULL;
		PATTERN_T* pattern;
		int i;
		int j;
		SEQ_T** seq_list;
		int num_seqs;
		int seq_len;
		//For the bdb_bg mode:
		ARRAY_T* seq_bg_freqs;
		double atcontent;
		double roundatcontent;
		double avg_seq_length = 0;

		//Open the file.
		if (open_file(args.sequence_filename, "r", FALSE, "FASTA", "sequences", &seq_file) == 0) {
				fprintf(stderr, "Couldn't open the file %s.\n", args.sequence_filename);
				ramen_terminate(1);
		}

		//Start reading in the sequences
		read_many_fastas(ramen_alph, seq_file, MAX_SEQ_LENGTH, &num_seqs, &seq_list);


		seq_ids = new_string_list();
		seq_fscores = allocate_array(num_seqs);

		//Allocate the required space for results
		results = malloc(sizeof(double*) * motifs.num);
		for (i=0;i<motifs.num;i++) {
				results[i] = malloc(sizeof(double)*num_seqs);
		}

		for (j=0;j<num_seqs;j++) {

				fprintf(stderr, "\rScanning %i of %i sequences...", j+1, num_seqs);

				//copy the pointer into our current object for clarity
				sequence = seq_list[j];

				//Read the fluorescence data from the description field.
				add_string(get_seq_name(sequence),seq_ids);
				seq_len = get_seq_length(sequence);
				set_array_item(j,atof(get_seq_description(sequence)),seq_fscores);

				//Scan with each motif.
				for (i=0;i<motifs.num;i++) {
						int motifindex = i*2;

						results[i][j] = ramen_sequence_scan(sequence, motif_at(motifs.motifs, motifindex), 
											      motif_at(motifs.motifs, motifindex+1),
											      NULL, NULL, //No need to pass PSSM.
										              AVG_ODDS, 0, TRUE, 0, motifs.bg_freqs);

						if (TRUE == args.linreg_normalise) {
								int k;
								double maxscore = 1;
								motif = motif_at(motifs.motifs,motifindex); 
								for (k=0;k<get_motif_length(motif);k++) {
										double maxprob = 0;
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif));
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif));
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif));
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif));
										maxscore *= maxprob;
								}
								results[i][j] /= maxscore;
						}
				}
		}

}
コード例 #20
0
ファイル: ama.c プロジェクト: a1aks/Haystack
/*************************************************************************
 * Entry point for ama
 *************************************************************************/
int main(int argc, char *argv[]) {
  int max_seq_length = MAX_SEQ;
  STRING_LIST_T* selected_motifs = NULL;
  double pseudocount = 0.01;
  int output_format = CISML_FORMAT;
  program_name = "ama";
  int scoring = AVG_ODDS;
  BOOLEAN_T pvalues = FALSE;
  BOOLEAN_T normalize_scores = FALSE;
  BOOLEAN_T combine_duplicates = FALSE;
  int num_gc_bins = 1;
  int sdbg_order = -1;				// don't use sequence background
  BOOLEAN_T scan_both_strands = TRUE;
  ARRAY_T* pos_bg_freqs = NULL;
  ARRAY_T* rev_bg_freqs = NULL;
  clock_t c0, c1; /* measuring cpu_time */
  CISML_T *cisml;
  char * out_dir = NULL;
  BOOLEAN_T clobber = FALSE;
  int i;
  int last = 0;
  ALPH_T alph = INVALID_ALPH;

  /**********************************************
   * COMMAND LINE PROCESSING
   **********************************************/

  const int num_options = 16;
  cmdoption const motif_scan_options[] = {
    { "max-seq-length", REQUIRED_VALUE },
    { "motif", REQUIRED_VALUE },
    { "motif-pseudo", REQUIRED_VALUE },
    { "rma", NO_VALUE },
    { "pvalues", NO_VALUE },
    { "sdbg", REQUIRED_VALUE },
    { "norc", NO_VALUE },
    { "cs", NO_VALUE },
    { "o-format", REQUIRED_VALUE },
    { "o", REQUIRED_VALUE },
    { "oc", REQUIRED_VALUE },
    { "scoring", REQUIRED_VALUE },
    { "verbosity", REQUIRED_VALUE },
    { "gcbins", REQUIRED_VALUE },
    { "last", REQUIRED_VALUE },
    { "version", NO_VALUE }
  };

  int option_index = 0;

  // Define the usage message.
  char usage[] = "USAGE: ama [options] <motif file> <sequence file> [<background file>]\n"
    "\n"
    "   Options:\n"
    "     --sdbg <order>\t\t\tUse Markov background model of\n"
    "       \t\t\t\t\torder <order> derived from the sequence\n"
    "       \t\t\t\t\tto compute its likelihood ratios.\n"
    "       \t\t\t\t\tOverrides --pvalues, --gcbins and --rma;\n"
    "       \t\t\t\t\t<background file> is required unless\n"
    "       \t\t\t\t\t--sdbg is given.\n"
    "     --motif <id>\t\t\tUse only the motif identified by <id>.\n"
    "       \t\t\t\t\tThis option may be repeated.\n"
    "     --motif-pseudo <float>\t\tThe value <float> times the background\n"
    "       \t\t\t\t\tfrequency is added to the count of each\n"
    "       \t\t\t\t\tletter when creating the likelihood \n"
    "       \t\t\t\t\tratio matrix (default: %g).\n"
    "     --norc\t\t\t\tDisables the scanning of the reverse\n"
    "       \t\t\t\t\tcomplement strand.\n"
    "     --scoring [avg-odds|max-odds]\tIndicates whether the average or \n"
    "       \t\t\t\t\tthe maximum odds should be calculated\n"
    "       \t\t\t\t\t(default: avg-odds)\n"
    "     --rma\t\t\t\tScale motif scores to the range 0-1.\n"
    "       \t\t\t\t\t(Relative Motif Affinity).\n"
    "       \t\t\t\t\tMotif scores are scaled by the maximum\n"
    "       \t\t\t\t\tscore achievable by that PWM. (default:\n"
    "       \t\t\t\t\tmotif scores are not normalized)\n"
    "     --pvalues\t\t\t\tPrint p-value of avg-odds score in cisml\n"
    "       \t\t\t\t\toutput. Ignored for max-odds scoring.\n"
    "       \t\t\t\t\t(default: p-values are not printed)\n"
    "     --gcbins <bins>\t\t\tCompensate p-values for GC content of\n"
    "       \t\t\t\t\teach sequence using given number of \n"
    "       \t\t\t\t\tGC range bins. Recommended bins: 41.\n"
    "       \t\t\t\t\t(default: p-values are based on\n"
    "       \t\t\t\t\tfrequencies in background file)\n"
    "     --cs\t\t\t\tEnable combining sequences with same\n"
    "       \t\t\t\t\tidentifier by taking the average score\n"
    "       \t\t\t\t\tand the Sidac corrected p-value.\n"
    "     --o-format [gff|cisml]\t\tOutput file format (default: cisml)\n"
    "       \t\t\t\t\tignored if --o or --oc option used\n"
    "     --o <directory>\t\t\tOutput all available formats to\n"
    "       \t\t\t\t\t<directory>; give up if <directory>\n"
    "       \t\t\t\t\texists\n"
    "     --oc <directory>\t\t\tOutput all available formats to\n"
    "       \t\t\t\t\t<directory>; if <directory> exists\n"
    "       \t\t\t\t\toverwrite contents\n"
    "     --verbosity [1|2|3|4]\t\tControls amount of screen output\n"
    "       \t\t\t\t\t(default: %d)\n"
    "     --max-seq-length <int>\t\tSet the maximum length allowed for \n"
    "       \t\t\t\t\tinput sequences. (default: %d)\n"
    "     --last <int>\t\t\tUse only scores of (up to) last <n>\n"
    "       \t\t\t\t\tsequence positions to compute AMA.\n"
    "     --version   \t\t\tPrint version and exit.\n"
    "\n";

  // Parse the command line.
  if (simple_setopt(argc, argv, num_options, motif_scan_options) != NO_ERROR) {
    die("Error processing command line options: option name too long.\n");
  }
    
    BOOLEAN_T setoutputformat = FALSE;
    BOOLEAN_T setoutputdirectory = FALSE;

  while (TRUE) {
    int c = 0;
    char* option_name = NULL;
    char* option_value = NULL;
    const char * message = NULL;

    // Read the next option, and break if we're done.
    c = simple_getopt(&option_name, &option_value, &option_index);
    if (c == 0) {
      break;
    } else if (c < 0) {
      (void) simple_getopterror(&message);
      die("Error processing command line options (%s).\n", message);
    } else if (strcmp(option_name, "max-seq-length") == 0) {
	max_seq_length = atoi(option_value);
    } else if (strcmp(option_name, "norc") == 0) {
	scan_both_strands = FALSE;
    } else if (strcmp(option_name, "cs") == 0) {
		combine_duplicates = TRUE;
    } else if (strcmp(option_name, "motif") == 0) {
	if (selected_motifs == NULL) {
	  selected_motifs = new_string_list();
	}
	add_string(option_value, selected_motifs);
    } else if (strcmp(option_name, "motif-pseudo") == 0) {
	pseudocount = atof(option_value);
    } else if (strcmp(option_name, "o-format") == 0) {
        if (setoutputdirectory) {
            if (verbosity >= NORMAL_VERBOSE)
                fprintf(stderr, "output directory specified, ignoring --o-format\n");
        } else {
            setoutputformat = TRUE;
            if (strcmp(option_value, "gff") == 0)
                output_format = GFF_FORMAT;
            else if (strcmp(option_value, "cisml") == 0)
                output_format = CISML_FORMAT;
            else {
                if (verbosity >= NORMAL_VERBOSE)
                  fprintf(stderr, "Output format not known. Using standard instead (cisML).\n");
                  output_format = CISML_FORMAT;
            }
        }
    } else if (strcmp(option_name, "o") == 0 || strcmp(option_name, "oc") == 0) {
        setoutputdirectory = TRUE;
        if (setoutputformat) {
            if (verbosity >= NORMAL_VERBOSE)
                fprintf(stderr, "output directory specified, ignoring --o-format\n");
        }
        clobber = strcmp(option_name, "oc") == 0;
        out_dir = (char*) malloc (sizeof(char)*(strlen(option_value)+1));
        strcpy(out_dir, option_value);
        output_format = DIRECTORY_FORMAT;
    } else if (strcmp(option_name, "verbosity") == 0) {
	verbosity = atoi(option_value);
    } else if (strcmp(option_name, "scoring") == 0) {
      if (strcmp(option_value, "max-odds") == 0)
	scoring = MAX_ODDS;
      else if (strcmp(option_value, "avg-odds") == 0)
	scoring = AVG_ODDS;
      else if (strcmp(option_value, "sum-odds") == 0)
	scoring = SUM_ODDS;
	  else
	die("Specified scoring scheme not known.\n", message);
    } else if (strcmp(option_name, "pvalues") == 0) {
      pvalues = TRUE;
    } else if (strcmp(option_name, "rma") == 0) {
      normalize_scores = TRUE;
      fprintf(stderr, "Normalizing motif scores using RMA method.\n");
    } else if (strcmp(option_name, "gcbins") == 0) {
      num_gc_bins = atoi(option_value);
      pvalues = TRUE;
      if (num_gc_bins <= 1) die("Number of bins in --gcbins must be greater than 1.\n", message);
    } else if (strcmp(option_name, "sdbg") == 0) {
      sdbg_order = atoi(option_value);			// >=0 means use sequence bkg
    }
    else if (strcmp(option_name, "last") == 0) {
      int i = 0;
      if (option_value[0] == '-') ++i;
      while (option_value[i] != '\0') {
        if (!isdigit(option_value[i])) {
          die("Specified parameter 'last' contains non-numeric characters.\n");
        }
        ++i;
      }
      last = atoi(option_value);
      if (errno != 0) {
        die("Specified parameter 'last' could not be parsed as a number as:\n%s\n",strerror(errno));
      }
      if (last < 0) {
        die("Specified parameter 'last' had negative value (%d) when only postive or zero values are allowed \n", last);
      }
    }
    else if (strcmp(option_name, "version") == 0) {
      fprintf(stdout, VERSION "\n");
      exit(EXIT_SUCCESS);
    }
  }

  // --sdbg overrides --pvalues and --gcbins and --rma
  int req_args = 3;
  if (sdbg_order >= 0) {
    pvalues = FALSE;
    normalize_scores = FALSE;
    num_gc_bins = 1;
    req_args = 2;
  }

  // Check all required arguments given
  if (sdbg_order >= 0 && argc > option_index + req_args) {
    die("<background file> cannot be given together with --sdbg.\n");
  } else if (argc != option_index + req_args) {
    fprintf(stderr, usage, pseudocount, verbosity, max_seq_length);
    exit(EXIT_FAILURE);
  }

  // Get required arguments. 
  char* motif_filename = argv[option_index];
  option_index++;
  char* fasta_filename = argv[option_index];
  option_index++;
  char* bg_filename;
  if (req_args == 3) {			// required unless --sdbg given
    bg_filename = argv[option_index];
    option_index++;
  } else {
    bg_filename = "--uniform--";	// So PSSMs will use uniform background;
					// we can multiply them out later.
  }

  // measure time
  c0 = clock();

  // Set up hash tables for computing reverse complement if doing --sdbg
  if (sdbg_order >= 0) setup_hash_alph(DNAB);

  // Create cisml data structure for recording results
  cisml = allocate_cisml(program_name, motif_filename, fasta_filename);
  set_cisml_background_file(cisml, bg_filename);

  /**********************************************
   * Read the motifs and background model.
   **********************************************/
  int num_motifs = 0;
  MREAD_T *mread;
  ARRAYLST_T *motifs;
  PSSM_PAIR_T** pssm_pairs;	// note pssm_pairs is an array of pointers

  //this reads any meme file, xml, txt and html
  mread = mread_create(motif_filename, OPEN_MFILE);
  mread_set_bg_source(mread, bg_filename);
  mread_set_pseudocount(mread, pseudocount);

  motifs = mread_load(mread, NULL);
  alph = mread_get_alphabet(mread);
  pos_bg_freqs = mread_get_background(mread);

  mread_destroy(mread);

  num_motifs = arraylst_size(motifs);

  // allocate memory for PSSM pairs
  pssm_pairs = (PSSM_PAIR_T**)mm_malloc(sizeof(PSSM_PAIR_T*) * num_motifs);

  if (verbosity >= NORMAL_VERBOSE) 
    fprintf(stderr, "Number of motifs in file %d.\n", num_motifs);

  // make a CISML pattern to hold scores for each motif
  PATTERN_T** patterns = NULL;
  Resize(patterns, num_motifs, PATTERN_T*);
  int motif_index;
  for (motif_index = 0; motif_index < num_motifs; motif_index++) {
    MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs);
    patterns[motif_index] = allocate_pattern(get_motif_id(motif), "");
    add_cisml_pattern(cisml, patterns[motif_index]);
  }

  // make reverse complement motifs and background frequencies.
  if (scan_both_strands == TRUE) {
    add_reverse_complements(motifs);
    assert(arraylst_size(motifs) == (2 * num_motifs));
    rev_bg_freqs = allocate_array(get_array_length(pos_bg_freqs));
    complement_dna_freqs(pos_bg_freqs, rev_bg_freqs);
  }

  /**************************************************************
   * Convert motif matrices into log-odds matrices.
   * Scale them.
   * Compute the lookup tables for the PDF of scaled log-odds scores.
   **************************************************************/
  int ns = scan_both_strands ? 2 : 1;	// number of strands
  for (motif_index = 0; motif_index < num_motifs; motif_index++) {
    MOTIF_T *motif, *motif_rc;
    motif = (MOTIF_T*)arraylst_get(motif_index*ns, motifs);
    if (scan_both_strands)
      motif_rc = (MOTIF_T*)arraylst_get(motif_index*ns + 1, motifs);
    else
      motif_rc = NULL;
    /*
     *  Note: If scanning both strands, we complement the motif frequencies
     *  but not the background frequencies so the motif looks the same.
     *  However, the given frequencies are used in computing the p-values
     *  since they represent the frequencies on the negative strands.
     *  (If we instead were to complement the input sequence, keeping the
     *  the motif fixed, we would need to use the complemented frequencies
     *  in computing the p-values.  Is that any clearer?)
    */
    double range = 300;		// 100 is not very good; 1000 is great but too slow
    PSSM_T* pos_pssm =
      build_motif_pssm(
        motif, 
        pos_bg_freqs, 
        pos_bg_freqs, 
        NULL, // Priors not used
        0.0L, // alpha not used
        range, 
        num_gc_bins, 
        TRUE
      );
    PSSM_T* neg_pssm = (scan_both_strands ?
      build_motif_pssm(
        motif_rc, 
        rev_bg_freqs, 
        pos_bg_freqs, 
        NULL, // Priors not used
        0.0L, // alpha not used
        range, 
        num_gc_bins, 
        TRUE
      )
      : NULL
    );
    pssm_pairs[motif_index] = create_pssm_pair(pos_pssm, neg_pssm);
  }

  // Open the FASTA file for reading.
  FILE* fasta_file = NULL;
  if (open_file(fasta_filename, "r", FALSE, "FASTA", "sequences", &fasta_file) == 0) {
    die("Couldn't open the file %s.\n", fasta_filename);
  }
  if (verbosity >= NORMAL_VERBOSE) {
    if (last == 0) {
      fprintf(stderr, "Using entire sequence\n");
    } else {
      fprintf(stderr, "Limiting sequence to last %d positions.\n", last);
    }
  }

  /**************************************************************
   * Read in all sequences and score with all motifs
   **************************************************************/
  int seq_loading_num = 0;  // keeps track on the number of sequences read in total
  int seq_counter = 0;		// holds the index to the seq in the pattern
  int unique_seqs = 0;      // keeps track on the number of unique sequences
  BOOLEAN_T need_postprocessing = FALSE;
  SEQ_T* sequence = NULL;
  RBTREE_T* seq_ids = rbtree_create(rbtree_strcasecmp,NULL,free,rbtree_intcpy,free);
  RBNODE_T* seq_node;
  BOOLEAN_T created;
  while (read_one_fasta(alph, fasta_file, max_seq_length, &sequence)) {
    ++seq_loading_num;
	created = FALSE;
    char* seq_name = get_seq_name(sequence);
    int seq_len = get_seq_length(sequence);
    int scan_len;
    if (last != 0) {
      scan_len = last;
    } else {
      scan_len = seq_len;
    }
	  
	// red-black trees are only required if duplicates should be combined
	if (combine_duplicates){
		//lookup seq id and create new entry if required, return sequence index
		char *tmp_id = mm_malloc(strlen(seq_name)+1); // required copy for rb-tree
		strncpy(tmp_id,seq_name,strlen(seq_name)+1);
		seq_node = rbtree_lookup(seq_ids, tmp_id, TRUE, &created);
		if (created) {// assign it a loading number
			rbtree_set(seq_ids, seq_node, &unique_seqs);
			seq_counter = unique_seqs;
			++unique_seqs;
		} else {
			seq_counter = *((int*)rbnode_get(seq_node));
		}
	}
	  
    //
    // Set up sequence-dependent background model and compute
    // log cumulative probability of sequence.
    //
    double *logcumback = NULL;                    // array of log cumulative probs.
    if (sdbg_order >= 0) {
      Resize(logcumback, seq_len+1, double);
      char* raw_seq = get_raw_sequence(sequence);
      BOOLEAN rc = FALSE;
      double *a_cp = get_markov_from_sequence(raw_seq, alph_string(alph), rc, sdbg_order, 0);
      log_cum_back(raw_seq, a_cp, sdbg_order, logcumback);
      myfree(a_cp);
    }

    // Get the GC content of the sequence if binning p-values by GC
    // and store it in the sequence object.
    if (num_gc_bins > 1) {
      ARRAY_T *freqs = get_sequence_freqs(sequence, alph);
      set_total_gc_sequence(sequence,
        get_array_item(1,freqs) + get_array_item(2,freqs));	// f(C) + f(G)
      free_array(freqs);			// clean up
    } else {
      set_total_gc_sequence(sequence, -1);	// flag ignore
    }

    /**************************************************************
     * Process all motifs.
     **************************************************************/
    int ns = scan_both_strands ? 2 : 1;
    for (motif_index = 0; motif_index < num_motifs; motif_index++) {
      PATTERN_T *pattern = patterns[motif_index];
      MOTIF_T* motif = (MOTIF_T*)arraylst_get(ns*motif_index, motifs);
      char* motif_id = (scan_both_strands ? get_motif_st_id(motif) : get_motif_id(motif));
      if (verbosity >= HIGH_VERBOSE) {
        fprintf(stderr, "Using motif %s of width %d.\n", motif_id, get_motif_length(motif));
      }
      if ((selected_motifs == NULL) || (have_string(get_motif_id(motif), selected_motifs) == TRUE)) {
        if (verbosity >= HIGHER_VERBOSE) {
          fprintf(stderr, "Scanning %s sequence with length %d "
              "abbreviated to %d with motif %s with length %d.\n",
              seq_name, seq_len, scan_len, motif_id, get_motif_length(motif));
        }
		SCANNED_SEQUENCE_T* scanned_seq = NULL;

		
		if (!combine_duplicates || get_pattern_num_scanned_sequences(pattern) <= seq_counter){
			// Create a scanned_sequence record and save it in the pattern.
			scanned_seq = allocate_scanned_sequence(seq_name, seq_name, pattern);
			set_scanned_sequence_length(scanned_seq, scan_len);
		} else {
			// get existing sequence record
			scanned_seq = get_pattern_scanned_sequences(pattern)[seq_counter];
			set_scanned_sequence_length(scanned_seq, max(scan_len, get_scanned_sequence_length(scanned_seq)));
		}
		
		// check if scanned component of sequence has sufficient length for the motif
		if (scan_len < get_motif_length(motif)) {
			// set score to zero and p-value to 1 if not set yet
			if(!has_scanned_sequence_score(scanned_seq)){
				set_scanned_sequence_score(scanned_seq, 0.0);
			}
			if(pvalues && !has_scanned_sequence_pvalue(scanned_seq)){
				set_scanned_sequence_pvalue(scanned_seq, 1.0);
			} 
			add_scanned_sequence_scanned_position(scanned_seq); 
			if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 0L) need_postprocessing = TRUE;
			if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "%s too short for motif %s. Score set to 0!\n", seq_name, motif_id);
		} else {  
			// scan the sequence using average/maximum motif affinity
			ama_sequence_scan(alph, sequence, logcumback, pssm_pairs[motif_index], scoring, 
							  pvalues, last, scanned_seq, &need_postprocessing);
		}

      } else {
        if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "Skipping motif %s.\n", motif_id);
      }
    } // All motifs parsed

    free_seq(sequence);
    if (sdbg_order >= 0) myfree(logcumback);

  } // read sequences