/********************************************************************** shuffle_sequence() shuffle a given sequences based on their content **********************************************************************/ void shuffle_sequence( SEQ_T* seq, /* original sequence IN */ unsigned int seed, /* seed IN */ SEQ_T** target /* target sequence OUT */ ){ my_srand(seed); assert(*target==NULL); // reset target if not null if (*target != NULL){ free_seq(*target); } *target = allocate_seq(get_seq_name(seq),"shuffled",get_seq_offset(seq),get_raw_sequence(seq)); char *raw = get_raw_sequence(*target); /* copy original in temp string */ char* tmp = (char*)mm_calloc(get_seq_length(seq)+1,sizeof(char)); strcpy(tmp,get_raw_sequence(seq)); tmp[get_seq_length(seq)]='\0'; int i,j; char *ss; char *dd; for(j=0,i=get_seq_length(seq);i>0;i--){ // Pick a random number in the range: int pick = rand() % i; raw[j++] = tmp[pick]; // "shift" routine here eliminates the "picked" base from the _src string: // dd starts at the picked position: ss is one beyond that: for( dd = tmp+pick , ss = dd + 1 ; *dd ; *dd++=*ss++ ); } myfree(tmp); }
void PlaySeq(const char* seqFile, const char* bnkFile, const char* war1, const char* war2, const char* war3, const char* war4) { StopSeq(); free_seq(); curr_seq.msg = SNDSYS_PLAYSEQ; LoadFile(&curr_seq.seq, seqFile); LoadFile(&curr_seq.bnk, bnkFile); LoadFile(curr_seq.war + 0, war1); LoadFile(curr_seq.war + 1, war2); LoadFile(curr_seq.war + 2, war3); LoadFile(curr_seq.war + 3, war4); fifoSendDatamsg(FIFO_SNDSYS, sizeof(curr_seq), (u8*) &curr_seq); }
/**************************************************************************** * Extract a small alignment out of the middle of a larger alignment. ****************************************************************************/ ALIGNMENT_T* extract_subalignment (int start, int width, ALIGNMENT_T* alignment) { int num_sequences = get_num_aligned_sequences(alignment); SEQ_T** sequences = get_alignment_sequences(alignment); SEQ_T** subsequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*)); // Extract the specified columns into a new list of sequences. int i_seq = 0; char* subsequence = mm_malloc((width + 1) * sizeof(char)); for (i_seq = 0; i_seq < num_sequences; i_seq++) { SEQ_T* this_seq = sequences[i_seq]; char* raw_seq = get_raw_sequence(this_seq); strncpy(subsequence, raw_seq + start, width); subsequence[width] = '\0'; subsequences[i_seq] = allocate_seq(get_seq_name(this_seq), get_seq_description(this_seq), get_seq_offset(this_seq), subsequence); } // Extract the consensus string in the specified columns. char* consensus = get_consensus_string(alignment); char* subconsensus = mm_malloc(sizeof(char) * (width + 1)); strncpy(subconsensus, consensus + start, width); subconsensus[width] = '\0'; // Allocate and return the new alignment. ALIGNMENT_T* subalignment = allocate_alignment(get_alignment_name(alignment), get_alignment_description(alignment), num_sequences, subsequences, subconsensus); // Free local dynamic memory. for (i_seq = 0; i_seq < num_sequences; i_seq++) { free_seq(subsequences[i_seq]); } myfree(subsequences); myfree(subsequence); return(subalignment); }
/**************************************************************************** * Free one alignment object. ****************************************************************************/ void free_alignment(ALIGNMENT_T* alignment) { if (alignment == NULL) { return; } else { if (alignment->consensus_string != NULL) { myfree(alignment->consensus_string); } if (alignment->sequences != NULL) { int i; for(i = 0; i < alignment->num_sequences; i++) { assert(alignment->sequences[i] != NULL); free_seq(alignment->sequences[i]); } myfree(alignment->sequences); } myfree(alignment); } }
dsEnqError list_start() { struct ds_search_arg search_arg; struct ds_search_result result; struct DSError error; dsEnqError return_error; return_error = Okay; if (get_default_service (&search_arg.sra_common) != 0) { return localdsaerror; } search_arg.sra_common.ca_servicecontrol.svc_options = SVC_OPT_PREFERCHAIN; search_arg.sra_baseobject = (*base_path != 'T'? str2dn (base_path): NULLDN); search_arg.sra_eis.eis_allattributes = FALSE; search_arg.sra_eis.eis_infotypes = EIS_ATTRIBUTETYPESONLY; search_arg.sra_eis.eis_select = 0; search_arg.sra_searchaliases = TRUE; search_arg.sra_subset = SRA_ONELEVEL; search_arg.sra_filter = filter_alloc(); search_arg.sra_filter->flt_type = FILTER_NOT; search_arg.sra_filter->flt_next = NULLFILTER; search_arg.sra_filter->flt_un.flt_un_filter = filter_alloc(); search_arg.sra_filter->flt_un.flt_un_filter->flt_type = FILTER_ITEM; search_arg.sra_filter->flt_un.flt_un_filter->flt_next = NULLFILTER; search_arg.sra_filter->flt_un.flt_un_filter->flt_un.flt_un_item.fi_type = FILTERITEM_EQUALITY; search_arg.sra_filter->flt_un.flt_un_filter->flt_un.flt_un_item.fi_un. fi_un_ava.ava_type = AttrT_new("2.5.4.0"); search_arg.sra_filter->flt_un.flt_un_filter->flt_un.flt_un_item.fi_un. fi_un_ava.ava_value = str2AttrV("dsa", search_arg.sra_filter->flt_un.flt_un_filter-> flt_un.flt_un_item.fi_un.fi_un_ava.ava_type-> oa_syntax); #ifndef NO_STATS LLOG (log_stat,LLOG_NOTICE,("search +%s,extent %d, val objectClass != dsa", base_path,search_arg.sra_subset)); #endif if (search_arg.sra_filter->flt_un.flt_un_filter->flt_un.flt_un_item. fi_un.fi_un_ava.ava_value == NULLAttrV) { return_error = localdsaerror; } else if (ds_search (&search_arg, &error, &result) != DS_OK) { free_seq(dnseq); dnseq = NULLDS; dn_number = 0; log_ds_error(&error); ds_error_free(&error); switch (error.dse_type) { case DSE_LOCALERROR: return_error = duaerror; break; case DSE_REMOTEERROR: return_error = localdsaerror; break; case DSE_ATTRIBUTEERROR: return_error = attributerror; break; case DSE_REFERRAL: case DSE_DSAREFERRAL: return_error = remotedsaerror; break; case DSE_SECURITYERROR: return_error = security; break; case DSE_NAMEERROR: return_error = namerror; break; case DSE_SERVICEERROR: return_error = serviceerror; break; default: return_error = localdsaerror; break; } } else { dn_number = 0; if (result.CSR_entries != NULLENTRYINFO) { register EntryInfo *ptr; free_seq(dnseq); dnseq = NULLDS; dn_number = 0; for (ptr = result.CSR_entries; ptr != NULLENTRYINFO; ptr = ptr->ent_next) { dn_number++; dn2buf ((caddr_t)ptr->ent_dn, goto_path); add_seq (&dnseq, goto_path); } if (dn_number) dnseq = SortList(dnseq); } else if (result.CSR_limitproblem == LSR_NOLIMITPROBLEM) { free_seq(dnseq); dnseq = NULLDS; dn_number = 0; return_error = nothingfound; } if (result.CSR_limitproblem != LSR_NOLIMITPROBLEM) { switch (result.CSR_limitproblem) { case LSR_TIMELIMITEXCEEDED: if (dn_number > 0) return_error = timelimit_w_partial; else { free_seq(dnseq); dnseq = NULLDS; return_error = timelimit; } break; case LSR_SIZELIMITEXCEEDED: return_error = listsizelimit; break; case LSR_ADMINSIZEEXCEEDED: if (dn_number > 0) return_error = adminlimit_w_partial; else { free_seq(dnseq); dnseq = NULLDS; return_error = adminlimit; } break; } } if (result.CSR_entries) entryinfo_free(result.CSR_entries, 0); } entry_number = dn_number; filter_free(search_arg.sra_filter); dn_free(search_arg.sra_baseobject); ds_error_free(&error); return return_error; }
/************************************************************************* * Entry point for ama *************************************************************************/ int main(int argc, char **argv) { AMA_OPTIONS_T options; ARRAYLST_T *motifs; clock_t c0, c1; // measuring cpu_time MOTIF_AND_PSSM_T *combo; CISML_T *cisml; PATTERN_T** patterns; PATTERN_T *pattern; FILE *fasta_file, *text_output, *cisml_output; int i, seq_loading_num, seq_counter, unique_seqs, seq_len, scan_len, x1, x2, y1, y2; char *seq_name, *path; bool need_postprocessing, created; SEQ_T *sequence; RBTREE_T *seq_ids; RBNODE_T *seq_node; double *logcumback; ALPH_T *alph; // process the command process_command_line(argc, argv, &options); // load DNA motifs motifs = load_motifs(&options); // get the alphabet if (arraylst_size(motifs) > 0) { combo = (MOTIF_AND_PSSM_T*)arraylst_get(0, motifs); alph = alph_hold(get_motif_alph(combo->motif)); } else { alph = alph_dna(); } // pick columns for GC operations x1 = -1; x2 = -1; y1 = -1; y2 = -1; if (alph_size_core(alph) == 4 && alph_size_pairs(alph) == 2) { x1 = 0; // A x2 = alph_complement(alph, x1); // T y1 = (x2 == 1 ? 2 : 1); // C y2 = alph_complement(alph, y1); // G assert(x1 != x2 && y1 != y2 && x1 != y1 && x2 != y2 && x1 != y2 && x2 != y1); } // record starting time c0 = clock(); // Create cisml data structure for recording results cisml = allocate_cisml(PROGRAM_NAME, options.command_line, options.motif_filename, options.fasta_filename); set_cisml_background_file(cisml, options.bg_filename); // make a CISML pattern to hold scores for each motif for (i = 0; i < arraylst_size(motifs); i++) { combo = (MOTIF_AND_PSSM_T*)arraylst_get(i, motifs); add_cisml_pattern(cisml, allocate_pattern(get_motif_id(combo->motif), "")); } // Open the FASTA file for reading. fasta_file = NULL; if (!open_file(options.fasta_filename, "r", false, "FASTA", "sequences", &fasta_file)) { die("Couldn't open the file %s.\n", options.fasta_filename); } if (verbosity >= NORMAL_VERBOSE) { if (options.last == 0) { fprintf(stderr, "Using entire sequence\n"); } else { fprintf(stderr, "Limiting sequence to last %d positions.\n", options.last); } } // // Read in all sequences and score with all motifs // seq_loading_num = 0; // keeps track on the number of sequences read in total seq_counter = 0; // holds the index to the seq in the pattern unique_seqs = 0; // keeps track on the number of unique sequences need_postprocessing = false; sequence = NULL; logcumback = NULL; seq_ids = rbtree_create(rbtree_strcasecmp,rbtree_strcpy,free,rbtree_intcpy,free); while (read_one_fasta(alph, fasta_file, options.max_seq_length, &sequence)) { ++seq_loading_num; seq_name = get_seq_name(sequence); seq_len = get_seq_length(sequence); scan_len = (options.last != 0 ? options.last : seq_len); // red-black trees are only required if duplicates should be combined if (options.combine_duplicates){ //lookup seq id and create new entry if required, return sequence index seq_node = rbtree_lookup(seq_ids, get_seq_name(sequence), true, &created); if (created) { // assign it a loading number rbtree_set(seq_ids, seq_node, &unique_seqs); seq_counter = unique_seqs; ++unique_seqs; } else { seq_counter = *((int*)rbnode_get(seq_node)); } } // // Set up sequence-dependent background model and compute // log cumulative probability of sequence. // This needs the sequence in raw format. // if (options.sdbg_order >= 0) logcumback = log_cumulative_background(alph, options.sdbg_order, sequence); // Index the sequence, throwing away the raw format and ambiguous characters index_sequence(sequence, alph, SEQ_NOAMBIG); // Get the GC content of the sequence if binning p-values by GC // and store it in the sequence object. if (options.num_gc_bins > 1) { ARRAY_T *freqs = get_sequence_freqs(sequence, alph); set_total_gc_sequence(sequence, get_array_item(y1, freqs) + get_array_item(y2, freqs)); // f(C) + f(G) free_array(freqs); // clean up } else { set_total_gc_sequence(sequence, -1); // flag ignore } // Scan with motifs. for (i = 0; i < arraylst_size(motifs); i++) { pattern = get_cisml_patterns(cisml)[i]; combo = (MOTIF_AND_PSSM_T*)arraylst_get(i, motifs); if (verbosity >= HIGHER_VERBOSE) { fprintf(stderr, "Scanning %s sequence with length %d " "abbreviated to %d with motif %s with length %d.\n", seq_name, seq_len, scan_len, get_motif_id(combo->motif), get_motif_length(combo->motif)); } SCANNED_SEQUENCE_T* scanned_seq = NULL; if (!options.combine_duplicates || get_pattern_num_scanned_sequences(pattern) <= seq_counter) { // Create a scanned_sequence record and save it in the pattern. scanned_seq = allocate_scanned_sequence(seq_name, seq_name, pattern); set_scanned_sequence_length(scanned_seq, scan_len); } else { // get existing sequence record scanned_seq = get_pattern_scanned_sequences(pattern)[seq_counter]; set_scanned_sequence_length(scanned_seq, max(scan_len, get_scanned_sequence_length(scanned_seq))); } // check if scanned component of sequence has sufficient length for the motif if (scan_len < get_motif_length(combo->motif)) { // set score to zero and p-value to 1 if not set yet if(!has_scanned_sequence_score(scanned_seq)){ set_scanned_sequence_score(scanned_seq, 0.0); } if(options.pvalues && !has_scanned_sequence_pvalue(scanned_seq)){ set_scanned_sequence_pvalue(scanned_seq, 1.0); } add_scanned_sequence_scanned_position(scanned_seq); if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 0L) { need_postprocessing = true; } if (verbosity >= HIGH_VERBOSE) { fprintf(stderr, "%s too short for motif %s. Score set to 0.\n", seq_name, get_motif_id(combo->motif)); } } else { // scan the sequence using average/maximum motif affinity ama_sequence_scan(alph, sequence, logcumback, combo->pssm_pair, options.scoring, options.pvalues, options.last, scanned_seq, &need_postprocessing); } } // All motifs scanned free_seq(sequence); if (options.sdbg_order >= 0) myfree(logcumback); } // read sequences fclose(fasta_file); if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "(%d) sequences read in.\n", seq_loading_num); if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "Finished \n"); // if any sequence identifier was multiple times in the sequence set then // postprocess of the data is required if (need_postprocessing || options.normalize_scores) { post_process(cisml, motifs, options.normalize_scores); } // output results if (options.output_format == DIRECTORY_FORMAT) { if (create_output_directory(options.out_dir, options.clobber, verbosity > QUIET_VERBOSE)) { // only warn in higher verbose modes fprintf(stderr, "failed to create output directory `%s' or already exists\n", options.out_dir); exit(1); } path = make_path_to_file(options.out_dir, text_filename); //FIXME check for errors: MEME doesn't either and we at least know we have a good directory text_output = fopen(path, "w"); free(path); path = make_path_to_file(options.out_dir, cisml_filename); //FIXME check for errors cisml_output = fopen(path, "w"); free(path); print_cisml(cisml_output, cisml, true, NULL, false); print_score(cisml, text_output); fclose(cisml_output); fclose(text_output); } else if (options.output_format == GFF_FORMAT) { print_score(cisml, stdout); } else if (options.output_format == CISML_FORMAT) { print_cisml(stdout, cisml, true, NULL, false); } else { die("Output format invalid!\n"); } // // Clean up. // rbtree_destroy(seq_ids); arraylst_destroy(motif_and_pssm_destroy, motifs); free_cisml(cisml); rbtree_destroy(options.selected_motifs); alph_release(alph); // measure time if (verbosity >= NORMAL_VERBOSE) { // starting time c1 = clock(); fprintf(stderr, "cycles (CPU); %ld cycles\n", (long) c1); fprintf(stderr, "elapsed CPU time: %f seconds\n", (float) (c1-c0) / CLOCKS_PER_SEC); } return 0; }
int main(int argc, char *argv[]) { int count; seq_t seq1, seq2; hash_env_t he; collec_t res, rev_res; #if defined(DEBUG) && (DEBUG > 1) mcheck(NULL); mtrace(); #endif argv0 = argv[0]; if (setlocale(LC_ALL, "POSIX") == NULL) fprintf(stderr, "%s: Warning: could not set locale to POSIX\n", argv[0]); signal(SIGSEGV, bug_handler); #ifndef __MINGW32__ signal(SIGBUS, bug_handler); #endif /* Default options. */ options.C = DEFAULT_C; options.cutoff = DIST_CUTOFF; options.gapPct = DEFAULT_GAPPCT; options.intron_window = 6; options.K = DEFAULT_K; options.splice_type_list = "GTAG,GCAG,GTAC,ATAC"; options.nbSplice = 4; options.scoreSplice_window = 10; options.mismatchScore = MISMATCH; options.reverse = 2; options.matchScore = MATCH; options.W = DEFAULT_W; options.X = DEFAULT_X; options.filterPct = DEFAULT_FILTER; options.minScore_cutoff = MATCH_CUTOFF; while (1) { int c = getopt(argc, argv, "A:C:c:E:f:g:I:K:L:M:o:q:R:r:W:X:"); if (c == -1) break; switch (c) { case 'A': options.ali_flag = atoi(optarg); if (options.ali_flag < 0 || options.ali_flag > 4) fatal("A must be one of 0, 1, 2, 3, or 4.\n"); break; case 'C': { int val = atoi(optarg); if (val < 0) fatal("Value for option C must be non-negative.\n"); options.C = val; break; } case 'c': { int val = atoi(optarg); if (val < 0) fatal("Value for option c must be non-negative.\n"); options.minScore_cutoff = val; break; } case 'E': options.cutoff = atoi(optarg); if (options.cutoff < 3 || options.cutoff > 10) fatal("Cutoff (E) must be within [3,10].\n"); break; case 'f': options.filterPct = atoi(optarg); if (options.filterPct > 100) fatal("Filter in percent (f) must be within [0,100].\n"); break; case 'g': options.gapPct = atoi(optarg); break; case 'I': options.intron_window = atoi(optarg); break; case 'K': { int val = atoi(optarg); if (val < 0) fatal("Value for option K must be non-negative.\n"); options.K = val; break; } case 'L': { size_t i; size_t len = strlen(optarg); options.splice_type_list = optarg; options.nbSplice = 1; if (len % 5 != 4) fatal("Splice types list has illegal length (%zu)\n", len); for (i = 0; i < len; i++) if (i % 5 == 4) { if (options.splice_type_list[i] != ',') fatal("Comma expected instead of %c at position %zu" "in splice types list.\n", options.splice_type_list[i], i); options.nbSplice += 1; } else { if (options.splice_type_list[i] != 'A' && options.splice_type_list[i] != 'C' && options.splice_type_list[i] != 'G' && options.splice_type_list[i] != 'T') fatal("Expected 'A', 'C', 'G' or 'T' instead of '%c' at" "position %zu in splice types list.\n", options.splice_type_list[i], i); } break; } case 'M': { int val = atoi(optarg); if (val < 0) fatal("Value for option M must be non-negative.\n"); options.scoreSplice_window = val; break; } case 'o': options.dnaOffset = atoi(optarg); break; case 'q': options.mismatchScore = atoi(optarg); break; case 'R': options.reverse = atoi(optarg); if (options.reverse < 0 || options.reverse > 2) fatal("R must be one of 0, 1, or 2.\n"); break; case 'r': options.matchScore = atoi(optarg); break; case 'W': options.W = atoi(optarg); if (options.W < 1 || options.W > 15) fatal("W must be within [1,15].\n"); break; case 'X': options.X = atoi(optarg); if (options.X < 1) fatal("X must be positive.\n"); break; case '?': break; default: fprintf(stderr, "?? getopt returned character code 0%o ??\n", c); } } if (optind + 2 != argc) { fprintf(stderr, Usage, argv[0], options.ali_flag, options.C, options.minScore_cutoff, options.cutoff, options.filterPct, options.gapPct, options.intron_window, options.K, options.splice_type_list, options.scoreSplice_window, options.dnaOffset, options.mismatchScore, options.reverse, options.matchScore, options.W, options.X); return 1; } /* read seq1 */ init_seq(argv[optind], &seq1); if (get_next_seq(&seq1, options.dnaOffset, 1) != 0) fatal("Cannot read sequence from %s.\n", argv[optind]); strncpy(dna_seq_head, seq1.header, 256); /* read seq2 */ init_seq(argv[optind + 1], &seq2); if (get_next_seq(&seq2, 0, 0) != 0) fatal("Cannot read sequence from %s.\n", argv[optind + 1]); init_encoding(); init_hash_env(&he, options.W, seq1.seq, seq1.len); init_col(&res, 1); init_col(&rev_res, 1); bld_table(&he); init_splice_junctions(); count = 0; while (!count || get_next_seq(&seq2, 0, 0) == 0) { unsigned int curRes; strncpy(rna_seq_head, seq2.header, 256); ++count; switch (options.reverse) { case 0: SIM4(&he, &seq2, &res); break; case 2: SIM4(&he, &seq2, &res); case 1: seq_revcomp_inplace(&seq2); SIM4(&he, &seq2, &rev_res); break; default: fatal ("Unrecognized request for EST orientation.\n"); } /* Keep only the best matches, according to filterPct. */ if (options.filterPct > 0) { unsigned int max_nmatches = 0; for (curRes = 0; curRes < rev_res.nb; curRes++) { result_p_t r = rev_res.e.result[curRes]; if (r->st.nmatches > max_nmatches) max_nmatches = r->st.nmatches; } for (curRes = 0; curRes < res.nb; curRes++) { result_p_t r = res.e.result[curRes]; if (r->st.nmatches > max_nmatches) max_nmatches = r->st.nmatches; } max_nmatches = (max_nmatches * options.filterPct) / 100; for (curRes = 0; curRes < rev_res.nb; curRes++) { result_p_t r = rev_res.e.result[curRes]; if (r->st.nmatches < max_nmatches) r->st.nmatches = 0; } for (curRes = 0; curRes < res.nb; curRes++) { result_p_t r = res.e.result[curRes]; if (r->st.nmatches < max_nmatches) r->st.nmatches = 0; } } /* Now, print results. */ for (curRes = 0; curRes < rev_res.nb; curRes++) print_res(rev_res.e.result[curRes], 1, &seq1, &seq2); rev_res.nb = 0; if (options.reverse && options.ali_flag) /* reverse-complement back seq2 for alignment */ seq_revcomp_inplace(&seq2); for (curRes = 0; curRes < res.nb; curRes++) print_res(res.e.result[curRes], 0, &seq1, &seq2); res.nb = 0; } #ifdef DEBUG fprintf(stderr, "DEBUG mode: freeing all memory...\n"); fflush(stdout); fflush(stderr); free_hash_env(&he); free_seq(&seq1); free_seq(&seq2); free(options.splice); free(res.e.elt); free(rev_res.e.elt); #endif return 0; }
/************************************************************************* * Entry point for ama *************************************************************************/ int main(int argc, char *argv[]) { int max_seq_length = MAX_SEQ; STRING_LIST_T* selected_motifs = NULL; double pseudocount = 0.01; int output_format = CISML_FORMAT; program_name = "ama"; int scoring = AVG_ODDS; BOOLEAN_T pvalues = FALSE; BOOLEAN_T normalize_scores = FALSE; BOOLEAN_T combine_duplicates = FALSE; int num_gc_bins = 1; int sdbg_order = -1; // don't use sequence background BOOLEAN_T scan_both_strands = TRUE; ARRAY_T* pos_bg_freqs = NULL; ARRAY_T* rev_bg_freqs = NULL; clock_t c0, c1; /* measuring cpu_time */ CISML_T *cisml; char * out_dir = NULL; BOOLEAN_T clobber = FALSE; int i; int last = 0; ALPH_T alph = INVALID_ALPH; /********************************************** * COMMAND LINE PROCESSING **********************************************/ const int num_options = 16; cmdoption const motif_scan_options[] = { { "max-seq-length", REQUIRED_VALUE }, { "motif", REQUIRED_VALUE }, { "motif-pseudo", REQUIRED_VALUE }, { "rma", NO_VALUE }, { "pvalues", NO_VALUE }, { "sdbg", REQUIRED_VALUE }, { "norc", NO_VALUE }, { "cs", NO_VALUE }, { "o-format", REQUIRED_VALUE }, { "o", REQUIRED_VALUE }, { "oc", REQUIRED_VALUE }, { "scoring", REQUIRED_VALUE }, { "verbosity", REQUIRED_VALUE }, { "gcbins", REQUIRED_VALUE }, { "last", REQUIRED_VALUE }, { "version", NO_VALUE } }; int option_index = 0; // Define the usage message. char usage[] = "USAGE: ama [options] <motif file> <sequence file> [<background file>]\n" "\n" " Options:\n" " --sdbg <order>\t\t\tUse Markov background model of\n" " \t\t\t\t\torder <order> derived from the sequence\n" " \t\t\t\t\tto compute its likelihood ratios.\n" " \t\t\t\t\tOverrides --pvalues, --gcbins and --rma;\n" " \t\t\t\t\t<background file> is required unless\n" " \t\t\t\t\t--sdbg is given.\n" " --motif <id>\t\t\tUse only the motif identified by <id>.\n" " \t\t\t\t\tThis option may be repeated.\n" " --motif-pseudo <float>\t\tThe value <float> times the background\n" " \t\t\t\t\tfrequency is added to the count of each\n" " \t\t\t\t\tletter when creating the likelihood \n" " \t\t\t\t\tratio matrix (default: %g).\n" " --norc\t\t\t\tDisables the scanning of the reverse\n" " \t\t\t\t\tcomplement strand.\n" " --scoring [avg-odds|max-odds]\tIndicates whether the average or \n" " \t\t\t\t\tthe maximum odds should be calculated\n" " \t\t\t\t\t(default: avg-odds)\n" " --rma\t\t\t\tScale motif scores to the range 0-1.\n" " \t\t\t\t\t(Relative Motif Affinity).\n" " \t\t\t\t\tMotif scores are scaled by the maximum\n" " \t\t\t\t\tscore achievable by that PWM. (default:\n" " \t\t\t\t\tmotif scores are not normalized)\n" " --pvalues\t\t\t\tPrint p-value of avg-odds score in cisml\n" " \t\t\t\t\toutput. Ignored for max-odds scoring.\n" " \t\t\t\t\t(default: p-values are not printed)\n" " --gcbins <bins>\t\t\tCompensate p-values for GC content of\n" " \t\t\t\t\teach sequence using given number of \n" " \t\t\t\t\tGC range bins. Recommended bins: 41.\n" " \t\t\t\t\t(default: p-values are based on\n" " \t\t\t\t\tfrequencies in background file)\n" " --cs\t\t\t\tEnable combining sequences with same\n" " \t\t\t\t\tidentifier by taking the average score\n" " \t\t\t\t\tand the Sidac corrected p-value.\n" " --o-format [gff|cisml]\t\tOutput file format (default: cisml)\n" " \t\t\t\t\tignored if --o or --oc option used\n" " --o <directory>\t\t\tOutput all available formats to\n" " \t\t\t\t\t<directory>; give up if <directory>\n" " \t\t\t\t\texists\n" " --oc <directory>\t\t\tOutput all available formats to\n" " \t\t\t\t\t<directory>; if <directory> exists\n" " \t\t\t\t\toverwrite contents\n" " --verbosity [1|2|3|4]\t\tControls amount of screen output\n" " \t\t\t\t\t(default: %d)\n" " --max-seq-length <int>\t\tSet the maximum length allowed for \n" " \t\t\t\t\tinput sequences. (default: %d)\n" " --last <int>\t\t\tUse only scores of (up to) last <n>\n" " \t\t\t\t\tsequence positions to compute AMA.\n" " --version \t\t\tPrint version and exit.\n" "\n"; // Parse the command line. if (simple_setopt(argc, argv, num_options, motif_scan_options) != NO_ERROR) { die("Error processing command line options: option name too long.\n"); } BOOLEAN_T setoutputformat = FALSE; BOOLEAN_T setoutputdirectory = FALSE; while (TRUE) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char * message = NULL; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { (void) simple_getopterror(&message); die("Error processing command line options (%s).\n", message); } else if (strcmp(option_name, "max-seq-length") == 0) { max_seq_length = atoi(option_value); } else if (strcmp(option_name, "norc") == 0) { scan_both_strands = FALSE; } else if (strcmp(option_name, "cs") == 0) { combine_duplicates = TRUE; } else if (strcmp(option_name, "motif") == 0) { if (selected_motifs == NULL) { selected_motifs = new_string_list(); } add_string(option_value, selected_motifs); } else if (strcmp(option_name, "motif-pseudo") == 0) { pseudocount = atof(option_value); } else if (strcmp(option_name, "o-format") == 0) { if (setoutputdirectory) { if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "output directory specified, ignoring --o-format\n"); } else { setoutputformat = TRUE; if (strcmp(option_value, "gff") == 0) output_format = GFF_FORMAT; else if (strcmp(option_value, "cisml") == 0) output_format = CISML_FORMAT; else { if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "Output format not known. Using standard instead (cisML).\n"); output_format = CISML_FORMAT; } } } else if (strcmp(option_name, "o") == 0 || strcmp(option_name, "oc") == 0) { setoutputdirectory = TRUE; if (setoutputformat) { if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "output directory specified, ignoring --o-format\n"); } clobber = strcmp(option_name, "oc") == 0; out_dir = (char*) malloc (sizeof(char)*(strlen(option_value)+1)); strcpy(out_dir, option_value); output_format = DIRECTORY_FORMAT; } else if (strcmp(option_name, "verbosity") == 0) { verbosity = atoi(option_value); } else if (strcmp(option_name, "scoring") == 0) { if (strcmp(option_value, "max-odds") == 0) scoring = MAX_ODDS; else if (strcmp(option_value, "avg-odds") == 0) scoring = AVG_ODDS; else if (strcmp(option_value, "sum-odds") == 0) scoring = SUM_ODDS; else die("Specified scoring scheme not known.\n", message); } else if (strcmp(option_name, "pvalues") == 0) { pvalues = TRUE; } else if (strcmp(option_name, "rma") == 0) { normalize_scores = TRUE; fprintf(stderr, "Normalizing motif scores using RMA method.\n"); } else if (strcmp(option_name, "gcbins") == 0) { num_gc_bins = atoi(option_value); pvalues = TRUE; if (num_gc_bins <= 1) die("Number of bins in --gcbins must be greater than 1.\n", message); } else if (strcmp(option_name, "sdbg") == 0) { sdbg_order = atoi(option_value); // >=0 means use sequence bkg } else if (strcmp(option_name, "last") == 0) { int i = 0; if (option_value[0] == '-') ++i; while (option_value[i] != '\0') { if (!isdigit(option_value[i])) { die("Specified parameter 'last' contains non-numeric characters.\n"); } ++i; } last = atoi(option_value); if (errno != 0) { die("Specified parameter 'last' could not be parsed as a number as:\n%s\n",strerror(errno)); } if (last < 0) { die("Specified parameter 'last' had negative value (%d) when only postive or zero values are allowed \n", last); } } else if (strcmp(option_name, "version") == 0) { fprintf(stdout, VERSION "\n"); exit(EXIT_SUCCESS); } } // --sdbg overrides --pvalues and --gcbins and --rma int req_args = 3; if (sdbg_order >= 0) { pvalues = FALSE; normalize_scores = FALSE; num_gc_bins = 1; req_args = 2; } // Check all required arguments given if (sdbg_order >= 0 && argc > option_index + req_args) { die("<background file> cannot be given together with --sdbg.\n"); } else if (argc != option_index + req_args) { fprintf(stderr, usage, pseudocount, verbosity, max_seq_length); exit(EXIT_FAILURE); } // Get required arguments. char* motif_filename = argv[option_index]; option_index++; char* fasta_filename = argv[option_index]; option_index++; char* bg_filename; if (req_args == 3) { // required unless --sdbg given bg_filename = argv[option_index]; option_index++; } else { bg_filename = "--uniform--"; // So PSSMs will use uniform background; // we can multiply them out later. } // measure time c0 = clock(); // Set up hash tables for computing reverse complement if doing --sdbg if (sdbg_order >= 0) setup_hash_alph(DNAB); // Create cisml data structure for recording results cisml = allocate_cisml(program_name, motif_filename, fasta_filename); set_cisml_background_file(cisml, bg_filename); /********************************************** * Read the motifs and background model. **********************************************/ int num_motifs = 0; MREAD_T *mread; ARRAYLST_T *motifs; PSSM_PAIR_T** pssm_pairs; // note pssm_pairs is an array of pointers //this reads any meme file, xml, txt and html mread = mread_create(motif_filename, OPEN_MFILE); mread_set_bg_source(mread, bg_filename); mread_set_pseudocount(mread, pseudocount); motifs = mread_load(mread, NULL); alph = mread_get_alphabet(mread); pos_bg_freqs = mread_get_background(mread); mread_destroy(mread); num_motifs = arraylst_size(motifs); // allocate memory for PSSM pairs pssm_pairs = (PSSM_PAIR_T**)mm_malloc(sizeof(PSSM_PAIR_T*) * num_motifs); if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "Number of motifs in file %d.\n", num_motifs); // make a CISML pattern to hold scores for each motif PATTERN_T** patterns = NULL; Resize(patterns, num_motifs, PATTERN_T*); int motif_index; for (motif_index = 0; motif_index < num_motifs; motif_index++) { MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs); patterns[motif_index] = allocate_pattern(get_motif_id(motif), ""); add_cisml_pattern(cisml, patterns[motif_index]); } // make reverse complement motifs and background frequencies. if (scan_both_strands == TRUE) { add_reverse_complements(motifs); assert(arraylst_size(motifs) == (2 * num_motifs)); rev_bg_freqs = allocate_array(get_array_length(pos_bg_freqs)); complement_dna_freqs(pos_bg_freqs, rev_bg_freqs); } /************************************************************** * Convert motif matrices into log-odds matrices. * Scale them. * Compute the lookup tables for the PDF of scaled log-odds scores. **************************************************************/ int ns = scan_both_strands ? 2 : 1; // number of strands for (motif_index = 0; motif_index < num_motifs; motif_index++) { MOTIF_T *motif, *motif_rc; motif = (MOTIF_T*)arraylst_get(motif_index*ns, motifs); if (scan_both_strands) motif_rc = (MOTIF_T*)arraylst_get(motif_index*ns + 1, motifs); else motif_rc = NULL; /* * Note: If scanning both strands, we complement the motif frequencies * but not the background frequencies so the motif looks the same. * However, the given frequencies are used in computing the p-values * since they represent the frequencies on the negative strands. * (If we instead were to complement the input sequence, keeping the * the motif fixed, we would need to use the complemented frequencies * in computing the p-values. Is that any clearer?) */ double range = 300; // 100 is not very good; 1000 is great but too slow PSSM_T* pos_pssm = build_motif_pssm( motif, pos_bg_freqs, pos_bg_freqs, NULL, // Priors not used 0.0L, // alpha not used range, num_gc_bins, TRUE ); PSSM_T* neg_pssm = (scan_both_strands ? build_motif_pssm( motif_rc, rev_bg_freqs, pos_bg_freqs, NULL, // Priors not used 0.0L, // alpha not used range, num_gc_bins, TRUE ) : NULL ); pssm_pairs[motif_index] = create_pssm_pair(pos_pssm, neg_pssm); } // Open the FASTA file for reading. FILE* fasta_file = NULL; if (open_file(fasta_filename, "r", FALSE, "FASTA", "sequences", &fasta_file) == 0) { die("Couldn't open the file %s.\n", fasta_filename); } if (verbosity >= NORMAL_VERBOSE) { if (last == 0) { fprintf(stderr, "Using entire sequence\n"); } else { fprintf(stderr, "Limiting sequence to last %d positions.\n", last); } } /************************************************************** * Read in all sequences and score with all motifs **************************************************************/ int seq_loading_num = 0; // keeps track on the number of sequences read in total int seq_counter = 0; // holds the index to the seq in the pattern int unique_seqs = 0; // keeps track on the number of unique sequences BOOLEAN_T need_postprocessing = FALSE; SEQ_T* sequence = NULL; RBTREE_T* seq_ids = rbtree_create(rbtree_strcasecmp,NULL,free,rbtree_intcpy,free); RBNODE_T* seq_node; BOOLEAN_T created; while (read_one_fasta(alph, fasta_file, max_seq_length, &sequence)) { ++seq_loading_num; created = FALSE; char* seq_name = get_seq_name(sequence); int seq_len = get_seq_length(sequence); int scan_len; if (last != 0) { scan_len = last; } else { scan_len = seq_len; } // red-black trees are only required if duplicates should be combined if (combine_duplicates){ //lookup seq id and create new entry if required, return sequence index char *tmp_id = mm_malloc(strlen(seq_name)+1); // required copy for rb-tree strncpy(tmp_id,seq_name,strlen(seq_name)+1); seq_node = rbtree_lookup(seq_ids, tmp_id, TRUE, &created); if (created) {// assign it a loading number rbtree_set(seq_ids, seq_node, &unique_seqs); seq_counter = unique_seqs; ++unique_seqs; } else { seq_counter = *((int*)rbnode_get(seq_node)); } } // // Set up sequence-dependent background model and compute // log cumulative probability of sequence. // double *logcumback = NULL; // array of log cumulative probs. if (sdbg_order >= 0) { Resize(logcumback, seq_len+1, double); char* raw_seq = get_raw_sequence(sequence); BOOLEAN rc = FALSE; double *a_cp = get_markov_from_sequence(raw_seq, alph_string(alph), rc, sdbg_order, 0); log_cum_back(raw_seq, a_cp, sdbg_order, logcumback); myfree(a_cp); } // Get the GC content of the sequence if binning p-values by GC // and store it in the sequence object. if (num_gc_bins > 1) { ARRAY_T *freqs = get_sequence_freqs(sequence, alph); set_total_gc_sequence(sequence, get_array_item(1,freqs) + get_array_item(2,freqs)); // f(C) + f(G) free_array(freqs); // clean up } else { set_total_gc_sequence(sequence, -1); // flag ignore } /************************************************************** * Process all motifs. **************************************************************/ int ns = scan_both_strands ? 2 : 1; for (motif_index = 0; motif_index < num_motifs; motif_index++) { PATTERN_T *pattern = patterns[motif_index]; MOTIF_T* motif = (MOTIF_T*)arraylst_get(ns*motif_index, motifs); char* motif_id = (scan_both_strands ? get_motif_st_id(motif) : get_motif_id(motif)); if (verbosity >= HIGH_VERBOSE) { fprintf(stderr, "Using motif %s of width %d.\n", motif_id, get_motif_length(motif)); } if ((selected_motifs == NULL) || (have_string(get_motif_id(motif), selected_motifs) == TRUE)) { if (verbosity >= HIGHER_VERBOSE) { fprintf(stderr, "Scanning %s sequence with length %d " "abbreviated to %d with motif %s with length %d.\n", seq_name, seq_len, scan_len, motif_id, get_motif_length(motif)); } SCANNED_SEQUENCE_T* scanned_seq = NULL; if (!combine_duplicates || get_pattern_num_scanned_sequences(pattern) <= seq_counter){ // Create a scanned_sequence record and save it in the pattern. scanned_seq = allocate_scanned_sequence(seq_name, seq_name, pattern); set_scanned_sequence_length(scanned_seq, scan_len); } else { // get existing sequence record scanned_seq = get_pattern_scanned_sequences(pattern)[seq_counter]; set_scanned_sequence_length(scanned_seq, max(scan_len, get_scanned_sequence_length(scanned_seq))); } // check if scanned component of sequence has sufficient length for the motif if (scan_len < get_motif_length(motif)) { // set score to zero and p-value to 1 if not set yet if(!has_scanned_sequence_score(scanned_seq)){ set_scanned_sequence_score(scanned_seq, 0.0); } if(pvalues && !has_scanned_sequence_pvalue(scanned_seq)){ set_scanned_sequence_pvalue(scanned_seq, 1.0); } add_scanned_sequence_scanned_position(scanned_seq); if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 0L) need_postprocessing = TRUE; if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "%s too short for motif %s. Score set to 0!\n", seq_name, motif_id); } else { // scan the sequence using average/maximum motif affinity ama_sequence_scan(alph, sequence, logcumback, pssm_pairs[motif_index], scoring, pvalues, last, scanned_seq, &need_postprocessing); } } else { if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "Skipping motif %s.\n", motif_id); } } // All motifs parsed free_seq(sequence); if (sdbg_order >= 0) myfree(logcumback); } // read sequences
/************************************************************************* * Entry point for centrimo *************************************************************************/ int main(int argc, char *argv[]) { CENTRIMO_OPTIONS_T options; SEQ_SITES_T seq_sites; SITE_COUNTS_T counts; int seqN, motifN, seqlen, db_i, motif_i, i; double log_pvalue_thresh; SEQ_T** sequences = NULL; ARRAY_T* bg_freqs = NULL; ARRAYLST_T *stats_list; MOTIF_DB_T **dbs, *db; MREAD_T *mread; MOTIF_STATS_T *stats; MOTIF_T *motif, *rev_motif; PSSM_T *pos_pssm, *rev_pssm; char *sites_path, *desc; FILE *sites_file; HTMLWR_T *html; JSONWR_T *json; // COMMAND LINE PROCESSING process_command_line(argc, argv, &options); // load the sequences read_sequences(options.alphabet, options.seq_source, &sequences, &seqN); seqlen = (seqN ? get_seq_length(sequences[0]) : 0); // calculate a sequence background (unless other background is given) if (!options.bg_source) { bg_freqs = calc_bg_from_fastas(options.alphabet, seqN, sequences); } // load the motifs motifN = 0; dbs = mm_malloc(sizeof(MOTIF_DB_T*) * arraylst_size(options.motif_sources)); for (i = 0; i < arraylst_size(options.motif_sources); i++) { char* db_source; db_source = (char*)arraylst_get(i, options.motif_sources); dbs[i] = read_motifs(i, db_source, options.bg_source, &bg_freqs, options.pseudocount, options.selected_motifs, options.alphabet); motifN += arraylst_size(dbs[i]->motifs); } log_pvalue_thresh = log(options.evalue_thresh) - log(motifN); // Setup some things for double strand scanning if (options.scan_both_strands == TRUE) { // Set up hash tables for computing reverse complement setup_hash_alph(DNAB); setalph(0); // Correct background by averaging on freq. for both strands. average_freq_with_complement(options.alphabet, bg_freqs); normalize_subarray(0, alph_size(options.alphabet, ALPH_SIZE), 0.0, bg_freqs); calc_ambigs(options.alphabet, FALSE, bg_freqs); } // Create output directory if (create_output_directory(options.output_dirname, options.allow_clobber, (verbosity >= NORMAL_VERBOSE))) { die("Couldn't create output directory %s.\n", options.output_dirname); } // open output files sites_path = make_path_to_file(options.output_dirname, SITES_FILENAME); sites_file = fopen(sites_path, "w"); free(sites_path); // setup html monolith writer json = NULL; if ((html = htmlwr_create(get_meme_etc_dir(), TEMPLATE_FILENAME))) { htmlwr_set_dest_name(html, options.output_dirname, HTML_FILENAME); htmlwr_replace(html, "centrimo_data.js", "data"); json = htmlwr_output(html); if (json == NULL) die("Template does not contain data section.\n"); } else { DEBUG_MSG(QUIET_VERBOSE, "Failed to open html template file.\n"); } if (json) { // output some top level variables jsonwr_str_prop(json, "version", VERSION); jsonwr_str_prop(json, "revision", REVISION); jsonwr_str_prop(json, "release", ARCHIVE_DATE); jsonwr_str_array_prop(json, "cmd", argv, argc); jsonwr_property(json, "options"); jsonwr_start_object_value(json); jsonwr_dbl_prop(json, "motif-pseudo", options.pseudocount); jsonwr_dbl_prop(json, "score", options.score_thresh); jsonwr_dbl_prop(json, "ethresh", options.evalue_thresh); jsonwr_lng_prop(json, "maxbin", options.max_window+1); jsonwr_bool_prop(json, "norc", !options.scan_both_strands); jsonwr_bool_prop(json, "noflip", options.no_flip); jsonwr_end_object_value(json); // output the description desc = prepare_description(&options); if (desc) { jsonwr_str_prop(json, "job_description", desc); free(desc); } // output size metrics jsonwr_lng_prop(json, "seqlen", seqlen); jsonwr_lng_prop(json, "tested", motifN); // output the fasta db jsonwr_property(json, "sequence_db"); jsonwr_start_object_value(json); jsonwr_str_prop(json, "source", options.seq_source); jsonwr_lng_prop(json, "count", seqN); jsonwr_end_object_value(json); // output the motif dbs jsonwr_property(json, "motif_dbs"); jsonwr_start_array_value(json); for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) { db = dbs[db_i]; jsonwr_start_object_value(json); jsonwr_str_prop(json, "source", db->source); jsonwr_lng_prop(json, "count", arraylst_size(db->motifs)); jsonwr_end_object_value(json); } jsonwr_end_array_value(json); // start the motif array jsonwr_property(json, "motifs"); jsonwr_start_array_value(json); } /************************************************************** * Tally the positions of the best sites for each of the * selected motifs. **************************************************************/ // prepare the sequence sites memset(&seq_sites, 0, sizeof(SEQ_SITES_T)); // prepare the site counts counts.allocated = ((2 * seqlen) - 1); counts.sites = mm_malloc(sizeof(double) * counts.allocated); // prepare the motifs stats list stats_list = arraylst_create(); // prepare the other vars motif = NULL; pos_pssm = NULL; rev_motif = NULL; rev_pssm = NULL; for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) { db = dbs[db_i]; for (motif_i = 0; motif_i < arraylst_size(db->motifs); motif_i++) { motif = (MOTIF_T *) arraylst_get(motif_i, db->motifs); DEBUG_FMT(NORMAL_VERBOSE, "Using motif %s of width %d.\n", get_motif_id(motif), get_motif_length(motif)); // reset the counts for (i = 0; i < counts.allocated; i++) counts.sites[i] = 0; counts.total_sites = 0; // create the pssm pos_pssm = make_pssm(bg_freqs, motif); // If required, do the same for the reverse complement motif. if (options.scan_both_strands) { rev_motif = dup_rc_motif(motif); rev_pssm = make_pssm(bg_freqs, rev_motif); } // scan the sequences for (i = 0; i < seqN; i++) score_sequence(&options, sequences[i], pos_pssm, rev_pssm, &seq_sites, &counts); // DEBUG check that the sum of the sites is close to the site count double sum_check = 0, sum_diff; for (i = 0; i < counts.allocated; i++) sum_check += counts.sites[i]; sum_diff = counts.total_sites - sum_check; if (sum_diff < 0) sum_diff = -sum_diff; if (sum_diff > 0.1) { fprintf(stderr, "Warning: site counts don't sum to accurate value! " "%g != %ld", sum_check, counts.total_sites); } // output the plain text site counts output_site_counts(sites_file, seqlen, db, motif, &counts); // compute the best central window stats = compute_stats(options.max_window, seqlen, db, motif, &counts); // check if it passes the threshold if (json && stats->log_adj_pvalue <= log_pvalue_thresh) { output_motif_json(json, stats, &counts); arraylst_add(stats, stats_list); } else { free(stats); } // Free memory associated with this motif. free_pssm(pos_pssm); free_pssm(rev_pssm); destroy_motif(rev_motif); } } if (json) jsonwr_end_array_value(json); // finish writing sites fclose(sites_file); // finish writing html file if (html) { if (htmlwr_output(html) != NULL) { die("Found another JSON replacement!\n"); } htmlwr_destroy(html); } // write text file output_centrimo_text(&options, motifN, stats_list); // Clean up. for (i = 0; i < seqN; ++i) { free_seq(sequences[i]); } free(sequences); for (i = 0; i < arraylst_size(options.motif_sources); i++) { free_db(dbs[i]); } free(dbs); free_array(bg_freqs); free(counts.sites); free(seq_sites.sites); arraylst_destroy(free, stats_list); cleanup_options(&options); return 0; }
dsEnqError srch_start() { struct ds_search_arg search_arg; struct ds_search_result result; struct DSError error; dsEnqError return_error; extern Filter make_filter(); DN curr_rdn; if (*mvalue == '\0') { return list_start(); } if (get_default_service (&search_arg.sra_common) != 0) { return nothingfound; } search_arg.sra_common.ca_servicecontrol.svc_options = SVC_OPT_PREFERCHAIN; curr_rdn = search_arg.sra_baseobject = (*base_path != 'T'? str2dn (base_path): NULLDN); search_arg.sra_eis.eis_allattributes = FALSE; search_arg.sra_eis.eis_infotypes = EIS_ATTRIBUTETYPESONLY; search_arg.sra_eis.eis_select = 0; search_arg.sra_searchaliases = TRUE; search_arg.sra_subset = SRA_ONELEVEL; while (curr_rdn != NULLDN) { if (!strcmp(curr_rdn->dn_rdn->rdn_at->oa_ot.ot_stroid, "2.5.4.10")) { search_arg.sra_subset = SRA_WHOLESUBTREE; break; } curr_rdn = curr_rdn->dn_parent; } if ((search_arg.sra_filter = make_filter(filt_arr[typeindx])) == NULLFILTER) return duaerror; #ifndef NO_STATS LLOG (log_stat, LLOG_NOTICE, ("search +%s, extent %d, val %s", base_path,search_arg.sra_subset, mvalue)); #endif if(ds_search (&search_arg, &error, &result) != DS_OK) { /* deal with error */ free_seq(dnseq); dnseq = NULLDS; dn_number = 0; log_ds_error(&error); ds_error_free(&error); switch (error.dse_type) { case DSE_LOCALERROR: return_error = duaerror; break; case DSE_REMOTEERROR: return_error = localdsaerror; break; case DSE_ATTRIBUTEERROR: return_error = attributerror; break; case DSE_REFERRAL: case DSE_DSAREFERRAL: return_error = remotedsaerror; break; case DSE_SECURITYERROR: return_error = security; break; case DSE_NAMEERROR: return_error = namerror; break; case DSE_SERVICEERROR: return_error = serviceerror; break; default: return_error = localdsaerror; break; } } else { correlate_search_results (&result); dn_number = 0; if (result.CSR_entries != NULLENTRYINFO) { register EntryInfo *ptr; return_error = Okay; free_seq(dnseq); dnseq = NULLDS; dn_number = 0; for (ptr = result.CSR_entries; ptr != NULLENTRYINFO; ptr = ptr->ent_next){ dn_number++; dn2buf((caddr_t) ptr->ent_dn, goto_path); add_seq(&dnseq, goto_path); } if (dn_number) dnseq = SortList(dnseq); } else if (result.CSR_limitproblem == LSR_NOLIMITPROBLEM) { free_seq(dnseq); dnseq = NULLDS; dn_number = 0; return_error = nothingfound; } if(result.CSR_limitproblem != LSR_NOLIMITPROBLEM) { switch (result.CSR_limitproblem) { case LSR_TIMELIMITEXCEEDED: if (dn_number > 0) return_error = timelimit_w_partial; else { free_seq(dnseq); dnseq = NULLDS; return_error = timelimit; } break; case LSR_SIZELIMITEXCEEDED: return_error = listsizelimit; break; case LSR_ADMINSIZEEXCEEDED: if (dn_number > 0) return_error = adminlimit_w_partial; else { free_seq(dnseq); dnseq = NULLDS; return_error = adminlimit; } break; } entryinfo_free(result.CSR_entries, 0); } } entry_number = dn_number; filter_free(search_arg.sra_filter); dn_free(search_arg.sra_baseobject); ds_error_free(&error); return return_error; }
/**************************************************************************** * Remove from the alignment all columns that contain gaps for the * specified species. ****************************************************************************/ ALIGNMENT_T* remove_alignment_gaps (char* species, ALIGNMENT_T* alignment) { // Locate this species in the alignment. int species_index = get_index_in_string_list(species, get_species_names(alignment)); if (species_index == -1) { die("Can't find %s in alignment.\n", species); } SEQ_T* this_seq = get_alignment_sequence(species_index, alignment); // Get the dimensions of the original matrix. int num_sequences = get_num_aligned_sequences(alignment); int alignment_length = get_alignment_length(alignment); // Allocate memory for raw sequences that will constitute the new alignment. char** raw_sequences = (char**)mm_malloc(sizeof(char*) * num_sequences); int i_seq = 0; for (i_seq = 0; i_seq < num_sequences; i_seq++) { raw_sequences[i_seq] = (char*)mm_calloc(alignment_length + 1, sizeof(char*)); } char* consensus = get_consensus_string(alignment); char* new_consensus = (char*)mm_calloc(alignment_length + 1, sizeof(char*)); // Iterate over all columns. int i_column; int i_raw = 0; for (i_column = 0; i_column < alignment_length; i_column++) { // Is there a gap? char this_char = get_seq_char(i_column, this_seq); if ((this_char != '-') && (this_char != '.')) { // If no gap, then copy over this column. for (i_seq = 0; i_seq < num_sequences; i_seq++) { SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment); char this_char = get_seq_char(i_column, this_sequence); raw_sequences[i_seq][i_raw] = this_char; } new_consensus[i_raw] = consensus[i_column]; i_raw++; } } // Create new sequence objects. SEQ_T** new_sequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*)); for (i_seq = 0; i_seq < num_sequences; i_seq++) { SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment); new_sequences[i_seq] = allocate_seq(get_seq_name(this_sequence), get_seq_description(this_sequence), get_seq_offset(this_sequence), raw_sequences[i_seq]); } // Allocate and return the new alignment. ALIGNMENT_T* new_alignment = allocate_alignment(get_alignment_name(alignment), get_alignment_description(alignment), num_sequences, new_sequences, new_consensus); // Free local dynamic memory. for (i_seq = 0; i_seq < num_sequences; i_seq++) { myfree(raw_sequences[i_seq]); free_seq(new_sequences[i_seq]); } myfree(raw_sequences); myfree(new_sequences); myfree(new_consensus); return(new_alignment); }