int main(int argc, char* argv[]) { char* myString; int* suffixArray; int stringLength; int i; ifstream inFile; inFile.open(argv[1]); Timing timehere; if (strcmp(argv[1], "test.dat") != 0) { timehere.markbeg(); if (strstr(argv[1], ".fas")[0] == '.') { read_fasta(inFile, myString, stringLength); } else { read_input(inFile, myString, stringLength); } timehere.markend(); inFile.close(); cout << "finish read " << stringLength << " characters."<< endl; timehere.outtime(); } else { read_input(inFile, myString, stringLength); inFile.close(); cout << "finish read " << stringLength << " characters."<< endl; } timehere.markbeg(); suffixArray = LinearSuffixSort(myString, stringLength); timehere.markend(); timehere.outtime("finish suffix sort,"); if (strcmp(argv[1], "test.dat") == 0) { int result; bool pass = true; ifstream resultF; resultF.open("result.test.dat"); cout << "Testing the Suffix Array" << endl; for (i = 0; i < stringLength; i++) { resultF >> result; if (result != suffixArray[i]) { pass = false; } } if (pass == false) { cout << endl; cout << "***************" << endl; cout << "test has failed" << endl; cout << "***************" << endl; } else { cout << endl; cout << "******************" << endl; cout << "test is successful" << endl; cout << "******************" << endl; } }
int main (int argc, char *argv[]) { char *myString; int stringLength; int i; string namePattern = ""; char *version = "1.06"; bool display_usage = false; char c; CPSSCAN ps_scan; PBS pbs; PPT ppt(PPT_window, 4); char *tRNA_file = NULL; char *ps_dir = NULL; bool ishtml = false; struct re_pattern_buffer id_filter; id_filter.allocated = 0; id_filter.buffer = 0; id_filter.fastmap = 0; id_filter.translate = 0; Timing timehere; Timing timeall; timeall.markbeg(); while ((c = getopt (argc, argv, "ho:t:e:m:u:D:d:L:l:p:s:cw:S:a:P:g:F:B:b:J:j:O:r:M:f:xEiCG:T:")) > 0) { switch (c) { case 'h': display_usage = true; break; case 'o': gap_open = atoi (optarg); break; case 't': gap_ext = atoi (optarg); break; case 'e': gap_end = atoi (optarg); break; case 'm': score_match = atoi (optarg); break; case 'u': score_mismatch = atoi (optarg); break; case 'D': Dmax = atoi (optarg); break; case 'd': Dmin = atoi (optarg); break; case 'L': Lmax = atoi (optarg); break; case 'l': Lmin = atoi (optarg); break; case 'p': Lex = atoi (optarg); break; case 'c': CHECK_PAIRS = 1; break; case 'x': ishtml = true; break; case 'E': edge_signal = true; break; case 'i': showPairNum = true; break; case 'w': wrought = atoi(optarg); break; case 'P': namePattern = optarg; break; case 'F': Filter = optarg; break; case 'g': MaxGap = atoi(optarg); break; case 'O': outAlignLen = atoi(optarg); break; case 'G': max_sub_rt_gap = atoi(optarg); break; case 'T': min_sub_rt_count = atoi(optarg); break; case 'r': PBS_minLen = atoi(optarg); break; case 'j': JoinThreshold = atof(optarg); break; case 'J': SplitThreshold = atof(optarg); break; case 'S': minOutScore = atof(optarg); break; case 'B': HigherSharpness = atof(optarg); break; case 'b': LowerSharpness = atof(optarg); break; case 'M': minMatchSim = atof(optarg); break; case 's': tRNA_file = optarg; break; case 'a': ps_dir = optarg; break; case 'f': fig_file = optarg; break; case 'C': checkCentriole = true; break; default: display_usage = true; // case 'm': show_score_matrix(); break; } } if (optind >= argc || display_usage) { fprintf (stderr, "ltr_finder v%s\n", version); fprintf (stderr, "Usage : [options] <INPUT_FASTA_FILE>\n"); // fprintf (stderr, "Options: -b NUM bandwidth, default is %d\n", // band_width); fprintf (stderr, " -o NUM gap open penalty, default is %d\n", gap_open); fprintf (stderr, " -t NUM gap extension penalty, default is %d\n", gap_ext); fprintf (stderr, " -e NUM gap end penalty, default is %d\n", gap_end); fprintf (stderr, " -m NUM match score, default is %d\n", score_match); fprintf (stderr, " -u NUM unmatch score, default is %d\n", score_mismatch); fprintf (stderr, " -D NUM Max distance between 5'&3'LTR, default is %d\n", Dmax); fprintf (stderr, " -d NUM Min distance between 5'&3'LTR, default is %d\n", Dmin); fprintf (stderr, " -L NUM Max length of 5'&3'LTR, default is %d\n", Lmax); fprintf (stderr, " -l NUM Min length of 5'&3'LTR, default is %d\n", Lmin); fprintf (stderr, " -p NUM min length of exact match pair, default is %d\n", Lex); fprintf (stderr, " -g NUM Max gap between joined pairs, default is %d\n", MaxGap); fprintf (stderr, " -G NUM Max gap between RT sub-domains, default is %d\n", max_sub_rt_gap); fprintf (stderr, " -T NUM Min sub-domains found in a RT domain, default is %d\n", min_sub_rt_count); fprintf (stderr, " -j NUM Threshold for join new sequence in existed alignment\n"); fprintf (stderr, " new alignment similarity higher than this will be joined,\n"); fprintf (stderr, " default is %0.2f\n", JoinThreshold); fprintf (stderr, " -J NUM Threshold for split existed alignment to two part\n"); fprintf (stderr, " new alignment similarity lower than this will be split,\n"); fprintf (stderr, " set this threshold lower than -j, means turn it off,\n"); fprintf (stderr, " default is %0.2f\n", SplitThreshold); fprintf (stderr, " -S NUM output Score limit, default is %0.2f, [0,10]\n", minOutScore); fprintf (stderr, " -M NUM min LTR similarity threshold, default is %0.2f, [0,1]\n", minMatchSim); fprintf (stderr, " -B NUM Boundary alignment sharpness threshold, higher one.\n"); fprintf (stderr, " one of the two edge's sharpness must higher than\n"); fprintf (stderr, " this threshold, default is %0.3f, [0,1]\n", HigherSharpness); fprintf (stderr, " -b NUM Boundary alignment sharpness threshold, lower one.\n"); fprintf (stderr, " both of the two edge's sharpness must higher than\n"); fprintf (stderr, " this threshold, default is %0.3f, [0,1]\n", LowerSharpness); fprintf (stderr, " -r NUM PBS detecting threshold, min tRNA match length: %d, [1,18]\n", PBS_minLen); fprintf (stderr, " -w NUM output format: [0]-full, 1-summary, 2-table.\n"); fprintf (stderr, " -O NUM output alignment length(only affect -w0), default is %d\n", outAlignLen); fprintf (stderr, " -P STR SeqIDs, will only calculate matched SeqID\n"); fprintf (stderr, " POSIX style regular express is supported.\n"); fprintf (stderr, " -s filename tRNA sequence file(FASTA format)\n"); fprintf (stderr, " -f filename data file used to draw figure\n"); fprintf (stderr, " -a ps_scan_dir Use ps_scan to predict protein domain\n"); fprintf (stderr, " -x Output in html format\n"); fprintf (stderr, " -E LTR must have edge signal\n"); fprintf (stderr, " (at least two of PBS,PPT,TSR)\n"); fprintf (stderr, " -C detect Centriole, delete highly repeat regions\n"); fprintf (stderr, " -F 01string Filter to choose desired result,default is 0\n"); fprintf (stderr, " 10000000000 5'-LTR must have TG\n"); fprintf (stderr, " 01000000000 5'-LTR must have CA\n"); fprintf (stderr, " 00100000000 3'-LTR must have TG\n"); fprintf (stderr, " 00010000000 3'-LTR must have CA\n"); fprintf (stderr, " 00001000000 TSR must be found\n"); fprintf (stderr, " 00000100000 PBS must be found\n"); fprintf (stderr, " 00000010000 PPT must be found\n"); fprintf (stderr, " 00000001000 RT domain muse be found\n"); fprintf (stderr, " 00000000100 Integrase core must be found\n"); fprintf (stderr, " 00000000010 Integrase c-term must be found\n"); fprintf (stderr, " 00000000001 RNase H must be found\n"); // fprintf(stderr, " -m show score matrix\n"); fprintf (stderr, " -h help\n"); exit (1); } // if(argc<2) // { // cerr<<"ltr_finder INPUT_FASTA_FILE"<<endl; // exit(1); // } //Program : BGF //Version : 2.1.2 //Time : Wed Nov 22 20:53:23 2006 if (ishtml) printf("<html>\n<head>\n<title>LTR_FINDER Result</title>\n</head>\n<body>\n<pre>\n"); printf("Program : LTR_FINDER\n"); printf("Version : %s\n\n", version); const char *id_filter_stat; re_syntax_options = RE_SYNTAX_POSIX_EGREP | RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL; id_filter_stat = re_compile_pattern(namePattern.c_str(), namePattern.length(), &id_filter); if (id_filter_stat != NULL) { printf("not a vaild POSIX regex after -P, code = %s\n", id_filter_stat); } if (tRNA_file != NULL) { timehere.markbeg(); string tmp_out = "Load tRNA db ["; tmp_out += tRNA_file; tmp_out += "] "; pbs.LoadSeq(tRNA_file); timehere.markend (); timehere.outtime(tmp_out.c_str()); } ps_scan.init(ps_dir); set_score_matrix(gap_open, gap_ext, gap_end, score_match, score_mismatch); FILE * inFASTA = fopen(argv[optind], "r"); if (inFASTA == NULL) { cerr << "open " << argv[optind] << " error!" << endl; exit(1); } //init transDNA for(int i=0;i<128;++i) transDNA[i]='N'; transDNA['a']='A'; transDNA['A']='A'; transDNA['c']='C'; transDNA['C']='C'; transDNA['g']='G'; transDNA['G']='G'; transDNA['t']='T'; transDNA['T']='T'; transDNA['u']='T'; transDNA['U']='T'; seq_t sequence; sequence.s = NULL; sequence.m = 0; char name[1024]; ofstream fig; if (fig_file != NULL) fig.open(fig_file); int total_sequence = 0; int total_img = 0; while ( -1 != read_fasta(inFASTA, &sequence, name, 0)) { total_sequence++; if (namePattern.length() > 0 && id_filter_stat == NULL && re_search( &id_filter, name, strlen(name), 0, strlen(name), 0) < 0 ) //name Pattern not found //namePattern.find(name)==string::npos) continue; for (int i = 0;i < sequence.l;++i) sequence.s[i] = transDNA[sequence.s[i]]; char* myString = (char*)sequence.s; int stringLength = sequence.l; if (showPairNum) cout << "Sequence:" << name << " Len:" << stringLength << " "; //cout<<"begin GetPairs\n"; vector < candidate > Pair; GetPairs (myString, stringLength, Lex, Lmax, Dmin, Dmax, Pair); if (showPairNum) continue; //output pairs for checking for (int i = 0; i < Pair.size () && CHECK_PAIRS; ++i) { cout << setw (10); cout << Pair[i].pos1 << "-" << Pair[i].pos1 + Pair[i].len - 1 << " * " << Pair[i].pos2 << "-" << Pair[i].pos2 + Pair[i].len - 1 << endl; } //constrcut sticks //cout<<used[2]<<endl; vector < stick > sticks; JoinPairs(myString, stringLength, Pair, sticks); //cerr<<"pairs after join:"<<sticks.size()<<endl; int count_do = 0; ps_scan.reset(); for (int i = 0; i < sticks.size (); ++i) { sticks[i].score = 0; //cout<<"begin extend pairs\n"; if (!ExtendPairs(myString, stringLength, sticks[i])) continue; count_do++; //cout<<"begin findsignal\n"; FindSignal(myString, stringLength, pbs, ppt, sticks[i]); ps_scan.AddRegion(sticks[i].end1 + 1, sticks[i].pos2 - 1); } timehere.markbeg(); ps_scan.Predict(myString, stringLength); timehere.markend (); timehere.outtime ("Predict protein Domains"); for (int i = 0;i < sticks.size (); ++i) { if (sticks[i].score < 0) continue; //cout<<"begin finddomain\n"; ps_scan.Find(sticks[i].end1 + 1, sticks[i].pos2 - 1, sticks[i].motif); //cout<<"count score\n"; CountScore(sticks[i]); //cerr<<"len:"<<sticks[i].match_len<<" score:"<<sticks[i].match_score<<endl; sticks[i].match_score = sticks[i].match_score / sticks[i].match_len; //score/average_len if (sticks[i].match_score > 1) { //cerr << "similarity bigger than1:" << sticks[i].match_score << endl; sticks[i].match_score=1;//why??? } } //cerr<<"after minSharpness filter:"<<count_do<<endl; //may be this function is needn't //EraseOverlap(sticks); stable_sort (sticks.begin (), sticks.end ()); //cout<<"out result\n"; int result_count = OutPutResult(name, myString, sequence.l, sticks, fig); if (CHECK_PAIRS) ps_scan.PrintNoUsed(); cout << endl; if (result_count && ishtml && fig.good()) { total_img++; cout << "</pre>\n"; cout << "<img src=\"" << total_img << ".png\">\n"; cout << "<pre>\n"; } } if (total_sequence == 0) { cout << "No sequence found, please input FASTA format sequence and try again\n"; } fig.close(); fclose(inFASTA); free (sequence.s); timeall.markend (); timeall.outtime("Total consume"); if (ishtml) cout << "</pre>\n</body>\n</html>\n"; return (0); }