示例#1
0
int main(int argc, char* argv[])
{
  char* myString;
  int* suffixArray;
  int stringLength;
  int i;
  ifstream inFile;
  inFile.open(argv[1]);
  Timing timehere;

  if (strcmp(argv[1], "test.dat") != 0) 
  {
    timehere.markbeg();
    if (strstr(argv[1], ".fas")[0] == '.')
    {
      read_fasta(inFile, myString, stringLength);
    }
    else
    {
      read_input(inFile, myString, stringLength);
    }
    timehere.markend();
    inFile.close();
    cout << "finish read "
	 << stringLength << " characters."<< endl;
    timehere.outtime();
  }
  else
  {
    read_input(inFile, myString, stringLength);
    inFile.close();
    cout << "finish read " 
	 << stringLength << " characters."<< endl;
  }

  timehere.markbeg();
  suffixArray = LinearSuffixSort(myString, stringLength);
  timehere.markend();
  timehere.outtime("finish suffix sort,");

  if (strcmp(argv[1], "test.dat") == 0) 
  {
    int result;
    bool pass = true;
    ifstream resultF;
    resultF.open("result.test.dat");

    cout << "Testing the Suffix Array" << endl;

    for (i = 0; i < stringLength; i++)
    {
      resultF >> result;
      if (result != suffixArray[i])
      {
	pass = false;
      }
    }
    if (pass == false)
    {
      cout << endl;
      cout << "***************" << endl;
      cout << "test has failed" << endl;
      cout << "***************" << endl;
    }
    else
    {
      cout << endl;
      cout << "******************" << endl;
      cout << "test is successful" << endl;
      cout << "******************" << endl;
    }
  }
示例#2
0
int main (int argc, char *argv[])
{
    char *myString;
    int stringLength;
    int i;
    string namePattern = "";
    char *version = "1.06";

    bool display_usage = false;
    char c;
    CPSSCAN ps_scan;
    PBS pbs;
    PPT ppt(PPT_window, 4);
    char *tRNA_file = NULL;
    char *ps_dir = NULL;
    bool ishtml = false;

    struct re_pattern_buffer id_filter;
    id_filter.allocated = 0;
    id_filter.buffer = 0;
    id_filter.fastmap = 0;
    id_filter.translate = 0;

    Timing timehere;
    Timing timeall;
    timeall.markbeg();

    while ((c = getopt (argc, argv, "ho:t:e:m:u:D:d:L:l:p:s:cw:S:a:P:g:F:B:b:J:j:O:r:M:f:xEiCG:T:")) > 0)
    {
        switch (c)
        {

        case 'h':
            display_usage = true;
            break;

        case 'o':
            gap_open = atoi (optarg);
            break;

        case 't':
            gap_ext = atoi (optarg);
            break;

        case 'e':
            gap_end = atoi (optarg);
            break;

        case 'm':
            score_match = atoi (optarg);
            break;

        case 'u':
            score_mismatch = atoi (optarg);
            break;

        case 'D':
            Dmax = atoi (optarg);
            break;

        case 'd':
            Dmin = atoi (optarg);
            break;

        case 'L':
            Lmax = atoi (optarg);
            break;

        case 'l':
            Lmin = atoi (optarg);
            break;

        case 'p':
            Lex = atoi (optarg);
            break;

        case 'c':
            CHECK_PAIRS = 1;
            break;

        case 'x':
            ishtml = true;
            break;

        case 'E':
            edge_signal = true;
            break;

        case 'i':
            showPairNum = true;
            break;

        case 'w':
            wrought = atoi(optarg);
            break;

        case 'P':
            namePattern = optarg;
            break;

        case 'F':
            Filter = optarg;
            break;

        case 'g':
            MaxGap = atoi(optarg);
            break;

        case 'O':
            outAlignLen = atoi(optarg);
            break;

        case 'G':
            max_sub_rt_gap = atoi(optarg);
            break;

        case 'T':
            min_sub_rt_count = atoi(optarg);
            break;

        case 'r':
            PBS_minLen = atoi(optarg);
            break;

        case 'j':
            JoinThreshold = atof(optarg);
            break;

        case 'J':
            SplitThreshold = atof(optarg);
            break;

        case 'S':
            minOutScore = atof(optarg);
            break;

        case 'B':
            HigherSharpness = atof(optarg);
            break;

        case 'b':
            LowerSharpness = atof(optarg);
            break;

        case 'M':
            minMatchSim = atof(optarg);
            break;

        case 's':
            tRNA_file = optarg;
            break;

        case 'a':
            ps_dir = optarg;
            break;

        case 'f':
            fig_file = optarg;
            break;

        case 'C':
            checkCentriole = true;
            break;

        default:
            display_usage = true;

            //          case 'm': show_score_matrix(); break;
        }

    }

    if (optind >= argc || display_usage)
    {
        fprintf (stderr, "ltr_finder v%s\n", version);
        fprintf (stderr, "Usage  : [options] <INPUT_FASTA_FILE>\n");
        //  fprintf (stderr, "Options: -b NUM     bandwidth, default is %d\n",
        //     band_width);
        fprintf (stderr,
                 "         -o NUM     gap open penalty, default is %d\n",
                 gap_open);
        fprintf (stderr,
                 "         -t NUM     gap extension penalty, default is %d\n",
                 gap_ext);
        fprintf (stderr,
                 "         -e NUM     gap end penalty, default is %d\n",
                 gap_end);
        fprintf (stderr, "         -m NUM     match score, default is %d\n",
                 score_match);
        fprintf (stderr, "         -u NUM     unmatch score, default is %d\n",
                 score_mismatch);
        fprintf (stderr,
                 "         -D NUM     Max distance between 5'&3'LTR, default is %d\n",
                 Dmax);
        fprintf (stderr,
                 "         -d NUM     Min distance between 5'&3'LTR, default is %d\n",
                 Dmin);
        fprintf (stderr,
                 "         -L NUM     Max length of 5'&3'LTR, default is %d\n",
                 Lmax);
        fprintf (stderr,
                 "         -l NUM     Min length of 5'&3'LTR, default is %d\n",
                 Lmin);
        fprintf (stderr,
                 "         -p NUM     min length of exact match pair, default is %d\n",
                 Lex);
        fprintf (stderr,
                 "         -g NUM     Max gap between joined pairs, default is %d\n",
                 MaxGap);
        fprintf (stderr,
                 "         -G NUM     Max gap between RT sub-domains, default is %d\n",
                 max_sub_rt_gap);
        fprintf (stderr,
                 "         -T NUM     Min sub-domains found in a RT domain, default is %d\n",
                 min_sub_rt_count);
        fprintf (stderr, "         -j NUM     Threshold for join new sequence in existed alignment\n");
        fprintf (stderr, "                    new alignment similarity higher than this will be joined,\n");
        fprintf (stderr, "                    default is %0.2f\n",
                 JoinThreshold);
        fprintf (stderr, "         -J NUM     Threshold for split existed alignment to two part\n");
        fprintf (stderr, "                    new alignment similarity lower than this will be split,\n");
        fprintf (stderr, "                    set this threshold lower than -j, means turn it off,\n");
        fprintf (stderr, "                    default is %0.2f\n",
                 SplitThreshold);

        fprintf (stderr,
                 "         -S NUM     output Score limit, default is %0.2f, [0,10]\n",
                 minOutScore);
        fprintf (stderr,
                 "         -M NUM     min LTR similarity threshold, default is %0.2f, [0,1]\n",
                 minMatchSim);
        fprintf (stderr, "         -B NUM     Boundary alignment sharpness threshold, higher one.\n");
        fprintf (stderr, "                     one of the two edge's sharpness must higher than\n");
        fprintf (stderr, "                     this threshold, default is %0.3f, [0,1]\n",
                 HigherSharpness);
        fprintf (stderr, "         -b NUM     Boundary alignment sharpness threshold, lower one.\n");
        fprintf (stderr, "                     both of the two edge's sharpness must higher than\n");
        fprintf (stderr, "                     this threshold, default is %0.3f, [0,1]\n",
                 LowerSharpness);

        fprintf (stderr,
                 "         -r NUM     PBS detecting threshold, min tRNA match length: %d, [1,18]\n",
                 PBS_minLen);
        fprintf (stderr,
                 "         -w NUM     output format: [0]-full, 1-summary, 2-table.\n");
        fprintf (stderr,
                 "         -O NUM     output alignment length(only affect -w0), default is %d\n",
                 outAlignLen);
        fprintf (stderr, "         -P STR     SeqIDs, will only calculate matched SeqID\n");
        fprintf (stderr, "                      POSIX style regular express is supported.\n");
        fprintf (stderr,
                 "         -s filename      tRNA sequence file(FASTA format)\n");
        fprintf (stderr,
                 "         -f filename      data file used to draw figure\n");
        fprintf (stderr,
                 "         -a ps_scan_dir   Use ps_scan to predict protein domain\n");
        fprintf (stderr, "         -x         Output in html format\n");
        fprintf (stderr, "         -E         LTR must have edge signal\n");
        fprintf (stderr, "                    (at least two of PBS,PPT,TSR)\n");
        fprintf (stderr, "         -C         detect Centriole, delete highly repeat regions\n");
        fprintf (stderr,
                 "         -F 01string      Filter to choose desired result,default is 0\n");
        fprintf (stderr, "                     10000000000 5'-LTR must have TG\n");
        fprintf (stderr, "                     01000000000 5'-LTR must have CA\n");
        fprintf (stderr, "                     00100000000 3'-LTR must have TG\n");
        fprintf (stderr, "                     00010000000 3'-LTR must have CA\n");
        fprintf (stderr, "                     00001000000 TSR must be found\n");
        fprintf (stderr, "                     00000100000 PBS must be found\n");
        fprintf (stderr, "                     00000010000 PPT must be found\n");
        fprintf (stderr, "                     00000001000 RT domain muse be found\n");
        fprintf (stderr, "                     00000000100 Integrase core must be found\n");
        fprintf (stderr, "                     00000000010 Integrase c-term must be found\n");
        fprintf (stderr, "                     00000000001 RNase H must be found\n");

        //  fprintf(stderr, "         -m         show score matrix\n");
        fprintf (stderr, "         -h         help\n");
        exit (1);
    }
    //    if(argc<2)
    //    {
    //        cerr<<"ltr_finder INPUT_FASTA_FILE"<<endl;
    //        exit(1);
    //    }

    //Program    : BGF
    //Version    : 2.1.2
    //Time       : Wed Nov 22 20:53:23 2006

    if (ishtml)
        printf("<html>\n<head>\n<title>LTR_FINDER Result</title>\n</head>\n<body>\n<pre>\n");

    printf("Program    : LTR_FINDER\n");

    printf("Version    : %s\n\n", version);

    const char *id_filter_stat;

    re_syntax_options = RE_SYNTAX_POSIX_EGREP |
                        RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL;

    id_filter_stat = re_compile_pattern(namePattern.c_str(),
                                        namePattern.length(), &id_filter);

    if (id_filter_stat != NULL)
    {
        printf("not a vaild POSIX regex after -P, code = %s\n", id_filter_stat);
    }

    if (tRNA_file != NULL)
    {
        timehere.markbeg();
        string tmp_out = "Load tRNA db [";
        tmp_out += tRNA_file;
        tmp_out += "] ";
        pbs.LoadSeq(tRNA_file);
        timehere.markend ();
        timehere.outtime(tmp_out.c_str());
    }

    ps_scan.init(ps_dir);
    set_score_matrix(gap_open, gap_ext, gap_end, score_match, score_mismatch);

    FILE * inFASTA = fopen(argv[optind], "r");

    if (inFASTA == NULL)
    {
        cerr << "open " << argv[optind] << " error!" << endl;
        exit(1);
    }

	//init transDNA
	for(int i=0;i<128;++i)
		transDNA[i]='N';
	transDNA['a']='A';
	transDNA['A']='A';
	transDNA['c']='C';
	transDNA['C']='C';
	transDNA['g']='G';
	transDNA['G']='G';
	transDNA['t']='T';
	transDNA['T']='T';
	transDNA['u']='T';
	transDNA['U']='T';
	
    seq_t sequence;
    sequence.s = NULL;
    sequence.m = 0;
    char name[1024];

    ofstream fig;

    if (fig_file != NULL)
        fig.open(fig_file);

    int total_sequence = 0;

    int total_img = 0;

    while ( -1 != read_fasta(inFASTA, &sequence, name, 0))
    {
        total_sequence++;

        if (namePattern.length() > 0 && id_filter_stat == NULL &&
                re_search( &id_filter, name, strlen(name),
                           0, strlen(name), 0) < 0 ) //name Pattern not found
            //namePattern.find(name)==string::npos)
            continue;

        for (int i = 0;i < sequence.l;++i)
            sequence.s[i] = transDNA[sequence.s[i]];

        char* myString = (char*)sequence.s;

        int stringLength = sequence.l;

        if (showPairNum)
            cout << "Sequence:" << name << " Len:" << stringLength << " ";

        //cout<<"begin GetPairs\n";
        vector < candidate > Pair;

        GetPairs (myString, stringLength, Lex, Lmax, Dmin, Dmax, Pair);

        if (showPairNum)
            continue;

        //output pairs for checking
        for (int i = 0; i < Pair.size () && CHECK_PAIRS; ++i)
        {
            cout << setw (10);
            cout << Pair[i].pos1 << "-" << Pair[i].pos1 + Pair[i].len - 1 << " * "
            << Pair[i].pos2 << "-" << Pair[i].pos2 + Pair[i].len - 1 << endl;
        }
        //constrcut sticks
        //cout<<used[2]<<endl;
        vector < stick > sticks;

        JoinPairs(myString, stringLength, Pair, sticks);

        //cerr<<"pairs after join:"<<sticks.size()<<endl;
        int count_do = 0;

        ps_scan.reset();

        for (int i = 0; i < sticks.size (); ++i)
        {
            sticks[i].score = 0;
            //cout<<"begin extend pairs\n";

            if (!ExtendPairs(myString, stringLength, sticks[i]))
                continue;

            count_do++;

            //cout<<"begin findsignal\n";
            FindSignal(myString, stringLength, pbs, ppt, sticks[i]);

            ps_scan.AddRegion(sticks[i].end1 + 1, sticks[i].pos2 - 1);
        }

        timehere.markbeg();
        ps_scan.Predict(myString, stringLength);
        timehere.markend ();
        timehere.outtime ("Predict protein Domains");



        for (int i = 0;i < sticks.size (); ++i)
        {
            if (sticks[i].score < 0)
                continue;

            //cout<<"begin finddomain\n";
            ps_scan.Find(sticks[i].end1 + 1, sticks[i].pos2 - 1, sticks[i].motif);

            //cout<<"count score\n";
            CountScore(sticks[i]);

            //cerr<<"len:"<<sticks[i].match_len<<" score:"<<sticks[i].match_score<<endl;
            sticks[i].match_score = sticks[i].match_score / sticks[i].match_len; //score/average_len

            if (sticks[i].match_score > 1)
            {
                //cerr << "similarity bigger than1:" << sticks[i].match_score << endl;
                sticks[i].match_score=1;//why???
            }

        }
        //cerr<<"after minSharpness filter:"<<count_do<<endl;

        //may be this function is needn't
        //EraseOverlap(sticks);

        stable_sort (sticks.begin (), sticks.end ());
        //cout<<"out result\n";
        int result_count = OutPutResult(name, myString, sequence.l, sticks, fig);

        if (CHECK_PAIRS)
            ps_scan.PrintNoUsed();

        cout << endl;

        if (result_count && ishtml && fig.good())
        {
            total_img++;
            cout << "</pre>\n";
            cout << "<img src=\"" << total_img << ".png\">\n";
            cout << "<pre>\n";
        }

    }

    if (total_sequence == 0)
    {
        cout << "No sequence found, please input FASTA format sequence and try again\n";
    }

    fig.close();
    fclose(inFASTA);
    free (sequence.s);
    timeall.markend ();
    timeall.outtime("Total consume");

    if (ishtml)
        cout << "</pre>\n</body>\n</html>\n";

    return (0);
}