Beispiel #1
0
int main (int argc, const char * argv[]){

    struct timeval start, end;
    gettimeofday(&start, NULL);

    // general parameters
    size_t maxSeqLen = 50000;
    int seqType = Sequence::AMINO_ACIDS;

    // parameter for the prefiltering
    int kmerSize = 6;
    int alphabetSize = 21;
    size_t maxResListLen = 100;
    int split = 0;
    int skip = 0;
    bool aaBiasCorrection = true;
    float zscoreThr = 50.0f;
    float sensitivity = 4.0;

    // parameters for the alignment
    double evalThr = 0.001;
    double covThr = 0.8;
    int maxAlnNum = 10;

    std::string lastSeqDB = "";
    std::string currentSeqDB = "";
    std::string cluDB = ""; 
    std::string outDB = "";
    std::string tmpDir = "";

    // get the path of the scoring matrix
    char* mmdir = getenv ("MMDIR");
    if (mmdir == 0){
        std::cerr << "Please set the environment variable $MMDIR to your MMSEQS installation directory.\n";
        exit(1);
    }
    std::string scoringMatrixFile(mmdir);
    scoringMatrixFile = scoringMatrixFile + "/data/blosum62.out";

    parseArgs(argc, argv, &lastSeqDB, &currentSeqDB, &cluDB, &outDB, &tmpDir, &scoringMatrixFile, &maxSeqLen);

    std::string lastSeqDBIndex = lastSeqDB + ".index";
    std::string currentSeqDBIndex = currentSeqDB + ".index";
    std::string cluDBIndex = cluDB + ".index";
    std::string outDBIndex = outDB + ".index";

    std::list<std::string>* tmpFiles = new std::list<std::string>();
    std::string AIndex = tmpDir + "/A.index";
    std::string BIndex = tmpDir + "/B.index";
    tmpFiles->push_back(AIndex);
    tmpFiles->push_back(BIndex);

    std::string Brest_indexFile = tmpDir + "/Brest.index";
    tmpFiles->push_back(Brest_indexFile);
    
    std::string BB_clu = tmpDir + "/BB_clu";
    std::string BB_clu_index = BB_clu + ".index";
    tmpFiles->push_back(BB_clu);
    tmpFiles->push_back(BB_clu_index);
    
    std::cout << "////////////////////////////////////////////////////////////////////////\n";
    std::cout << "///////                   Init                             /////////////\n";
    std::cout << "////////////////////////////////////////////////////////////////////////\n";
    // extract three indexes:
    // - A: last database version without deleted sequences
    // - B: sequences which are new in the database
    writeIndexes(AIndex, BIndex, lastSeqDBIndex, currentSeqDBIndex);


    std::cout << "////////////////////////////////////////////////////////////////////////\n";
    std::cout << "///////            Calculating B->A scores                 /////////////\n";
    std::cout << "////////////////////////////////////////////////////////////////////////\n";
    // calculate score for the updating
    // B->A scores
    std::string BA_base = runScoresCalculation(currentSeqDB, BIndex,
            currentSeqDB, AIndex,
            tmpDir,
            scoringMatrixFile, maxSeqLen, seqType,
            kmerSize, alphabetSize, maxResListLen, split, skip, aaBiasCorrection, zscoreThr, sensitivity,
            evalThr, covThr, maxAlnNum, "BA", tmpFiles);

    std::cout << "////////////////////////////////////////////////////////////////////////\n";
    std::cout << "///////      Adding sequences to existing clusters         /////////////\n";
    std::cout << "////////////////////////////////////////////////////////////////////////\n";
    // update the clustering
    DBReader* currSeqDbr = new DBReader(currentSeqDB.c_str(), currentSeqDBIndex.c_str());
    currSeqDbr->open(DBReader::NOSORT);

    // data structures for the clustering
    int seqDBSize = currSeqDbr->getSize();
    unsigned int* id2rep = new unsigned int[seqDBSize];
    char** rep2cluName = new char*[seqDBSize];
    for (int i = 0; i < seqDBSize; i++)
        rep2cluName[i] = new char[FFINDEX_MAX_ENTRY_NAME_LENTH];
    cluster_t* clusters = new cluster_t[seqDBSize];
    for (int i = 0; i < seqDBSize; i++){
        clusters[i].clu_size = 0;
        clusters[i].first = 0;
        clusters[i].last = 0;
    }

    std::cout << "Read the existing clustering...\n";
    // Read the existing clustering
    readClustering(currSeqDbr, cluDB, id2rep, rep2cluName, clusters);

    std::cout << "Append new sequences to the existing clustering...\n";
    // append sequences from the new database to the existing clustering based on the B->A alignment scores
    // write sequences without a match to a separate index (they will be clustered separately)
    appendToClustering(currSeqDbr, BIndex, BA_base, id2rep, clusters, Brest_indexFile);

    if (seqsWithoutMatches > 0){
        std::cout << "////////////////////////////////////////////////////////////////////////\n";
        std::cout << "///////            Calculating B->B scores                 /////////////\n";
        std::cout << "////////////////////////////////////////////////////////////////////////\n";
        // B->B scores
        std::string BB_base = runScoresCalculation(currentSeqDB, Brest_indexFile, 
                currentSeqDB, Brest_indexFile,
                tmpDir,
                scoringMatrixFile, maxSeqLen, seqType,
                kmerSize, alphabetSize, maxResListLen, split, skip, aaBiasCorrection, zscoreThr, sensitivity,
                evalThr, covThr, maxAlnNum, "BB", tmpFiles);

        std::cout << "////////////////////////////////////////////////////////////////////////\n";
        std::cout << "///////             Appending new clusters                 /////////////\n";
        std::cout << "////////////////////////////////////////////////////////////////////////\n";
        std::cout << "Cluster new sequences without a match to the existing clusters...\n";
        // cluster sequences without a match to the existing clusters separately
        // use the index generated in the previous step
        Clustering* clu = new Clustering(currentSeqDB, currentSeqDBIndex,
                BB_base, BB_base + ".index",
                BB_clu, BB_clu_index,
                0.0, 0, maxResListLen);
        clu->run(Clustering::SET_COVER); 

        std::cout << "Append generated clusters to the complete clustering...\n";
        // append B->B clusters to the clustering
        newClus = readClustering(currSeqDbr, BB_clu, id2rep, rep2cluName, clusters);
    }

    // write new clustering
    std::cout << "Write clustering results...\n";
    writeResults(clusters, rep2cluName, currSeqDbr, seqDBSize, outDB);
    std::cout << "done.\n";

    currSeqDbr->close();

    std::cout << "////////////////////////////////////////////////////////////////////////\n";
    std::cout << "///////                   Statistics                            ////////\n";
    std::cout << "////////////////////////////////////////////////////////////////////////\n";
    std::cout << "\nPrevios database version: " << oldDBSize << " entries.\n";
    std::cout << "New database vesion     : " << newDBSize << " entries.\n";
    std::cout << deletedSeqs << " entries were deleted,\n";
    std::cout << newSeqs << " entries are new,\n";
    std::cout << sharedSeqs << " entries are shared.\n\n";

    std::cout << seqsWithMatches << " new sequences had matches to the previous database version.\n";
    std::cout << "Remaining " << seqsWithoutMatches << " were grouped into " << newClus << " new clusters.\n";
 
    gettimeofday(&end, NULL);
    int sec = end.tv_sec - start.tv_sec;
    std::cout << "\nTime for updating: " << (sec / 3600) << " h " << (sec % 3600 / 60) << " m " << (sec % 60) << "s\n\n";

    deleteTmpFiles(tmpFiles);
    delete tmpFiles;

}