Ejemplo n.º 1
0
int genSSAMain(int argc, char** argv)
{
    Timer t("sga gen-ssa");
    parseGenSSAOptions(argc, argv);
    
    BWT* pBWT = new BWT(opt::prefix + BWT_EXT);
    ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pBWT->getNumStrings(), RIO_NUMERICID);
    pBWT->printInfo();

    SampledSuffixArray* pSSA = new SampledSuffixArray();
    pSSA->build(pBWT, pRIT, opt::sampleRate);
    pSSA->printInfo();
    pSSA->writeSSA(opt::prefix + SSA_EXT);

    if(opt::validate)
        pSSA->validate(opt::readsFile, pBWT);

    delete pBWT;
    delete pRIT;
    delete pSSA;

    return 0;
}
Ejemplo n.º 2
0
void cluster()
{
    BWT* pBWT = new BWT(opt::prefix + BWT_EXT);
    BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT);
    OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT,opt::errorRate, opt::seedLength, opt::seedStride, true);

    pOverlapper->setExactModeOverlap(opt::errorRate < 0.001f);
    pOverlapper->setExactModeIrreducible(opt::errorRate < 0.001f);

    BitVector markedReads(pBWT->getNumStrings());

    std::string preclustersFile = opt::outFile + ".preclusters";
    std::ostream* pPreWriter = createWriter(preclustersFile);
    ClusterPostProcess postProcessor(pPreWriter, opt::minSize, &markedReads);
    
    // Set the cluster parameters
    ClusterParameters parameters;
    parameters.pOverlapper = pOverlapper;
    parameters.minOverlap = opt::minOverlap;
    parameters.maxClusterSize = opt::maxSize;
    parameters.maxIterations = opt::maxIterations;
    parameters.pMarkedReads = &markedReads;

    // Read the limit kmer sequences, if provided
    std::set<std::string>* pLimitKmers = NULL;

    if(!opt::limitFile.empty())
    {
        // Read in the limit sequences
        pLimitKmers = new std::set<std::string>;
        readLimitKmers(pLimitKmers);
        parameters.pLimitKmers = pLimitKmers;
        parameters.limitK = opt::limitKmer;
    }
    else
    {
        parameters.pLimitKmers = NULL;
        parameters.limitK = 0;
    }

    // Make pre-clusters from the reads
    if(opt::numThreads <= 1)
    {
        printf("[%s] starting serial-mode read clustering\n", PROGRAM_IDENT);
        ClusterProcess processor(parameters);
        
        // If the extend file is empty, build new clusters
        if(opt::extendFile.empty())
        {
            PROCESS_CLUSTER_SERIAL(opt::readsFile, &processor, &postProcessor);
        }
        else
        {
            // Process a set of preexisting clusters
            ClusterReader clusterReader(opt::extendFile);
            PROCESS_EXTEND_SERIAL(clusterReader, &processor, &postProcessor);
        }
    }
    else
    {
        printf("[%s] starting parallel-mode read clustering computation with %d threads\n", PROGRAM_IDENT, opt::numThreads);
        
        std::vector<ClusterProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            ClusterProcess* pProcessor = new ClusterProcess(parameters);
            processorVector.push_back(pProcessor);
        }
        
        if(opt::extendFile.empty())
        {
            PROCESS_CLUSTER_PARALLEL(opt::readsFile, processorVector, &postProcessor);
        }
        else
        {
            ClusterReader clusterReader(opt::extendFile);
            PROCESS_EXTEND_PARALLEL(clusterReader, processorVector, &postProcessor);
        }
        
        for(size_t i = 0; i < processorVector.size(); ++i)
        {
            delete processorVector[i];
            processorVector[i] = NULL;
        }
    }
    delete pPreWriter;
    delete pBWT;
    delete pRBWT;
    delete pOverlapper;

    // Deallocate limit kmers
    if(pLimitKmers != NULL)
        delete pLimitKmers;

    // Open the preclusters file and convert them to read names
    SuffixArray* pFwdSAI = new SuffixArray(opt::prefix + SAI_EXT);
    ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pFwdSAI->getNumStrings());

    size_t seedIdx = 0;
    std::istream* pPreReader = createReader(preclustersFile);
    std::ostream* pClusterWriter = createWriter(opt::outFile);
    std::string line;
    while(getline(*pPreReader,line))
    {
        std::stringstream parser(line);
        std::string clusterName;
        std::string readSequence;
        size_t clusterSize;
        int64_t lowIdx;
        int64_t highIdx;
        parser >> clusterName >> clusterSize >> readSequence >> lowIdx >> highIdx;

        if(lowIdx > highIdx)
        {
            // This is an extra read that is not present in the FM-index
            // Output a record with a fake read ID
            *pClusterWriter << clusterName << "\t" << clusterSize << "\tseed-" << seedIdx++ << "\t" << readSequence << "\n";
        }
        else
        {
            for(int64_t i = lowIdx; i <= highIdx; ++i)
            {
                const ReadInfo& targetInfo = pRIT->getReadInfo(pFwdSAI->get(i).getID());
                std::string readName = targetInfo.id;
                *pClusterWriter << clusterName << "\t" << clusterSize << "\t" << readName << "\t" << readSequence << "\n";
            }
        }
    }
    unlink(preclustersFile.c_str());

    delete pFwdSAI;
    delete pRIT;
    delete pPreReader;
    delete pClusterWriter;
}
Ejemplo n.º 3
0
//
// Main
//
int filterMain(int argc, char** argv)
{
    parseFilterOptions(argc, argv);
    Timer* pTimer = new Timer(PROGRAM_IDENT);


    BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate);
    BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate);
    //pBWT->printInfo();

    std::ostream* pWriter = createWriter(opt::outFile);
    std::ostream* pDiscardWriter = createWriter(opt::discardFile);
    QCPostProcess* pPostProcessor = new QCPostProcess(pWriter, pDiscardWriter);

    // If performing duplicate check, create a bitvector to record
    // which reads are duplicates
    BitVector* pSharedBV = NULL;
    if(opt::dupCheck)
        pSharedBV = new BitVector(pBWT->getNumStrings());

    // Set up QC parameters
    QCParameters params;
    params.pBWT = pBWT;
    params.pRevBWT = pRBWT;
    params.pSharedBV = pSharedBV;

    params.checkDuplicates = opt::dupCheck;
    params.substringOnly = opt::substringOnly;
    params.checkKmer = opt::kmerCheck;
    params.checkHPRuns = opt::hpCheck;
    params.checkDegenerate = opt::lowComplexityCheck;

    params.verbose = opt::verbose;

    params.kmerLength = opt::kmerLength;
    params.kmerThreshold = opt::kmerThreshold;

    params.hpKmerLength = 51;
    params.hpHardAcceptCount = 10;
    params.hpMinProportion = 0.1f;
    params.hpMinLength = 6;

    if(opt::numThreads <= 1)
    {
        // Serial mode
        QCProcess processor(params);
        PROCESS_FILTER_SERIAL(opt::readsFile, &processor, pPostProcessor);
    }
    else
    {
        // Parallel mode
        std::vector<QCProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            QCProcess* pProcessor = new QCProcess(params);
            processorVector.push_back(pProcessor);
        }

        PROCESS_FILTER_PARALLEL(opt::readsFile, processorVector, pPostProcessor);

        for(int i = 0; i < opt::numThreads; ++i)
            delete processorVector[i];
    }

    delete pPostProcessor;
    delete pWriter;
    delete pDiscardWriter;

    delete pBWT;
    delete pRBWT;

    if(pSharedBV != NULL)
        delete pSharedBV;

    std::cout << "RE-building index for " << opt::outFile << " in memory using ropebwt2\n";
    std::string prefix=stripFilename(opt::outFile);
        //BWT *pBWT, *pRBWT;
		#pragma omp parallel
		{
			#pragma omp single nowait
			{	
			    std::string bwt_filename = prefix + BWT_EXT;
				BWTCA::runRopebwt2(opt::outFile, bwt_filename, opt::numThreads, false);
				std::cout << "\t done bwt construction, generating .sai file\n";
				pBWT = new BWT(bwt_filename);
			}
			#pragma omp single nowait
			{	
				std::string rbwt_filename = prefix + RBWT_EXT;
				BWTCA::runRopebwt2(opt::outFile, rbwt_filename, opt::numThreads, true);
				std::cout << "\t done rbwt construction, generating .rsai file\n";
				pRBWT = new BWT(rbwt_filename);
			}
		}
        std::string sai_filename = prefix + SAI_EXT;
		SampledSuffixArray ssa;
        ssa.buildLexicoIndex(pBWT, opt::numThreads);
        ssa.writeLexicoIndex(sai_filename);
        delete pBWT;

        std::string rsai_filename = prefix + RSAI_EXT;
        SampledSuffixArray rssa;
        rssa.buildLexicoIndex(pRBWT, opt::numThreads);
        rssa.writeLexicoIndex(rsai_filename);
        delete pRBWT;

    // Cleanup
    delete pTimer;

    return 0;
}
Ejemplo n.º 4
0
//
// Main
//
int filterMain(int argc, char** argv)
{
    parseFilterOptions(argc, argv);
    Timer* pTimer = new Timer(PROGRAM_IDENT);


    BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate);
    BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate);
    pBWT->printInfo();

    std::ostream* pWriter = createWriter(opt::outFile);
    std::ostream* pDiscardWriter = createWriter(opt::discardFile);
    QCPostProcess* pPostProcessor = new QCPostProcess(pWriter, pDiscardWriter);

    // If performing duplicate check, create a bitvector to record
    // which reads are duplicates
    BitVector* pSharedBV = NULL;
    if(opt::dupCheck)
        pSharedBV = new BitVector(pBWT->getNumStrings());

    // Set up QC parameters
    QCParameters params;
    params.pBWT = pBWT;
    params.pRevBWT = pRBWT;
    params.pSharedBV = pSharedBV;

    params.checkDuplicates = opt::dupCheck;
    params.substringOnly = opt::substringOnly;
    params.checkKmer = opt::kmerCheck;
    params.kmerBothStrand = opt::kmerBothStrand;
    params.checkHPRuns = opt::hpCheck;
    params.checkDegenerate = opt::lowComplexityCheck;

    params.verbose = opt::verbose;

    params.kmerLength = opt::kmerLength;
    params.kmerThreshold = opt::kmerThreshold;

    params.hpKmerLength = 51;
    params.hpHardAcceptCount = 10;
    params.hpMinProportion = 0.1f;
    params.hpMinLength = 6;

    if(opt::numThreads <= 1)
    {
        // Serial mode
        QCProcess processor(params);
        PROCESS_FILTER_SERIAL(opt::readsFile, &processor, pPostProcessor);
    }
    else
    {
        // Parallel mode
        std::vector<QCProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            QCProcess* pProcessor = new QCProcess(params);
            processorVector.push_back(pProcessor);
        }

        PROCESS_FILTER_PARALLEL(opt::readsFile, processorVector, pPostProcessor);

        for(int i = 0; i < opt::numThreads; ++i)
            delete processorVector[i];
    }

    delete pPostProcessor;
    delete pWriter;
    delete pDiscardWriter;

    delete pBWT;
    delete pRBWT;

    if(pSharedBV != NULL)
        delete pSharedBV;

    // Rebuild the FM-index without the discarded reads
    std::string out_prefix = stripFilename(opt::outFile);
    removeReadsFromIndices(opt::prefix, opt::discardFile, out_prefix, BWT_EXT, SAI_EXT, false, opt::numThreads);
    removeReadsFromIndices(opt::prefix, opt::discardFile, out_prefix, RBWT_EXT, RSAI_EXT, true, opt::numThreads);

    // Cleanup
    delete pTimer;
    if(opt::numThreads > 1)
        pthread_exit(NULL);

    return 0;
}
Ejemplo n.º 5
0
//
// Main
//
int FMMergeMain(int argc, char** argv)
{
    parseFMMergeOptions(argc, argv);

    BWT* pBWT = new BWT(opt::prefix + BWT_EXT);
    BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT);
    OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT,0.0f, 0,0,true); 
    pOverlapper->setExactModeOverlap(true);
    pOverlapper->setExactModeIrreducible(true);
    Timer* pTimer = new Timer(PROGRAM_IDENT);
    pBWT->printInfo();

    // Construct a bitvector indicating what reads have been used
    // All the processes read from this vector and only the post processor
    // writes to it.
    BitVector markedReads(pBWT->getNumStrings());

    std::ostream* pWriter = createWriter(opt::outFile);
    FMMergePostProcess postProcessor(pWriter, &markedReads);

    if(opt::numThreads <= 1)
    {
        printf("[%s] starting serial-mode read merging\n", PROGRAM_IDENT);
        FMMergeProcess processor(pOverlapper, opt::minOverlap, &markedReads);
        SequenceProcessFramework::processSequencesSerial<SequenceWorkItem,
                                                         FMMergeResult, 
                                                         FMMergeProcess, 
                                                         FMMergePostProcess>(opt::readsFile, &processor, &postProcessor);
    }
    else
    {
        printf("[%s] starting parallel-mode read merging computation with %d threads\n", PROGRAM_IDENT, opt::numThreads);
        
        std::vector<FMMergeProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            FMMergeProcess* pProcessor = new FMMergeProcess(pOverlapper, opt::minOverlap, &markedReads);
            processorVector.push_back(pProcessor);
        }

        SequenceProcessFramework::processSequencesParallel<SequenceWorkItem,
                                                         FMMergeResult, 
                                                         FMMergeProcess, 
                                                         FMMergePostProcess>(opt::readsFile, processorVector, &postProcessor);
        
        for(size_t i = 0; i < processorVector.size(); ++i)
        {
            delete processorVector[i];
            processorVector[i] = NULL;
        }
    }

    // Check that every bit was set in the bit vector
    size_t numSet = 0;
    size_t numTotal = pBWT->getNumStrings();
    for(size_t i = 0; i < numTotal; ++i)
    {
        if(markedReads.test(i))
            ++numSet;
    }

    // Get the number of strings in the BWT, this is used to pre-allocated the read table
    delete pOverlapper;
    delete pBWT; 
    delete pRBWT;
    delete pWriter;

    // Cleanup
    delete pTimer;
    if(opt::numThreads > 1)
        pthread_exit(NULL);

    return 0;
}