Ejemplo n.º 1
0
//
// Main
//
int overlapMain(int argc, char** argv)
{
    parseOverlapOptions(argc, argv);

    // Prepare the output ASQG file
    assert(opt::outputType == OT_ASQG);

    // Open output file
    std::ostream* pASQGWriter = createWriter(opt::outFile);

    // Build and write the ASQG header
    ASQG::HeaderRecord headerRecord;
    headerRecord.setOverlapTag(opt::minOverlap);
    headerRecord.setErrorRateTag(opt::errorRate);
    headerRecord.setInputFileTag(opt::readsFile);
    headerRecord.setContainmentTag(true); // containments are always present
    headerRecord.setTransitiveTag(!opt::bIrreducibleOnly);
    headerRecord.write(*pASQGWriter);

    // Compute the overlap hits
    StringVector hitsFilenames;

    // Determine which index files to use. If a target file was provided,
    // use the index of the target reads
    std::string indexPrefix;
    if(!opt::prefix.empty())
      indexPrefix = opt::prefix;
    else
    {
      if(!opt::targetFile.empty())
        indexPrefix = stripFilename(opt::targetFile);
      else
        indexPrefix = stripFilename(opt::readsFile);
    }
    BWT* pBWT = new BWT(indexPrefix + BWT_EXT, opt::sampleRate);
    BWT* pRBWT = new BWT(indexPrefix + RBWT_EXT, opt::sampleRate);
    OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT, 
                                                         opt::errorRate, opt::seedLength, 
                                                         opt::seedStride, opt::bIrreducibleOnly);

    pOverlapper->setExactModeOverlap(opt::errorRate <= 0.0001);
    pOverlapper->setExactModeIrreducible(opt::errorRate <= 0.0001);

    Timer* pTimer = new Timer(PROGRAM_IDENT);
    pBWT->printInfo();

    // Make a prefix for the temporary hits files
    std::string outPrefix;
    outPrefix = stripFilename(opt::readsFile);
    if(!opt::targetFile.empty())
    {
        outPrefix.append(1, '.');
        outPrefix.append(stripFilename(opt::targetFile));
    }

    if(opt::numThreads <= 1)
    {
        printf("[%s] starting serial-mode overlap computation\n", PROGRAM_IDENT);
        computeHitsSerial(outPrefix, opt::readsFile, pOverlapper, opt::minOverlap, hitsFilenames, pASQGWriter);
    }
    else
    {
        printf("[%s] starting parallel-mode overlap computation with %d threads\n", PROGRAM_IDENT, opt::numThreads);
        computeHitsParallel(opt::numThreads, outPrefix, opt::readsFile, pOverlapper, opt::minOverlap, hitsFilenames, pASQGWriter);
    }

    // Get the number of strings in the BWT, this is used to pre-allocated the read table
    delete pOverlapper;
    delete pBWT; 
    delete pRBWT;

    // Parse the hits files and write the overlaps to the ASQG file
    convertHitsToASQG(indexPrefix, hitsFilenames, pASQGWriter);

    // Cleanup
    delete pASQGWriter;
    delete pTimer;
    if(opt::numThreads > 1)
        pthread_exit(NULL);

    return 0;
}
Ejemplo n.º 2
0
void cluster()
{
    BWT* pBWT = new BWT(opt::prefix + BWT_EXT);
    BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT);
    OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT,opt::errorRate, opt::seedLength, opt::seedStride, true);

    pOverlapper->setExactModeOverlap(opt::errorRate < 0.001f);
    pOverlapper->setExactModeIrreducible(opt::errorRate < 0.001f);

    BitVector markedReads(pBWT->getNumStrings());

    std::string preclustersFile = opt::outFile + ".preclusters";
    std::ostream* pPreWriter = createWriter(preclustersFile);
    ClusterPostProcess postProcessor(pPreWriter, opt::minSize, &markedReads);
    
    // Set the cluster parameters
    ClusterParameters parameters;
    parameters.pOverlapper = pOverlapper;
    parameters.minOverlap = opt::minOverlap;
    parameters.maxClusterSize = opt::maxSize;
    parameters.maxIterations = opt::maxIterations;
    parameters.pMarkedReads = &markedReads;

    // Read the limit kmer sequences, if provided
    std::set<std::string>* pLimitKmers = NULL;

    if(!opt::limitFile.empty())
    {
        // Read in the limit sequences
        pLimitKmers = new std::set<std::string>;
        readLimitKmers(pLimitKmers);
        parameters.pLimitKmers = pLimitKmers;
        parameters.limitK = opt::limitKmer;
    }
    else
    {
        parameters.pLimitKmers = NULL;
        parameters.limitK = 0;
    }

    // Make pre-clusters from the reads
    if(opt::numThreads <= 1)
    {
        printf("[%s] starting serial-mode read clustering\n", PROGRAM_IDENT);
        ClusterProcess processor(parameters);
        
        // If the extend file is empty, build new clusters
        if(opt::extendFile.empty())
        {
            PROCESS_CLUSTER_SERIAL(opt::readsFile, &processor, &postProcessor);
        }
        else
        {
            // Process a set of preexisting clusters
            ClusterReader clusterReader(opt::extendFile);
            PROCESS_EXTEND_SERIAL(clusterReader, &processor, &postProcessor);
        }
    }
    else
    {
        printf("[%s] starting parallel-mode read clustering computation with %d threads\n", PROGRAM_IDENT, opt::numThreads);
        
        std::vector<ClusterProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            ClusterProcess* pProcessor = new ClusterProcess(parameters);
            processorVector.push_back(pProcessor);
        }
        
        if(opt::extendFile.empty())
        {
            PROCESS_CLUSTER_PARALLEL(opt::readsFile, processorVector, &postProcessor);
        }
        else
        {
            ClusterReader clusterReader(opt::extendFile);
            PROCESS_EXTEND_PARALLEL(clusterReader, processorVector, &postProcessor);
        }
        
        for(size_t i = 0; i < processorVector.size(); ++i)
        {
            delete processorVector[i];
            processorVector[i] = NULL;
        }
    }
    delete pPreWriter;
    delete pBWT;
    delete pRBWT;
    delete pOverlapper;

    // Deallocate limit kmers
    if(pLimitKmers != NULL)
        delete pLimitKmers;

    // Open the preclusters file and convert them to read names
    SuffixArray* pFwdSAI = new SuffixArray(opt::prefix + SAI_EXT);
    ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pFwdSAI->getNumStrings());

    size_t seedIdx = 0;
    std::istream* pPreReader = createReader(preclustersFile);
    std::ostream* pClusterWriter = createWriter(opt::outFile);
    std::string line;
    while(getline(*pPreReader,line))
    {
        std::stringstream parser(line);
        std::string clusterName;
        std::string readSequence;
        size_t clusterSize;
        int64_t lowIdx;
        int64_t highIdx;
        parser >> clusterName >> clusterSize >> readSequence >> lowIdx >> highIdx;

        if(lowIdx > highIdx)
        {
            // This is an extra read that is not present in the FM-index
            // Output a record with a fake read ID
            *pClusterWriter << clusterName << "\t" << clusterSize << "\tseed-" << seedIdx++ << "\t" << readSequence << "\n";
        }
        else
        {
            for(int64_t i = lowIdx; i <= highIdx; ++i)
            {
                const ReadInfo& targetInfo = pRIT->getReadInfo(pFwdSAI->get(i).getID());
                std::string readName = targetInfo.id;
                *pClusterWriter << clusterName << "\t" << clusterSize << "\t" << readName << "\t" << readSequence << "\n";
            }
        }
    }
    unlink(preclustersFile.c_str());

    delete pFwdSAI;
    delete pRIT;
    delete pPreReader;
    delete pClusterWriter;
}
Ejemplo n.º 3
0
//
// Main
//
int FMMergeMain(int argc, char** argv)
{
    parseFMMergeOptions(argc, argv);

    BWT* pBWT = new BWT(opt::prefix + BWT_EXT);
    BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT);
    OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT,0.0f, 0,0,true); 
    pOverlapper->setExactModeOverlap(true);
    pOverlapper->setExactModeIrreducible(true);
    Timer* pTimer = new Timer(PROGRAM_IDENT);
    pBWT->printInfo();

    // Construct a bitvector indicating what reads have been used
    // All the processes read from this vector and only the post processor
    // writes to it.
    BitVector markedReads(pBWT->getNumStrings());

    std::ostream* pWriter = createWriter(opt::outFile);
    FMMergePostProcess postProcessor(pWriter, &markedReads);

    if(opt::numThreads <= 1)
    {
        printf("[%s] starting serial-mode read merging\n", PROGRAM_IDENT);
        FMMergeProcess processor(pOverlapper, opt::minOverlap, &markedReads);
        SequenceProcessFramework::processSequencesSerial<SequenceWorkItem,
                                                         FMMergeResult, 
                                                         FMMergeProcess, 
                                                         FMMergePostProcess>(opt::readsFile, &processor, &postProcessor);
    }
    else
    {
        printf("[%s] starting parallel-mode read merging computation with %d threads\n", PROGRAM_IDENT, opt::numThreads);
        
        std::vector<FMMergeProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            FMMergeProcess* pProcessor = new FMMergeProcess(pOverlapper, opt::minOverlap, &markedReads);
            processorVector.push_back(pProcessor);
        }

        SequenceProcessFramework::processSequencesParallel<SequenceWorkItem,
                                                         FMMergeResult, 
                                                         FMMergeProcess, 
                                                         FMMergePostProcess>(opt::readsFile, processorVector, &postProcessor);
        
        for(size_t i = 0; i < processorVector.size(); ++i)
        {
            delete processorVector[i];
            processorVector[i] = NULL;
        }
    }

    // Check that every bit was set in the bit vector
    size_t numSet = 0;
    size_t numTotal = pBWT->getNumStrings();
    for(size_t i = 0; i < numTotal; ++i)
    {
        if(markedReads.test(i))
            ++numSet;
    }

    // Get the number of strings in the BWT, this is used to pre-allocated the read table
    delete pOverlapper;
    delete pBWT; 
    delete pRBWT;
    delete pWriter;

    // Cleanup
    delete pTimer;
    if(opt::numThreads > 1)
        pthread_exit(NULL);

    return 0;
}