Exemplo n.º 1
0
// Perform duplicate check
// Look up the interval of the read in the BWT. If the index of the read
DuplicateCheckResult QCProcess::performDuplicateCheck(const SequenceWorkItem& workItem)
{
    assert(m_params.pSharedBV != NULL);

    std::string w = workItem.read.seq.toString();
    std::string rc_w = reverseComplement(w);

    // Look up the interval of the sequence and its reverse complement
    BWTIntervalPair fwdIntervals = BWTAlgorithms::findIntervalPair(m_params.pBWT, m_params.pRevBWT, w);
    BWTIntervalPair rcIntervals = BWTAlgorithms::findIntervalPair(m_params.pBWT, m_params.pRevBWT, rc_w);

    // Check if this read is a substring of any other
    // This is indicated by the presence of a non-$ extension in the left or right direction
    AlphaCount64 fwdECL = BWTAlgorithms::getExtCount(fwdIntervals.interval[0], m_params.pBWT);
    AlphaCount64 fwdECR = BWTAlgorithms::getExtCount(fwdIntervals.interval[1], m_params.pRevBWT);

    AlphaCount64 rcECL = BWTAlgorithms::getExtCount(rcIntervals.interval[0], m_params.pBWT);
    AlphaCount64 rcECR = BWTAlgorithms::getExtCount(rcIntervals.interval[1], m_params.pRevBWT);

    if(fwdECL.hasDNAChar() || fwdECR.hasDNAChar() || rcECL.hasDNAChar() || rcECR.hasDNAChar())
    {
        // Substring reads are always removed so no need to update the bit vector
        return DCR_SUBSTRING;
    }

    // Calculate the lexicographic intervals for the fwd and reverse intervals
    BWTAlgorithms::updateBothL(fwdIntervals, '$', m_params.pBWT);
    BWTAlgorithms::updateBothL(rcIntervals, '$', m_params.pBWT);

    // Calculate the canonical index for this string - the lowest
    // value in the two lexicographic index
    int64_t fi = fwdIntervals.interval[0].isValid() ? fwdIntervals.interval[0].lower : std::numeric_limits<int64_t>::max();
    int64_t ri = rcIntervals.interval[0].isValid() ? rcIntervals.interval[0].lower : std::numeric_limits<int64_t>::max();
    int64_t canonicalIdx = std::min(fi, ri);

    // Check if the bit reprsenting the canonical index is set in the shared bit vector
    if(!m_params.pSharedBV->test(canonicalIdx))
    {
        // This read is not a duplicate
        // Attempt to atomically set the bit from false to true
        if(m_params.pSharedBV->updateCAS(canonicalIdx, false, true))
        {
            // Call succeed, return that this read is not a duplicate
            return DCR_UNIQUE;
        }
        else
        {
            // Call failed, some other thread set the bit before
            // this thread. Return that the reead is a duplicate
            return DCR_FULL_LENGTH_DUPLICATE;
        }
    }
    else
    {
        // this read is duplicate
        return DCR_FULL_LENGTH_DUPLICATE;
    }
}
Exemplo n.º 2
0
std::string get_valid_dbg_neighbors_coverage_and_ratio(const std::string& kmer,
                                                       const BWTIndexSet& index_set,
                                                       size_t min_coverage,
                                                       double min_ratio,
                                                       EdgeDir dir)
{
    std::string out;
    AlphaCount64 counts = 
        BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(kmer, 
                                                              index_set.pBWT,
                                                              dir,
                                                              index_set.pCache);
    
    if(!counts.hasDNAChar())
        return out; // no extensions

    char max_b = counts.getMaxDNABase();
    size_t max_c = counts.get(max_b);

    for(size_t j = 0; j < 4; ++j)
    {
        char b = "ACGT"[j];
        size_t c = counts.get(b);
        if(c >= min_coverage && (double)c / max_c >= min_ratio)
            out.push_back(b);
    }
    return out;    
}
Exemplo n.º 3
0
// Calculate the successors of this node in the implicit deBruijn graph
StringVector StringThreader::getDeBruijnExtensions(StringThreaderNode* pNode)
{
    WARN_ONCE("TODO: Refactor StringThreader to use new deBruijn code");

    // Get the last k-1 bases of the node
    std::string pmer = pNode->getSuffix(m_kmer - 1);
    std::string rc_pmer = reverseComplement(pmer);

    // Get an interval for the p-mer and its reverse complement
    BWTIntervalPair ip = BWTAlgorithms::findIntervalPair(m_pBWT, m_pRevBWT, pmer);
    BWTIntervalPair rc_ip = BWTAlgorithms::findIntervalPair(m_pBWT, m_pRevBWT, rc_pmer);

    // Get the extension bases
    AlphaCount64 extensions;
    AlphaCount64 rc_extensions;
    if(ip.interval[1].isValid())
        extensions += BWTAlgorithms::getExtCount(ip.interval[1], m_pRevBWT);
    if(rc_ip.interval[1].isValid())
        rc_extensions = BWTAlgorithms::getExtCount(rc_ip.interval[0], m_pBWT);
    rc_extensions.complement();
    extensions += rc_extensions;

    // Loop over the DNA symbols, if there is are more than two characters create a branch
    // otherwise just perform an extension.
    bool hasExtension = extensions.hasDNAChar();

    StringVector out;
    if(hasExtension)
    {
        for(int i = 0; i < DNA_ALPHABET::size; ++i)
        {
            char b = DNA_ALPHABET::getBase(i);
            if(extensions.get(b) > 0)
            {
                // extend to b
                std::string tmp;
                tmp.append(1,b);
                out.push_back(tmp);
            }
        }
    }

    // If the node branched, return true so the outer function can remove it from the leaf list
    return out;
}