示例#1
0
void suffixArray(int n, const T *str) {
    int m = mapCharToInt(++n, str);
    sais(n, m, s, t, p);
    for (int i = 0; i < n; i++) rk[sa[i]] = i;
    for (int i = 0, h = ht[0] = 0; i < n-1; i++) {
        int j = sa[rk[i]-1];
        while (i+h < n && j+h < n && s[i+h] == s[j+h]) h++;
        if (ht[rk[i]] = h) h--;
    }
}
示例#2
0
// Construct an extended suffix array for some string of length n.s
void constructESA(const char *s, int n, ESA *esa, ESA_FLAGS flags)
{
   esa->t     = s;
   esa->n     = n;
   esa->flags = flags;
         
   // TODO: Change these to malloc's later.
   esa->SA  = calloc( (n+2), sizeof(int) );
   esa->LCP = calloc( (n+2), sizeof(int) );
   
   
   //printf("Constructing SA/LCP\n");
   // Construct the SA and LCP in linear time.
   sais((unsigned char*)s, esa->SA, esa->LCP, n);

   // This is needed for the child table values to be computed
   // correctly.
   esa->LCP[n] = 0;
   
   if (! (flags & NO_CHILD_TAB) )   
   {
      //printf("Constructing Child table\n");
      // Child table, we attempt standard construction first,
      // then optimise it to occupy just one field.
      esa->up      = calloc( (n+2), sizeof(int) );
      esa->down    = calloc( (n+2), sizeof(int) );
      esa->accross = calloc( (n+2), sizeof(int) ); 

      // Create the child table.
      constructChildValues( esa );
   }
   
   // Construct the inverse suffix array.
   if (! (flags & NO_INV)  )
   {
      //printf("Constructing SAi\n");
      esa->SAi = calloc( (n+2), sizeof(int) );
   
      for (int i=0; i<n; i++)
         esa->SAi[esa->SA[i]] = i;
   }
         
   // Initialise the RMQ structure.           
   if (! (flags & NO_RMQ) )
   {
      //printf("initialising RMQ\n");
      RMQ_succinct(esa->LCP, n);  
   }
   
   //printf("Done ESA\n");
}
示例#3
0
void sais(int n, int m, int *s, int *t, int *p) {
    int n1 = t[n-1] = 0, ch = rk[0] = -1, *s1 = s+n;
    for (int i = n-2; ~i; i--) t[i] = s[i] == s[i+1] ? t[i+1] : s[i] > s[i+1];
    for (int i = 1; i < n; i++) rk[i] = t[i-1] && !t[i] ? (p[n1] = i, n1++) : -1;
    inducedSort(p);
    for (int i = 0, x, y; i < n; i++) if (~(x = rk[sa[i]])) {
        if (ch < 1 || p[x+1] - p[x] != p[y+1] - p[y]) ch++;
        else for (int j = p[x], k = p[y]; j <= p[x+1]; j++, k++)
            if ((s[j]<<1|t[j]) != (s[k]<<1|t[k])) {ch++; break;}
        s1[y = x] = ch;
    }
    if (ch+1 < n1) sais(n1, ch+1, s1, t+n, p+n1);
    else for (int i = 0; i < n1; i++) sa[s1[i]] = i;
    for (int i = 0; i < n1; i++) s1[i] = p[sa[i]];
    inducedSort(s1);
}
示例#4
0
void indexTranscriptsSA(ParserT* parser,
            		std::string& outputDir,
                        std::mutex& iomutex) {
    // Seed with a real random value, if available
    std::random_device rd;

    // Create a random uniform distribution
    std::default_random_engine eng(rd());

    std::uniform_int_distribution<> dis(0, 3);

    uint32_t n{0};
    uint32_t k = rapmap::utils::my_mer::k();
    std::vector<std::string> transcriptNames;
    std::vector<uint32_t> transcriptStarts;
    //std::vector<uint32_t> positionIDs;
    constexpr char bases[] = {'A', 'C', 'G', 'T'};
    uint32_t polyAClipLength{10};
    uint32_t numPolyAsClipped{0};
    std::string polyA(polyAClipLength, 'A');

    using TranscriptList = std::vector<uint32_t>;
    using eager_iterator = MerMapT::array::eager_iterator;
    using KmerBinT = uint64_t;

    size_t numDistinctKmers{0};
    size_t numKmers{0};
    size_t currIndex{0};
    std::cerr << "\n[Step 1 of 4] : counting k-mers\n";

    //rsdic::RSDicBuilder rsdb;
    std::vector<uint32_t> onePos; // Positions in the bit array where we should write a '1'
    fmt::MemoryWriter txpSeqStream;
    {
        ScopedTimer timer;
        while(true) {
            typename ParserT::job j(*parser);
            if(j.is_empty()) break;
            for(size_t i = 0; i < j->nb_filled; ++i) { // For each sequence
                std::string& readStr = j->data[i].seq;
                readStr.erase(std::remove_if(readStr.begin(), readStr.end(),
                               [](const char a) -> bool {
                                    return !(isprint(a));
                                }), readStr.end());
                // Do Kallisto-esque clipping of polyA tails
                if (readStr.size() > polyAClipLength and
                        readStr.substr(readStr.length() - polyAClipLength) == polyA) {

                    auto newEndPos = readStr.find_last_not_of("Aa");
                    // If it was all As
                    if (newEndPos == std::string::npos) {
                        readStr.resize(0);
                    } else {
                        readStr.resize(newEndPos + 1);
                    }
                    ++numPolyAsClipped;
                }

                uint32_t readLen  = readStr.size();
                uint32_t txpIndex = n++;

		// The name of the current transcript
                transcriptNames.push_back(j->data[i].header);
		// The position at which this transcript starts
                transcriptStarts.push_back(currIndex);


                bool firstBase{true};
                rapmap::utils::my_mer mer;
                mer.polyT();
                for (size_t b = 0; b < readLen; ++b) {
                    readStr[b] = ::toupper(readStr[b]);
                    int c = jellyfish::mer_dna::code(readStr[b]);
                    // Replace non-ACGT bases with pseudo-random bases
                    if (jellyfish::mer_dna::not_dna(c)) {
                        char rbase = bases[dis(eng)];
                        c = jellyfish::mer_dna::code(rbase);
                        readStr[b] = rbase;
                    }
                    //positionIDs.push_back(txpIndex);
		    //rsdb.PushBack(0);
                }
                txpSeqStream << readStr;
                txpSeqStream << '$';
                //positionIDs.push_back(txpIndex);
		//rsdb.PushBack(1);
                currIndex += readLen + 1;
		onePos.push_back(currIndex - 1);
            }
            if (n % 10000 == 0) {
                std::cerr << "\r\rcounted k-mers for " << n << " transcripts";
            }
        }
    }
    std::cerr << "\n";

    std::cerr << "Clipped poly-A tails from " << numPolyAsClipped << " transcripts\n";

    // Put the concatenated text in a string
    std::string concatText = txpSeqStream.str();
    // And clear the stream
    txpSeqStream.clear();


    // Make our dense bit arrray
    BIT_ARRAY* bitArray = bit_array_create(concatText.length());
    for (auto p : onePos) {
	    bit_array_set_bit(bitArray, p);
    }

    /** SANITY CHECKS RELATED TO THE RANK structure **/
    /*
    uint64_t nextSetBit{0};
    uint64_t offset{0};
    auto numBits = bit_array_length(bitArray);
    while (offset < numBits and bit_array_find_next_set_bit(bitArray, offset, &nextSetBit)) {
	if (concatText[nextSetBit] != '$') {
		std::cerr << "Bit # " << nextSetBit << " is set to 1, but the "
			  << "corresponding character in the text is " << concatText[nextSetBit] << "\n";
	}
	offset = nextSetBit + 1;
    }

    if (bit_array_num_bits_set(bitArray) != onePos.size()) {
	std::cerr << "ERROR: Bit array has " << bit_array_num_bits_set(bitArray)
		  << " bits set, but this should be " << onePos.size() << "!\n";
	std::exit(1);
    }

    rank9b bitmap(bitArray->words, bitArray->num_of_bits);
    for (size_t i = 0; i < onePos.size() - 1; ++i) {
	    auto pos = onePos[i];
	    auto r = bitmap.rank(pos+1);

	    if (r != i+1) {
		std::cerr << "rank should be " << i+1 << " but it's " << r << "\n";
		std::cerr << "text is " << concatText[pos] < "\n\n";
		std::cerr << "bit vector says " << (bit_array_get_bit(bitArray, pos) ? '1' : '0') << "\n";
	    }
    }

    std::ofstream rsStream(outputDir + "rsdSafe.bin", std::ios::binary);
    {
        ScopedTimer timer;
        rsdic::RSDic rsd;
        rsdb.Build(rsd);
        rsd.Save(rsStream);
        std::cerr << "done\n";
    }
    rsStream.close();
    */
    /** END OF SANITY CHECK **/
    onePos.clear();
    onePos.shrink_to_fit();

    std::string rsFileName = outputDir + "rsd.bin";
    FILE* rsFile = fopen(rsFileName.c_str(), "w");
    {
        ScopedTimer timer;
        std::cerr << "Building rank-select dictionary and saving to disk ";
	bit_array_save(bitArray, rsFile);
        std::cerr << "done\n";
    }
    fclose(rsFile);
    bit_array_free(bitArray);


    std::ofstream seqStream(outputDir + "txpInfo.bin", std::ios::binary);
    {
        ScopedTimer timer;
        std::cerr << "Writing sequence data to file . . . ";
        cereal::BinaryOutputArchive seqArchive(seqStream);
        seqArchive(transcriptNames);
        seqArchive(transcriptStarts);
        //seqArchive(positionIDs);
        seqArchive(concatText);
        std::cerr << "done\n";
    }
    seqStream.close();

    // clear stuff we no longer need
    //positionIDs.clear();
    //positionIDs.shrink_to_fit();
    transcriptStarts.clear();
    transcriptStarts.shrink_to_fit();
    transcriptNames.clear();
    transcriptNames.shrink_to_fit();
    // done clearing


    // Build the suffix array
    size_t tlen = concatText.length();
    std::vector<int> SA(tlen, 0);
    std::ofstream saStream(outputDir + "sa.bin", std::ios::binary);
    {
        ScopedTimer timer;
        std::cerr << "Building suffix array . . . ";
        auto ret = sais(reinterpret_cast<unsigned char*>(
                        const_cast<char*>(concatText.c_str())),
                        SA.data(), tlen + 1);
        if (ret == 0) {
            std::cerr << "success\n";
            {
                ScopedTimer timer2;
                std::cerr << "saving to disk . . . ";
                cereal::BinaryOutputArchive saArchive(saStream);
                saArchive(SA);
		// don't actually need the LCP right now
                // saArchive(LCP);
                std::cerr << "done\n";
            }
        } else {
            std::cerr << "FAILURE: return code from sais() was " << ret << "\n";
	    std::exit(1);
        }
        std::cerr << "done\n";
    }
    saStream.close();

    // clear things we don't need
    //LCP.clear();
    // LCP.shrink_to_fit();
    // done clearing

    // Now, build the k-mer lookup table
    /*
    std::unordered_map<uint64_t,
                       rapmap::utils::SAInterval,
                       rapmap::utils::KmerKeyHasher> khash;
                       */
    google::dense_hash_map<uint64_t,
                        rapmap::utils::SAInterval,
                        rapmap::utils::KmerKeyHasher> khash;
    khash.set_empty_key(std::numeric_limits<uint64_t>::max());

    /*
    concatText.erase(std::remove_if(concatText.begin(),
                                    concatText.end(),
                                    [] (const char a) -> bool { return !isprint(a); }),
                     concatText.end());
                     */

    // The start and stop of the current interval
    uint32_t start = 0, stop = 0;
    // An iterator to the beginning of the text
    auto textB = concatText.begin();
    auto textE = concatText.end();
    // The current k-mer as a string
    rapmap::utils::my_mer mer;
    bool currentValid{false};
    std::string currentKmer;
    std::string nextKmer;
    while (stop < tlen) {
        // Check if the string starting at the
        // current position is valid (i.e. doesn't contain $)
        // and is <= k bases from the end of the string
        nextKmer = concatText.substr(SA[stop], k);
        if (nextKmer.length() == k and
            nextKmer.find_first_of('$') == std::string::npos) {
            // If this is a new k-mer, then hash the current k-mer
            if (nextKmer != currentKmer) {
                if (currentKmer.length() == k and
                    currentKmer.find_first_of('$') == std::string::npos) {
                    mer = rapmap::utils::my_mer(currentKmer);
                    auto bits = mer.get_bits(0, 2*k);
                    auto hashIt = khash.find(bits);
                    if (hashIt == khash.end()) {
                        if (start > 1) {
                            if (concatText.substr(SA[start-1], k) ==
                                concatText.substr(SA[start], k)) {
                                std::cerr << "T[SA["
                                          << start-1 << "]:" << k << "] = "
                                          << concatText.substr(SA[start-1], k)
                                          << " = T[SA[" << start << "]:" << k << "]\n";
                                std::cerr << "start = " << start << ", stop = " << stop << "\n";
                                std::cerr << "(1) THIS SHOULD NOT HAPPEN\n";
                                std::exit(1);
                            }
                        }
                        if (start == stop) {
                            std::cerr << "AHH (1) : Interval is empty! (start = " << start
                                      << ") = (stop =  " << stop << ")\n";
                        }
                        if (start == stop) {
                            std::cerr << "AHH (2) : Interval is empty! (start = " << start
                                << ") = (stop =  " << stop << ")\n";
                        }

                        khash[bits] = {start, stop};
                    } else {
                        std::cerr << "\nERROR (1): trying to add same suffix "
                                  << currentKmer << " (len = "
                                  << currentKmer.length() << ") multiple times!\n";
                        auto prevInt = hashIt->second;
                        std::cerr << "existing interval is ["
                                  << prevInt.begin << ", " << prevInt.end << ")\n";
                        for (auto x = prevInt.begin; x < prevInt.end; ++x) {
                            auto suff = concatText.substr(SA[x], k);
                            for (auto c : suff) {
                                std::cerr << "*" << c << "*";
                            }
                            std::cerr << " (len = " << suff.length() <<")\n";
                        }
                        std::cerr << "new interval is ["
                                  << start << ", " << stop << ")\n";
                        for (auto x = start; x < stop; ++x) {
                            auto suff = concatText.substr(SA[x], k);
                            for (auto c : suff) {
                                std::cerr << "*" << c << "*";
                            }
                            std::cerr << "\n";
                        }
                    }
                }
                currentKmer = nextKmer;
                start = stop;
            }
        } else {
            // If this isn't a valid suffix (contains a $)

            // If the previous interval was valid, put it
            // in the hash.
            if (currentKmer.length() == k and
                currentKmer.find_first_of('$') == std::string::npos) {
                mer = rapmap::utils::my_mer(currentKmer);
                auto bits = mer.get_bits(0, 2*k);
                auto hashIt = khash.find(bits);
                if (hashIt == khash.end()) {
                    if (start > 2) {
                        if (concatText.substr(SA[start-1], k) ==
                            concatText.substr(SA[start], k)) {
                            std::cerr << "T[SA["
                                << start-1 << "]:" << k << "] = "
                                << concatText.substr(SA[start-1], k)
                                << " = T[SA[" << start << "]:" << k << "]\n";
                            std::cerr << "start = " << start << ", stop = " << stop << "\n";
                            std::cerr << "(2) THIS SHOULD NOT HAPPEN\n";
                            std::exit(1);
                        }
                    }
                    khash[bits] = {start, stop};
                } else {
                    std::cerr << "\nERROR (2): trying to add same suffix "
                              << currentKmer << "multiple times!\n";
                    auto prevInt = hashIt->second;
                    std::cerr << "existing interval is ["
                        << prevInt.begin << ", " << prevInt.end << ")\n";
                    for (auto x = prevInt.begin; x < prevInt.end; ++x) {
                        std::cerr << concatText.substr(SA[x], k) << "\n";
                    }
                    std::cerr << "new interval is ["
                        << start << ", " << stop << ")\n";
                    for (auto x = start; x < stop; ++x) {
                        std::cerr << concatText.substr(SA[x], k) << "\n";
                    }
                }

            }
            // The current interval is invalid and empty
            currentKmer = nextKmer;
            start = stop;
        }
        if (stop % 1000000 == 0) {
            std::cerr << "\r\rprocessed " << stop << " positions";
        }
        // We always update the end position
        ++stop;
    }
    if (start < tlen) {
        if (currentKmer.length() == k and
            currentKmer.find_first_of('$') != std::string::npos) {
            mer = rapmap::utils::my_mer(currentKmer);
            khash[mer.get_bits(0, 2*k)] = {start, stop};
        }
    }
    std::cerr << "\nkhash had " << khash.size() << " keys\n";
    std::ofstream hashStream(outputDir + "hash.bin", std::ios::binary);
    {
        ScopedTimer timer;
        std::cerr << "saving hash to disk . . . ";
        cereal::BinaryOutputArchive hashArchive(hashStream);
        hashArchive(k);
        khash.serialize(google::dense_hash_map<uint64_t,
                        rapmap::utils::SAInterval,
                        rapmap::utils::KmerKeyHasher>::NopointerSerializer(), &hashStream);
        //hashArchive(khash);
        std::cerr << "done\n";
    }
    hashStream.close();


    std::string indexVersion = "q0";
    IndexHeader header(IndexType::QUASI, indexVersion, true, k);
    // Finally (since everything presumably succeeded) write the header
    std::ofstream headerStream(outputDir + "header.json");
    {
	cereal::JSONOutputArchive archive(headerStream);
	archive(header);
    }
    headerStream.close();

}