void suffixArray(int n, const T *str) { int m = mapCharToInt(++n, str); sais(n, m, s, t, p); for (int i = 0; i < n; i++) rk[sa[i]] = i; for (int i = 0, h = ht[0] = 0; i < n-1; i++) { int j = sa[rk[i]-1]; while (i+h < n && j+h < n && s[i+h] == s[j+h]) h++; if (ht[rk[i]] = h) h--; } }
// Construct an extended suffix array for some string of length n.s void constructESA(const char *s, int n, ESA *esa, ESA_FLAGS flags) { esa->t = s; esa->n = n; esa->flags = flags; // TODO: Change these to malloc's later. esa->SA = calloc( (n+2), sizeof(int) ); esa->LCP = calloc( (n+2), sizeof(int) ); //printf("Constructing SA/LCP\n"); // Construct the SA and LCP in linear time. sais((unsigned char*)s, esa->SA, esa->LCP, n); // This is needed for the child table values to be computed // correctly. esa->LCP[n] = 0; if (! (flags & NO_CHILD_TAB) ) { //printf("Constructing Child table\n"); // Child table, we attempt standard construction first, // then optimise it to occupy just one field. esa->up = calloc( (n+2), sizeof(int) ); esa->down = calloc( (n+2), sizeof(int) ); esa->accross = calloc( (n+2), sizeof(int) ); // Create the child table. constructChildValues( esa ); } // Construct the inverse suffix array. if (! (flags & NO_INV) ) { //printf("Constructing SAi\n"); esa->SAi = calloc( (n+2), sizeof(int) ); for (int i=0; i<n; i++) esa->SAi[esa->SA[i]] = i; } // Initialise the RMQ structure. if (! (flags & NO_RMQ) ) { //printf("initialising RMQ\n"); RMQ_succinct(esa->LCP, n); } //printf("Done ESA\n"); }
void sais(int n, int m, int *s, int *t, int *p) { int n1 = t[n-1] = 0, ch = rk[0] = -1, *s1 = s+n; for (int i = n-2; ~i; i--) t[i] = s[i] == s[i+1] ? t[i+1] : s[i] > s[i+1]; for (int i = 1; i < n; i++) rk[i] = t[i-1] && !t[i] ? (p[n1] = i, n1++) : -1; inducedSort(p); for (int i = 0, x, y; i < n; i++) if (~(x = rk[sa[i]])) { if (ch < 1 || p[x+1] - p[x] != p[y+1] - p[y]) ch++; else for (int j = p[x], k = p[y]; j <= p[x+1]; j++, k++) if ((s[j]<<1|t[j]) != (s[k]<<1|t[k])) {ch++; break;} s1[y = x] = ch; } if (ch+1 < n1) sais(n1, ch+1, s1, t+n, p+n1); else for (int i = 0; i < n1; i++) sa[s1[i]] = i; for (int i = 0; i < n1; i++) s1[i] = p[sa[i]]; inducedSort(s1); }
void indexTranscriptsSA(ParserT* parser, std::string& outputDir, std::mutex& iomutex) { // Seed with a real random value, if available std::random_device rd; // Create a random uniform distribution std::default_random_engine eng(rd()); std::uniform_int_distribution<> dis(0, 3); uint32_t n{0}; uint32_t k = rapmap::utils::my_mer::k(); std::vector<std::string> transcriptNames; std::vector<uint32_t> transcriptStarts; //std::vector<uint32_t> positionIDs; constexpr char bases[] = {'A', 'C', 'G', 'T'}; uint32_t polyAClipLength{10}; uint32_t numPolyAsClipped{0}; std::string polyA(polyAClipLength, 'A'); using TranscriptList = std::vector<uint32_t>; using eager_iterator = MerMapT::array::eager_iterator; using KmerBinT = uint64_t; size_t numDistinctKmers{0}; size_t numKmers{0}; size_t currIndex{0}; std::cerr << "\n[Step 1 of 4] : counting k-mers\n"; //rsdic::RSDicBuilder rsdb; std::vector<uint32_t> onePos; // Positions in the bit array where we should write a '1' fmt::MemoryWriter txpSeqStream; { ScopedTimer timer; while(true) { typename ParserT::job j(*parser); if(j.is_empty()) break; for(size_t i = 0; i < j->nb_filled; ++i) { // For each sequence std::string& readStr = j->data[i].seq; readStr.erase(std::remove_if(readStr.begin(), readStr.end(), [](const char a) -> bool { return !(isprint(a)); }), readStr.end()); // Do Kallisto-esque clipping of polyA tails if (readStr.size() > polyAClipLength and readStr.substr(readStr.length() - polyAClipLength) == polyA) { auto newEndPos = readStr.find_last_not_of("Aa"); // If it was all As if (newEndPos == std::string::npos) { readStr.resize(0); } else { readStr.resize(newEndPos + 1); } ++numPolyAsClipped; } uint32_t readLen = readStr.size(); uint32_t txpIndex = n++; // The name of the current transcript transcriptNames.push_back(j->data[i].header); // The position at which this transcript starts transcriptStarts.push_back(currIndex); bool firstBase{true}; rapmap::utils::my_mer mer; mer.polyT(); for (size_t b = 0; b < readLen; ++b) { readStr[b] = ::toupper(readStr[b]); int c = jellyfish::mer_dna::code(readStr[b]); // Replace non-ACGT bases with pseudo-random bases if (jellyfish::mer_dna::not_dna(c)) { char rbase = bases[dis(eng)]; c = jellyfish::mer_dna::code(rbase); readStr[b] = rbase; } //positionIDs.push_back(txpIndex); //rsdb.PushBack(0); } txpSeqStream << readStr; txpSeqStream << '$'; //positionIDs.push_back(txpIndex); //rsdb.PushBack(1); currIndex += readLen + 1; onePos.push_back(currIndex - 1); } if (n % 10000 == 0) { std::cerr << "\r\rcounted k-mers for " << n << " transcripts"; } } } std::cerr << "\n"; std::cerr << "Clipped poly-A tails from " << numPolyAsClipped << " transcripts\n"; // Put the concatenated text in a string std::string concatText = txpSeqStream.str(); // And clear the stream txpSeqStream.clear(); // Make our dense bit arrray BIT_ARRAY* bitArray = bit_array_create(concatText.length()); for (auto p : onePos) { bit_array_set_bit(bitArray, p); } /** SANITY CHECKS RELATED TO THE RANK structure **/ /* uint64_t nextSetBit{0}; uint64_t offset{0}; auto numBits = bit_array_length(bitArray); while (offset < numBits and bit_array_find_next_set_bit(bitArray, offset, &nextSetBit)) { if (concatText[nextSetBit] != '$') { std::cerr << "Bit # " << nextSetBit << " is set to 1, but the " << "corresponding character in the text is " << concatText[nextSetBit] << "\n"; } offset = nextSetBit + 1; } if (bit_array_num_bits_set(bitArray) != onePos.size()) { std::cerr << "ERROR: Bit array has " << bit_array_num_bits_set(bitArray) << " bits set, but this should be " << onePos.size() << "!\n"; std::exit(1); } rank9b bitmap(bitArray->words, bitArray->num_of_bits); for (size_t i = 0; i < onePos.size() - 1; ++i) { auto pos = onePos[i]; auto r = bitmap.rank(pos+1); if (r != i+1) { std::cerr << "rank should be " << i+1 << " but it's " << r << "\n"; std::cerr << "text is " << concatText[pos] < "\n\n"; std::cerr << "bit vector says " << (bit_array_get_bit(bitArray, pos) ? '1' : '0') << "\n"; } } std::ofstream rsStream(outputDir + "rsdSafe.bin", std::ios::binary); { ScopedTimer timer; rsdic::RSDic rsd; rsdb.Build(rsd); rsd.Save(rsStream); std::cerr << "done\n"; } rsStream.close(); */ /** END OF SANITY CHECK **/ onePos.clear(); onePos.shrink_to_fit(); std::string rsFileName = outputDir + "rsd.bin"; FILE* rsFile = fopen(rsFileName.c_str(), "w"); { ScopedTimer timer; std::cerr << "Building rank-select dictionary and saving to disk "; bit_array_save(bitArray, rsFile); std::cerr << "done\n"; } fclose(rsFile); bit_array_free(bitArray); std::ofstream seqStream(outputDir + "txpInfo.bin", std::ios::binary); { ScopedTimer timer; std::cerr << "Writing sequence data to file . . . "; cereal::BinaryOutputArchive seqArchive(seqStream); seqArchive(transcriptNames); seqArchive(transcriptStarts); //seqArchive(positionIDs); seqArchive(concatText); std::cerr << "done\n"; } seqStream.close(); // clear stuff we no longer need //positionIDs.clear(); //positionIDs.shrink_to_fit(); transcriptStarts.clear(); transcriptStarts.shrink_to_fit(); transcriptNames.clear(); transcriptNames.shrink_to_fit(); // done clearing // Build the suffix array size_t tlen = concatText.length(); std::vector<int> SA(tlen, 0); std::ofstream saStream(outputDir + "sa.bin", std::ios::binary); { ScopedTimer timer; std::cerr << "Building suffix array . . . "; auto ret = sais(reinterpret_cast<unsigned char*>( const_cast<char*>(concatText.c_str())), SA.data(), tlen + 1); if (ret == 0) { std::cerr << "success\n"; { ScopedTimer timer2; std::cerr << "saving to disk . . . "; cereal::BinaryOutputArchive saArchive(saStream); saArchive(SA); // don't actually need the LCP right now // saArchive(LCP); std::cerr << "done\n"; } } else { std::cerr << "FAILURE: return code from sais() was " << ret << "\n"; std::exit(1); } std::cerr << "done\n"; } saStream.close(); // clear things we don't need //LCP.clear(); // LCP.shrink_to_fit(); // done clearing // Now, build the k-mer lookup table /* std::unordered_map<uint64_t, rapmap::utils::SAInterval, rapmap::utils::KmerKeyHasher> khash; */ google::dense_hash_map<uint64_t, rapmap::utils::SAInterval, rapmap::utils::KmerKeyHasher> khash; khash.set_empty_key(std::numeric_limits<uint64_t>::max()); /* concatText.erase(std::remove_if(concatText.begin(), concatText.end(), [] (const char a) -> bool { return !isprint(a); }), concatText.end()); */ // The start and stop of the current interval uint32_t start = 0, stop = 0; // An iterator to the beginning of the text auto textB = concatText.begin(); auto textE = concatText.end(); // The current k-mer as a string rapmap::utils::my_mer mer; bool currentValid{false}; std::string currentKmer; std::string nextKmer; while (stop < tlen) { // Check if the string starting at the // current position is valid (i.e. doesn't contain $) // and is <= k bases from the end of the string nextKmer = concatText.substr(SA[stop], k); if (nextKmer.length() == k and nextKmer.find_first_of('$') == std::string::npos) { // If this is a new k-mer, then hash the current k-mer if (nextKmer != currentKmer) { if (currentKmer.length() == k and currentKmer.find_first_of('$') == std::string::npos) { mer = rapmap::utils::my_mer(currentKmer); auto bits = mer.get_bits(0, 2*k); auto hashIt = khash.find(bits); if (hashIt == khash.end()) { if (start > 1) { if (concatText.substr(SA[start-1], k) == concatText.substr(SA[start], k)) { std::cerr << "T[SA[" << start-1 << "]:" << k << "] = " << concatText.substr(SA[start-1], k) << " = T[SA[" << start << "]:" << k << "]\n"; std::cerr << "start = " << start << ", stop = " << stop << "\n"; std::cerr << "(1) THIS SHOULD NOT HAPPEN\n"; std::exit(1); } } if (start == stop) { std::cerr << "AHH (1) : Interval is empty! (start = " << start << ") = (stop = " << stop << ")\n"; } if (start == stop) { std::cerr << "AHH (2) : Interval is empty! (start = " << start << ") = (stop = " << stop << ")\n"; } khash[bits] = {start, stop}; } else { std::cerr << "\nERROR (1): trying to add same suffix " << currentKmer << " (len = " << currentKmer.length() << ") multiple times!\n"; auto prevInt = hashIt->second; std::cerr << "existing interval is [" << prevInt.begin << ", " << prevInt.end << ")\n"; for (auto x = prevInt.begin; x < prevInt.end; ++x) { auto suff = concatText.substr(SA[x], k); for (auto c : suff) { std::cerr << "*" << c << "*"; } std::cerr << " (len = " << suff.length() <<")\n"; } std::cerr << "new interval is [" << start << ", " << stop << ")\n"; for (auto x = start; x < stop; ++x) { auto suff = concatText.substr(SA[x], k); for (auto c : suff) { std::cerr << "*" << c << "*"; } std::cerr << "\n"; } } } currentKmer = nextKmer; start = stop; } } else { // If this isn't a valid suffix (contains a $) // If the previous interval was valid, put it // in the hash. if (currentKmer.length() == k and currentKmer.find_first_of('$') == std::string::npos) { mer = rapmap::utils::my_mer(currentKmer); auto bits = mer.get_bits(0, 2*k); auto hashIt = khash.find(bits); if (hashIt == khash.end()) { if (start > 2) { if (concatText.substr(SA[start-1], k) == concatText.substr(SA[start], k)) { std::cerr << "T[SA[" << start-1 << "]:" << k << "] = " << concatText.substr(SA[start-1], k) << " = T[SA[" << start << "]:" << k << "]\n"; std::cerr << "start = " << start << ", stop = " << stop << "\n"; std::cerr << "(2) THIS SHOULD NOT HAPPEN\n"; std::exit(1); } } khash[bits] = {start, stop}; } else { std::cerr << "\nERROR (2): trying to add same suffix " << currentKmer << "multiple times!\n"; auto prevInt = hashIt->second; std::cerr << "existing interval is [" << prevInt.begin << ", " << prevInt.end << ")\n"; for (auto x = prevInt.begin; x < prevInt.end; ++x) { std::cerr << concatText.substr(SA[x], k) << "\n"; } std::cerr << "new interval is [" << start << ", " << stop << ")\n"; for (auto x = start; x < stop; ++x) { std::cerr << concatText.substr(SA[x], k) << "\n"; } } } // The current interval is invalid and empty currentKmer = nextKmer; start = stop; } if (stop % 1000000 == 0) { std::cerr << "\r\rprocessed " << stop << " positions"; } // We always update the end position ++stop; } if (start < tlen) { if (currentKmer.length() == k and currentKmer.find_first_of('$') != std::string::npos) { mer = rapmap::utils::my_mer(currentKmer); khash[mer.get_bits(0, 2*k)] = {start, stop}; } } std::cerr << "\nkhash had " << khash.size() << " keys\n"; std::ofstream hashStream(outputDir + "hash.bin", std::ios::binary); { ScopedTimer timer; std::cerr << "saving hash to disk . . . "; cereal::BinaryOutputArchive hashArchive(hashStream); hashArchive(k); khash.serialize(google::dense_hash_map<uint64_t, rapmap::utils::SAInterval, rapmap::utils::KmerKeyHasher>::NopointerSerializer(), &hashStream); //hashArchive(khash); std::cerr << "done\n"; } hashStream.close(); std::string indexVersion = "q0"; IndexHeader header(IndexType::QUASI, indexVersion, true, k); // Finally (since everything presumably succeeded) write the header std::ofstream headerStream(outputDir + "header.json"); { cereal::JSONOutputArchive archive(headerStream); archive(header); } headerStream.close(); }