void base_case() { //sort suffixes auto sa = _wm; const unsigned char* t = (const unsigned char*) _old_block.c_str(); divsufsort64(t, sa, this->_current_block_size); //obtain ISA auto isa = &_wm[_current_block_size]; for (int64_t i = 0; i < _current_block_size; i++) { isa[sa[i]] = i; } //build psi auto psi_values = &_wm[_current_block_size * 2]; for (int64_t i = 0; i < _current_block_size; i++) { psi_values[i] = isa[(sa[i] + 1) % _current_block_size]; } std::vector<int64_t> _new_symbol_counter(_alphabet_size + 1, 0); for (uint64_t i = 0; i < _old_block.size(); i++) { _new_symbol_counter[_old_block[i]]++; } int64_t temp = _new_symbol_counter[0]; _new_symbol_counter[0] = 0; for (int64_t i = 1; i <= _alphabet_size; i++) { int64_t aux = _new_symbol_counter[i]; _new_symbol_counter[i] = _new_symbol_counter[i - 1] + temp; temp = aux; } _symbol_counter = std::move(_new_symbol_counter); //compress psi_information for next iterations _old_psi.configure(); for (int64_t i = 0; i < _current_block_size; i++) { _old_psi.set(i, psi_values[i]); } _old_psi.post_configure(); }
void calculate_sa(const unsigned char* c, typename int_vector<fixedIntWidth>::size_type len, int_vector<fixedIntWidth>& sa) { typedef typename int_vector<fixedIntWidth>::size_type size_type; if (len <= 1) { // handle special case sa = int_vector<fixedIntWidth>(len,0); return; } bool small_file = (sizeof(len) <= 4 or len < 0x7FFFFFFFULL); if (small_file) { uint8_t oldIntWidth = sa.width(); if (32 == fixedIntWidth or (0==fixedIntWidth and 32 >= oldIntWidth)) { sa.width(32); sa.resize(len); divsufsort(c, (int32_t*)sa.m_data, len); // copy integers back to the right positions if (oldIntWidth!=32) { for (size_type i=0; i<len; ++i) { sa.set_int(i*oldIntWidth, sa.get_int(i<<5, 32), oldIntWidth); } sa.width(oldIntWidth); sa.resize(len); } } else { if (sa.width() < bits::hi(len)+1) { throw std::logic_error("width of int_vector is to small for the text!!!"); } int_vector<> sufarray(len,0,32); divsufsort(c, (int32_t*)sufarray.m_data, len); for (size_type i=0; i<len; ++i) { sa[i] = sufarray[i]; } } } else { uint8_t oldIntWidth = sa.width(); sa.width(64); sa.resize(len); divsufsort64(c, (int64_t*)sa.m_data, len); // copy integers back to the right positions if (oldIntWidth!=64) { for (size_type i=0; i<len; ++i) { sa.set_int(i*oldIntWidth, sa.get_int(i<<6, 64), oldIntWidth); } sa.width(oldIntWidth); sa.resize(len); } } }
unsigned int DegeneratePatternMatch::prepareForRMQ(){ /* Prepare SP$ */ const INT len = (sLen + pLen) + 1; unsigned char* text = new unsigned char[len]; std::copy(sequence,sequence+sLen ,text ); std::copy(pattern,pattern+pLen ,text+sLen ); text[len-1] = DELIM; /* Compute Suffix Array */ INT* sa = new INT[len]; if(sa == NULL){ fprintf (stderr,"Cannot allocate memory for SA."); return (0); } #ifdef _USE_64 if(divsufsort64(text, sa, len) != 0){ fprintf (stderr, "SA computation failed"); return (0); } #endif #ifdef _USE_32 if(divsufsort(text, sa, len) != 0){ fprintf (stderr,"SA computation failed"); return (0); } #endif /* Compute Rank array */ rank = new INT[len]; if(rank == NULL){ fprintf (stderr,"Cannot allocate memory for Rank Array"); return (0); } for(INT i = 0; i < len; i++ ){ rank [sa[i]] = i; } /* Compute LCP array */ lcp = new INT[len]; if(lcp == NULL){ fprintf (stderr,"Cannot allocate memory for LCP Array"); return (0); } if(constructLCPArray(text, len, sa) != 1){ fprintf (stderr,"LCP computation failed"); return (0); } // Delete SA and text as they are not needed now delete[] sa; delete[] text; /* Prepare LCP array for RMQ */ // create a vector of length len and initialize it with 0s sdsl::int_vector<> v(len , 0 ); for(INT i = 0; i < len; i++){ v[i] = lcp[i]; } rmq = new sdsl::rmq_succinct_sct<>(&v); // v is not required now sdsl::util::clear(v); //cout << "prepared for rmq" << endl; }
/* This creates a sparse suffix array. * S - a string of characters * nS - the size of S * M - the size of memory available for S (N < M) * K - the sparseness of the suffix array * It returns a sparse suffix array if successful, NULL otherwise. */ alder_ssa_core_t * alder_ssa_core_init_withS(char *S, const int64_t nS, const int64_t M, const int64_t K) { int s = 0; int64_t N = 0; if (K == 0 || K == 1 || K > sMaxK) return NULL; s = alder_ssa_khan09_appendDollarSign(S, nS, M, K, &N); if (s != 0) return NULL; alder_ssa_core_t *sparseSA = malloc(sizeof(alder_ssa_core_t)); int64_t *intSA = malloc((N/K+1)*sizeof(int64_t)); uint8_t *t_new = malloc((N/K+1)*sizeof(uint8_t)); int64_t *BucketBegin = malloc(BUCKETBEGINSIZE * sizeof(int64_t)); int64_t *SA = malloc((N/K)*sizeof(int64_t)); int64_t *ISA = malloc((N/K)*sizeof(int64_t)); int64_t *LCP = malloc((N/K)*sizeof(int64_t)); if (sparseSA == NULL || intSA == NULL || t_new == NULL || BucketBegin == NULL || SA == NULL || ISA == NULL || LCP == NULL) { XFREE(sparseSA); XFREE(intSA); XFREE(t_new); XFREE(BucketBegin); XFREE(SA); XFREE(ISA); XFREE(LCP); return NULL; } /////////////////////////////////////////////////////////////////////////// // Build SA /////////////////////////////////////////////////////////////////////////// sparseSA->S = S; sparseSA->K = K; sparseSA->N = N; sparseSA->logN = (int64_t)ceil(log2((double)(N/K))); sparseSA->NKm1 = N/K-1; assert(N%K==0); uint16_t bucketNr = 1; for(int64_t i = 0; i < N/K+1; i++) intSA[i] = i; alder_ssa_khan09_radixStep(S, K, BUCKETBEGINSIZE, t_new, intSA, &bucketNr, BucketBegin, 0, N/K-1, 0); assert(intSA[N/K] == N/K); t_new[N/K] = 0; // Terminate new integer string. s = (saint_t) divsufsort64(t_new, intSA, N/K+1); if (s != 0) { // Error! } //////////////////////////////////////////////// // Translate suffix array: set sparseSA->SA. for (int64_t i = 0; i < N/K; i++) SA[i] = intSA[i+1] * K; /////////////////////////////////////////////////////////////////////////// // Build ISA using sparse SA. for (int64_t i = 0; i < N/K; i++) ISA[SA[i]/K] = i; /////////////////////////////////////////////////////////////////////////// // Initialize LCP : SA + ISA -> LCP alder_ssa_khan09_computeLCP(S, SA, ISA, LCP, N, K); sparseSA->SA = SA; sparseSA->ISA = ISA; sparseSA->LCP = LCP; XFREE(intSA); XFREE(t_new); XFREE(BucketBegin); return sparseSA; }
/** * This function builds a suffix array using files with sequences. * Argument: * ref - file names. * refType - 0 for fasta, 1 for fastq. * K - more spaces for dollar signs in a sparse suffix array. * Return: * a sparse suffix array if successful, and NULL otherwise. */ alder_sparse_sa_t *alder_sparse_sa_alloc_file (const struct bstrList *ref, const int refType, const int64_t K) { assert(K == 2 || K== 3); assert(sizeof(size_t) == sizeof(int64_t)); alder_sparse_sa_t *sparseSA = malloc(sizeof(alder_sparse_sa_t)); if (sparseSA == NULL) { logc_logWarning(ERROR_LOGGER, "cannot a sparse suffix array."); GSL_ERROR_VAL("sparse sa alloc failed", GSL_ENOMEM, NULL); } /////////////////////////////////////////////////////////////////////////// // Set sparseSA->fS, and initialize others K, N, logN, NKm1. /////////////////////////////////////////////////////////////////////////// int64_t tSeq = 0; int64_t tBase = 0; if (refType == 0) { alder_fasta_list_length(ref, &tSeq, &tBase); sparseSA->fS = alder_fasta_list_alloc(ref, tSeq, tBase, K); } else if (refType == 1) { } if (sparseSA->fS == NULL) { free(sparseSA); GSL_ERROR_VAL("sparse sa alloc failed", GSL_ENOMEM, NULL); } sparseSA->S = sparseSA->fS->data; int64_t S_length = sparseSA->fS->numberOfBase; logc_log(MAIN_LOGGER, LOG_FINE, "Number of files=%d", ref->qty); logc_log(MAIN_LOGGER, LOG_FINE, "Number of sequences=%lld", tSeq); logc_log(MAIN_LOGGER, LOG_FINE, "Number of bases=%lld", tBase); // Increase string length so divisible by K. // Don't forget to count $ termination character. int64_t appendK = S_length % K == 0 ? K : K - S_length % K; if (sparseSA->fS->sizeCapacity < S_length + appendK + 1) { alder_fasta_list_free(sparseSA->fS); free(sparseSA); GSL_ERROR_VAL("sparse sa alloc failed: increase sSizeCapacity" "in alder_fasta.rl", GSL_ENOMEM, NULL); } assert(appendK > 0); for(int64_t i = 0; i < appendK; i++) { sparseSA->fS->data[sparseSA->fS->numberOfBase + i] = '$'; } sparseSA->fS->data[sparseSA->fS->numberOfBase + appendK] = '\0'; sparseSA->fS->sizeOfDataWithDollar = sparseSA->fS->numberOfBase + appendK; sparseSA->K = K; sparseSA->N = sparseSA->fS->sizeOfDataWithDollar; int64_t N = sparseSA->N; sparseSA->logN = (int64_t)ceil(log2((double)(N/K))); sparseSA->NKm1 = N/K-1; assert(N%K==0); if (K == 0 || K == 1 || K > sMaxK) { alder_fasta_list_free(sparseSA->fS); free(sparseSA); GSL_ERROR_VAL("sparse sa alloc failed", GSL_ENOMEM, NULL); } /////////////////////////////////////////////////////////////////////////// // Build SA /////////////////////////////////////////////////////////////////////////// assert(1 < K && K < 4); uint16_t bucketNr = 1; int64_t *intSA = malloc((N/K+1)*sizeof(int64_t)); if (intSA == NULL) { alder_fasta_list_free(sparseSA->fS); free(sparseSA); GSL_ERROR_VAL("sparse sa alloc failed", GSL_ENOMEM, NULL); } for(int64_t i = 0; i < N/K+1; i++) intSA[i] = i; // Init SA. uint8_t *t_new = malloc((N/K+1)*sizeof(uint8_t)); if (t_new == NULL) { free(intSA); alder_fasta_list_free(sparseSA->fS); free(sparseSA); GSL_ERROR_VAL("sparse sa alloc failed", GSL_ENOMEM, NULL); } int64_t *BucketBegin = malloc(BUCKETBEGINSIZE * sizeof(int64_t)); if (BucketBegin == NULL) { free(t_new); free(intSA); alder_fasta_list_free(sparseSA->fS); free(sparseSA); GSL_ERROR_VAL("sparse sa alloc failed", GSL_ENOMEM, NULL); } //////////////////////////////////////////////// // Radix sort. //////////////////////////////////////////////// // intSA: size of N/K+1 initialized 0 to N/K. // t_new: could be the result? // bucketNr: ??? // BucketBegin: some temp we may not need this. // l: 0 // r: N/K-1 // h: 0 // What are these l, r, h? alder_radixStep(sparseSA, t_new, intSA, &bucketNr, BucketBegin, 0, N/K-1, 0); assert(intSA[N/K] == N/K); t_new[N/K] = 0; // Terminate new integer string. free(BucketBegin); logc_log(MAIN_LOGGER, LOG_FINE, "bucketNr=%d", bucketNr); logc_log(MAIN_LOGGER, LOG_FINE, "N/K=%lld", N/K); #ifdef MAIN_LOGGER for (size_t i = 0; i < N/K + 1; i++) { logc_log(MAIN_LOGGER, LOG_FINEST, "t_new and intSA [%zd] %d\t%lld", i, (int)t_new[i], intSA[i]); } #endif //////////////////////////////////////////////// // Replace this with libdivsufsort64. //////////////////////////////////////////////// // t_new is the input // intSA is the suffix array of t_new. // size of intSA is N/K. // 0 .. bucketNr - 1 // Makes suffix array p of x. x becomes inverse of p. p and x are both of size // n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original // contents of x[n] is disregarded, the n-th symbol being regarded as // end-of-string smaller than all other symbols. // // void suffixsort(int *x, int *p, int n, int k, int l) // suffixsort(t_new, intSA_int, (int)(N/K), bucketNr_int, 0); // // t_new is uint8_t // intSA is int64_t // N/K is the size of the arrays. // N/K+1 makes more sense; 0 at N/K position is used as the terminator. // because SA takes values from 1-st position not from 0-th position. // and the original source mentioned that the last character is // the terminator. suffixsort seems to mention x[n] or the last // character is disregarded, or ithe n-th symbol being regarded as // end-of-string smaller than all other symbols. // N/K was the original input; saint_t status = divsufsort64(t_new, intSA, N/K+1); if (status != 0) { fprintf(stderr, "error: divsufsort64\n"); } #ifdef MAIN_LOGGER logc_log(MAIN_LOGGER, LOG_FINE, "divsufsort64 is called"); for (size_t i = 0; i < N/K + 1; i++) { logc_log(MAIN_LOGGER, LOG_FINEST, "intSA [%zd] %lld", i, intSA[i]); } #endif free(t_new); //////////////////////////////////////////////// // Translate suffix array: set sparseSA->SA. //////////////////////////////////////////////// sparseSA->SA = malloc((N/K)*sizeof(int64_t)); if (sparseSA->SA == NULL) { free(intSA); alder_fasta_list_free(sparseSA->fS); free(sparseSA); GSL_ERROR_VAL("sparse sa alloc failed", GSL_ENOMEM, NULL); } for (int64_t i = 0; i < N/K; i++) { sparseSA->SA[i] = intSA[i+1] * K; } free(intSA); #ifdef MAIN_LOGGER logc_log(MAIN_LOGGER, LOG_FINEST, "SA is computed using intSA (deleted)"); for (size_t i = 0; i < N/K; i++) { logc_log(MAIN_LOGGER, LOG_FINEST, "SA [%zd] %lld", i, sparseSA->SA[i]); } #endif /////////////////////////////////////////////////////////////////////////// // Build ISA using sparse SA. /////////////////////////////////////////////////////////////////////////// sparseSA->ISA = malloc((N/K)*sizeof(int64_t)); if (sparseSA->ISA == NULL) { free(sparseSA->SA); alder_fasta_list_free(sparseSA->fS); free(sparseSA); GSL_ERROR_VAL("sparse sa alloc failed", GSL_ENOMEM, NULL); } for (int64_t i = 0; i < N/K; i++) { sparseSA->ISA[sparseSA->SA[i]/K] = i; } /////////////////////////////////////////////////////////////////////////// // Initialize LCP : SA + ISA -> LCP /////////////////////////////////////////////////////////////////////////// sparseSA->LCP = malloc((N/K)*sizeof(int64_t)); if (sparseSA->LCP == NULL) { free(sparseSA->ISA); free(sparseSA->SA); alder_fasta_list_free(sparseSA->fS); free(sparseSA); GSL_ERROR_VAL("sparse sa alloc failed", GSL_ENOMEM, NULL); } alder_computeLCP(sparseSA); #ifdef MAIN_LOGGER logc_log(MAIN_LOGGER, LOG_FINE, "ISA is computed using SA."); for (size_t i = 0; i < N/K; i++) { logc_log(MAIN_LOGGER, LOG_FINEST, "ISA [%zd] %lld", i, sparseSA->ISA[i]); } for (size_t i = 0; i < N/K; i++) { logc_log(MAIN_LOGGER, LOG_FINEST, "LCP [%zd] %lld", i, sparseSA->LCP[i]); } #endif return sparseSA; }