void base_case() {
     //sort suffixes
     auto sa = _wm;
     const unsigned char* t = (const unsigned char*) _old_block.c_str();
     divsufsort64(t, sa, this->_current_block_size);
     //obtain ISA
     auto isa = &_wm[_current_block_size];
     for (int64_t i = 0; i < _current_block_size; i++) {
         isa[sa[i]] = i;
     }
     //build psi
     auto psi_values = &_wm[_current_block_size * 2];
     for (int64_t i = 0; i < _current_block_size; i++) {
         psi_values[i] = isa[(sa[i] + 1) % _current_block_size];
     }
     std::vector<int64_t> _new_symbol_counter(_alphabet_size + 1, 0);
     for (uint64_t i = 0; i < _old_block.size(); i++) {
         _new_symbol_counter[_old_block[i]]++;
     }
     int64_t temp = _new_symbol_counter[0];
     _new_symbol_counter[0] = 0;
     for (int64_t i = 1; i <= _alphabet_size; i++) {
         int64_t aux = _new_symbol_counter[i];
         _new_symbol_counter[i] = _new_symbol_counter[i - 1] + temp;
         temp = aux;
     }
     _symbol_counter = std::move(_new_symbol_counter);
     //compress psi_information for next iterations
     _old_psi.configure();
     for (int64_t i = 0; i < _current_block_size; i++) {
         _old_psi.set(i, psi_values[i]);
     }
     _old_psi.post_configure();
 }
void calculate_sa(const unsigned char* c, typename int_vector<fixedIntWidth>::size_type len, int_vector<fixedIntWidth>& sa)
{
    typedef typename int_vector<fixedIntWidth>::size_type size_type;
    if (len <= 1) { // handle special case
        sa = int_vector<fixedIntWidth>(len,0);
        return;
    }
    bool small_file = (sizeof(len) <= 4 or len < 0x7FFFFFFFULL);
    if (small_file) {
        uint8_t oldIntWidth = sa.width();
        if (32 == fixedIntWidth or (0==fixedIntWidth and 32 >= oldIntWidth)) {
            sa.width(32);
            sa.resize(len);
            divsufsort(c, (int32_t*)sa.m_data, len);
            // copy integers back to the right positions
            if (oldIntWidth!=32) {
                for (size_type i=0; i<len; ++i) {
                    sa.set_int(i*oldIntWidth, sa.get_int(i<<5, 32), oldIntWidth);
                }
                sa.width(oldIntWidth);
                sa.resize(len);
            }
        } else {
            if (sa.width() < bits::hi(len)+1) {
                throw std::logic_error("width of int_vector is to small for the text!!!");
            }
            int_vector<> sufarray(len,0,32);
            divsufsort(c, (int32_t*)sufarray.m_data, len);
            for (size_type i=0; i<len; ++i) {
                sa[i] = sufarray[i];
            }
        }
    } else {
        uint8_t oldIntWidth = sa.width();
        sa.width(64);
        sa.resize(len);
        divsufsort64(c, (int64_t*)sa.m_data, len);
        // copy integers back to the right positions
        if (oldIntWidth!=64) {
            for (size_type i=0; i<len; ++i) {
                sa.set_int(i*oldIntWidth, sa.get_int(i<<6, 64), oldIntWidth);
            }
            sa.width(oldIntWidth);
            sa.resize(len);
        }
    }
}
  unsigned int
  DegeneratePatternMatch::prepareForRMQ(){
    /* Prepare SP$  */
    const INT len = (sLen + pLen) + 1;
    unsigned char* text = new unsigned char[len];
    std::copy(sequence,sequence+sLen ,text );
    std::copy(pattern,pattern+pLen ,text+sLen );
    text[len-1] = DELIM;

    /* Compute Suffix Array */
    INT* sa = new INT[len];
    if(sa == NULL){
      fprintf (stderr,"Cannot allocate memory for SA.");
      return (0);
    }
#ifdef _USE_64
    if(divsufsort64(text, sa, len) != 0){
      fprintf (stderr, "SA computation failed");
      return (0);
    }
#endif
#ifdef _USE_32
    if(divsufsort(text, sa, len) != 0){
      fprintf (stderr,"SA computation failed");
      return (0);
    }
#endif
    /* Compute Rank array */ 
    rank = new INT[len];
    if(rank == NULL){
      fprintf (stderr,"Cannot allocate memory for Rank Array");
      return (0);  
    }
    for(INT i = 0; i < len; i++ ){
      rank [sa[i]] = i;
    }

    /* Compute LCP array */ 
    lcp = new INT[len];
    if(lcp == NULL){
      fprintf (stderr,"Cannot allocate memory for LCP Array"); 
      return (0); 
    }       
    if(constructLCPArray(text, len, sa) != 1){
      fprintf (stderr,"LCP computation failed");
      return (0);
    }

    // Delete SA and text as they are not needed now
    delete[] sa;
    delete[] text;

    /* Prepare LCP array for RMQ */ 
    // create a vector of length len and initialize it with 0s
    sdsl::int_vector<> v(len , 0 ); 
    for(INT i = 0; i < len; i++){
      v[i] = lcp[i];
    }
    rmq = new sdsl::rmq_succinct_sct<>(&v);
    // v is not required now
    sdsl::util::clear(v);  
    //cout << "prepared for rmq" << endl;
  }
Exemple #4
0
/* This creates a sparse suffix array.
 * S - a string of characters
 * nS - the size of S
 * M - the size of memory available for S (N < M)
 * K - the sparseness of the suffix array
 * It returns a sparse suffix array if successful, NULL otherwise.
 */
alder_ssa_core_t * alder_ssa_core_init_withS(char *S,
                                             const int64_t nS,
                                             const int64_t M,
                                             const int64_t K)
{
    int s = 0;
    int64_t N = 0;
    if (K == 0 || K == 1 || K > sMaxK) return NULL;
    s = alder_ssa_khan09_appendDollarSign(S, nS, M, K, &N);
    if (s != 0) return NULL;
    
    alder_ssa_core_t *sparseSA = malloc(sizeof(alder_ssa_core_t));
    int64_t *intSA = malloc((N/K+1)*sizeof(int64_t));
    uint8_t *t_new = malloc((N/K+1)*sizeof(uint8_t));
    int64_t *BucketBegin = malloc(BUCKETBEGINSIZE * sizeof(int64_t));
    int64_t *SA = malloc((N/K)*sizeof(int64_t));
    int64_t *ISA = malloc((N/K)*sizeof(int64_t));
    int64_t *LCP = malloc((N/K)*sizeof(int64_t));
    if (sparseSA == NULL || intSA == NULL || t_new == NULL ||
        BucketBegin == NULL || SA == NULL || ISA == NULL || LCP == NULL) {
        XFREE(sparseSA);
        XFREE(intSA);
        XFREE(t_new);
        XFREE(BucketBegin);
        XFREE(SA);
        XFREE(ISA);
        XFREE(LCP);
        return NULL;
    }
    
    ///////////////////////////////////////////////////////////////////////////
    // Build SA
    ///////////////////////////////////////////////////////////////////////////
    sparseSA->S = S;
    sparseSA->K = K;
    sparseSA->N = N;
    sparseSA->logN = (int64_t)ceil(log2((double)(N/K)));
    sparseSA->NKm1 = N/K-1;
    assert(N%K==0);
    uint16_t bucketNr = 1;
    for(int64_t i = 0; i < N/K+1; i++) intSA[i] = i;
    alder_ssa_khan09_radixStep(S, K, BUCKETBEGINSIZE, t_new, intSA, &bucketNr,
                               BucketBegin, 0, N/K-1, 0);
    assert(intSA[N/K] == N/K);
    t_new[N/K] = 0; // Terminate new integer string.
    s = (saint_t) divsufsort64(t_new, intSA, N/K+1);
    if (s != 0) {
        // Error!
    }
    ////////////////////////////////////////////////
    // Translate suffix array: set sparseSA->SA.
    for (int64_t i = 0; i < N/K; i++) SA[i] = intSA[i+1] * K;
    ///////////////////////////////////////////////////////////////////////////
    // Build ISA using sparse SA.
    for (int64_t i = 0; i < N/K; i++) ISA[SA[i]/K] = i;
    ///////////////////////////////////////////////////////////////////////////
    // Initialize LCP : SA + ISA -> LCP
    alder_ssa_khan09_computeLCP(S, SA, ISA, LCP, N, K);
    
    sparseSA->SA = SA;
    sparseSA->ISA = ISA;
    sparseSA->LCP = LCP;
    XFREE(intSA);
    XFREE(t_new);
    XFREE(BucketBegin);
    return sparseSA;
}
Exemple #5
0
/**
 * This function builds a suffix array using files with sequences.
 * Argument:
 * ref - file names.
 * refType - 0 for fasta, 1 for fastq.
 * K - more spaces for dollar signs in a sparse suffix array.
 * Return:
 * a sparse suffix array if successful, and NULL otherwise.
 */
alder_sparse_sa_t *alder_sparse_sa_alloc_file (const struct bstrList *ref,
                                               const int refType,
                                               const int64_t K)
{
    assert(K == 2 || K== 3);
    assert(sizeof(size_t) == sizeof(int64_t));
    
    alder_sparse_sa_t *sparseSA = malloc(sizeof(alder_sparse_sa_t));
    if (sparseSA == NULL) {
        logc_logWarning(ERROR_LOGGER, "cannot a sparse suffix array.");
        GSL_ERROR_VAL("sparse sa alloc failed", GSL_ENOMEM, NULL);
    }
    
    ///////////////////////////////////////////////////////////////////////////
    // Set sparseSA->fS, and initialize others K, N, logN, NKm1.
    ///////////////////////////////////////////////////////////////////////////
    int64_t tSeq = 0;
    int64_t tBase = 0;
    if (refType == 0) {
        alder_fasta_list_length(ref, &tSeq, &tBase);
        sparseSA->fS = alder_fasta_list_alloc(ref, tSeq, tBase, K);
    } else if (refType == 1) {
    }
    
    if (sparseSA->fS == NULL) {
        free(sparseSA);
        GSL_ERROR_VAL("sparse sa alloc failed", GSL_ENOMEM, NULL);
    }
    sparseSA->S = sparseSA->fS->data;
    int64_t S_length = sparseSA->fS->numberOfBase;
    logc_log(MAIN_LOGGER, LOG_FINE, "Number of files=%d", ref->qty);
    logc_log(MAIN_LOGGER, LOG_FINE, "Number of sequences=%lld", tSeq);
    logc_log(MAIN_LOGGER, LOG_FINE, "Number of bases=%lld", tBase);
    
    // Increase string length so divisible by K.
    // Don't forget to count $ termination character.
    int64_t appendK = S_length % K == 0 ? K : K - S_length % K;
    if (sparseSA->fS->sizeCapacity < S_length + appendK + 1) {
        alder_fasta_list_free(sparseSA->fS);
        free(sparseSA);
        GSL_ERROR_VAL("sparse sa alloc failed: increase sSizeCapacity"
                      "in alder_fasta.rl",
                      GSL_ENOMEM, NULL);
    }
    assert(appendK > 0);
    for(int64_t i = 0; i < appendK; i++) {
        sparseSA->fS->data[sparseSA->fS->numberOfBase + i] = '$';
    }
    sparseSA->fS->data[sparseSA->fS->numberOfBase + appendK] = '\0';
    sparseSA->fS->sizeOfDataWithDollar = sparseSA->fS->numberOfBase + appendK;
    sparseSA->K = K;
    sparseSA->N = sparseSA->fS->sizeOfDataWithDollar;
    int64_t N = sparseSA->N;
    sparseSA->logN = (int64_t)ceil(log2((double)(N/K)));
    sparseSA->NKm1 = N/K-1;
    assert(N%K==0);
    if (K == 0 || K == 1 || K > sMaxK) {
        alder_fasta_list_free(sparseSA->fS);
        free(sparseSA);
        GSL_ERROR_VAL("sparse sa alloc failed", GSL_ENOMEM, NULL);
    }
    
    ///////////////////////////////////////////////////////////////////////////
    // Build SA
    ///////////////////////////////////////////////////////////////////////////
    assert(1 < K && K < 4);
    uint16_t bucketNr = 1;
    int64_t *intSA = malloc((N/K+1)*sizeof(int64_t));
    if (intSA == NULL) {
        alder_fasta_list_free(sparseSA->fS);
        free(sparseSA);
        GSL_ERROR_VAL("sparse sa alloc failed", GSL_ENOMEM, NULL);
    }
    for(int64_t i = 0; i < N/K+1; i++) intSA[i] = i; // Init SA.
    uint8_t *t_new = malloc((N/K+1)*sizeof(uint8_t));
    if (t_new == NULL) {
        free(intSA);
        alder_fasta_list_free(sparseSA->fS);
        free(sparseSA);
        GSL_ERROR_VAL("sparse sa alloc failed", GSL_ENOMEM, NULL);
    }
    int64_t *BucketBegin = malloc(BUCKETBEGINSIZE * sizeof(int64_t));
    if (BucketBegin == NULL) {
        free(t_new);
        free(intSA);
        alder_fasta_list_free(sparseSA->fS);
        free(sparseSA);
        GSL_ERROR_VAL("sparse sa alloc failed", GSL_ENOMEM, NULL);
    }
    
    ////////////////////////////////////////////////
    // Radix sort.
    ////////////////////////////////////////////////
    // intSA: size of N/K+1 initialized 0 to N/K.
    // t_new: could be the result?
    // bucketNr: ???
    // BucketBegin: some temp we may not need this.
    // l: 0
    // r: N/K-1
    // h: 0
    // What are these l, r, h?
    alder_radixStep(sparseSA, t_new, intSA, &bucketNr, BucketBegin, 0, N/K-1, 0);
    assert(intSA[N/K] == N/K);
    t_new[N/K] = 0; // Terminate new integer string.
    free(BucketBegin);
    logc_log(MAIN_LOGGER, LOG_FINE, "bucketNr=%d", bucketNr);
    logc_log(MAIN_LOGGER, LOG_FINE, "N/K=%lld", N/K);
#ifdef MAIN_LOGGER
    for (size_t i = 0; i < N/K + 1; i++) {
        logc_log(MAIN_LOGGER, LOG_FINEST,
                 "t_new and intSA [%zd] %d\t%lld",
                 i, (int)t_new[i], intSA[i]);
    }
#endif
    
    ////////////////////////////////////////////////
    // Replace this with libdivsufsort64.
    ////////////////////////////////////////////////
    // t_new is the input
    // intSA is the suffix array of t_new.
    // size of intSA is N/K.
    // 0 .. bucketNr - 1
    // Makes suffix array p of x. x becomes inverse of p. p and x are both of size
    // n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
    // contents of x[n] is disregarded, the n-th symbol being regarded as
    // end-of-string smaller than all other symbols.
    //
    // void suffixsort(int *x, int *p, int n, int k, int l)
    // suffixsort(t_new, intSA_int, (int)(N/K), bucketNr_int, 0);
    //
    // t_new is uint8_t
    // intSA is int64_t
    // N/K is the size of the arrays.
    // N/K+1 makes more sense; 0 at N/K position is used as the terminator.
    // because SA takes values from 1-st position not from 0-th position.
    // and the original source mentioned that the last character is
    // the terminator. suffixsort seems to mention x[n] or the last
    // character is disregarded, or ithe n-th symbol being regarded as
    // end-of-string smaller than all other symbols.
    // N/K was the original input;
    saint_t status = divsufsort64(t_new, intSA, N/K+1);
    if (status != 0) {
        fprintf(stderr, "error: divsufsort64\n");
    }
#ifdef MAIN_LOGGER
    logc_log(MAIN_LOGGER, LOG_FINE, "divsufsort64 is called");
    for (size_t i = 0; i < N/K + 1; i++) {
        logc_log(MAIN_LOGGER, LOG_FINEST,
                 "intSA [%zd] %lld",
                 i, intSA[i]);
    }
#endif
    free(t_new);
    
    ////////////////////////////////////////////////
    // Translate suffix array: set sparseSA->SA.
    ////////////////////////////////////////////////
    sparseSA->SA = malloc((N/K)*sizeof(int64_t));
    if (sparseSA->SA == NULL) {
        free(intSA);
        alder_fasta_list_free(sparseSA->fS);
        free(sparseSA);
        GSL_ERROR_VAL("sparse sa alloc failed", GSL_ENOMEM, NULL);
    }
    for (int64_t i = 0; i < N/K; i++) {
        sparseSA->SA[i] = intSA[i+1] * K;
    }
    free(intSA);
#ifdef MAIN_LOGGER
    logc_log(MAIN_LOGGER, LOG_FINEST,
             "SA is computed using intSA (deleted)");
    for (size_t i = 0; i < N/K; i++) {
        logc_log(MAIN_LOGGER, LOG_FINEST,
                 "SA [%zd] %lld",
                 i, sparseSA->SA[i]);
    }
#endif
    
    ///////////////////////////////////////////////////////////////////////////
    // Build ISA using sparse SA.
    ///////////////////////////////////////////////////////////////////////////
    sparseSA->ISA = malloc((N/K)*sizeof(int64_t));
    if (sparseSA->ISA == NULL) {
        free(sparseSA->SA);
        alder_fasta_list_free(sparseSA->fS);
        free(sparseSA);
        GSL_ERROR_VAL("sparse sa alloc failed", GSL_ENOMEM, NULL);
    }
    for (int64_t i = 0; i < N/K; i++) {
        sparseSA->ISA[sparseSA->SA[i]/K] = i;
    }
    
    ///////////////////////////////////////////////////////////////////////////
    // Initialize LCP : SA + ISA -> LCP
    ///////////////////////////////////////////////////////////////////////////
    sparseSA->LCP = malloc((N/K)*sizeof(int64_t));
    if (sparseSA->LCP == NULL) {
        free(sparseSA->ISA);
        free(sparseSA->SA);
        alder_fasta_list_free(sparseSA->fS);
        free(sparseSA);
        GSL_ERROR_VAL("sparse sa alloc failed", GSL_ENOMEM, NULL);
    }
    alder_computeLCP(sparseSA);
    
#ifdef MAIN_LOGGER
    logc_log(MAIN_LOGGER, LOG_FINE, "ISA is computed using SA.");
    for (size_t i = 0; i < N/K; i++) {
        logc_log(MAIN_LOGGER, LOG_FINEST,
                 "ISA [%zd] %lld",
                 i, sparseSA->ISA[i]);
    }
    for (size_t i = 0; i < N/K; i++) {
        logc_log(MAIN_LOGGER, LOG_FINEST,
                 "LCP [%zd] %lld",
                 i, sparseSA->LCP[i]);
    }
#endif
    
    return sparseSA;
}