Exemple #1
0
static void Create_LCP_RANK(const std::string &s, size_t lcp[], size_t rank[]){
  const unsigned char * text = reinterpret_cast<const unsigned char *>(s.c_str());
  int  i, h, x;
  int N = s.size();
  int  * sa   = new int[N];

  // create suffix array, rank array, lcp array, //////////
  divsufsort(text, sa, N);  
  for(i = 0; i < N; i++) rank[sa[i]] = i;  
  lcp[0] = 0;
  for(i = h = 0; i < N; i++){
    x = rank[i];
    if(x > 0){
      const unsigned char * p0, * p1, * ep = text + N;
      int j = sa[x-1];
      p1 = text + i + h;
      p0 = text + j + h;
      while((p0 != ep) && (p1 != ep) && (*p1 == *p0)){
        p1++; p0++; h++;
      }
      lcp[x] = h;
      if(h > 0) h--;
    }
  }
  delete [] sa; // we don't need suffix array anymore
}
void suffix::SuffixArray::build()
{

    m_sufarr = new saidx_t[m_strLen];
    m_invsufarr = new saidx_t[m_strLen];
    m_lcparr = new saidx_t[m_strLen];

    assert(0 == divsufsort(m_str, m_sufarr, (saidx_t)m_strLen));

    // compute inverse suffix array
    for (saidx_t i = 0; i < m_strLen; ++i)
    {
        m_invsufarr[m_sufarr[i]] = i;
    }

    // compute lcp array
    saidx_t k,j;

    for(saidx_t i = 0, l = 0; i < m_strLen; ++i)
    {
        k = m_invsufarr[i];
        j = m_sufarr[k-1];
        // i+l, j+l are not necessary unless it's terminated by ! or zero character
        while (i+l < m_strLen && j+l < m_strLen && m_str[i+l]==m_str[j+l]) ++l;
        m_lcparr[k] = l;
        if (l > 0) --l;
    }

}
Exemple #3
0
/* get the SA of text
 * if filename is given search if the SA is already computed
 */
unsigned int* getSA(unsigned char const* text, int length, const char* filename){
    char filename_temp[] = "stdin";
    char filename_sa[1024];
    unsigned int* sa;
    if(filename == NULL)
        filename = filename_temp;
    /*filename = basename(filename);*/ /*not sure about this*/
    snprintf(filename_sa,1024,"%s.sa",filename);
    /* checks if we already computed the SA*/
    if(access(filename_sa, R_OK)!=0 || strcmp(filename,"stdin") == 0 || strcmp(filename, "stdin.rev") == 0){
        sa = new unsigned int[length];
        unsigned int err = divsufsort(text, (saidx_t*)sa, (saidx_t)length);
        if(err){
            fprintf(stderr, "ERROR: SA: constructing the SA\n");
        }
        err = saveArray((unsigned char*)sa, length * sizeof(unsigned int), filename_sa);
        if(err != 0){
            fprintf(stderr,"ERROR: SA: saving the SA\n");
        }        
    }else{
        unsigned int salen;
        sa = (unsigned int*)readArray(filename_sa, &salen, sizeof(unsigned int));
    }
    return sa;
}
void calculate_sa(const unsigned char* c, typename int_vector<fixedIntWidth>::size_type len, int_vector<fixedIntWidth>& sa)
{
    typedef typename int_vector<fixedIntWidth>::size_type size_type;
    if (len <= 1) { // handle special case
        sa = int_vector<fixedIntWidth>(len,0);
        return;
    }
    bool small_file = (sizeof(len) <= 4 or len < 0x7FFFFFFFULL);
    if (small_file) {
        uint8_t oldIntWidth = sa.width();
        if (32 == fixedIntWidth or (0==fixedIntWidth and 32 >= oldIntWidth)) {
            sa.width(32);
            sa.resize(len);
            divsufsort(c, (int32_t*)sa.m_data, len);
            // copy integers back to the right positions
            if (oldIntWidth!=32) {
                for (size_type i=0; i<len; ++i) {
                    sa.set_int(i*oldIntWidth, sa.get_int(i<<5, 32), oldIntWidth);
                }
                sa.width(oldIntWidth);
                sa.resize(len);
            }
        } else {
            if (sa.width() < bits::hi(len)+1) {
                throw std::logic_error("width of int_vector is to small for the text!!!");
            }
            int_vector<> sufarray(len,0,32);
            divsufsort(c, (int32_t*)sufarray.m_data, len);
            for (size_type i=0; i<len; ++i) {
                sa[i] = sufarray[i];
            }
        }
    } else {
        uint8_t oldIntWidth = sa.width();
        sa.width(64);
        sa.resize(len);
        divsufsort64(c, (int64_t*)sa.m_data, len);
        // copy integers back to the right positions
        if (oldIntWidth!=64) {
            for (size_type i=0; i<len; ++i) {
                sa.set_int(i*oldIntWidth, sa.get_int(i<<6, 64), oldIntWidth);
            }
            sa.width(oldIntWidth);
            sa.resize(len);
        }
    }
}
Exemple #5
0
/*
 * Build the suffix array SA for the specified byte array T of length n.
 *
 * The suffix array is a sorted array of the byte array's suffixes, represented
 * by indices into the byte array.  It can equivalently be viewed as a mapping
 * from suffix rank to suffix position.
 *
 * To build the suffix array, we use libdivsufsort, which uses an
 * induced-sorting-based algorithm.  In practice, this seems to be the fastest
 * suffix array construction algorithm currently available.
 *
 * References:
 *
 *	Y. Mori.  libdivsufsort, a lightweight suffix-sorting library.
 *	https://code.google.com/p/libdivsufsort/.
 *
 *	G. Nong, S. Zhang, and W.H. Chan.  2009.  Linear Suffix Array
 *	Construction by Almost Pure Induced-Sorting.  Data Compression
 *	Conference, 2009.  DCC '09.  pp. 193 - 202.
 *
 *	S.J. Puglisi, W.F. Smyth, and A. Turpin.  2007.  A Taxonomy of Suffix
 *	Array Construction Algorithms.  ACM Computing Surveys (CSUR) Volume 39
 *	Issue 2, 2007 Article No. 4.
 */
static void
build_SA(u32 SA[], const u8 T[], u32 n, u32 *tmp)
{
	/* Note: divsufsort() requires a fixed amount of temporary space.  The
	 * implementation of divsufsort() has been modified from the original to
	 * use the provided temporary space instead of allocating its own, since
	 * we don't want to have to deal with malloc() failures here.  */
	divsufsort(T, SA, n, tmp);
}
Exemple #6
0
int64_t *
compute_sa
(
 char    * genome,
 uint64_t  gsize
)
{
   char * data = malloc(gsize);
   for (uint64_t i = 0; i < gsize; i++) data[i] = uppercase[(int)genome[i]];
   int64_t * sa = malloc(gsize*sizeof(int64_t));
   if (sa == NULL) return NULL;
   divsufsort((unsigned char *) data, sa, gsize);
   free(data);
   return sa;
}
Exemple #7
0
int
main(int argc, const char *argv[]) {
  FILE *fp;
  const char *fname;
  sauchar_t *T;
  saidx_t *SA;
  LFS_OFF_T n;
  clock_t start, finish;
  saint_t needclose = 1;

  /* Check arguments. */
  if((argc == 1) ||
     (strcmp(argv[1], "-h") == 0) ||
     (strcmp(argv[1], "--help") == 0)) { print_help(argv[0], EXIT_SUCCESS); }
  if(argc != 2) { print_help(argv[0], EXIT_FAILURE); }

  /* Open a file for reading. */
  if(strcmp(argv[1], "-") != 0) {
#if HAVE_FOPEN_S
    if(fopen_s(&fp, fname = argv[1], "rb") != 0) {
#else
    if((fp = LFS_FOPEN(fname = argv[1], "rb")) == NULL) {
#endif
      fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], fname);
      perror(NULL);
      exit(EXIT_FAILURE);
    }
  } else {
#if HAVE__SETMODE && HAVE__FILENO
    if(_setmode(_fileno(stdin), _O_BINARY) == -1) {
      fprintf(stderr, "%s: Cannot set mode: ", argv[0]);
      perror(NULL);
      exit(EXIT_FAILURE);
    }
#endif
    fp = stdin;
    fname = "stdin";
    needclose = 0;
  }

  /* Get the file size. */
  if(LFS_FSEEK(fp, 0, SEEK_END) == 0) {
    n = LFS_FTELL(fp);
    rewind(fp);
    if(n < 0) {
      fprintf(stderr, "%s: Cannot ftell `%s': ", argv[0], fname);
      perror(NULL);
      exit(EXIT_FAILURE);
    }
    if(0x7fffffff <= n) {
      fprintf(stderr, "%s: Input file `%s' is too big.\n", argv[0], fname);
      exit(EXIT_FAILURE);
    }
  } else {
    fprintf(stderr, "%s: Cannot fseek `%s': ", argv[0], fname);
    perror(NULL);
    exit(EXIT_FAILURE);
  }

  /* Allocate 5n bytes of memory. */
  T = (sauchar_t *)malloc((size_t)n * sizeof(sauchar_t));
  SA = (saidx_t *)malloc((size_t)n * sizeof(saidx_t));
  if((T == NULL) || (SA == NULL)) {
    fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]);
    exit(EXIT_FAILURE);
  }

  /* Read n bytes of data. */
  if(fread(T, sizeof(sauchar_t), (size_t)n, fp) != (size_t)n) {
    fprintf(stderr, "%s: %s `%s': ",
      argv[0],
      (ferror(fp) || !feof(fp)) ? "Cannot read from" : "Unexpected EOF in",
      argv[1]);
    perror(NULL);
    exit(EXIT_FAILURE);
  }
  if(needclose & 1) { fclose(fp); }

  /* Construct the suffix array. */
//~  fprintf(stderr, "%s: %" PRIdOFF_T " bytes ... ", fname, n);
//~  start = clock();
  if(divsufsort(T, SA, (saidx_t)n) != 0) {
    fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]);
    exit(EXIT_FAILURE);
  }
//~  finish = clock();
//~  fprintf(stderr, "%.4f sec\n", (double)(finish - start) / (double)CLOCKS_PER_SEC);

  /* Check the suffix array. */
//~  if(sufcheck(T, SA, (saidx_t)n, 1) != 0) { exit(EXIT_FAILURE); }

  /* Deallocate memory. */
  free(SA);
  free(T);

  return 0;
}
  unsigned int
  DegeneratePatternMatch::prepareForRMQ(){
    /* Prepare SP$  */
    const INT len = (sLen + pLen) + 1;
    unsigned char* text = new unsigned char[len];
    std::copy(sequence,sequence+sLen ,text );
    std::copy(pattern,pattern+pLen ,text+sLen );
    text[len-1] = DELIM;

    /* Compute Suffix Array */
    INT* sa = new INT[len];
    if(sa == NULL){
      fprintf (stderr,"Cannot allocate memory for SA.");
      return (0);
    }
#ifdef _USE_64
    if(divsufsort64(text, sa, len) != 0){
      fprintf (stderr, "SA computation failed");
      return (0);
    }
#endif
#ifdef _USE_32
    if(divsufsort(text, sa, len) != 0){
      fprintf (stderr,"SA computation failed");
      return (0);
    }
#endif
    /* Compute Rank array */ 
    rank = new INT[len];
    if(rank == NULL){
      fprintf (stderr,"Cannot allocate memory for Rank Array");
      return (0);  
    }
    for(INT i = 0; i < len; i++ ){
      rank [sa[i]] = i;
    }

    /* Compute LCP array */ 
    lcp = new INT[len];
    if(lcp == NULL){
      fprintf (stderr,"Cannot allocate memory for LCP Array"); 
      return (0); 
    }       
    if(constructLCPArray(text, len, sa) != 1){
      fprintf (stderr,"LCP computation failed");
      return (0);
    }

    // Delete SA and text as they are not needed now
    delete[] sa;
    delete[] text;

    /* Prepare LCP array for RMQ */ 
    // create a vector of length len and initialize it with 0s
    sdsl::int_vector<> v(len , 0 ); 
    for(INT i = 0; i < len; i++){
      v[i] = lcp[i];
    }
    rmq = new sdsl::rmq_succinct_sct<>(&v);
    // v is not required now
    sdsl::util::clear(v);  
    //cout << "prepared for rmq" << endl;
  }
bool Encode::next_block()
{
    if (block.count > 0) {
        read.processed += block.nreads;

        sem_destroy(&shared.mutex);
        sem_destroy(&shared.nempty);
        sem_destroy(&shared.nstored);
    }

    sem_init(&shared.mutex, 0, 1);

    sem_init(&shared.nempty, 0, MAX_THREADS);

    sem_init(&shared.nstored, 0, 0);

    shared.nread = 0;

    shared.nwrite = 0;

    shared.finished = false;

    block.nreads = config.block_size / read.length;

    char *ptr = (char*)block.text;

    size_t nbytes = 0;
    size_t i = 0;

    while (fgets(ptr, BUFSIZ, block.fp)) {
        size_t length = strlen(ptr);
        assert(length == read.length);

        ptr[length - 1] = '\0'; // Chomp.

        ptr += length;

        nbytes += length;

        i++;

        if (block.count == 0 &&
            i == block.raw_nreads)
            break;

        if (i == block.nreads)
            break;
    }

    if (i == 0)
        return false;

    block.nbytes = nbytes;

    block.nreads = i;

    for (i = 0; i < block.nreads; i++) {
        encoding[i].reference = SIZE_MAX;
        encoding[i].ref_position = 0;
        encoding[i].dst_position = 0;
        encoding[i].length = 0;
        encoding[i].isrcomplement = false;
        encoding[i].nbits = ceil_log2(read.processed + 1) + (2 * (read.length - 1));
    }

    block.count++;

    if (block.count == 1)
        return true; // We not need to build index for the initial raw block.

    divsufsort(block.text, block.sa, (int32_t)block.nbytes);

    if (block.index)
        delete block.index;

    block.index = new MatchingStatistics(block.text, block.sa, block.nbytes);

    return true;
}
int main(int argc,char** argv){
	sauchar_t* str;
	saidx_t* SA;
	saidx_t* SAI;
	word n;
	saidx_t* LCP;
	int h,j,k;
    int i;
    text* T = callocx(1,sizeof(text));
    text* P = callocx(1,sizeof(text));
    text* Index = callocx(1,sizeof(text));
    if(argc<4 || argc >5){
        fprintf(stderr,"%s","Error\n");
        fprintf(stderr,"%s%s%s","Usage = ",argv[0]," -create <inputText> <indexFile>\n");
        fprintf(stderr,"%s%s%s","Usage = ",argv[0]," -search <indexFile> <inputText> <patternFile>\n");
        exit(EXIT_FAILURE);
    }
    if(strcmp(argv[1],"-create")==0){
        T->file = fopen(argv[2],"r");
        Index->file = fopen(argv[3],"wb");

        if(T->file==NULL){
            perror("Error: ");
            exit(EXIT_FAILURE);
        }
        if(Index->file==NULL){
            perror("Error: ");
            exit(EXIT_FAILURE);
        }

        fseek(T->file,0,SEEK_END);
        T->length = ftell(T->file);
        rewind(T->file);

        str = callocx(sizeof(sauchar_t),T->length);
        SA = callocx(sizeof(saidx_t),T->length);
        SAI = callocx(sizeof(saidx_t),T->length);
        LCP = callocx(sizeof(saidx_t),T->length);
        fread(str,sizeof(sauchar_t),T->length,T->file);
        if(divsufsort(str,SA,T->length)){
            perror("suffix sort error");
            exit(EXIT_FAILURE);
        }

        for(i=0;i<T->length;i++){
            SAI[SA[i]] = i;
        }

        for(i=0;i<T->length;i++){
            h=0;
            if(SAI[i] !=  T->length -1){
                k = SA[SAI[i]+1];
                j=0;
                while(str[i+h]==str[k+h]) h++;
                LCP[SAI[i]] = h;
                if(h) h--;
            }
            else{
                LCP[SAI[i]] = 0;
            }
        }
        if(Index->file==NULL){
            perror("Error");
            exit(EXIT_FAILURE);
        }
        fwrite(&(T->length),sizeof(word),1,Index->file);
        fwrite(SA,sizeof(word),T->length,Index->file);
        fwrite(LCP,sizeof(word),T->length,Index->file);
        fclose(Index->file);
        free(SA);
        free(SAI);
        free (LCP);
        free(str);

    }
    else if(strcmp(argv[1],"-search")==0){
        P->file = fopen(argv[4],"r");
        T->file = fopen(argv[3],"r");
        Index->file = fopen(argv[2],"rb");
        if(T->file==NULL){
            perror("Error: ");
            exit(EXIT_FAILURE);
        }
        if(P->file == NULL){
            perror("Error: ");
            exit(EXIT_FAILURE);
        }
        if(Index->file==NULL){
            perror("Error: ");
            exit(EXIT_FAILURE);
        }
        fseek(T->file,0,SEEK_END);
        T->length = ftell(T->file);
        rewind(T->file);
        T->textStart = 0;
        T->textEnd = T->length-1;
        fseek(P->file,0,SEEK_END);
        P->length = ftell(P->file);
        rewind(P->file);
        fread(&n,sizeof(word),1,Index->file);
        SA = callocx(sizeof(saidx_t),n);
        LCP = callocx(sizeof(saidx_t),n);
        fread(SA,sizeof(saidx_t),n,Index->file);
        fread(LCP,sizeof(saidx_t),n,Index->file);
        searchPattern(T,P,SA,LCP,n);
        free(SA);
        free(LCP);
    }
    else{
        fprintf(stderr,"%s%s%s","Usage = ",argv[0]," -create <inputText> <indexFile>\n");
        fprintf(stderr,"%s%s%s","Usage = ",argv[0]," -search <inputText> <indexFile> <patternFile>\n");
        exit(EXIT_FAILURE);
    }
    freeText(T);
    freeText(P);
    freeText(Index);
    return(0);
}
Exemple #11
0
void
FM::build(uint8_t* T,uint32_t n,uint32_t samplerate) {
    uint8_t* X;
    uint8_t* X_bwt;
    int32_t* SA;
    uint32_t i,prev,tmp,start,stop;
    float elapsed;
    
    start = gettime();
	
	info("building index.");
    
    /* remap if 0 in text */
    info("- remapping alphabet.");
    X = remap0(T,n);
    free(T);
    
    /* create cumulative counts */
    info("- creating cumulative counts C[].");
    for (i=0;i<size_uchar+1;i++) C[i]=0;
    for (i=0;i<n;++i) C[X[i]]++;
    prev=C[0];C[0]=0;
    for (i=1;i<size_uchar+1;i++) {
      tmp = C[i];
      C[i]=C[i-1]+prev;
      prev = tmp;
    }
    
    /* perform k-BWT */
    info("- performing bwt.");
    SA = (int32_t*) safe_malloc( n * sizeof(int32_t)  );
    if( divsufsort(X,SA,n) != 0 ) {
        fatal("error divsufsort");
    }
    
    /* sample SA for locate() */
    info("- sample SA locations.");
    suffixes = (uint32_t*) safe_malloc( ((n/samplerate)+1) * sizeof(uint32_t));
    BitString B(n);
    tmp = 0;
    for(i=0;i<n;i++) {
        if( SA[i] % samplerate == 0) {
            suffixes[tmp] = SA[i];
            B.setBit(i,true);
            tmp++;
        } else B.setBit(i,false);
    }
    /* enable rank on context vector */
    this->sampled = new BitSequenceRRR(B,RRR_SAMPLERATE);
	
	/* sample SA for display() */
	positions = (uint32_t*) safe_malloc( ((n/samplerate)+2) * sizeof(uint32_t));
    for (i=0;i<this->n;i++) {
        if (SA[i] % samplerate == 0) this->positions[SA[i]/samplerate] = i;
	}
    positions[(this->n-1)/samplerate+1] = positions[0];
	
    info("- creating bwt output.");
    X_bwt = (uint8_t*) safe_malloc( n * sizeof(uint8_t)  );
    for(i=0;i<n;i++) {
        if(SA[i]==0) { 
            X_bwt[i] = X[n-1];
            this->I = i;
        } else X_bwt[i] = X[SA[i]-1];
    }
    free(SA);
    
    info("- create RRR wavelet tree over bwt.");
    MapperNone * map = new MapperNone();
    BitSequenceBuilder * bsb = new BitSequenceBuilderRRR(RRR_SAMPLERATE);
    T_bwt = new WaveletTreeNoptrs((uint32_t*)X_bwt,n,sizeof(uint8_t)*8,bsb,map,true);
    
    stop = gettime();
    elapsed = (float)(stop-start)/1000000;
    
    /* build aux data */
    info("build FM-Index done. (%.3f sec)",elapsed);
    
    uint32_t bytes;
    info("space usage:");
    bytes = sigma * sizeof(uint8_t);
    info("- remap_reverse: %d bytes (%.2f\%)",bytes,(float)bytes/getSize()*100);
    bytes = sizeof(this->C);
    info("- C: %d bytes (%.2f\%)",bytes,(float)bytes/getSize()*100);
    bytes = ((n/samplerate)+1) * sizeof(uint32_t);
    info("- Suffixes: %d bytes (%.2f\%)",bytes,(float)bytes/getSize()*100);
    bytes = ((n/samplerate)+2) * sizeof(uint32_t);
    info("- Positions: %d bytes (%.2f\%)",bytes,(float)bytes/getSize()*100);
    bytes = sampled->getSize();
    info("- Sampled: %d bytes (%.2f\%)",bytes,(float)bytes/getSize()*100);
    bytes = T_bwt->getSize();
    info("- T_bwt: %d bytes (%.2f\%)",bytes,(float)bytes/getSize()*100);
	info("input Size n = %lu bytes\n",this->n);
	info("index Size = %lu bytes (%.2f n)",getSize(),getSizeN());
}
/* Function:  buildAndWriteFMIndex()
 * Synopsis:  Take text as input, along with several pre-allocated variables,
 *            and produce BWT and corresponding FM-index, then write it all
 *            to the output file.
 *
 *            if SAsamp == NULL, don't store/write T or SAsamp
 */
int buildAndWriteFMIndex (FM_METADATA *meta, uint32_t seq_offset, uint32_t ambig_offset,
                        uint16_t seq_cnt, uint16_t ambig_cnt, uint32_t overlap,
                        uint8_t *T, uint8_t *BWT,
                        int *SA, uint32_t *SAsamp,
                        uint32_t *occCnts_sb, uint32_t *cnts_sb,
                        uint16_t *occCnts_b, uint16_t *cnts_b,
                        uint64_t N, FILE *fp
    ) {


  int status;
  uint64_t i,j,c,joffset;
  int chars_per_byte = 8/meta->charBits;
  uint32_t compressed_bytes =   ((chars_per_byte-1+N)/chars_per_byte);
  uint32_t term_loc;

  int num_freq_cnts_b  = 1+ceil((double)N/(meta->freq_cnt_b));
  int num_freq_cnts_sb = 1+ceil((double)N/meta->freq_cnt_sb);
  int num_SA_samples   = 1+floor((double)N/meta->freq_SA);


  uint8_t *Tcompressed = NULL;
  if (SAsamp != NULL) {
    ESL_ALLOC (Tcompressed, compressed_bytes * sizeof(uint8_t));

    // Reverse the text T, so the BWT will be on reversed T.  Only used for the 1st pass
    fm_reverseString ((char*)T, N-1);
  }

  // Construct the Suffix Array on text T
  status = divsufsort(T, SA, N);
  if ( status < 0 )
    esl_fatal("buildAndWriteFMIndex: Error building BWT.\n");

  // Construct the BWT, SA landmarks, and FM-index
  for (c=0; c<meta->alph_size; c++) {
    cnts_sb[c] = 0;
    cnts_b[c] = 0;
    FM_OCC_CNT(sb, 0, c ) = 0;
    FM_OCC_CNT(b, 0, c ) = 0;
  }


  for(j=0; j < N-1; ++j) {
    T[j]--;  //move values down so 'a'=0...'t'=3; store 'a' in place of '$'
  }
  T[N-1]=0;

  BWT[0] =  SA[0]==0 ? 0 /* '$' */ : T[ SA[0]-1] ;

  cnts_sb[BWT[0]]++;
  cnts_b[BWT[0]]++;

  //Scan through SA to build the BWT and FM index structures
  for(j=1; j < N; ++j) {
    if (SA[j]==0) { //'$'
      term_loc = j;
      BWT[j] =  0; //store 'a' in place of '$'
    } else {
      BWT[j] =  T[ SA[j]-1] ;
    }


    //sample the SA
    if (SAsamp != NULL) {
      if ( !(j % meta->freq_SA) )
        SAsamp[ j/meta->freq_SA ] = ( SA[j] == N - 1 ? -1 : SA[j] ) ; // handle the wrap-around '$'
    }

    cnts_sb[BWT[j]]++;
    cnts_b[BWT[j]]++;

    joffset = j+1;
    if ( !(  joffset % meta->freq_cnt_b) ) {  // (j+1)%freq_cnt_b==0  , i.e. every freq_cnt_bth position, noting that it's a zero-based count

      for (c=0; c<meta->alph_size; c++)
        FM_OCC_CNT(b, (joffset/meta->freq_cnt_b), c ) = cnts_b[c];

      if ( !(joffset % meta->freq_cnt_sb) ) {  // j%freq_cnt_sb==0
        for (c=0; c<meta->alph_size; c++) {
          FM_OCC_CNT(sb, (joffset/meta->freq_cnt_sb), c ) = cnts_sb[c];
          cnts_b[c] = 0;
        }
      }
    }
  }

  //wrap up the counting;
  for (c=0; c<meta->alph_size; c++) {
    FM_OCC_CNT(b, num_freq_cnts_b-1, c ) = cnts_b[c];
    FM_OCC_CNT(sb, num_freq_cnts_sb-1, c ) = cnts_sb[c];
  }



  // Convert BWT and T to packed versions if appropriate.
  if (meta->alph_type == fm_DNA) {
     //4 chars per byte.  Counting will be done based on quadruples 0..3; 4..7; 8..11; etc.
      for(i=0; i < N-3; i+=4)
        BWT[i/4]           = BWT[i]<<6 | BWT[i+1]<<4 | BWT[i+2]<<2 | BWT[i+3];
      if (i <= N-1)
        BWT[i/4]           =  BWT[i]<<6;
      if (i+1 <= N-1)
        BWT[i/4]           |=  BWT[i+1]<<4;
      if (i+2 <= N-1)
        BWT[i/4]           |=  BWT[i+2]<<2;

  } else if (meta->alph_type == fm_DNA_full ) {
    //2 chars per byte.  Counting will be done based on quadruples 0..3; 4..7; 8..11; etc.
      for(i=0; i < N-1; i+=2)
        BWT[i/2]           = BWT[i]<<4 | BWT[i+1];
      if (i==N-1)
        BWT[i/2]           =  BWT[i]<<4 ;
  }



  //If this is the 1st (reversed text) BWT, de-reverse it, then compress it
  if (SAsamp != NULL) {
    fm_reverseString ((char*)T, N-1);
    // Convert BWT and T to packed versions if appropriate.
    if (meta->alph_type == fm_DNA ) {
       //4 chars per byte.  Counting will be done based on quadruples 0..3; 4..7; 8..11; etc.
      for(i=0; i < N-3; i+=4)
        Tcompressed[i/4] =  T[i]<<6 |   T[i+1]<<4 |   T[i+2]<<2 | T[i+3];

      if (i <= N-1)
        Tcompressed[i/4] =   T[i]<<6;
      if (i+1 <= N-1)
        Tcompressed[i/4] |=   T[i+1]<<4;
      if (i+2 <= N-1)
        Tcompressed[i/4] |=   T[i+2]<<2;

    } else if (meta->alph_type == fm_DNA_full) {
      //2 chars per byte.  Counting will be done based on quadruples 0..3; 4..7; 8..11; etc.
      for(i=0; i < N-1; i+=2)
        Tcompressed[i/2] =   T[i]<<4 |   T[i+1];
      if (i==N-1)
        Tcompressed[i/2] =    T[i]<<4 ;
    } else {
      for(i=0; i < N-1; i++)
        Tcompressed[i] =    T[i];
    }
  }


  for(j=0; j < N-1; ++j) {
      T[j]++;  //move values back up, in case the reverse FM needs to be built
  }
  T[N-1] = 0;


  // Write the FM-index meta data
  if(fwrite(&N, sizeof(uint64_t), 1, fp) !=  1)
    esl_fatal( "buildAndWriteFMIndex: Error writing block_length in FM index.\n");
  if(fwrite(&term_loc, sizeof(uint32_t), 1, fp) !=  1)
    esl_fatal( "buildAndWriteFMIndex: Error writing terminal location in FM index.\n");
  if(fwrite(&seq_offset, sizeof(uint32_t), 1, fp) !=  1)
    esl_fatal( "buildAndWriteFMIndex: Error writing seq_offset in FM index.\n");
  if(fwrite(&ambig_offset, sizeof(uint32_t), 1, fp) !=  1)
    esl_fatal( "buildAndWriteFMIndex: Error writing ambig_offset in FM index.\n");
  if(fwrite(&overlap, sizeof(uint32_t), 1, fp) !=  1)
    esl_fatal( "buildAndWriteFMIndex: Error writing overlap in FM index.\n");
  if(fwrite(&seq_cnt, sizeof(uint16_t), 1, fp) !=  1)
    esl_fatal( "buildAndWriteFMIndex: Error writing seq_cnt in FM index.\n");
  if(fwrite(&ambig_cnt, sizeof(uint16_t), 1, fp) !=  1)
    esl_fatal( "buildAndWriteFMIndex: Error writing ambig_cnt in FM index.\n");

  // don't write Tcompressed or SAsamp if SAsamp == NULL
  if(Tcompressed != NULL && fwrite(Tcompressed, sizeof(uint8_t), compressed_bytes, fp) != compressed_bytes)
    esl_fatal( "buildAndWriteFMIndex: Error writing T in FM index.\n");
  if(fwrite(BWT, sizeof(uint8_t), compressed_bytes, fp) != compressed_bytes)
    esl_fatal( "buildAndWriteFMIndex: Error writing BWT in FM index.\n");
  if(SAsamp != NULL && fwrite(SAsamp, sizeof(uint32_t), (size_t)num_SA_samples, fp) != (size_t)num_SA_samples)
    esl_fatal( "buildAndWriteFMIndex: Error writing SA in FM index.\n");
  if(fwrite(occCnts_b, sizeof(uint16_t)*(meta->alph_size), (size_t)num_freq_cnts_b, fp) != (size_t)num_freq_cnts_b)
    esl_fatal( "buildAndWriteFMIndex: Error writing occCnts_b in FM index.\n");
  if(fwrite(occCnts_sb, sizeof(uint32_t)*(meta->alph_size), (size_t)num_freq_cnts_sb, fp) != (size_t)num_freq_cnts_sb)
    esl_fatal( "buildAndWriteFMIndex: Error writing occCnts_sb in FM index.\n");


  if (Tcompressed)         free(Tcompressed);

  return eslOK;

ERROR:
  /* Deallocate memory. */
  if (Tcompressed)         free(Tcompressed);
  return eslFAIL;

}
Exemple #13
0
int
main(int argc, const char *argv[]) {
  FILE *fp;
  sauchar_t *T;
  saidx_t *SA;
  saidx_t n;
  clock_t start, finish;
#if HAVE_SYS_STAT_H
  struct stat s;
#endif

  /* Check argument. */
  if((argc != 2) ||
     (strcmp(argv[1], "-h") == 0) ||
     (strcmp(argv[1], "--help") == 0)) {
    fprintf(stderr,
      "suftest, a suffixsort tester, version %s.\n"
      , divsufsort_version());
    fprintf(stderr,
      "usage: %s FILE\n\n"
      , argv[0]);
    exit(EXIT_FAILURE);
  }

  /* Get a file's status information. */
#if HAVE_SYS_STAT_H
  if(stat(argv[1], &s) != 0) {
    fprintf(stderr, "%s: Cannot stat file `%s': ", argv[0], argv[1]);
    perror(NULL);
    exit(EXIT_FAILURE);
  }
  n = s.st_size;
#endif

  /* Open a file for reading. */
  if((fp = fopen(argv[1], "rb")) == NULL) {
    fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], argv[1]);
    perror(NULL);
    exit(EXIT_FAILURE);
  }

#if !HAVE_SYS_STAT_H
  fseek(fp, 0, SEEK_END);
  n = ftell(fp);
  rewind(fp);
#endif

  /* Allocate n+4(n+1) bytes of memory. */
  if(((T = malloc(n * sizeof(sauchar_t))) == NULL) ||
     ((SA = malloc((n + 1) * sizeof(saidx_t))) == NULL)) {
    fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]);
    exit(EXIT_FAILURE);
  }

  /* Read n bytes of data. */
  if(fread(T, sizeof(sauchar_t), n, fp) != n) {
    fprintf(stderr, "%s: %s `%s': ",
      argv[0],
      (ferror(fp) || !feof(fp)) ? "Cannot read from" : "Unexpected EOF in",
      argv[1]);
    perror(NULL);
    exit(EXIT_FAILURE);
  }
  fclose(fp);

  /* Construct the suffix array. */
  fprintf(stderr, "%s: %d bytes ... ", argv[1], (int)n);
  start = clock();
  divsufsort(T, SA, n);
  finish = clock();
  fprintf(stderr, "%.4f sec\n",
    (double)(finish - start) / (double)CLOCKS_PER_SEC);

  /* Check the suffix array. */
  if(sufcheck(T, SA, n, 3) != 0) {
    exit(EXIT_FAILURE);
  }

  /* Deallocate memory. */
  free(SA);
  free(T);

  return 0;
}