static void Create_LCP_RANK(const std::string &s, size_t lcp[], size_t rank[]){ const unsigned char * text = reinterpret_cast<const unsigned char *>(s.c_str()); int i, h, x; int N = s.size(); int * sa = new int[N]; // create suffix array, rank array, lcp array, ////////// divsufsort(text, sa, N); for(i = 0; i < N; i++) rank[sa[i]] = i; lcp[0] = 0; for(i = h = 0; i < N; i++){ x = rank[i]; if(x > 0){ const unsigned char * p0, * p1, * ep = text + N; int j = sa[x-1]; p1 = text + i + h; p0 = text + j + h; while((p0 != ep) && (p1 != ep) && (*p1 == *p0)){ p1++; p0++; h++; } lcp[x] = h; if(h > 0) h--; } } delete [] sa; // we don't need suffix array anymore }
void suffix::SuffixArray::build() { m_sufarr = new saidx_t[m_strLen]; m_invsufarr = new saidx_t[m_strLen]; m_lcparr = new saidx_t[m_strLen]; assert(0 == divsufsort(m_str, m_sufarr, (saidx_t)m_strLen)); // compute inverse suffix array for (saidx_t i = 0; i < m_strLen; ++i) { m_invsufarr[m_sufarr[i]] = i; } // compute lcp array saidx_t k,j; for(saidx_t i = 0, l = 0; i < m_strLen; ++i) { k = m_invsufarr[i]; j = m_sufarr[k-1]; // i+l, j+l are not necessary unless it's terminated by ! or zero character while (i+l < m_strLen && j+l < m_strLen && m_str[i+l]==m_str[j+l]) ++l; m_lcparr[k] = l; if (l > 0) --l; } }
/* get the SA of text * if filename is given search if the SA is already computed */ unsigned int* getSA(unsigned char const* text, int length, const char* filename){ char filename_temp[] = "stdin"; char filename_sa[1024]; unsigned int* sa; if(filename == NULL) filename = filename_temp; /*filename = basename(filename);*/ /*not sure about this*/ snprintf(filename_sa,1024,"%s.sa",filename); /* checks if we already computed the SA*/ if(access(filename_sa, R_OK)!=0 || strcmp(filename,"stdin") == 0 || strcmp(filename, "stdin.rev") == 0){ sa = new unsigned int[length]; unsigned int err = divsufsort(text, (saidx_t*)sa, (saidx_t)length); if(err){ fprintf(stderr, "ERROR: SA: constructing the SA\n"); } err = saveArray((unsigned char*)sa, length * sizeof(unsigned int), filename_sa); if(err != 0){ fprintf(stderr,"ERROR: SA: saving the SA\n"); } }else{ unsigned int salen; sa = (unsigned int*)readArray(filename_sa, &salen, sizeof(unsigned int)); } return sa; }
void calculate_sa(const unsigned char* c, typename int_vector<fixedIntWidth>::size_type len, int_vector<fixedIntWidth>& sa) { typedef typename int_vector<fixedIntWidth>::size_type size_type; if (len <= 1) { // handle special case sa = int_vector<fixedIntWidth>(len,0); return; } bool small_file = (sizeof(len) <= 4 or len < 0x7FFFFFFFULL); if (small_file) { uint8_t oldIntWidth = sa.width(); if (32 == fixedIntWidth or (0==fixedIntWidth and 32 >= oldIntWidth)) { sa.width(32); sa.resize(len); divsufsort(c, (int32_t*)sa.m_data, len); // copy integers back to the right positions if (oldIntWidth!=32) { for (size_type i=0; i<len; ++i) { sa.set_int(i*oldIntWidth, sa.get_int(i<<5, 32), oldIntWidth); } sa.width(oldIntWidth); sa.resize(len); } } else { if (sa.width() < bits::hi(len)+1) { throw std::logic_error("width of int_vector is to small for the text!!!"); } int_vector<> sufarray(len,0,32); divsufsort(c, (int32_t*)sufarray.m_data, len); for (size_type i=0; i<len; ++i) { sa[i] = sufarray[i]; } } } else { uint8_t oldIntWidth = sa.width(); sa.width(64); sa.resize(len); divsufsort64(c, (int64_t*)sa.m_data, len); // copy integers back to the right positions if (oldIntWidth!=64) { for (size_type i=0; i<len; ++i) { sa.set_int(i*oldIntWidth, sa.get_int(i<<6, 64), oldIntWidth); } sa.width(oldIntWidth); sa.resize(len); } } }
/* * Build the suffix array SA for the specified byte array T of length n. * * The suffix array is a sorted array of the byte array's suffixes, represented * by indices into the byte array. It can equivalently be viewed as a mapping * from suffix rank to suffix position. * * To build the suffix array, we use libdivsufsort, which uses an * induced-sorting-based algorithm. In practice, this seems to be the fastest * suffix array construction algorithm currently available. * * References: * * Y. Mori. libdivsufsort, a lightweight suffix-sorting library. * https://code.google.com/p/libdivsufsort/. * * G. Nong, S. Zhang, and W.H. Chan. 2009. Linear Suffix Array * Construction by Almost Pure Induced-Sorting. Data Compression * Conference, 2009. DCC '09. pp. 193 - 202. * * S.J. Puglisi, W.F. Smyth, and A. Turpin. 2007. A Taxonomy of Suffix * Array Construction Algorithms. ACM Computing Surveys (CSUR) Volume 39 * Issue 2, 2007 Article No. 4. */ static void build_SA(u32 SA[], const u8 T[], u32 n, u32 *tmp) { /* Note: divsufsort() requires a fixed amount of temporary space. The * implementation of divsufsort() has been modified from the original to * use the provided temporary space instead of allocating its own, since * we don't want to have to deal with malloc() failures here. */ divsufsort(T, SA, n, tmp); }
int64_t * compute_sa ( char * genome, uint64_t gsize ) { char * data = malloc(gsize); for (uint64_t i = 0; i < gsize; i++) data[i] = uppercase[(int)genome[i]]; int64_t * sa = malloc(gsize*sizeof(int64_t)); if (sa == NULL) return NULL; divsufsort((unsigned char *) data, sa, gsize); free(data); return sa; }
int main(int argc, const char *argv[]) { FILE *fp; const char *fname; sauchar_t *T; saidx_t *SA; LFS_OFF_T n; clock_t start, finish; saint_t needclose = 1; /* Check arguments. */ if((argc == 1) || (strcmp(argv[1], "-h") == 0) || (strcmp(argv[1], "--help") == 0)) { print_help(argv[0], EXIT_SUCCESS); } if(argc != 2) { print_help(argv[0], EXIT_FAILURE); } /* Open a file for reading. */ if(strcmp(argv[1], "-") != 0) { #if HAVE_FOPEN_S if(fopen_s(&fp, fname = argv[1], "rb") != 0) { #else if((fp = LFS_FOPEN(fname = argv[1], "rb")) == NULL) { #endif fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], fname); perror(NULL); exit(EXIT_FAILURE); } } else { #if HAVE__SETMODE && HAVE__FILENO if(_setmode(_fileno(stdin), _O_BINARY) == -1) { fprintf(stderr, "%s: Cannot set mode: ", argv[0]); perror(NULL); exit(EXIT_FAILURE); } #endif fp = stdin; fname = "stdin"; needclose = 0; } /* Get the file size. */ if(LFS_FSEEK(fp, 0, SEEK_END) == 0) { n = LFS_FTELL(fp); rewind(fp); if(n < 0) { fprintf(stderr, "%s: Cannot ftell `%s': ", argv[0], fname); perror(NULL); exit(EXIT_FAILURE); } if(0x7fffffff <= n) { fprintf(stderr, "%s: Input file `%s' is too big.\n", argv[0], fname); exit(EXIT_FAILURE); } } else { fprintf(stderr, "%s: Cannot fseek `%s': ", argv[0], fname); perror(NULL); exit(EXIT_FAILURE); } /* Allocate 5n bytes of memory. */ T = (sauchar_t *)malloc((size_t)n * sizeof(sauchar_t)); SA = (saidx_t *)malloc((size_t)n * sizeof(saidx_t)); if((T == NULL) || (SA == NULL)) { fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]); exit(EXIT_FAILURE); } /* Read n bytes of data. */ if(fread(T, sizeof(sauchar_t), (size_t)n, fp) != (size_t)n) { fprintf(stderr, "%s: %s `%s': ", argv[0], (ferror(fp) || !feof(fp)) ? "Cannot read from" : "Unexpected EOF in", argv[1]); perror(NULL); exit(EXIT_FAILURE); } if(needclose & 1) { fclose(fp); } /* Construct the suffix array. */ //~ fprintf(stderr, "%s: %" PRIdOFF_T " bytes ... ", fname, n); //~ start = clock(); if(divsufsort(T, SA, (saidx_t)n) != 0) { fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]); exit(EXIT_FAILURE); } //~ finish = clock(); //~ fprintf(stderr, "%.4f sec\n", (double)(finish - start) / (double)CLOCKS_PER_SEC); /* Check the suffix array. */ //~ if(sufcheck(T, SA, (saidx_t)n, 1) != 0) { exit(EXIT_FAILURE); } /* Deallocate memory. */ free(SA); free(T); return 0; }
unsigned int DegeneratePatternMatch::prepareForRMQ(){ /* Prepare SP$ */ const INT len = (sLen + pLen) + 1; unsigned char* text = new unsigned char[len]; std::copy(sequence,sequence+sLen ,text ); std::copy(pattern,pattern+pLen ,text+sLen ); text[len-1] = DELIM; /* Compute Suffix Array */ INT* sa = new INT[len]; if(sa == NULL){ fprintf (stderr,"Cannot allocate memory for SA."); return (0); } #ifdef _USE_64 if(divsufsort64(text, sa, len) != 0){ fprintf (stderr, "SA computation failed"); return (0); } #endif #ifdef _USE_32 if(divsufsort(text, sa, len) != 0){ fprintf (stderr,"SA computation failed"); return (0); } #endif /* Compute Rank array */ rank = new INT[len]; if(rank == NULL){ fprintf (stderr,"Cannot allocate memory for Rank Array"); return (0); } for(INT i = 0; i < len; i++ ){ rank [sa[i]] = i; } /* Compute LCP array */ lcp = new INT[len]; if(lcp == NULL){ fprintf (stderr,"Cannot allocate memory for LCP Array"); return (0); } if(constructLCPArray(text, len, sa) != 1){ fprintf (stderr,"LCP computation failed"); return (0); } // Delete SA and text as they are not needed now delete[] sa; delete[] text; /* Prepare LCP array for RMQ */ // create a vector of length len and initialize it with 0s sdsl::int_vector<> v(len , 0 ); for(INT i = 0; i < len; i++){ v[i] = lcp[i]; } rmq = new sdsl::rmq_succinct_sct<>(&v); // v is not required now sdsl::util::clear(v); //cout << "prepared for rmq" << endl; }
bool Encode::next_block() { if (block.count > 0) { read.processed += block.nreads; sem_destroy(&shared.mutex); sem_destroy(&shared.nempty); sem_destroy(&shared.nstored); } sem_init(&shared.mutex, 0, 1); sem_init(&shared.nempty, 0, MAX_THREADS); sem_init(&shared.nstored, 0, 0); shared.nread = 0; shared.nwrite = 0; shared.finished = false; block.nreads = config.block_size / read.length; char *ptr = (char*)block.text; size_t nbytes = 0; size_t i = 0; while (fgets(ptr, BUFSIZ, block.fp)) { size_t length = strlen(ptr); assert(length == read.length); ptr[length - 1] = '\0'; // Chomp. ptr += length; nbytes += length; i++; if (block.count == 0 && i == block.raw_nreads) break; if (i == block.nreads) break; } if (i == 0) return false; block.nbytes = nbytes; block.nreads = i; for (i = 0; i < block.nreads; i++) { encoding[i].reference = SIZE_MAX; encoding[i].ref_position = 0; encoding[i].dst_position = 0; encoding[i].length = 0; encoding[i].isrcomplement = false; encoding[i].nbits = ceil_log2(read.processed + 1) + (2 * (read.length - 1)); } block.count++; if (block.count == 1) return true; // We not need to build index for the initial raw block. divsufsort(block.text, block.sa, (int32_t)block.nbytes); if (block.index) delete block.index; block.index = new MatchingStatistics(block.text, block.sa, block.nbytes); return true; }
int main(int argc,char** argv){ sauchar_t* str; saidx_t* SA; saidx_t* SAI; word n; saidx_t* LCP; int h,j,k; int i; text* T = callocx(1,sizeof(text)); text* P = callocx(1,sizeof(text)); text* Index = callocx(1,sizeof(text)); if(argc<4 || argc >5){ fprintf(stderr,"%s","Error\n"); fprintf(stderr,"%s%s%s","Usage = ",argv[0]," -create <inputText> <indexFile>\n"); fprintf(stderr,"%s%s%s","Usage = ",argv[0]," -search <indexFile> <inputText> <patternFile>\n"); exit(EXIT_FAILURE); } if(strcmp(argv[1],"-create")==0){ T->file = fopen(argv[2],"r"); Index->file = fopen(argv[3],"wb"); if(T->file==NULL){ perror("Error: "); exit(EXIT_FAILURE); } if(Index->file==NULL){ perror("Error: "); exit(EXIT_FAILURE); } fseek(T->file,0,SEEK_END); T->length = ftell(T->file); rewind(T->file); str = callocx(sizeof(sauchar_t),T->length); SA = callocx(sizeof(saidx_t),T->length); SAI = callocx(sizeof(saidx_t),T->length); LCP = callocx(sizeof(saidx_t),T->length); fread(str,sizeof(sauchar_t),T->length,T->file); if(divsufsort(str,SA,T->length)){ perror("suffix sort error"); exit(EXIT_FAILURE); } for(i=0;i<T->length;i++){ SAI[SA[i]] = i; } for(i=0;i<T->length;i++){ h=0; if(SAI[i] != T->length -1){ k = SA[SAI[i]+1]; j=0; while(str[i+h]==str[k+h]) h++; LCP[SAI[i]] = h; if(h) h--; } else{ LCP[SAI[i]] = 0; } } if(Index->file==NULL){ perror("Error"); exit(EXIT_FAILURE); } fwrite(&(T->length),sizeof(word),1,Index->file); fwrite(SA,sizeof(word),T->length,Index->file); fwrite(LCP,sizeof(word),T->length,Index->file); fclose(Index->file); free(SA); free(SAI); free (LCP); free(str); } else if(strcmp(argv[1],"-search")==0){ P->file = fopen(argv[4],"r"); T->file = fopen(argv[3],"r"); Index->file = fopen(argv[2],"rb"); if(T->file==NULL){ perror("Error: "); exit(EXIT_FAILURE); } if(P->file == NULL){ perror("Error: "); exit(EXIT_FAILURE); } if(Index->file==NULL){ perror("Error: "); exit(EXIT_FAILURE); } fseek(T->file,0,SEEK_END); T->length = ftell(T->file); rewind(T->file); T->textStart = 0; T->textEnd = T->length-1; fseek(P->file,0,SEEK_END); P->length = ftell(P->file); rewind(P->file); fread(&n,sizeof(word),1,Index->file); SA = callocx(sizeof(saidx_t),n); LCP = callocx(sizeof(saidx_t),n); fread(SA,sizeof(saidx_t),n,Index->file); fread(LCP,sizeof(saidx_t),n,Index->file); searchPattern(T,P,SA,LCP,n); free(SA); free(LCP); } else{ fprintf(stderr,"%s%s%s","Usage = ",argv[0]," -create <inputText> <indexFile>\n"); fprintf(stderr,"%s%s%s","Usage = ",argv[0]," -search <inputText> <indexFile> <patternFile>\n"); exit(EXIT_FAILURE); } freeText(T); freeText(P); freeText(Index); return(0); }
void FM::build(uint8_t* T,uint32_t n,uint32_t samplerate) { uint8_t* X; uint8_t* X_bwt; int32_t* SA; uint32_t i,prev,tmp,start,stop; float elapsed; start = gettime(); info("building index."); /* remap if 0 in text */ info("- remapping alphabet."); X = remap0(T,n); free(T); /* create cumulative counts */ info("- creating cumulative counts C[]."); for (i=0;i<size_uchar+1;i++) C[i]=0; for (i=0;i<n;++i) C[X[i]]++; prev=C[0];C[0]=0; for (i=1;i<size_uchar+1;i++) { tmp = C[i]; C[i]=C[i-1]+prev; prev = tmp; } /* perform k-BWT */ info("- performing bwt."); SA = (int32_t*) safe_malloc( n * sizeof(int32_t) ); if( divsufsort(X,SA,n) != 0 ) { fatal("error divsufsort"); } /* sample SA for locate() */ info("- sample SA locations."); suffixes = (uint32_t*) safe_malloc( ((n/samplerate)+1) * sizeof(uint32_t)); BitString B(n); tmp = 0; for(i=0;i<n;i++) { if( SA[i] % samplerate == 0) { suffixes[tmp] = SA[i]; B.setBit(i,true); tmp++; } else B.setBit(i,false); } /* enable rank on context vector */ this->sampled = new BitSequenceRRR(B,RRR_SAMPLERATE); /* sample SA for display() */ positions = (uint32_t*) safe_malloc( ((n/samplerate)+2) * sizeof(uint32_t)); for (i=0;i<this->n;i++) { if (SA[i] % samplerate == 0) this->positions[SA[i]/samplerate] = i; } positions[(this->n-1)/samplerate+1] = positions[0]; info("- creating bwt output."); X_bwt = (uint8_t*) safe_malloc( n * sizeof(uint8_t) ); for(i=0;i<n;i++) { if(SA[i]==0) { X_bwt[i] = X[n-1]; this->I = i; } else X_bwt[i] = X[SA[i]-1]; } free(SA); info("- create RRR wavelet tree over bwt."); MapperNone * map = new MapperNone(); BitSequenceBuilder * bsb = new BitSequenceBuilderRRR(RRR_SAMPLERATE); T_bwt = new WaveletTreeNoptrs((uint32_t*)X_bwt,n,sizeof(uint8_t)*8,bsb,map,true); stop = gettime(); elapsed = (float)(stop-start)/1000000; /* build aux data */ info("build FM-Index done. (%.3f sec)",elapsed); uint32_t bytes; info("space usage:"); bytes = sigma * sizeof(uint8_t); info("- remap_reverse: %d bytes (%.2f\%)",bytes,(float)bytes/getSize()*100); bytes = sizeof(this->C); info("- C: %d bytes (%.2f\%)",bytes,(float)bytes/getSize()*100); bytes = ((n/samplerate)+1) * sizeof(uint32_t); info("- Suffixes: %d bytes (%.2f\%)",bytes,(float)bytes/getSize()*100); bytes = ((n/samplerate)+2) * sizeof(uint32_t); info("- Positions: %d bytes (%.2f\%)",bytes,(float)bytes/getSize()*100); bytes = sampled->getSize(); info("- Sampled: %d bytes (%.2f\%)",bytes,(float)bytes/getSize()*100); bytes = T_bwt->getSize(); info("- T_bwt: %d bytes (%.2f\%)",bytes,(float)bytes/getSize()*100); info("input Size n = %lu bytes\n",this->n); info("index Size = %lu bytes (%.2f n)",getSize(),getSizeN()); }
/* Function: buildAndWriteFMIndex() * Synopsis: Take text as input, along with several pre-allocated variables, * and produce BWT and corresponding FM-index, then write it all * to the output file. * * if SAsamp == NULL, don't store/write T or SAsamp */ int buildAndWriteFMIndex (FM_METADATA *meta, uint32_t seq_offset, uint32_t ambig_offset, uint16_t seq_cnt, uint16_t ambig_cnt, uint32_t overlap, uint8_t *T, uint8_t *BWT, int *SA, uint32_t *SAsamp, uint32_t *occCnts_sb, uint32_t *cnts_sb, uint16_t *occCnts_b, uint16_t *cnts_b, uint64_t N, FILE *fp ) { int status; uint64_t i,j,c,joffset; int chars_per_byte = 8/meta->charBits; uint32_t compressed_bytes = ((chars_per_byte-1+N)/chars_per_byte); uint32_t term_loc; int num_freq_cnts_b = 1+ceil((double)N/(meta->freq_cnt_b)); int num_freq_cnts_sb = 1+ceil((double)N/meta->freq_cnt_sb); int num_SA_samples = 1+floor((double)N/meta->freq_SA); uint8_t *Tcompressed = NULL; if (SAsamp != NULL) { ESL_ALLOC (Tcompressed, compressed_bytes * sizeof(uint8_t)); // Reverse the text T, so the BWT will be on reversed T. Only used for the 1st pass fm_reverseString ((char*)T, N-1); } // Construct the Suffix Array on text T status = divsufsort(T, SA, N); if ( status < 0 ) esl_fatal("buildAndWriteFMIndex: Error building BWT.\n"); // Construct the BWT, SA landmarks, and FM-index for (c=0; c<meta->alph_size; c++) { cnts_sb[c] = 0; cnts_b[c] = 0; FM_OCC_CNT(sb, 0, c ) = 0; FM_OCC_CNT(b, 0, c ) = 0; } for(j=0; j < N-1; ++j) { T[j]--; //move values down so 'a'=0...'t'=3; store 'a' in place of '$' } T[N-1]=0; BWT[0] = SA[0]==0 ? 0 /* '$' */ : T[ SA[0]-1] ; cnts_sb[BWT[0]]++; cnts_b[BWT[0]]++; //Scan through SA to build the BWT and FM index structures for(j=1; j < N; ++j) { if (SA[j]==0) { //'$' term_loc = j; BWT[j] = 0; //store 'a' in place of '$' } else { BWT[j] = T[ SA[j]-1] ; } //sample the SA if (SAsamp != NULL) { if ( !(j % meta->freq_SA) ) SAsamp[ j/meta->freq_SA ] = ( SA[j] == N - 1 ? -1 : SA[j] ) ; // handle the wrap-around '$' } cnts_sb[BWT[j]]++; cnts_b[BWT[j]]++; joffset = j+1; if ( !( joffset % meta->freq_cnt_b) ) { // (j+1)%freq_cnt_b==0 , i.e. every freq_cnt_bth position, noting that it's a zero-based count for (c=0; c<meta->alph_size; c++) FM_OCC_CNT(b, (joffset/meta->freq_cnt_b), c ) = cnts_b[c]; if ( !(joffset % meta->freq_cnt_sb) ) { // j%freq_cnt_sb==0 for (c=0; c<meta->alph_size; c++) { FM_OCC_CNT(sb, (joffset/meta->freq_cnt_sb), c ) = cnts_sb[c]; cnts_b[c] = 0; } } } } //wrap up the counting; for (c=0; c<meta->alph_size; c++) { FM_OCC_CNT(b, num_freq_cnts_b-1, c ) = cnts_b[c]; FM_OCC_CNT(sb, num_freq_cnts_sb-1, c ) = cnts_sb[c]; } // Convert BWT and T to packed versions if appropriate. if (meta->alph_type == fm_DNA) { //4 chars per byte. Counting will be done based on quadruples 0..3; 4..7; 8..11; etc. for(i=0; i < N-3; i+=4) BWT[i/4] = BWT[i]<<6 | BWT[i+1]<<4 | BWT[i+2]<<2 | BWT[i+3]; if (i <= N-1) BWT[i/4] = BWT[i]<<6; if (i+1 <= N-1) BWT[i/4] |= BWT[i+1]<<4; if (i+2 <= N-1) BWT[i/4] |= BWT[i+2]<<2; } else if (meta->alph_type == fm_DNA_full ) { //2 chars per byte. Counting will be done based on quadruples 0..3; 4..7; 8..11; etc. for(i=0; i < N-1; i+=2) BWT[i/2] = BWT[i]<<4 | BWT[i+1]; if (i==N-1) BWT[i/2] = BWT[i]<<4 ; } //If this is the 1st (reversed text) BWT, de-reverse it, then compress it if (SAsamp != NULL) { fm_reverseString ((char*)T, N-1); // Convert BWT and T to packed versions if appropriate. if (meta->alph_type == fm_DNA ) { //4 chars per byte. Counting will be done based on quadruples 0..3; 4..7; 8..11; etc. for(i=0; i < N-3; i+=4) Tcompressed[i/4] = T[i]<<6 | T[i+1]<<4 | T[i+2]<<2 | T[i+3]; if (i <= N-1) Tcompressed[i/4] = T[i]<<6; if (i+1 <= N-1) Tcompressed[i/4] |= T[i+1]<<4; if (i+2 <= N-1) Tcompressed[i/4] |= T[i+2]<<2; } else if (meta->alph_type == fm_DNA_full) { //2 chars per byte. Counting will be done based on quadruples 0..3; 4..7; 8..11; etc. for(i=0; i < N-1; i+=2) Tcompressed[i/2] = T[i]<<4 | T[i+1]; if (i==N-1) Tcompressed[i/2] = T[i]<<4 ; } else { for(i=0; i < N-1; i++) Tcompressed[i] = T[i]; } } for(j=0; j < N-1; ++j) { T[j]++; //move values back up, in case the reverse FM needs to be built } T[N-1] = 0; // Write the FM-index meta data if(fwrite(&N, sizeof(uint64_t), 1, fp) != 1) esl_fatal( "buildAndWriteFMIndex: Error writing block_length in FM index.\n"); if(fwrite(&term_loc, sizeof(uint32_t), 1, fp) != 1) esl_fatal( "buildAndWriteFMIndex: Error writing terminal location in FM index.\n"); if(fwrite(&seq_offset, sizeof(uint32_t), 1, fp) != 1) esl_fatal( "buildAndWriteFMIndex: Error writing seq_offset in FM index.\n"); if(fwrite(&ambig_offset, sizeof(uint32_t), 1, fp) != 1) esl_fatal( "buildAndWriteFMIndex: Error writing ambig_offset in FM index.\n"); if(fwrite(&overlap, sizeof(uint32_t), 1, fp) != 1) esl_fatal( "buildAndWriteFMIndex: Error writing overlap in FM index.\n"); if(fwrite(&seq_cnt, sizeof(uint16_t), 1, fp) != 1) esl_fatal( "buildAndWriteFMIndex: Error writing seq_cnt in FM index.\n"); if(fwrite(&ambig_cnt, sizeof(uint16_t), 1, fp) != 1) esl_fatal( "buildAndWriteFMIndex: Error writing ambig_cnt in FM index.\n"); // don't write Tcompressed or SAsamp if SAsamp == NULL if(Tcompressed != NULL && fwrite(Tcompressed, sizeof(uint8_t), compressed_bytes, fp) != compressed_bytes) esl_fatal( "buildAndWriteFMIndex: Error writing T in FM index.\n"); if(fwrite(BWT, sizeof(uint8_t), compressed_bytes, fp) != compressed_bytes) esl_fatal( "buildAndWriteFMIndex: Error writing BWT in FM index.\n"); if(SAsamp != NULL && fwrite(SAsamp, sizeof(uint32_t), (size_t)num_SA_samples, fp) != (size_t)num_SA_samples) esl_fatal( "buildAndWriteFMIndex: Error writing SA in FM index.\n"); if(fwrite(occCnts_b, sizeof(uint16_t)*(meta->alph_size), (size_t)num_freq_cnts_b, fp) != (size_t)num_freq_cnts_b) esl_fatal( "buildAndWriteFMIndex: Error writing occCnts_b in FM index.\n"); if(fwrite(occCnts_sb, sizeof(uint32_t)*(meta->alph_size), (size_t)num_freq_cnts_sb, fp) != (size_t)num_freq_cnts_sb) esl_fatal( "buildAndWriteFMIndex: Error writing occCnts_sb in FM index.\n"); if (Tcompressed) free(Tcompressed); return eslOK; ERROR: /* Deallocate memory. */ if (Tcompressed) free(Tcompressed); return eslFAIL; }
int main(int argc, const char *argv[]) { FILE *fp; sauchar_t *T; saidx_t *SA; saidx_t n; clock_t start, finish; #if HAVE_SYS_STAT_H struct stat s; #endif /* Check argument. */ if((argc != 2) || (strcmp(argv[1], "-h") == 0) || (strcmp(argv[1], "--help") == 0)) { fprintf(stderr, "suftest, a suffixsort tester, version %s.\n" , divsufsort_version()); fprintf(stderr, "usage: %s FILE\n\n" , argv[0]); exit(EXIT_FAILURE); } /* Get a file's status information. */ #if HAVE_SYS_STAT_H if(stat(argv[1], &s) != 0) { fprintf(stderr, "%s: Cannot stat file `%s': ", argv[0], argv[1]); perror(NULL); exit(EXIT_FAILURE); } n = s.st_size; #endif /* Open a file for reading. */ if((fp = fopen(argv[1], "rb")) == NULL) { fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], argv[1]); perror(NULL); exit(EXIT_FAILURE); } #if !HAVE_SYS_STAT_H fseek(fp, 0, SEEK_END); n = ftell(fp); rewind(fp); #endif /* Allocate n+4(n+1) bytes of memory. */ if(((T = malloc(n * sizeof(sauchar_t))) == NULL) || ((SA = malloc((n + 1) * sizeof(saidx_t))) == NULL)) { fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]); exit(EXIT_FAILURE); } /* Read n bytes of data. */ if(fread(T, sizeof(sauchar_t), n, fp) != n) { fprintf(stderr, "%s: %s `%s': ", argv[0], (ferror(fp) || !feof(fp)) ? "Cannot read from" : "Unexpected EOF in", argv[1]); perror(NULL); exit(EXIT_FAILURE); } fclose(fp); /* Construct the suffix array. */ fprintf(stderr, "%s: %d bytes ... ", argv[1], (int)n); start = clock(); divsufsort(T, SA, n); finish = clock(); fprintf(stderr, "%.4f sec\n", (double)(finish - start) / (double)CLOCKS_PER_SEC); /* Check the suffix array. */ if(sufcheck(T, SA, n, 3) != 0) { exit(EXIT_FAILURE); } /* Deallocate memory. */ free(SA); free(T); return 0; }