/* ******************************************************************** rle+compression of a string: Fenwick;s three-level model input int len size of mtf sequence uchar *in input mtf sequence int alpha_size size of the alphabet output the compressed string is written in the output file note the mtf rank is coded as follows: if <=2 then the first level codes the rank using 2 bits. In case (>2 and <=9) then we write 11 as escape code and go to the second level where the rank is coded using 3 bits as (rank - 3). If rank > 9, then again an escape code is output as 111, and (rank - 10) is represented using enough bits. One important point is that, when the rank is equal to Mtf_save then the next value denotes a character and thus it is coded in binary using a proper number of bits. ******************************************************************* */ void rle_hierarchical(uchar *in, int len, int alpha_size) { int int_log2(int); void bit_write(int,int); int bits_x_char,i,z; uchar c; int mtf_size; mtf_size = MIN(Mtf_save,alpha_size); bits_x_char = int_log2(alpha_size); z=-1; // # of pending zeroes (-1) for(i=0; i<len;i++) { assert(in[i]<alpha_size); if(in[i]==0) { if(++z==255) { bit_write(2,0); // unary code for 0 bit_write(8,255); // write 255 using 8 bits z=-1; } } else { /* ----- check if there are pending zeores ---- */ if(z>=0) { bit_write(2,0); // unary code for 0 bit_write(8,z); // write z using 8 bits z=-1; } // ---- write a nonzero mtf rank ----- if(in[i]<=2) { bit_write(2,in[i]); // binary coding } if((in[i]>2) && (in[i] <= 9)){ bit_write(2,3); // escape level 1 bit_write(3,in[i]-3); // binary coding of second level } if((in[i] > 9) && (in[i] <= mtf_size)){ bit_write(5,31); // escape code level 2 bit_write(int_log2(mtf_size-10),in[i]-10); } if(in[i] == mtf_size){ c=in[++i]; // get actual char assert(c<alpha_size); bit_write(bits_x_char,c); // write remapped char } } } // ---- there could be some pending zeroes if(z>=0) { bit_write(2,0); // unary code for 0 bit_write(8,z); // 255 using 8 bits } }
// =========================================================================== // x=in, y=out, w=exp(2*pi*i*k/n), k=0..n/2-1 // =========================================================================== void fft_myrmics_cfft2(int n, float *x, float *y, float *w) { int m, j, mj, tgle, i; m = int_log2(n); //(int) (log ((float) n) / log (1.99)); mj = 1; tgle = 1; fft_myrmics_step(n, mj, x, x + n, y, y + 2 * mj, w); for (j = 0; j < m - 2; j++) { mj *= 2; if (tgle) { fft_myrmics_step(n, mj, y, y + n, x, x + 2 * mj, w); tgle = 0; } else { fft_myrmics_step(n, mj, x, x + n, y, y + 2 * mj, w); tgle = 1; } } if (tgle) { for (i = 0; i < n; i++) { y[i] = x[i]; } } mj = n / 2; fft_myrmics_step(n, mj, x, x + n, y, y + 2 * mj, w); }
/* ***************************************************************** write the starting position (in the output file) of each one of the Num_bucs_lev2 buckets. For simplicity we use 32 bits for each position. These values are written at the end of the prologue just before the beginning of the first bucket. It writes also the starting position of the occurrence list ***************************************************************** */ void write_susp_infos(bwi_input *s) { void bit_write(int,int); void uint_write(int); void bit_flush(void); int int_log2(int); int i,offset,len; /* -- write starting position of occ-explicit list --*/ // warning! the constant 19 depends on the structure of the prologue!!! if (fseek(Outfile,19,SEEK_SET)) { fprintf(stderr,"seek error on output file -write_susp_infos-\n"); exit(1); } uint_write(Start_prologue_occ); bit_flush(); // Warning: the offset heavily depends on the structure of prologue. // The value of start_level2[0] has been initialized in // the procedure compress_superbucket() len = (int_log2(s->text_size)+7)/8; // variable length representation offset = s->start_lev2[0] - Num_bucs_lev2*len; if (fseek(Outfile,offset,SEEK_SET)) { fprintf(stderr,"seek error on output file -write_susp_infos-\n"); exit(1); } for(i=0;i<Num_bucs_lev2;i++) bit_write(len*8,s->start_lev2[i]); bit_flush(); assert(ftell(Outfile)==(int)s->start_lev2[0]); }
// Initialize for edge generation rmat_iterator(RandomGenerator& gen, vertices_size_type n, edges_size_type m, double a, double b, double c, double d, bool permute_vertices = true) : gen(), n(n), a(a), b(b), c(c), d(d), edge(m), permute_vertices(permute_vertices), SCALE(int_log2(n)) { this->gen.reset(new uniform_01<RandomGenerator>(gen)); assert(boost::test_tools::check_is_close(a + b + c + d, 1., boost::test_tools::fraction_tolerance(1.e-5))); if (permute_vertices) generate_permutation_vector(gen, vertexPermutation, n); // TODO: Generate the entire adjacency matrix then "Clip and flip" if undirected graph // Generate the first edge vertices_size_type u, v; boost::tie(u, v) = generate_edge(this->gen, n, SCALE, a, b, c, d); if (permute_vertices) current = std::make_pair(vertexPermutation[u], vertexPermutation[v]); else current = std::make_pair(u, v); --edge; }
double FFT_num_flops(int N) { double Nd = (double) N; double logN = (double) int_log2(N); return (5.0*Nd-2)*logN + 2*(Nd+1); }
/* * Multi-table-Huffman compressed bucket. Update the array occ[] summing * up all occurrencs of the chars in its prefix preceding the absolute * position k. Note that ch is a bucket-remapped char. */ uchar get_b_multihuf(ulong k, ulong * occ, fm_index * s, int is_odd) { int bit, bits = int_log2(s->alpha_size_b); ulong bpos, j; uchar prev; bpos = k % s->bucket_size_lev2; if (is_odd) bpos = s->bucket_size_lev2 - bpos - 1; if (s->alpha_size_b == 1) { /* special case bucket with only one char */ prev = s->inv_map_b[0]; if(is_odd) { for (j=0; j <= bpos; j++) { s->mtf_seq[j] = prev; occ[prev]--; } occ[prev]++; } else { for (j = 0; j <= bpos; j++) { s->mtf_seq[j] = prev; occ[prev]++; } } return prev; } fm_bit_read24(bits, prev); s->mtf_seq[0] = prev = s->inv_map_b[prev]; if (is_odd) occ[prev]--; else occ[prev]++; for(j=1; j<=bpos; j++) { fm_bit_read24(1, bit); if(bit){ fm_bit_read24(bits, prev); s->mtf_seq[j] = prev = s->inv_map_b[prev]; } else s->mtf_seq[j] = prev; if (is_odd) occ[prev]--; else occ[prev]++; } if (is_odd) occ[prev]++; return prev; }
int fm_use_index (fm_index *fmindex) { int error = fm_read_basic_prologue (fmindex); if (error) return error; if(fmindex->smalltext) { fmindex->skip = 0; if(fmindex->text_size<SMALLSMALLFILESIZE) return FM_OK; fmindex->smalltext = 2; return FM_OK; } /* * init some var */ fmindex->int_dec_bits = int_log2 (int_log2 (fmindex->bucket_size_lev1 - fmindex->bucket_size_lev2)); if(fmindex->skip >1) { fmindex->occcharinf = fmindex->bwt_occ[fmindex->specialchar]; if(fmindex->specialchar==fmindex->alpha_size-1) fmindex->occcharsup = fmindex->text_size-1; else fmindex->occcharsup = fmindex->bwt_occ[fmindex->specialchar+1]; fmindex->num_marked_rows = fmindex->occcharsup-fmindex->occcharinf; } else fmindex->num_marked_rows = 0; fmindex->log2_row = int_log2(fmindex->text_size); fmindex->var_byte_rappr = ((fmindex->log2textsize + 7) / 8)*8; return FM_OK; }
// Initialize for edge generation unique_rmat_iterator(RandomGenerator& gen, vertices_size_type n, edges_size_type m, double a, double b, double c, double d, bool permute_vertices = true, EdgePredicate ep = keep_all_edges()) : gen(), done(false) { assert(boost::test_tools::check_is_close(a + b + c + d, 1., boost::test_tools::fraction_tolerance(1.e-5))); this->gen.reset(new uniform_01<RandomGenerator>(gen)); std::vector<vertices_size_type> vertexPermutation; if (permute_vertices) generate_permutation_vector(gen, vertexPermutation, n); int SCALE = int_log2(n); std::map<value_type, bool> edge_map; edges_size_type edges = 0; do { vertices_size_type u, v; boost::tie(u, v) = generate_edge(this->gen, n, SCALE, a, b, c, d); // Lowest vertex number always comes first // (this means we don't have to worry about i->j and j->i being in the edge list) if (u > v && is_same<directed_category, undirected_tag>::value) std::swap(u, v); if (edge_map.find(std::make_pair(u, v)) == edge_map.end()) { edge_map[std::make_pair(u, v)] = true; if (permute_vertices) { if (ep(vertexPermutation[u], vertexPermutation[v])) values.push_back(std::make_pair(vertexPermutation[u], vertexPermutation[v])); } else { if (ep(u, v)) values.push_back(std::make_pair(u, v)); } edges++; } } while (edges < m); // NGE - Asking for more than n^2 edges will result in an infinite loop here // Asking for a value too close to n^2 edges may as well current = values.back(); values.pop_back(); }
/* ********************************************************** open filename and write p[0] .. p[n-1] using log(n) bits ********************************************************** */ void write_sa(char *filename, int *p, int n) { int int_log2(int); void init_bit_buffer(void); void fbit_write(FILE *,int,int), fbit_flush( FILE * ); FILE *sa; Int32 psize, i; if(_ds_Verbose) fprintf(stderr,"Writing sa to file %s\n",filename); if((sa=fopen(filename,"wb"))==NULL) perror(filename); init_bit_buffer(); psize = int_log2(n); for(i=0;i<n;i++) fbit_write(sa,psize,p[i]); fbit_flush(sa); fclose(sa); }
// Initialize for edge generation sorted_rmat_iterator(RandomGenerator& gen, vertices_size_type n, edges_size_type m, double a, double b, double c, double d, bool permute_vertices = true, EdgePredicate ep = keep_all_edges()) : gen(), permute_vertices(permute_vertices), values(sort_pair<vertices_size_type>()), done(false) { assert(boost::test_tools::check_is_close(a + b + c + d, 1., boost::test_tools::fraction_tolerance(1.e-5))); this->gen.reset(new uniform_01<RandomGenerator>(gen)); std::vector<vertices_size_type> vertexPermutation; if (permute_vertices) generate_permutation_vector(gen, vertexPermutation, n); // TODO: "Clip and flip" if undirected graph int SCALE = int_log2(n); for (edges_size_type i = 0; i < m; ++i) { vertices_size_type u, v; boost::tie(u, v) = generate_edge(this->gen, n, SCALE, a, b, c, d); if (permute_vertices) { if (ep(vertexPermutation[u], vertexPermutation[v])) values.push(std::make_pair(vertexPermutation[u], vertexPermutation[v])); } else { if (ep(u, v)) values.push(std::make_pair(u, v)); } } current = values.top(); values.pop(); }
/* compress and write to file a bucket of length "len" starting at in[0]. the compression is done as follows: first the charatcters are remapped (we expect only a few distinct chars in a single bucket) then we use mtf and we compress. */ int compress_bucket(fm_index *s, uchar *in, ulong len, suint alphasize) { int fm_multihuf_compr(uchar *, int, int); suint local_alpha_size, j; uchar c, local_bool_map[256], local_map[256]; /* ---------- compute and write local boolean map ------ */ for(j=0; j<alphasize; j++){ local_bool_map[j]=0; local_map[j]=0; } local_alpha_size=0; for(j=0;j<len;j++) { // compute local boolean map c = in[j]; // remapped char assert(c<alphasize); local_bool_map[c] = 1; } for(j=0; j<alphasize; j++) // compute local map if(local_bool_map[j]) local_map[j] = local_alpha_size++; for(j=0;j<alphasize;j++) // write bool char map to file if(local_bool_map[j]) {fm_bit_write24(1,1);} else {fm_bit_write24(1,0);} for(j=0;j<len;j++) // remap bucket in[j]=local_map[in[j]]; int error = 0; switch ( s->type_compression ) { case ( MULTIH ): if (local_alpha_size == 1) { fm_bit_flush(); return FM_OK; } int bit = int_log2(local_alpha_size); char prev = in[0]; fm_bit_write(bit, in[0]); for(j=1;j<len;j++) { if(prev==in[j]){ fm_bit_write(1, 0); } else { fm_bit_write(1, 1); fm_bit_write(bit, in[j]); } prev = in[j]; } /* compute mtf picture */ /*mtf_string(in, s->mtf_seq, len, local_alpha_size); error = fm_multihuf_compr(s->mtf_seq, len, local_alpha_size); if ( error < 0 ) return error;*/ fm_bit_flush(); break; default: return FM_COMPNOTSUP; } return FM_OK; }
static void frag_report(const char *filename) { struct statfs fsinfo; #ifdef HAVE_FSTAT64 struct stat64 fileinfo; #else struct stat fileinfo; #endif int bs; long fd; unsigned long block, last_block = 0, numblocks, i, count; long bpib; /* Blocks per indirect block */ long cylgroups; int num_extents = 0, expected; int is_ext2 = 0; static int once = 1; unsigned int flags; int rc; #ifdef HAVE_OPEN64 fd = open64(filename, O_RDONLY); #else fd = open(filename, O_RDONLY); #endif if (fd < 0) { perror("open"); return; } if (statfs(filename, &fsinfo) < 0) { perror("statfs"); return; } #ifdef HAVE_FSTAT64 if (stat64(filename, &fileinfo) < 0) { #else if (stat(filename, &fileinfo) < 0) { #endif perror("stat"); return; } if (ioctl(fd, EXT3_IOC_GETFLAGS, &flags) < 0) flags = 0; if (!(flags & EXT4_EXTENTS_FL) && ((fsinfo.f_type == 0xef51) || (fsinfo.f_type == 0xef52) || (fsinfo.f_type == 0xef53))) is_ext2++; if (verbose && once) printf("Filesystem type is: %lx\n", (unsigned long) fsinfo.f_type); cylgroups = div_ceil(fsinfo.f_blocks, fsinfo.f_bsize*8); if (verbose && is_ext2 && once) printf("Filesystem cylinder groups is approximately %ld\n", cylgroups); physical_width = int_log10(fsinfo.f_blocks); if (physical_width < 8) physical_width = 8; if (ioctl(fd, FIGETBSZ, &bs) < 0) { /* FIGETBSZ takes an int */ perror("FIGETBSZ"); close(fd); return; } if (no_bs) bs = 1024; bpib = bs / 4; numblocks = (fileinfo.st_size + (bs-1)) / bs; logical_width = int_log10(numblocks); if (logical_width < 7) logical_width = 7; filesize = (long long)fileinfo.st_size; if (verbose) printf("File size of %s is %lld (%ld block%s, blocksize %d)\n", filename, (long long) fileinfo.st_size, numblocks, numblocks == 1 ? "" : "s", bs); if (force_bmap || filefrag_fiemap(fd, int_log2(bs), &num_extents) != 0) { for (i = 0, count = 0; i < numblocks; i++) { if (is_ext2 && last_block) { if (((i-EXT2_DIRECT) % bpib) == 0) last_block++; if (((i-EXT2_DIRECT-bpib) % (bpib*bpib)) == 0) last_block++; if (((i-EXT2_DIRECT-bpib-bpib*bpib) % (bpib*bpib*bpib)) == 0) last_block++; } rc = get_bmap(fd, i, &block); if (block == 0) continue; if (!num_extents) num_extents++; count++; if (last_block && (block != last_block+1) ) { if (verbose) printf("Discontinuity: Block %ld is at " "%lu (was %lu)\n", i, block, last_block+1); num_extents++; } last_block = block; } } if (num_extents == 1) printf("%s: 1 extent found", filename); else printf("%s: %d extents found", filename, num_extents); expected = (count/((bs*8)-(fsinfo.f_files/8/cylgroups)-3))+1; if (is_ext2 && expected < num_extents) printf(", perfection would be %d extent%s\n", expected, (expected>1) ? "s" : ""); else fputc('\n', stdout); close(fd); once = 0; } static void usage(const char *progname) { fprintf(stderr, "Usage: %s [-Bbvsx] file ...\n", progname); exit(1); }
void build_sa(bwi_input *s) { int scmp3(unsigned char *p, unsigned char *q, int maxl); void init_bit_buffer(void); int fbit_read(FILE *,int); int *larsson_sada_sufsort(uchar *, int, int); int *suffixsort5n(uchar *, int); void out_of_mem(char *s); int int_log2(int); int i, n, pointer_size,q,r,sa_size; FILE *safile; /* ------------ check sa file ---------------- */ n=0; safile = fopen(Safile_name,"rb"); if(safile!=NULL) { fseek(safile,0L,SEEK_END); n=ftell(safile); } if (n==0) { // ------- build sa using larsson-sada or 5n if(Verbose) fprintf(stderr, " from scratch "); if(Use_larsson_sada) { if(Verbose) fprintf(stderr, "(using ls) ... "); s->sa = larsson_sada_sufsort(s->text,s->text_size,s->alpha_size); } else { if(Verbose) fprintf(stderr, "(using 5n) ... "); s->sa = suffixsort5n(s->text,s->text_size); } } else { // ------ read sa from file -------- pointer_size = int_log2(s->text_size); // --- compute sa_size = s->text_size * pointer_size + 7)/8 // --- use q and r to avoid overflow q = s->text_size/8; r = s->text_size % 8; sa_size = (q*pointer_size) + (r*pointer_size+7)/8; if (n != sa_size) fatal_error("Invalid .sa file\n"); if(Verbose) fprintf(stderr, " by reading it from file... "); // allocate space for the suffix array s->sa = (int *) malloc(s->text_size * sizeof(int)); if(s->sa==NULL) out_of_mem("build_sa"); rewind(safile); init_bit_buffer(); for(i=0; i<s->text_size; i++)// read one suffix-array pointer at a time s->sa[i] = fbit_read(safile,pointer_size); fclose(safile); } // check the suffix array #if 0 for (i=0; i<s->text_size-1; ++i) if (scmp3(s->text+s->sa[i], s->text+s->sa[i+1], MIN(s->text_size-s->sa[i], s->text_size-s->sa[i+1]))>=0) { fprintf(stderr, "Suffix array check failed at position %d\n", i); exit(1); } #endif }
/* ********************************************************* The current format of the prologue is the following: 8 bits type of compression (2=Hier, 4=Multi Table Huff) 1 int size of input file 1 int position of eof in s->bw 1 uint16 size of a super bucket divided by 1024 1 uchar size of a bucket divided by 1024 (divides the previous) 1 uchar size-1 mtf_list stored in each bucket 1 uchar size-1 of the compacted alphabet of the text 1 uchar remapped char selected for occurrence list 1 int # skipped occ of chosen_char in bwt 1 int starting byte of occ-explicit list 256 bits boolean map of chars in the text (S = # of 1) S int prefix sum of character occurrences for each superbucket S' bytes map of compact_alph chars occurring in THIS superbucket (S' = (S+7)/8 -- it is byte aligned) ****FLUSH**** S int # occ of all compact_alphabet chars in prev superbuckets finally: NB x L starting position of each bucket in the compressed file NB is the number of buckets and L is the number of bits sufficient to represent that length (byte_aligned) -------- Body of the compressed file [byte-aligned] ------------- for each bucket (let R be the # of distinct chars in the superbucket) R 7x8val # of occ of each char in the previous buckets of the same superbucket. Each value is represented with the 7x8 encoding. This information is missing for the first bucket of each superbucket R bits map of chars appearing in this bucket. Let R' be the # of distinct chars in this bucket and L' the # of bits required to represent R' L' x M bits Initial move to front list for this bucket M = min(R',Mtf_save) ... bits needed to byte-align in case ONLY of Arith-coding ??? bits compressed bucket in mtf + rle + [Ari|Hier|Una] format --- bits ****FLUSH**** to have byte alignment -------- Body of the occ explicit list [byte-aligned] ------------- --------------------------------------------------------------------- --- URL: we have occ for text positions and rows --- ... L bits list of positions where character ch occurs in the original text. ch = character that occurs close to Marked_char_freq times in the text. **************************************************************** */ void write_prologue(bwi_input *s) { void init_bit_buffer(void); int int_log2(int); void uint_write(int); void bit_write(int,int); void bit_flush(void); void write7x8(int); bucket_lev1 sb; int i,len,k; /* ----- write file and bucket size ------ */ init_bit_buffer(); bit_write(8,Type_compression); uint_write(s->text_size); uint_write(s->bwt_eof_pos); assert(Bucket_size_lev1>>10<65536); assert((Bucket_size_lev1 & 0x3ff) == 0); bit_write(16,Bucket_size_lev1>>10); assert(Bucket_size_lev2>>10<256); assert((Bucket_size_lev2 & 0x3ff) == 0); bit_write(8,Bucket_size_lev2>>10); // ---- mtf and alphabet information assert(Mtf_save>0 && Mtf_save<=256); bit_write(8,Mtf_save-1); assert(s->alpha_size>0 && s->alpha_size<=256); bit_write(8,s->alpha_size-1); // ---- write chosen_char & starting byte of occ-list bit_write(8,s->chosen_char); uint_write(s->skip); uint_write(0); // ---- boolean alphabet char map for(i=0;i<256;i++) if(s->bool_char_map[i]) bit_write(1,1); else bit_write(1,0); // ---- write prefix sum of char occ for(i=0; i<s->alpha_size; i++) uint_write(s->pfx_char_occ[i]); // ----- process superbuckets for(i=0;i<Num_bucs_lev1;i++) { sb = s->buclist_lev1[i]; for(k=0;k<s->alpha_size;k++) // boolean char_map if(sb.bool_char_map[k]) bit_write(1,1); else bit_write(1,0); bit_flush(); // we keep everything byte aligned if(i>0) // write prefix-occ for(k=0;k<s->alpha_size;k++) uint_write(sb.occ[k]); } // ----- leave space for storing the start positions of buckets len = (int_log2(s->text_size)+7)/8; //it's byte-aligned for(i=0;i<Num_bucs_lev2;i++) bit_write(len * 8,0); }
/* ************************************************************ * * * main compression routine * * * ********************************************************** */ void compress_file(void) { void read_text(FILE *, bwi_input *s); void remap_alphabet(bwi_input *s); void build_sa(bwi_input *s); void compute_bwt(bwi_input *s); void compute_info_superbuckets(bwi_input *s); void compute_info_buckets(bwi_input *s); void write_prologue(bwi_input *s); void compress_superbucket(bwi_input *s, int); int compute_locations(bwi_input *s); int compute_locations_dict(bwi_input *s, int*); int compute_ranks_dict(bwi_input *s, int*); int compute_locations_huffword(bwi_input *s, int *); void bit_flush( void ); void bit_write(int,int); void init_bit_buffer(void); void write_susp_infos(bwi_input *s); bwi_input s; int i,len, retr_occ, retr_occ2, loc_occ_range; int Start_prologue_ranks; /* --------- Load the text file from disk ------- */ if(Verbose) fprintf(stderr,"Reading input file... "); read_text(Infile, &s); if(Verbose) fprintf(stderr,"done! (%f seconds)\n",getTime()); /* --------- Compact alphabet ------- */ if(Verbose>1) fprintf(stderr,"Remapping alphabet... "); remap_alphabet(&s); if(Verbose>1) fprintf(stderr,"done! (%f seconds). ",getTime()); if(Verbose>1) fprintf(stderr,"Compact alphabet size = %d\n",s.alpha_size); /* --------- Build suffix array ------- */ if(Verbose) fprintf(stderr,"Building suffix array"); build_sa(&s); if(Verbose) fprintf(stderr,"done! (%f seconds)\n",getTime()); /* --------- Compute BWT ------- */ if(Verbose>1) fprintf(stderr,"Computing BWT... "); compute_bwt(&s); if(Verbose>1) fprintf(stderr,"done! (%f seconds)\n",getTime()); /* ------- mark chars and compute locations ----- */ if (Is_dictionary) retr_occ = compute_locations_dict(&s,&loc_occ_range); // dictionary else if (Is_huffword) retr_occ = compute_locations_huffword(&s,&loc_occ_range);// huffword else if (Is_URL) retr_occ = compute_ranks_dict(&s,&loc_occ_range); // URL else retr_occ = compute_locations(&s); // standard /* --------- Compute various infos for each superbucket ------- */ if(Verbose>1) fprintf(stderr,"Computing infos superbukets... "); compute_info_superbuckets(&s); if(Verbose>1) fprintf(stderr,"done! (%f seconds)\n", getTime()); /* --------- Compute various infos for each bucket ------- */ if(Verbose>1) fprintf(stderr,"Computing infos buckets... "); compute_info_buckets(&s); if(Verbose>1) fprintf(stderr,"done! (%f seconds)\n", getTime()); /* --------- Writing the compressed file ------- */ Infile_size = s.text_size; Outfile_size=0; write_prologue(&s); if(Verbose) fprintf(stderr,"Prologue --> %d bytes!\n",Outfile_size); for(i=0;i<Num_bucs_lev1;i++) compress_superbucket(&s,i); /* ---- keep starting positions of occ-explicit list ---- */ Start_prologue_occ = Outfile_size; /* -- write the starting position of buckets -- */ write_susp_infos(&s); if (fseek(Outfile,Start_prologue_occ,SEEK_SET)) { fprintf(stderr, "Seek error on output file -compress_file-\n"); exit(1); } /* -- write the position of the marked chars ---- */ init_bit_buffer(); if(Is_dictionary || Is_huffword || Is_URL) len = int_log2(loc_occ_range); // bits required for each rank else len = int_log2(s.text_size); // bits required for each pos for(i=0; i < retr_occ; i++) bit_write(len,s.loc_occ[i]); bit_flush(); Start_prologue_ranks = (int)ftell(Outfile); if(Verbose) fprintf(stderr,"List of %d marked ranks --> %d bytes!\n", retr_occ,Start_prologue_ranks-Start_prologue_occ); /* -- in the case of URL we also store the DICT info -- */ /* It should be put together with the computation above --*/ /* Thus removing these differences in the code --*/ /* Hence Start_prologue_occ indicates the starting position of RANKS. */ /* After retr_occ RANKS start the LOCATIONS, which are again retr_occ */ /* in number. The value of retr_occ can be computed at decompression time */ /* by using the same formula adopted in compute_ranks_dict() */ if (Is_URL) { retr_occ2 = compute_locations_dict(&s,&loc_occ_range); // DICT if (retr_occ != retr_occ2) out_of_mem("Unequal number of sampled NULLs\n"); for(i=0; i < retr_occ; i++) bit_write(len,s.loc_occ[i]); bit_flush(); if(Verbose) fprintf(stderr,"List of %d marked locations --> %d bytes!\n", retr_occ2,(int)ftell(Outfile) - Start_prologue_ranks); } }
/* ********************************************************************** compress and write to file a bucket of length "len" starting at in[0]. the compression is done as follows: first the charatcters are remapped (we expect only a few distinct chars in a single bucket) then we use mtf (with a list of size Mtf_save) then we rle and compress using a unary code. ********************************************************************** */ void compress_bucket(uchar *in, int len, int alpha_size) { int int_log2(int); void init_bit_buffer(void); void bit_write(int,int); void bit_flush( void ); void out_of_mem(char *); int mtf_string(uchar *, uchar *, uchar *, int); void rle_hierarchical(uchar *, int, int); void multihuf_compr(uchar *, int, int); int k,j,bits_x_char,local_alpha_size,mtf_len; uchar c,mtf[256],local_bool_map[256], local_map[256]; uchar *mtf_seq; /* ---------- init ------------ */ init_bit_buffer(); /* ---------- compute and write local boolean map ------ */ for(k=0;k<alpha_size;k++) local_bool_map[k]=local_map[k]=0; local_alpha_size=0; for(j=0;j<len;j++) { // compute local boolean map c=in[j]; // remapped char assert(c<alpha_size); local_bool_map[c]=1; } for(k=0;k<alpha_size;k++) // compute local map if(local_bool_map[k]) local_map[k]=local_alpha_size++; for(j=0;j<len;j++) // remap bucket in[j]=local_map[in[j]]; for(k=0;k<alpha_size;k++) // write bool char map to file if(local_bool_map[k]) bit_write(1,1); else bit_write(1,0); /* ----------- compute and write mtf picture ------------- */ mtf_seq = (uchar *) malloc(2*len*sizeof(uchar)); // mtf temporary buffer if(mtf_seq==NULL) out_of_mem("compress_bucket (mtf_seq)"); mtf_len = mtf_string(in,mtf_seq,mtf,len); // mtf_seq=mtf(in), init mtf-list bits_x_char = int_log2(local_alpha_size); // write mtf to file for(k=0;k<MIN(Mtf_save,local_alpha_size);k++) { bit_write(bits_x_char,mtf[k]); } // -- Applies the proper compression routine -- switch (Type_compression) { case ARITH: // ---- Arithmetic compression of the bucket ----- fatal_error("Arithmetic coding no longer available -compress_bucket-\n"); exit(1); case HIER3: // ---- three-leveled model: Fenwick's proposal ----- rle_hierarchical(mtf_seq, mtf_len,local_alpha_size); break; case UNARY: // ---- Unary compression of mtf-ranks with escape ----- fatal_error("Unary coding no longer available -compress_bucket-\n"); exit(1); case MULTIH: // ---- RLE + MultiHuffman compression of the bucket ----- multihuf_compr(mtf_seq,mtf_len,local_alpha_size); break; default: fprintf(stderr,"\n Compression algorithm unknown! "); fprintf(stderr,"-compress_superbucket-\n"); exit(1); } bit_flush(); // Byte-align the next compressed bucket free(mtf_seq); }
static void FFT_transform_internal (int N, double *data, int direction) { int n = N/2; int bit = 0; int logn; int dual = 1; if (n == 1) return; /* Identity operation! */ logn = int_log2(n); if (N == 0) return; /* bit reverse the input data for decimation in time algorithm */ FFT_bitreverse(N, data) ; /* apply fft recursion */ /* this loop executed int_log2(N) times */ for (bit = 0; bit < logn; bit++, dual *= 2) { double w_real = 1.0; double w_imag = 0.0; int a; int b; double theta = 2.0 * direction * PI / (2.0 * (double) dual); double s = sin(theta); double t = sin(theta / 2.0); double s2 = 2.0 * t * t; for (a=0, b = 0; b < n; b += 2 * dual) { int i = 2*b ; int j = 2*(b + dual); double wd_real = data[j] ; double wd_imag = data[j+1] ; data[j] = data[i] - wd_real; data[j+1] = data[i+1] - wd_imag; data[i] += wd_real; data[i+1]+= wd_imag; } /* a = 1 .. (dual-1) */ for (a = 1; a < dual; a++) { /* trignometric recurrence for w-> exp(i theta) w */ { double tmp_real = w_real - s * w_imag - s2 * w_real; double tmp_imag = w_imag + s * w_real - s2 * w_imag; w_real = tmp_real; w_imag = tmp_imag; } for (b = 0; b < n; b += 2 * dual) { int i = 2*(b + a); int j = 2*(b + a + dual); double z1_real = data[j]; double z1_imag = data[j+1]; double wd_real = w_real * z1_real - w_imag * z1_imag; double wd_imag = w_real * z1_imag + w_imag * z1_real; data[j] = data[i] - wd_real; data[j+1] = data[i+1] - wd_imag; data[i] += wd_real; data[i+1]+= wd_imag; } } } }
/* * read basic prologue from compress */ static int fm_read_basic_prologue (fm_index * s) { int i; ulong size; fm_init_bit_reader (s->compress); s->text_size = fm_uint_read (); if(s->text_size< SMALLFILESIZE){ s->smalltext=1; return FM_OK; } s->smalltext = 0; s->type_compression = fm_bit_read (8); s->log2textsize = int_log2 (s->text_size - 1); s->bwt_eof_pos = fm_uint_read (); if (s->bwt_eof_pos > s->text_size) return FM_COMPNOTCORR; s->bucket_size_lev1 = fm_bit_read (16) << 10; s->bucket_size_lev2 = fm_bit_read (16); if (s->bucket_size_lev1 % s->bucket_size_lev2) return FM_COMPNOTCORR; s->num_bucs_lev1 = (s->text_size + s->bucket_size_lev1 - 1) / s->bucket_size_lev1; s->num_bucs_lev2 = (s->text_size + s->bucket_size_lev2 - 1) / s->bucket_size_lev2; /* mtf & alphabet information */ s->alpha_size = fm_bit_read (8) + 1; /* read Mark mode & starting position of occ list */ s->specialchar = (uchar) fm_bit_read (8); s->skip = fm_bit_read (32); uint start = fm_uint_read (); s->start_prologue_occ = s->compress + start; s->start_prologue_info_sb = fm_uint_read (); s->pos_marked_row_extr = fm_uint_read (); s->subchar = (uchar) fm_bit_read (8); /* remapped compress alphabet */ /* some information for the user */ #if 0 fprintf (stdout, "Compression type %d\n", s->type_compression); fprintf (stdout, "Text Size %lu\n", s->text_size); fprintf (stdout, "Bwt EOF %lu\n",s->bwt_eof_pos); fprintf (stdout, "alphasize %d\n",s->alpha_size); fprintf(stdout, "start prologue %lu\n", s->start_prologue_occ); fprintf (stdout, "Compression method: "); switch (s->type_compression) { case MULTIH: fprintf (stdout, "Huffman with multiple tables.\n"); break; default: return FM_COMPNOTSUP; } #endif /* alphabet info and inverse char maps */ for (i = 0; i < ALPHASIZE; i++) s->bool_char_map[i] = fm_bit_read (1); for (i = 0, size = 0; i < ALPHASIZE; i++) if (s->bool_char_map[i]) { s->char_map[i] = size; s->inv_char_map[size++] = (uchar) i; } assert (size == s->alpha_size); /* prefix summed char-occ info momorizzate con s->log2textsize bits */ for (i = 0; i < s->alpha_size; i++) { // legge somme occorrenze // caratteri s->bwt_occ[i] = fm_bit_read (s->log2textsize); } /* * calcola le occorrenze di ogni carattere nel testo */ for (i = 1; i < s->alpha_size; i++) s->char_occ[i - 1] = (s->bwt_occ[i]) - (s->bwt_occ[i - 1]); s->char_occ[(s->alpha_size) - 1] = (s->text_size) - (s->bwt_occ[(s->alpha_size) - 1]); /* * Calcolo posizione inizio info buckets */ s->sb_bitmap_size = (s->alpha_size+7)/8; s->start_prologue_info_b = s->start_prologue_info_sb + (s->sb_bitmap_size*s->num_bucs_lev1) + (s->alpha_size * sizeof(ulong) * (s->num_bucs_lev1 - 1)); return FM_OK; }
/* ***************************************************************** compute locations of "marked" occurrences. This procedure does the following: 1) compute the desired # of marked chars (=desired_marked_chars) 2) compute the best pair i,j such that (occ[i]/2^j) is as close as possible (but <= ) to desired_marked_chars write i in s->chosen_char and 2^j in s->skip 3) scan s->bwt[] and "select" one out of s->skip occurrences of s->chosen_char. For each selected occurrence write in s->loc_occ its position in the original text. I think this procedure could be improved (that is, simplified and faster in doing the search with bwhuffw) using the ideas introduced in compute_locations_dict() and compute_locations_huffword() more precisely 1) remove s->chosen_char and mark simply one row every s->skip (this would make the marked chars more evely distributed in the text). 2) consider the row starting with a "marked char" rather than ending (this would simplify the code) **************************************************************** */ int compute_locations(bwi_input *s) { int i,max,j,count,chosen_occ; int ch_occ[256],rescaled,skip; int exponent, desired_marked_chars, marked_chars; if(Marked_char_freq==0) { s->skip=0; s->chosen_char = 0; return 0; } /* ------ compute the desired number of marked chars ------ */ desired_marked_chars = (int) (s->text_size * Marked_char_freq); if(desired_marked_chars==0) desired_marked_chars=1; // ---- Count occurrences for each character ch_occ[s->alpha_size-1]= s->text_size-s->pfx_char_occ[s->alpha_size-1]; for(i=0;i<s->alpha_size-1;i++) ch_occ[i]=s->pfx_char_occ[i+1]-s->pfx_char_occ[i]; // ----- select best (char,skip) pair for(i=0, max=-1; i<s->alpha_size; i++){ if(i==s->bwt[0]) continue; // Exclude bwt-first-char (see below) /* --- determine the number of skipped char for i */ if (ch_occ[i] > desired_marked_chars) { exponent = int_log2(ch_occ[i]/desired_marked_chars); assert(exponent > 0); skip = int_pow2(exponent); } else skip = 1; /* --- check if this is the best choice seen so far --- */ rescaled = ch_occ[i] / skip; if(rescaled>max && rescaled <= desired_marked_chars) { max = rescaled; s->chosen_char = i; s->skip = skip; } } assert(max > 0); assert(s->skip>0); if(Verbose>1) { for(i=0;i<256;i++) if(s->char_map[i]==s->chosen_char) break; fprintf(stderr,"Marked char is ascii %d; ", i); fprintf(stderr,"one occ every %d is marked; ",s->skip); } // ------- compute number of marked chars chosen_occ = ch_occ[s->chosen_char]; if(chosen_occ % s->skip) marked_chars = chosen_occ/s->skip + 1; else marked_chars = chosen_occ/s->skip; // -------- alloc s->loc_occ s->loc_occ = (int *) malloc(sizeof(int) * (marked_chars)); // write the text location of the ROWS ending with ch for(i=1,j=0,count=0; i<s->text_size; i++) { // bwt[0] is not the marked char (see above) if (s->bwt[i] == s->chosen_char) { if ((count % s->skip) == 0) { if (i <= s->bwt_eof_pos) { s->loc_occ[j] = (int) s->sa[i-1]; assert(s->text[s->loc_occ[j]-1] == s->chosen_char); } else { s->loc_occ[j] = s->sa[i]; assert(s->text[s->loc_occ[j]-1] == s->chosen_char); } j++; } count++; } } // j is the number of marked chars if(Verbose>1) fprintf(stderr,"%d chars marked.\n", j); assert(j == marked_chars); assert(count == chosen_occ); return j; }
int load_index (char * filename, void ** index) { int error; fm_index *fmindex; fmindex = (fm_index *) malloc (sizeof (fm_index)); if (fmindex == NULL) return FM_OUTMEM; fmindex->compress_owner = 1; fmindex->owner = 0; /* * Load index file */ error = open_file (filename, &(fmindex->compress), &(fmindex->compress_size)); if (error) return error; error = fm_read_basic_prologue (fmindex); if (error) return error; if(fmindex->smalltext) { fmindex->skip = 0; if(fmindex->text_size<SMALLSMALLFILESIZE) { fmindex->text = fmindex->compress+4; *index = fmindex; return FM_OK; } fmindex->owner = 1; fmindex->smalltext = 2; error = fm_bwt_uncompress(fmindex); if (error < 0) return error; *index = fmindex; return FM_OK; } /* * init some var */ fmindex->int_dec_bits = int_log2 (int_log2 (fmindex->bucket_size_lev1 - fmindex->bucket_size_lev2)); if(fmindex->skip >1) { fmindex->occcharinf = fmindex->bwt_occ[fmindex->specialchar]; if(fmindex->specialchar==fmindex->alpha_size-1) fmindex->occcharsup = fmindex->text_size-1; else fmindex->occcharsup = fmindex->bwt_occ[fmindex->specialchar+1]; fmindex->num_marked_rows = fmindex->occcharsup-fmindex->occcharinf; } else fmindex->num_marked_rows = 0; fmindex->log2_row = int_log2(fmindex->text_size); fmindex->mtf_seq = (uchar *) malloc (fmindex->bucket_size_lev2 * sizeof (uchar)); if (fmindex->mtf_seq == NULL) return FM_OUTMEM; fmindex->var_byte_rappr = ((fmindex->log2textsize + 7) / 8)*8; *index = fmindex; return FM_OK; }
bool int_is_pow2( int n ) { return (1 << int_log2( n )) == n; }