int _nisam_pack_rec_unpack(register N_INFO *info, register byte *to, byte *from, uint reclength) { byte *end_field; reg3 N_RECINFO *end; N_RECINFO *current_field; ISAM_SHARE *share=info->s; DBUG_ENTER("_nisam_pack_rec_unpack"); init_bit_buffer(&info->bit_buff,from,reclength); for (current_field=share->rec, end=current_field+share->base.fields ; current_field < end ; current_field++,to=end_field) { end_field=to+current_field->base.length; (*current_field->unpack)(current_field,&info->bit_buff,(uchar*) to, (uchar*) end_field); } if (! info->bit_buff.error && info->bit_buff.pos - info->bit_buff.bits/8 == info->bit_buff.end) DBUG_RETURN(0); my_errno=HA_ERR_WRONG_IN_RECORD; info->update&= ~HA_STATE_AKTIV; DBUG_RETURN(-1); } /* _nisam_pack_rec_unpack */
static void flush_bits (struct bit_buffer *buf) { size_t s; if (buf->bit_idx != 8) { buf->byte_idx++; buf->bit_idx = 8; } s = fwrite (& buf->data [0], 1, buf->byte_idx, buf->f); /* $$$ should check result */ init_bit_buffer (buf); }
/* ********************************************************** open filename and write p[0] .. p[n-1] using log(n) bits ********************************************************** */ void write_sa(char *filename, int *p, int n) { int int_log2(int); void init_bit_buffer(void); void fbit_write(FILE *,int,int), fbit_flush( FILE * ); FILE *sa; Int32 psize, i; if(_ds_Verbose) fprintf(stderr,"Writing sa to file %s\n",filename); if((sa=fopen(filename,"wb"))==NULL) perror(filename); init_bit_buffer(); psize = int_log2(n); for(i=0;i<n;i++) fbit_write(sa,psize,p[i]); fbit_flush(sa); fclose(sa); }
void build_sa(bwi_input *s) { int scmp3(unsigned char *p, unsigned char *q, int maxl); void init_bit_buffer(void); int fbit_read(FILE *,int); int *larsson_sada_sufsort(uchar *, int, int); int *suffixsort5n(uchar *, int); void out_of_mem(char *s); int int_log2(int); int i, n, pointer_size,q,r,sa_size; FILE *safile; /* ------------ check sa file ---------------- */ n=0; safile = fopen(Safile_name,"rb"); if(safile!=NULL) { fseek(safile,0L,SEEK_END); n=ftell(safile); } if (n==0) { // ------- build sa using larsson-sada or 5n if(Verbose) fprintf(stderr, " from scratch "); if(Use_larsson_sada) { if(Verbose) fprintf(stderr, "(using ls) ... "); s->sa = larsson_sada_sufsort(s->text,s->text_size,s->alpha_size); } else { if(Verbose) fprintf(stderr, "(using 5n) ... "); s->sa = suffixsort5n(s->text,s->text_size); } } else { // ------ read sa from file -------- pointer_size = int_log2(s->text_size); // --- compute sa_size = s->text_size * pointer_size + 7)/8 // --- use q and r to avoid overflow q = s->text_size/8; r = s->text_size % 8; sa_size = (q*pointer_size) + (r*pointer_size+7)/8; if (n != sa_size) fatal_error("Invalid .sa file\n"); if(Verbose) fprintf(stderr, " by reading it from file... "); // allocate space for the suffix array s->sa = (int *) malloc(s->text_size * sizeof(int)); if(s->sa==NULL) out_of_mem("build_sa"); rewind(safile); init_bit_buffer(); for(i=0; i<s->text_size; i++)// read one suffix-array pointer at a time s->sa[i] = fbit_read(safile,pointer_size); fclose(safile); } // check the suffix array #if 0 for (i=0; i<s->text_size-1; ++i) if (scmp3(s->text+s->sa[i], s->text+s->sa[i+1], MIN(s->text_size-s->sa[i], s->text_size-s->sa[i+1]))>=0) { fprintf(stderr, "Suffix array check failed at position %d\n", i); exit(1); } #endif }
/* ********************************************************* The current format of the prologue is the following: 8 bits type of compression (2=Hier, 4=Multi Table Huff) 1 int size of input file 1 int position of eof in s->bw 1 uint16 size of a super bucket divided by 1024 1 uchar size of a bucket divided by 1024 (divides the previous) 1 uchar size-1 mtf_list stored in each bucket 1 uchar size-1 of the compacted alphabet of the text 1 uchar remapped char selected for occurrence list 1 int # skipped occ of chosen_char in bwt 1 int starting byte of occ-explicit list 256 bits boolean map of chars in the text (S = # of 1) S int prefix sum of character occurrences for each superbucket S' bytes map of compact_alph chars occurring in THIS superbucket (S' = (S+7)/8 -- it is byte aligned) ****FLUSH**** S int # occ of all compact_alphabet chars in prev superbuckets finally: NB x L starting position of each bucket in the compressed file NB is the number of buckets and L is the number of bits sufficient to represent that length (byte_aligned) -------- Body of the compressed file [byte-aligned] ------------- for each bucket (let R be the # of distinct chars in the superbucket) R 7x8val # of occ of each char in the previous buckets of the same superbucket. Each value is represented with the 7x8 encoding. This information is missing for the first bucket of each superbucket R bits map of chars appearing in this bucket. Let R' be the # of distinct chars in this bucket and L' the # of bits required to represent R' L' x M bits Initial move to front list for this bucket M = min(R',Mtf_save) ... bits needed to byte-align in case ONLY of Arith-coding ??? bits compressed bucket in mtf + rle + [Ari|Hier|Una] format --- bits ****FLUSH**** to have byte alignment -------- Body of the occ explicit list [byte-aligned] ------------- --------------------------------------------------------------------- --- URL: we have occ for text positions and rows --- ... L bits list of positions where character ch occurs in the original text. ch = character that occurs close to Marked_char_freq times in the text. **************************************************************** */ void write_prologue(bwi_input *s) { void init_bit_buffer(void); int int_log2(int); void uint_write(int); void bit_write(int,int); void bit_flush(void); void write7x8(int); bucket_lev1 sb; int i,len,k; /* ----- write file and bucket size ------ */ init_bit_buffer(); bit_write(8,Type_compression); uint_write(s->text_size); uint_write(s->bwt_eof_pos); assert(Bucket_size_lev1>>10<65536); assert((Bucket_size_lev1 & 0x3ff) == 0); bit_write(16,Bucket_size_lev1>>10); assert(Bucket_size_lev2>>10<256); assert((Bucket_size_lev2 & 0x3ff) == 0); bit_write(8,Bucket_size_lev2>>10); // ---- mtf and alphabet information assert(Mtf_save>0 && Mtf_save<=256); bit_write(8,Mtf_save-1); assert(s->alpha_size>0 && s->alpha_size<=256); bit_write(8,s->alpha_size-1); // ---- write chosen_char & starting byte of occ-list bit_write(8,s->chosen_char); uint_write(s->skip); uint_write(0); // ---- boolean alphabet char map for(i=0;i<256;i++) if(s->bool_char_map[i]) bit_write(1,1); else bit_write(1,0); // ---- write prefix sum of char occ for(i=0; i<s->alpha_size; i++) uint_write(s->pfx_char_occ[i]); // ----- process superbuckets for(i=0;i<Num_bucs_lev1;i++) { sb = s->buclist_lev1[i]; for(k=0;k<s->alpha_size;k++) // boolean char_map if(sb.bool_char_map[k]) bit_write(1,1); else bit_write(1,0); bit_flush(); // we keep everything byte aligned if(i>0) // write prefix-occ for(k=0;k<s->alpha_size;k++) uint_write(sb.occ[k]); } // ----- leave space for storing the start positions of buckets len = (int_log2(s->text_size)+7)/8; //it's byte-aligned for(i=0;i<Num_bucs_lev2;i++) bit_write(len * 8,0); }
/* ************************************************************ * * * main compression routine * * * ********************************************************** */ void compress_file(void) { void read_text(FILE *, bwi_input *s); void remap_alphabet(bwi_input *s); void build_sa(bwi_input *s); void compute_bwt(bwi_input *s); void compute_info_superbuckets(bwi_input *s); void compute_info_buckets(bwi_input *s); void write_prologue(bwi_input *s); void compress_superbucket(bwi_input *s, int); int compute_locations(bwi_input *s); int compute_locations_dict(bwi_input *s, int*); int compute_ranks_dict(bwi_input *s, int*); int compute_locations_huffword(bwi_input *s, int *); void bit_flush( void ); void bit_write(int,int); void init_bit_buffer(void); void write_susp_infos(bwi_input *s); bwi_input s; int i,len, retr_occ, retr_occ2, loc_occ_range; int Start_prologue_ranks; /* --------- Load the text file from disk ------- */ if(Verbose) fprintf(stderr,"Reading input file... "); read_text(Infile, &s); if(Verbose) fprintf(stderr,"done! (%f seconds)\n",getTime()); /* --------- Compact alphabet ------- */ if(Verbose>1) fprintf(stderr,"Remapping alphabet... "); remap_alphabet(&s); if(Verbose>1) fprintf(stderr,"done! (%f seconds). ",getTime()); if(Verbose>1) fprintf(stderr,"Compact alphabet size = %d\n",s.alpha_size); /* --------- Build suffix array ------- */ if(Verbose) fprintf(stderr,"Building suffix array"); build_sa(&s); if(Verbose) fprintf(stderr,"done! (%f seconds)\n",getTime()); /* --------- Compute BWT ------- */ if(Verbose>1) fprintf(stderr,"Computing BWT... "); compute_bwt(&s); if(Verbose>1) fprintf(stderr,"done! (%f seconds)\n",getTime()); /* ------- mark chars and compute locations ----- */ if (Is_dictionary) retr_occ = compute_locations_dict(&s,&loc_occ_range); // dictionary else if (Is_huffword) retr_occ = compute_locations_huffword(&s,&loc_occ_range);// huffword else if (Is_URL) retr_occ = compute_ranks_dict(&s,&loc_occ_range); // URL else retr_occ = compute_locations(&s); // standard /* --------- Compute various infos for each superbucket ------- */ if(Verbose>1) fprintf(stderr,"Computing infos superbukets... "); compute_info_superbuckets(&s); if(Verbose>1) fprintf(stderr,"done! (%f seconds)\n", getTime()); /* --------- Compute various infos for each bucket ------- */ if(Verbose>1) fprintf(stderr,"Computing infos buckets... "); compute_info_buckets(&s); if(Verbose>1) fprintf(stderr,"done! (%f seconds)\n", getTime()); /* --------- Writing the compressed file ------- */ Infile_size = s.text_size; Outfile_size=0; write_prologue(&s); if(Verbose) fprintf(stderr,"Prologue --> %d bytes!\n",Outfile_size); for(i=0;i<Num_bucs_lev1;i++) compress_superbucket(&s,i); /* ---- keep starting positions of occ-explicit list ---- */ Start_prologue_occ = Outfile_size; /* -- write the starting position of buckets -- */ write_susp_infos(&s); if (fseek(Outfile,Start_prologue_occ,SEEK_SET)) { fprintf(stderr, "Seek error on output file -compress_file-\n"); exit(1); } /* -- write the position of the marked chars ---- */ init_bit_buffer(); if(Is_dictionary || Is_huffword || Is_URL) len = int_log2(loc_occ_range); // bits required for each rank else len = int_log2(s.text_size); // bits required for each pos for(i=0; i < retr_occ; i++) bit_write(len,s.loc_occ[i]); bit_flush(); Start_prologue_ranks = (int)ftell(Outfile); if(Verbose) fprintf(stderr,"List of %d marked ranks --> %d bytes!\n", retr_occ,Start_prologue_ranks-Start_prologue_occ); /* -- in the case of URL we also store the DICT info -- */ /* It should be put together with the computation above --*/ /* Thus removing these differences in the code --*/ /* Hence Start_prologue_occ indicates the starting position of RANKS. */ /* After retr_occ RANKS start the LOCATIONS, which are again retr_occ */ /* in number. The value of retr_occ can be computed at decompression time */ /* by using the same formula adopted in compute_ranks_dict() */ if (Is_URL) { retr_occ2 = compute_locations_dict(&s,&loc_occ_range); // DICT if (retr_occ != retr_occ2) out_of_mem("Unequal number of sampled NULLs\n"); for(i=0; i < retr_occ; i++) bit_write(len,s.loc_occ[i]); bit_flush(); if(Verbose) fprintf(stderr,"List of %d marked locations --> %d bytes!\n", retr_occ2,(int)ftell(Outfile) - Start_prologue_ranks); } }
/* ********************************************************************** compress and write to file a bucket of length "len" starting at in[0]. the compression is done as follows: first the charatcters are remapped (we expect only a few distinct chars in a single bucket) then we use mtf (with a list of size Mtf_save) then we rle and compress using a unary code. ********************************************************************** */ void compress_bucket(uchar *in, int len, int alpha_size) { int int_log2(int); void init_bit_buffer(void); void bit_write(int,int); void bit_flush( void ); void out_of_mem(char *); int mtf_string(uchar *, uchar *, uchar *, int); void rle_hierarchical(uchar *, int, int); void multihuf_compr(uchar *, int, int); int k,j,bits_x_char,local_alpha_size,mtf_len; uchar c,mtf[256],local_bool_map[256], local_map[256]; uchar *mtf_seq; /* ---------- init ------------ */ init_bit_buffer(); /* ---------- compute and write local boolean map ------ */ for(k=0;k<alpha_size;k++) local_bool_map[k]=local_map[k]=0; local_alpha_size=0; for(j=0;j<len;j++) { // compute local boolean map c=in[j]; // remapped char assert(c<alpha_size); local_bool_map[c]=1; } for(k=0;k<alpha_size;k++) // compute local map if(local_bool_map[k]) local_map[k]=local_alpha_size++; for(j=0;j<len;j++) // remap bucket in[j]=local_map[in[j]]; for(k=0;k<alpha_size;k++) // write bool char map to file if(local_bool_map[k]) bit_write(1,1); else bit_write(1,0); /* ----------- compute and write mtf picture ------------- */ mtf_seq = (uchar *) malloc(2*len*sizeof(uchar)); // mtf temporary buffer if(mtf_seq==NULL) out_of_mem("compress_bucket (mtf_seq)"); mtf_len = mtf_string(in,mtf_seq,mtf,len); // mtf_seq=mtf(in), init mtf-list bits_x_char = int_log2(local_alpha_size); // write mtf to file for(k=0;k<MIN(Mtf_save,local_alpha_size);k++) { bit_write(bits_x_char,mtf[k]); } // -- Applies the proper compression routine -- switch (Type_compression) { case ARITH: // ---- Arithmetic compression of the bucket ----- fatal_error("Arithmetic coding no longer available -compress_bucket-\n"); exit(1); case HIER3: // ---- three-leveled model: Fenwick's proposal ----- rle_hierarchical(mtf_seq, mtf_len,local_alpha_size); break; case UNARY: // ---- Unary compression of mtf-ranks with escape ----- fatal_error("Unary coding no longer available -compress_bucket-\n"); exit(1); case MULTIH: // ---- RLE + MultiHuffman compression of the bucket ----- multihuf_compr(mtf_seq,mtf_len,local_alpha_size); break; default: fprintf(stderr,"\n Compression algorithm unknown! "); fprintf(stderr,"-compress_superbucket-\n"); exit(1); } bit_flush(); // Byte-align the next compressed bucket free(mtf_seq); }
my_bool _nisam_read_pack_info(N_INFO *info, pbool fix_keys) { File file; int diff_length; uint i,trees,huff_tree_bits,rec_reflength,length; uint16 *decode_table,*tmp_buff; ulong elements,intervall_length; char *disk_cache,*intervall_buff; uchar header[32]; ISAM_SHARE *share=info->s; BIT_BUFF bit_buff; DBUG_ENTER("_nisam_read_pack_info"); if (nisam_quick_table_bits < 4) nisam_quick_table_bits=4; else if (nisam_quick_table_bits > MAX_QUICK_TABLE_BITS) nisam_quick_table_bits=MAX_QUICK_TABLE_BITS; file=info->dfile; my_errno=0; if (my_read(file,(byte*) header,sizeof(header),MYF(MY_NABP))) { if (!my_errno) my_errno=HA_ERR_END_OF_FILE; DBUG_RETURN(1); } if (memcmp((byte*) header,(byte*) nisam_pack_file_magic,4)) { my_errno=HA_ERR_WRONG_IN_RECORD; DBUG_RETURN(1); } share->pack.header_length=uint4korr(header+4); share->min_pack_length=(uint) uint4korr(header+8); share->max_pack_length=(uint) uint4korr(header+12); set_if_bigger(share->base.pack_reclength,share->max_pack_length); elements=uint4korr(header+16); intervall_length=uint4korr(header+20); trees=uint2korr(header+24); share->pack.ref_length=header[26]; rec_reflength=header[27]; diff_length=(int) rec_reflength - (int) share->base.rec_reflength; if (fix_keys) share->rec_reflength=rec_reflength; share->base.min_block_length=share->min_pack_length+share->pack.ref_length; if (!(share->decode_trees=(DECODE_TREE*) my_malloc((uint) (trees*sizeof(DECODE_TREE)+ intervall_length*sizeof(byte)), MYF(MY_WME)))) DBUG_RETURN(1); intervall_buff=(byte*) (share->decode_trees+trees); length=(uint) (elements*2+trees*(1 << nisam_quick_table_bits)); if (!(share->decode_tables=(uint16*) my_malloc((length+512)*sizeof(uint16)+ (uint) (share->pack.header_length+7), MYF(MY_WME | MY_ZEROFILL)))) { my_free((gptr) share->decode_trees,MYF(0)); DBUG_RETURN(1); } tmp_buff=share->decode_tables+length; disk_cache=(byte*) (tmp_buff+512); if (my_read(file,disk_cache, (uint) (share->pack.header_length-sizeof(header)), MYF(MY_NABP))) { my_free((gptr) share->decode_trees,MYF(0)); my_free((gptr) share->decode_tables,MYF(0)); DBUG_RETURN(1); } huff_tree_bits=max_bit(trees ? trees-1 : 0); init_bit_buffer(&bit_buff,disk_cache, (uint) (share->pack.header_length-sizeof(header))); /* Read new info for each field */ for (i=0 ; i < share->base.fields ; i++) { share->rec[i].base_type=(enum en_fieldtype) get_bits(&bit_buff,4); share->rec[i].pack_type=(uint) get_bits(&bit_buff,4); share->rec[i].space_length_bits=get_bits(&bit_buff,4); share->rec[i].huff_tree=share->decode_trees+(uint) get_bits(&bit_buff, huff_tree_bits); share->rec[i].unpack=get_unpack_function(share->rec+i); } skipp_to_next_byte(&bit_buff); decode_table=share->decode_tables; for (i=0 ; i < trees ; i++) read_huff_table(&bit_buff,share->decode_trees+i,&decode_table, &intervall_buff,tmp_buff); decode_table=(uint16*) my_realloc((gptr) share->decode_tables, (uint) ((byte*) decode_table - (byte*) share->decode_tables), MYF(MY_HOLD_ON_ERROR)); { my_ptrdiff_t diff=PTR_BYTE_DIFF(decode_table,share->decode_tables); share->decode_tables=decode_table; for (i=0 ; i < trees ; i++) share->decode_trees[i].table=ADD_TO_PTR(share->decode_trees[i].table, diff, uint16*); } /* Fix record-ref-length for keys */ if (fix_keys) { for (i=0 ; i < share->base.keys ; i++) { share->keyinfo[i].base.keylength+=(uint16) diff_length; share->keyinfo[i].base.minlength+=(uint16) diff_length; share->keyinfo[i].base.maxlength+=(uint16) diff_length; share->keyinfo[i].seg[share->keyinfo[i].base.keysegs].base.length= (uint16) rec_reflength; } } if (bit_buff.error || bit_buff.pos < bit_buff.end) { /* info_length was wrong */ my_errno=HA_ERR_WRONG_IN_RECORD; my_free((gptr) share->decode_trees,MYF(0)); my_free((gptr) share->decode_tables,MYF(0)); DBUG_RETURN(1); } DBUG_RETURN(0); }