void WriteBitsToString (_String&s, long& bitAt, char lengthToWrite) { long leftOver = 8-bitAt%8, curPos = bitAt/8; if (leftOver >= lengthToWrite) { // will fit in current byte unsigned char value = s.getUChar(curPos); value += powersOf2[leftOver-1]-powersOf2[leftOver-lengthToWrite]; s[curPos]=value; } else { unsigned char value = (unsigned char)s[curPos]; value += powersOf2[leftOver-1]+1; s[curPos]=value; char fullBytes = (lengthToWrite-leftOver-1)/8; while (fullBytes) { s[++curPos]=255; fullBytes--; } s[++curPos]=254-powersOf2[8-(lengthToWrite-leftOver)%8]; } bitAt+=lengthToWrite; }
_Parameter _CString::FrequencyCompress(unsigned char theAlpha,bool doit) { _String* theAlphabet = SelectAlpha (theAlpha); if (theAlphabet->sLength>31) { return 1.; // can't do much - the alphabet is too large } char codeLength[256]; long freqs [256],j,t; long maxOccurences[256], locationsOfMaxSymbols[256] ; //simply ensures that we // won't have symbols out of the alphabet //analyze the frequency distribution of alphabetic symbols for (j=0; j<256; freqs[j]=0,codeLength[j]=0,maxOccurences[j]=0, j++ ) {} for (j=0; j<sLength; j++) { freqs[getUChar(j)]++; } t = 0; for (j=0; j<theAlphabet->sLength; j++) { freqs[NuclAlphabet.getUChar(j)]*=-1; } //make sure that the alphabet is "large" enough for the nucleotide case // NEW 03/29/98 for (j=0; j<256; j++) if (freqs[j]>0) { t = 1; break; } else { freqs[j]*=-1; } if (t) { if (theAlphabet == &NuclAlphabet) { return FrequencyCompress (FULLNUCLALPHABET, doit); } else { return 1; } } // now build the prefix code for the alphabet // fisrt find four most frequently occurring symbols for (j=0; j<(*theAlphabet).sLength; j++) { for (long k = 0; k<(*theAlphabet).sLength; k++) if (freqs[theAlphabet->getUChar(j)]>=maxOccurences[k]) { for (long l=(*theAlphabet).sLength-1; l>=k+1; l--) { maxOccurences[l]=maxOccurences[l-1]; locationsOfMaxSymbols[l]=locationsOfMaxSymbols[l-1]; } maxOccurences[k]=freqs[theAlphabet->getUChar(j)]; locationsOfMaxSymbols[k]=(*theAlphabet)[j]; break; } } // compute efficiency //j will store the predicted bit length of the compressed string j = (*theAlphabet).sLength*5; // translation table size j=8*((j%8)?(j/8+1):j/8); // we are also ready to build the code table for (long k = 0; k<(*theAlphabet).sLength; k++) { long l; for (l=0; l<(*theAlphabet).sLength; l++) if ((*theAlphabet)[k]==locationsOfMaxSymbols[l]) { j+=(l+1)*freqs[theAlphabet->getUChar(k)]; codeLength [locationsOfMaxSymbols[l]] = l+1; break; } } // if (j>Length()*8) return 1; // no compression could be performed if (!doit) { return j/8.0/sLength; } _String result ((unsigned long)(j%8?j/8+1:j/8)); // allocate output string // let's roll!! long csize = 0; //will indicate the current bit position in the target string t = 0; // current position in the string //first we must write out the encoding table as 5 bits of length per each for (j=0; j<(*theAlphabet).sLength; j++, csize+=5, t = csize/8) { long leftover = 8-csize%8; if (leftover>=5) { unsigned char value = result[t]; switch (leftover) { case 5: value+=codeLength[theAlphabet->getUChar(j)]; break; case 6: value+=codeLength[theAlphabet->getUChar(j)]*2; break; case 7: value+=codeLength[theAlphabet->getUChar(j)]*4; break; default: value+=codeLength[theAlphabet->getUChar(j)]*8; } result[t]=value; } else { result[t]+=codeLength[theAlphabet->getUChar(j)]/realPowersOf2[5-leftover]; result[++t]=(codeLength[theAlphabet->getUChar(j)]%realPowersOf2[5-leftover])*realPowersOf2[3+leftover]; } } // result[++t]=0; // mark the end of tabular encoding t++; // now encode the actual sequence t*=8; //t+=8; for (j=0; j<sLength; j++) { WriteBitsToString (result,t,codeLength[(unsigned char)sData[j]]); } // pad the rest of the last byte in the string by ones if (t%8) { unsigned char value = result [t/8]; value += powersOf2[7-t%8]+1; result[t/8]=value; t++; } // yahoo! we are done - store compression flag and replace the string with compressed string _Parameter factor = result.sLength/(_Parameter)sLength; if (factor<1) { // compression took place DuplicateErasing(&result); SetFlag( FREQCOMPRESSION); SetFlag (theAlpha); } return factor; }