示例#1
0
void    WriteBitsToString (_String&s, long& bitAt, char lengthToWrite)
{
    long leftOver = 8-bitAt%8, curPos = bitAt/8;
    if (leftOver >= lengthToWrite) { // will fit in current byte
        unsigned char value = s.getUChar(curPos);
        value += powersOf2[leftOver-1]-powersOf2[leftOver-lengthToWrite];
        s[curPos]=value;
    } else {
        unsigned char value = (unsigned char)s[curPos];
        value += powersOf2[leftOver-1]+1;
        s[curPos]=value;
        char fullBytes = (lengthToWrite-leftOver-1)/8;
        while (fullBytes) {
            s[++curPos]=255;
            fullBytes--;
        }
        s[++curPos]=254-powersOf2[8-(lengthToWrite-leftOver)%8];
    }
    bitAt+=lengthToWrite;
}
示例#2
0
_Parameter      _CString::FrequencyCompress(unsigned char theAlpha,bool doit)
{

    _String* theAlphabet = SelectAlpha (theAlpha);
  
    if (theAlphabet->sLength>31) {
        return 1.;    // can't do much - the alphabet is too large
    }
    char codeLength[256];
    long freqs [256],j,t;
    long maxOccurences[256], locationsOfMaxSymbols[256] ; //simply ensures that we
    // won't have symbols out of the alphabet


    //analyze the frequency distribution of alphabetic symbols

    for (j=0; j<256; freqs[j]=0,codeLength[j]=0,maxOccurences[j]=0, j++ ) {}


    for (j=0; j<sLength; j++) {
        freqs[getUChar(j)]++;
    }

    t = 0;
    for (j=0; j<theAlphabet->sLength; j++) {
        freqs[NuclAlphabet.getUChar(j)]*=-1;
    }

    //make sure that the alphabet is "large" enough for the nucleotide case
    // NEW 03/29/98
    for (j=0; j<256; j++)
        if (freqs[j]>0) {
            t = 1;
            break;
        } else {
            freqs[j]*=-1;
        }
    if (t) {
        if (theAlphabet == &NuclAlphabet) {
            return FrequencyCompress (FULLNUCLALPHABET, doit);
        } else {
            return 1;
        }
    }



    // now build the prefix code for the alphabet
    // fisrt find four most frequently occurring symbols

    for (j=0; j<(*theAlphabet).sLength; j++) {
        for (long k = 0; k<(*theAlphabet).sLength; k++)
            if (freqs[theAlphabet->getUChar(j)]>=maxOccurences[k]) {
                for (long l=(*theAlphabet).sLength-1; l>=k+1; l--) {
                    maxOccurences[l]=maxOccurences[l-1];
                    locationsOfMaxSymbols[l]=locationsOfMaxSymbols[l-1];
                }
                maxOccurences[k]=freqs[theAlphabet->getUChar(j)];
                locationsOfMaxSymbols[k]=(*theAlphabet)[j];
                break;
            }
    }


    // compute efficiency
    //j will store the predicted bit length of the compressed string

    j = (*theAlphabet).sLength*5; // translation table size
    j=8*((j%8)?(j/8+1):j/8);

    // we are also ready to build the code table

    for (long k = 0; k<(*theAlphabet).sLength; k++) {
        long l;
        for (l=0; l<(*theAlphabet).sLength; l++)
            if ((*theAlphabet)[k]==locationsOfMaxSymbols[l]) {
                j+=(l+1)*freqs[theAlphabet->getUChar(k)];
                codeLength [locationsOfMaxSymbols[l]] = l+1;
                break;
            }
    }

//  if (j>Length()*8) return 1;
// no compression could be performed
    if (!doit) {
        return j/8.0/sLength;
    }

    _String result ((unsigned long)(j%8?j/8+1:j/8)); // allocate output string


// let's roll!!
    long csize = 0; //will indicate the current bit position in the target string
    t = 0; // current position in the string
    //first we must write out the encoding table as 5 bits of length per each

    for (j=0; j<(*theAlphabet).sLength; j++, csize+=5, t = csize/8) {
        long leftover = 8-csize%8;
        if (leftover>=5) {
            unsigned char value = result[t];
            switch (leftover) {
            case 5:
                value+=codeLength[theAlphabet->getUChar(j)];
                break;
            case 6:
                value+=codeLength[theAlphabet->getUChar(j)]*2;
                break;
            case 7:
                value+=codeLength[theAlphabet->getUChar(j)]*4;
                break;
            default:
                value+=codeLength[theAlphabet->getUChar(j)]*8;
            }
            result[t]=value;
        } else {
            result[t]+=codeLength[theAlphabet->getUChar(j)]/realPowersOf2[5-leftover];
            result[++t]=(codeLength[theAlphabet->getUChar(j)]%realPowersOf2[5-leftover])*realPowersOf2[3+leftover];
        }
    }



    //  result[++t]=0;
    // mark the end of tabular encoding
    t++;
    // now encode the actual sequence
    t*=8;
    //t+=8;

    for (j=0; j<sLength; j++) {
        WriteBitsToString (result,t,codeLength[(unsigned char)sData[j]]);
    }

    // pad the rest of the last byte in the string by ones

    if (t%8) {
        unsigned char value = result [t/8];
        value += powersOf2[7-t%8]+1;
        result[t/8]=value;
        t++;
    }

    // yahoo! we are done - store compression flag and replace the string with compressed string
    _Parameter factor = result.sLength/(_Parameter)sLength;
    if (factor<1) { // compression took place
        DuplicateErasing(&result);
        SetFlag( FREQCOMPRESSION);
        SetFlag (theAlpha);
    }
    return factor;

}