示例#1
0
int stats3_compress(unsigned char *in,int inlen,unsigned char *out, int *outlen)
{
  range_coder *c=range_new_coder(inlen*2);
  stats3_compress_bits(c,in);
  range_conclude(c);
  *outlen=c->bits_used>>3;
  if (c->bits_used&7) (*outlen)++;
  bcopy(c->bit_stream,out,*outlen);
  range_coder_free(c);
  return 0;
}
示例#2
0
int stats3_decompress(unsigned char *in,int inlen,unsigned char *out, int *outlen)
{
  range_coder *c=range_new_coder(inlen);
  bcopy(in,c->bit_stream,inlen);
  c->bit_stream_length=inlen*8;
  c->bits_used=0;
  c->low=0;
  c->high=0xffffffff;
  range_decode_prefetch(c);

  stats3_decompress_bits(c,out,outlen);

  range_coder_free(c);
  return 0;
}
示例#3
0
int stats3_compress_bits(range_coder *c,unsigned char *m)
{
  unsigned char alpha[1024]; // message with all non alpha/spaces removed
  unsigned char lcalpha[1024]; // message with all alpha chars folded to lower-case

  /* Use model instead of just packed ASCII */
  range_encode_equiprobable(c,2,1); // not raw ASCII
  range_encode_symbol(c,&probPackedASCII,2,1); // not packed ASCII

  // printf("%f bits to encode model\n",c->entropy);
  total_model_bits+=c->entropy;
  double lastEntropy=c->entropy;
  
  /* Encode length of message */
  encodeLength(c,strlen((char *)m));
  
  // printf("%f bits to encode length\n",c->entropy-lastEntropy);
  total_length_bits+=c->entropy-lastEntropy;
  lastEntropy=c->entropy;

  /* encode any non-ASCII characters */
  encodeNonAlpha(c,m);
  stripNonAlpha(m,alpha);
  int nonAlphaChars=strlen(m)-strlen(alpha);

  //  printf("%f bits (%d emitted) to encode non-alpha\n",c->entropy-lastEntropy,c->bits_used);
  total_nonalpha_bits+=c->entropy-lastEntropy;

  lastEntropy=c->entropy;

  /* compress lower-caseified version of message */
  stripCase(alpha,lcalpha);
  encodeLCAlphaSpace(c,lcalpha);

  // printf("%f bits (%d emitted) to encode chars\n",c->entropy-lastEntropy,c->bits_used);
  total_alpha_bits+=c->entropy-lastEntropy;

  lastEntropy=c->entropy;
  
  /* case must be encoded after symbols, so we know how many
     letters and where word breaks are.
 */
  mungeCase((char *)alpha);
  encodeCaseModel1(c,alpha);
  
  //  printf("%f bits (%d emitted) to encode case\n",c->entropy-lastEntropy,c->bits_used);
  total_case_bits+=c->entropy-lastEntropy;

  range_conclude(c);
  // printf("%d bits actually used after concluding.\n",c->bits_used);
  total_finalisation_bits+=c->bits_used-c->entropy;

  if ((!nonAlphaChars)&&c->bits_used>=7*strlen((char *)m))
    {
      /* Can we code it more efficiently without statistical modelling? */
      range_coder *c2=range_new_coder(1024);
      range_encode_equiprobable(c2,2,1); // not raw ASCII
      range_encode_symbol(c2,&probPackedASCII,2,0); // is packed ASCII
      encodeLength(c2,strlen((char *)m));
      encodePackedASCII(c2,(char *)m);
      range_conclude(c2);
      if (c2->bits_used<c->bits_used) {
	range_coder_reset(c);
	range_encode_equiprobable(c,2,1); // not raw ASCII
	range_encode_symbol(c,&probPackedASCII,2,0); // is packed ASCII
	encodeLength(c,strlen((char *)m));
	encodePackedASCII(c,(char *)m);
	range_conclude(c);
	// printf("Reverting to raw non-statistical encoding: %d chars in %d bits\n",
	//        (int)strlen((char *)m),c->bits_used);
      }
      range_coder_free(c2);
    }
  
  if ((c->bits_used>=8*strlen((char*)m))
      &&(!(m[0]&0x80)))
    {
      /* we can't encode it more efficiently than 8-bit raw.
         We can only do this is MSB of first char of message is 0, as we use
	 the first bit of the message to indicate if it is compressed or not. */
      int i;
      range_coder_reset(c);
      for(i=0;m[i];i++) c->bit_stream[i]=m[i];
      c->bits_used=8*i;
      c->entropy=8*i;

      // printf("Reverting to raw 8-bit encoding: used %d bits\n",c->bits_used);
    }

  return 0;
}
示例#4
0
unsigned int writeNode(FILE *out,struct countnode *n,char *s,
		       /* Terminations don't get counted internally in a node,
			  but are used when encoding and decoding the node,
			  so we have to pass it in here. */
		       int totalCountIncludingTerminations,int threshold)
{
  nodesWritten++;
  char schild[128];
  int i;

  long long totalCount=0;

  int debug=0;

  for(i=0;i<CHARCOUNT;i++) totalCount+=getCount(n,i);
  if (totalCount!=n->count) {
    fprintf(stderr,"Sequence '%s' counts don't add up: %lld vs %lld\n",
	    s,totalCount,n->count);
  }

  if (debug) fprintf(stderr,"sequence '%s' occurs %lld times (%d inc. terminals).\n",
		 s,totalCount,totalCountIncludingTerminations);
  /* Don't go any deeper if the sequence is too rare */
  if (totalCount<threshold) return 0;

  int children=0;
  if (n->allChildren) {
    for(i=0;i<CHARCOUNT;i++) 
      if (n->allChildren[i]) children++;
  } else {
    for(i=0;i<FEW;i++) if (n->fewChildIds[i]) children++;
  }
      
  range_coder *c=range_new_coder(1024);

  int childAddresses[CHARCOUNT];
  int childCount=0;
  int storedChildren=0;

  /* Encode children first so that we know where they live */
  for(i=0;i<CHARCOUNT;i++) {
    childAddresses[i]=0;

    struct countnode **nn;
    nn=getChild(n,i,0);
    if (nn&&*nn&&(*nn)->count>=threshold) {
      if (0) fprintf(stderr,"n->children[%d]->count=%lld\n",i,(*nn)->count);
      snprintf(schild,128,"%s%c",s,chars[i]);
      childAddresses[i]=writeNode(out,*nn,schild,totalCount,threshold);
      storedChildren++;
    }
    if (getCount(n,i)) {
      childCount++;
    }
  }
  
  /* Write total count in this node */
  range_encode_equiprobable(c,totalCountIncludingTerminations+1,totalCount);
  /* Write number of children with counts */
  range_encode_equiprobable(c,CHARCOUNT+1,childCount);
  /* Now number of children that we are storing sub-nodes for */
  range_encode_equiprobable(c,CHARCOUNT+1,storedChildren);

  unsigned int highAddr=ftell(out);
  unsigned int lowAddr=0;
  if (debug) fprintf(stderr,"  lowAddr=0x%x, highAddr=0x%x\n",lowAddr,highAddr);

  if (debug)
    fprintf(stderr,
	    "wrote: childCount=%d, storedChildren=%d, count=%lld, superCount=%d @ 0x%x\n",
	    childCount,storedChildren,totalCount,totalCountIncludingTerminations,
	    (unsigned int)ftello(out));

  unsigned int remainingCount=totalCount;
  // XXX - we can improve on these probabilities by adjusting them
  // according to the remaining number of children and stored children.
  unsigned int hasCount=(CHARCOUNT-childCount)*0xffffff/CHARCOUNT;
  unsigned int isStored=(CHARCOUNT-storedChildren)*0xffffff/CHARCOUNT;
  for(i=0;i<CHARCOUNT;i++) {
    hasCount=(CHARCOUNT-i-childCount)*0xffffff/(CHARCOUNT-i);

    if (getCount(n,i)) {
      snprintf(schild,128,"%c%s",chars[i],s);
      if (debug) 
	fprintf(stderr, "writing: '%s' x %d\n",
		schild,getCount(n,i));
      if (debug) fprintf(stderr,":  writing %d of %d count for '%c'\n",
			 getCount(n,i),remainingCount+1,chars[i]);

      range_encode_symbol(c,&hasCount,2,1);
      range_encode_equiprobable(c,remainingCount+1,getCount(n,i));

      remainingCount-=getCount(n,i);
      childCount--;
    } else {
      range_encode_symbol(c,&hasCount,2,0);
    }
  }
      
  for(i=0;i<CHARCOUNT;i++) {
    isStored=(CHARCOUNT-i-storedChildren)*0xffffff/(CHARCOUNT-i);
    if (childAddresses[i]) {
      range_encode_symbol(c,&isStored,2,1);
      if (debug) fprintf(stderr,":    writing child %d (address attached)\n",i);
	
      /* Encode address of child node compactly.
	 For starters, we know that it must preceed us in the bit stream.
	 We also know that we write them in order, so once we know the address
	 of a previous one, we can narrow the range further. */
      range_encode_equiprobable(c,highAddr-lowAddr+1,childAddresses[i]-lowAddr);
      if (debug) fprintf(stderr,":    writing addr = %d of %d (lowAddr=%d)\n",
			 childAddresses[i]-lowAddr,highAddr-lowAddr+1,lowAddr);
      lowAddr=childAddresses[i];
      storedChildren--;
    } else {
      range_encode_symbol(c,&isStored,2,0);
    }  
  }

  range_conclude(c);

  /* Unaccounted for observations are observations that terminate at this point.
     They are totall normal and expected. */
  if (debug)
    if (remainingCount) {    
      fprintf(stderr,"'%s' Count incomplete: %d of %lld not accounted for.\n",
	      s,remainingCount,totalCount);
    }
  
  unsigned int addr = ftello(out);
  int bytes=c->bits_used>>3;
  if (c->bits_used&7) bytes++;
  fwrite(c->bit_stream,bytes,1,out);

  /* Verify */
  {
    /* Make pretend stats handle to extract from */
    stats_handle h;
    h.file=(FILE*)0xdeadbeef;
    h.mmap=c->bit_stream;
    h.dummyOffset=addr;
    h.fileLength=addr+bytes;
    if (0) fprintf(stderr,"verifying node @ 0x%x\n",addr);
    struct node *v=extractNodeAt(NULL,0,addr,totalCountIncludingTerminations,&h,
				 0 /* don't extract whole tree */,debug);

    int i;
    int error=0;
    for(i=0;i<CHARCOUNT;i++)
      {
	if (v->counts[i]!=getCount(n,i)) {
	  if (!error) {
	    fprintf(stderr,"Verify error writing node for '%s'\n",s);
	    fprintf(stderr,"  n->count=%lld, totalCount=%lld\n",
		    n->count,totalCount);
	  }
	  fprintf(stderr,"  '%c' (%d) : %d versus %d written.\n",
		  chars[i],i,v->counts[i],getCount(n,i));
	  error++;
	}
      }
    if (error) {
      fprintf(stderr,"Bit stream (%d bytes):",bytes);
      for(i=0;i<bytes;i++) fprintf(stderr," %02x",c->bit_stream[i]);
      fprintf(stderr,"\n");
      exit(-1);
    }
#ifdef DEBUG
    if ((!strcmp(s,"esae"))||(!strcmp(s,"esael")))
      {
	fprintf(stderr,"%s 0x%x (%f bits) totalCountIncTerms=%d\n",
		s,addr,c->entropy,totalCountIncludingTerminations);
	dumpNode(v);
      }
#endif
    node_free(v);
  }
  range_coder_free(c);

  return addr;
}