Example #1
0
inline void binary_encode(int low,int *pp,int high,range_coder *c)
{
    /* Work out the range of values we can encode/decode */
    int p=*pp;
    int range_minus_1=high-low;
    int range=range_minus_1+1;
    int value=p-low;

#ifdef DEBUG
    if (low>high||(p>high)||(p<low))
    {
        printf("menc Illegal triple encountered: [%d,%d,%d]\n",
               low,p,high);
        sleep(60);
    }
#endif

    range_encode_equiprobable(c,range,value);

    return;
}
Example #2
0
int stats3_compress_bits(range_coder *c,unsigned char *m)
{
  unsigned char alpha[1024]; // message with all non alpha/spaces removed
  unsigned char lcalpha[1024]; // message with all alpha chars folded to lower-case

  /* Use model instead of just packed ASCII */
  range_encode_equiprobable(c,2,1); // not raw ASCII
  range_encode_symbol(c,&probPackedASCII,2,1); // not packed ASCII

  // printf("%f bits to encode model\n",c->entropy);
  total_model_bits+=c->entropy;
  double lastEntropy=c->entropy;
  
  /* Encode length of message */
  encodeLength(c,strlen((char *)m));
  
  // printf("%f bits to encode length\n",c->entropy-lastEntropy);
  total_length_bits+=c->entropy-lastEntropy;
  lastEntropy=c->entropy;

  /* encode any non-ASCII characters */
  encodeNonAlpha(c,m);
  stripNonAlpha(m,alpha);
  int nonAlphaChars=strlen(m)-strlen(alpha);

  //  printf("%f bits (%d emitted) to encode non-alpha\n",c->entropy-lastEntropy,c->bits_used);
  total_nonalpha_bits+=c->entropy-lastEntropy;

  lastEntropy=c->entropy;

  /* compress lower-caseified version of message */
  stripCase(alpha,lcalpha);
  encodeLCAlphaSpace(c,lcalpha);

  // printf("%f bits (%d emitted) to encode chars\n",c->entropy-lastEntropy,c->bits_used);
  total_alpha_bits+=c->entropy-lastEntropy;

  lastEntropy=c->entropy;
  
  /* case must be encoded after symbols, so we know how many
     letters and where word breaks are.
 */
  mungeCase((char *)alpha);
  encodeCaseModel1(c,alpha);
  
  //  printf("%f bits (%d emitted) to encode case\n",c->entropy-lastEntropy,c->bits_used);
  total_case_bits+=c->entropy-lastEntropy;

  range_conclude(c);
  // printf("%d bits actually used after concluding.\n",c->bits_used);
  total_finalisation_bits+=c->bits_used-c->entropy;

  if ((!nonAlphaChars)&&c->bits_used>=7*strlen((char *)m))
    {
      /* Can we code it more efficiently without statistical modelling? */
      range_coder *c2=range_new_coder(1024);
      range_encode_equiprobable(c2,2,1); // not raw ASCII
      range_encode_symbol(c2,&probPackedASCII,2,0); // is packed ASCII
      encodeLength(c2,strlen((char *)m));
      encodePackedASCII(c2,(char *)m);
      range_conclude(c2);
      if (c2->bits_used<c->bits_used) {
	range_coder_reset(c);
	range_encode_equiprobable(c,2,1); // not raw ASCII
	range_encode_symbol(c,&probPackedASCII,2,0); // is packed ASCII
	encodeLength(c,strlen((char *)m));
	encodePackedASCII(c,(char *)m);
	range_conclude(c);
	// printf("Reverting to raw non-statistical encoding: %d chars in %d bits\n",
	//        (int)strlen((char *)m),c->bits_used);
      }
      range_coder_free(c2);
    }
  
  if ((c->bits_used>=8*strlen((char*)m))
      &&(!(m[0]&0x80)))
    {
      /* we can't encode it more efficiently than 8-bit raw.
         We can only do this is MSB of first char of message is 0, as we use
	 the first bit of the message to indicate if it is compressed or not. */
      int i;
      range_coder_reset(c);
      for(i=0;m[i];i++) c->bit_stream[i]=m[i];
      c->bits_used=8*i;
      c->entropy=8*i;

      // printf("Reverting to raw 8-bit encoding: used %d bits\n",c->bits_used);
    }

  return 0;
}
Example #3
0
File: recipe.c Project: Azizou/smac
int recipe_encode_field(struct recipe *recipe,stats_handle *stats, range_coder *c,
			int fieldnumber,char *value)
{
  int normalised_value;
  int minimum;
  int maximum;
  int precision;
  int h,m,s,d,y;
  float lat,lon;
  int ilat,ilon;

  precision=recipe->fields[fieldnumber].precision;

  switch (recipe->fields[fieldnumber].type) {
  case FIELDTYPE_INTEGER:
    normalised_value=atoi(value)-recipe->fields[fieldnumber].minimum;
    minimum=recipe->fields[fieldnumber].minimum;
    maximum=recipe->fields[fieldnumber].maximum;
    if (maximum<=minimum) {
      fprintf(stderr,"Illegal range: min=%d, max=%d\n",minimum,maximum);
      LOGI("Illegal range: min=%d, max=%d\n",minimum,maximum);
      return -1;
    }
    if (normalised_value<0||normalised_value>(maximum-minimum+1)) {
      fprintf(stderr,"Illegal value: min=%d, max=%d, value=%d\n",
                     minimum,maximum,atoi(value));
      LOGI("Illegal value: min=%d, max=%d, value=%d\n",
                     minimum,maximum,atoi(value));
      range_encode_equiprobable(c,maximum-minimum+2,maximum-minimum+1);
      int r=stats3_compress_append(c,(unsigned char *)value,strlen(value),stats,
				   NULL);
      return r;
    }
    return range_encode_equiprobable(c,maximum-minimum+2,normalised_value);
  case FIELDTYPE_FLOAT:
    {
      float f = atof(value);
      int sign=0;
      int exponent=0;
      int mantissa=0;
      if (f<0) { sign=1; f=-f; } else sign=0;
      double m = frexp(f,&exponent);
      mantissa = m * 0xffffff;
      if (exponent<-127) exponent=-127;
      if (exponent>127) exponent=127;
      fprintf(stderr,"encoding sign=%d, exp=%d, mantissa=%x, f=%f\n",
	      sign,exponent,mantissa,atof(value));
      // Sign
      range_encode_equiprobable(c,2,sign);
      // Exponent
      range_encode_equiprobable(c,256,exponent+128);
      // Mantissa
      range_encode_equiprobable(c,256,(mantissa>>16)&0xff);
      range_encode_equiprobable(c,256,(mantissa>>8)&0xff);
      return range_encode_equiprobable(c,256,(mantissa>>0)&0xff);
    }    
  case FIELDTYPE_FIXEDPOINT:
  case FIELDTYPE_BOOLEAN:
    normalised_value=recipe_parse_boolean(value);
    minimum=0;
    maximum=1;
    return range_encode_equiprobable(c,maximum-minimum+1,normalised_value);
  case FIELDTYPE_TIMEOFDAY:
    if (sscanf(value,"%d:%d.%d",&h,&m,&s)<2) return -1;
    // XXX - We don't support leap seconds
    if (h<0||h>23||m<0||m>59||s<0||s>59) return -1;
    normalised_value=h*3600+m*60+s;
    minimum=0;
    maximum=24*60*60;
    if (precision==0) precision=17; // 2^16 < 24*60*60 < 2^17
    if (precision<17) {
      normalised_value=normalised_value >> (17 - precision);
      minimum=minimum >> (17 - precision);
      maximum=maximum >> (17 - precision);
      maximum+=1; // make sure that normalised_value cannot = maximum
    }
    return range_encode_equiprobable(c,maximum-minimum+1,normalised_value);
  case FIELDTYPE_TIMEDATE:
    {
      struct tm tm;
      int tzh=0,tzm=0;
      int r;
      bzero(&tm,sizeof(tm));
      if ((r=sscanf(value,"%d-%d-%dT%d:%d:%d.%*d+%d:%d",
		 &tm.tm_year,&tm.tm_mon,&tm.tm_mday,
		 &tm.tm_hour,&tm.tm_min,&tm.tm_sec,
		    &tzh,&tzm))<6) {
	printf("r=%d\n",r);
	return -1;
      }
#if defined(__sgi) || defined(__sun)
#else
      tm.tm_gmtoff=tzm*60+tzh*3600;
#endif
      tm.tm_year-=1900;
      tm.tm_mon-=1;
      time_t t = mktime(&tm);
      minimum=1;
      maximum=0x7fffffff;
      normalised_value=t;

      int b;
      b=range_encode_equiprobable(c,0x8000,t>>16);
      b=range_encode_equiprobable(c,0x10000,t&0xffff);
      printf("TIMEDATE: encoding t=%d\n",(int)t);
      return b;
    }
  case FIELDTYPE_MAGPITIMEDATE:
    {
      struct tm tm;
      // int tzh=0,tzm=0;
      int r;
      bzero(&tm,sizeof(tm));
      if ((r=sscanf(value,"%d-%d-%d %d:%d:%d",
		 &tm.tm_year,&tm.tm_mon,&tm.tm_mday,
		 &tm.tm_hour,&tm.tm_min,&tm.tm_sec))<6) {
	printf("r=%d\n",r);
	return -1;
      }

      // Validate fields
      if (tm.tm_year<0||tm.tm_year>9999) return -1;
      if (tm.tm_mon<1||tm.tm_mon>12) return -1;
      if (tm.tm_mday<1||tm.tm_mday>31) return -1;
      if (tm.tm_hour<0||tm.tm_hour>24) return -1;
      if (tm.tm_min<0||tm.tm_min>59) return -1;
      if (tm.tm_sec<0||tm.tm_sec>61) return -1;

      // Encode each field: requires about 40 bits, but safely encodes all values
      // without risk of timezone munging on Android
      range_encode_equiprobable(c,10000,tm.tm_year);
      range_encode_equiprobable(c,12,tm.tm_mon-1);
      range_encode_equiprobable(c,31,tm.tm_mday-1);
      range_encode_equiprobable(c,25,tm.tm_hour);
      range_encode_equiprobable(c,60,tm.tm_min);
      return range_encode_equiprobable(c,62,tm.tm_sec);
    }
  case FIELDTYPE_DATE:
    // ODK does YYYY/MM/DD
    // Magpi does DD-MM-YYYY
    // The different delimiter allows us to discern between the two
    fprintf(stderr,"Parsing FIELDTYPE_DATE value '%s'\n",value);
    if (sscanf(value,"%d/%d/%d",&y,&m,&d)==3) { }
    else if (sscanf(value,"%d-%d-%d",&d,&m,&y)==3) { }
    else return -1;

    // XXX Not as efficient as it could be (assumes all months have 31 days)
    if (y<1||y>9999||m<1||m>12||d<1||d>31) {
      fprintf(stderr,"Invalid field value\n");
      return -1;
    }
    normalised_value=y*372+(m-1)*31+(d-1);
    minimum=0;
    maximum=10000*372;
    if (precision==0) precision=22; // 2^21 < maximum < 2^22
    if (precision<22) {
      normalised_value=normalised_value >> (22 - precision);
      minimum=minimum >> (22 - precision);
      maximum=maximum >> (22 - precision);
      maximum+=1; // make sure that normalised_value cannot = maximum
    }
Example #4
0
unsigned int writeNode(FILE *out,struct countnode *n,char *s,
		       /* Terminations don't get counted internally in a node,
			  but are used when encoding and decoding the node,
			  so we have to pass it in here. */
		       int totalCountIncludingTerminations,int threshold)
{
  nodesWritten++;
  char schild[128];
  int i;

  long long totalCount=0;

  int debug=0;

  for(i=0;i<CHARCOUNT;i++) totalCount+=getCount(n,i);
  if (totalCount!=n->count) {
    fprintf(stderr,"Sequence '%s' counts don't add up: %lld vs %lld\n",
	    s,totalCount,n->count);
  }

  if (debug) fprintf(stderr,"sequence '%s' occurs %lld times (%d inc. terminals).\n",
		 s,totalCount,totalCountIncludingTerminations);
  /* Don't go any deeper if the sequence is too rare */
  if (totalCount<threshold) return 0;

  int children=0;
  if (n->allChildren) {
    for(i=0;i<CHARCOUNT;i++) 
      if (n->allChildren[i]) children++;
  } else {
    for(i=0;i<FEW;i++) if (n->fewChildIds[i]) children++;
  }
      
  range_coder *c=range_new_coder(1024);

  int childAddresses[CHARCOUNT];
  int childCount=0;
  int storedChildren=0;

  /* Encode children first so that we know where they live */
  for(i=0;i<CHARCOUNT;i++) {
    childAddresses[i]=0;

    struct countnode **nn;
    nn=getChild(n,i,0);
    if (nn&&*nn&&(*nn)->count>=threshold) {
      if (0) fprintf(stderr,"n->children[%d]->count=%lld\n",i,(*nn)->count);
      snprintf(schild,128,"%s%c",s,chars[i]);
      childAddresses[i]=writeNode(out,*nn,schild,totalCount,threshold);
      storedChildren++;
    }
    if (getCount(n,i)) {
      childCount++;
    }
  }
  
  /* Write total count in this node */
  range_encode_equiprobable(c,totalCountIncludingTerminations+1,totalCount);
  /* Write number of children with counts */
  range_encode_equiprobable(c,CHARCOUNT+1,childCount);
  /* Now number of children that we are storing sub-nodes for */
  range_encode_equiprobable(c,CHARCOUNT+1,storedChildren);

  unsigned int highAddr=ftell(out);
  unsigned int lowAddr=0;
  if (debug) fprintf(stderr,"  lowAddr=0x%x, highAddr=0x%x\n",lowAddr,highAddr);

  if (debug)
    fprintf(stderr,
	    "wrote: childCount=%d, storedChildren=%d, count=%lld, superCount=%d @ 0x%x\n",
	    childCount,storedChildren,totalCount,totalCountIncludingTerminations,
	    (unsigned int)ftello(out));

  unsigned int remainingCount=totalCount;
  // XXX - we can improve on these probabilities by adjusting them
  // according to the remaining number of children and stored children.
  unsigned int hasCount=(CHARCOUNT-childCount)*0xffffff/CHARCOUNT;
  unsigned int isStored=(CHARCOUNT-storedChildren)*0xffffff/CHARCOUNT;
  for(i=0;i<CHARCOUNT;i++) {
    hasCount=(CHARCOUNT-i-childCount)*0xffffff/(CHARCOUNT-i);

    if (getCount(n,i)) {
      snprintf(schild,128,"%c%s",chars[i],s);
      if (debug) 
	fprintf(stderr, "writing: '%s' x %d\n",
		schild,getCount(n,i));
      if (debug) fprintf(stderr,":  writing %d of %d count for '%c'\n",
			 getCount(n,i),remainingCount+1,chars[i]);

      range_encode_symbol(c,&hasCount,2,1);
      range_encode_equiprobable(c,remainingCount+1,getCount(n,i));

      remainingCount-=getCount(n,i);
      childCount--;
    } else {
      range_encode_symbol(c,&hasCount,2,0);
    }
  }
      
  for(i=0;i<CHARCOUNT;i++) {
    isStored=(CHARCOUNT-i-storedChildren)*0xffffff/(CHARCOUNT-i);
    if (childAddresses[i]) {
      range_encode_symbol(c,&isStored,2,1);
      if (debug) fprintf(stderr,":    writing child %d (address attached)\n",i);
	
      /* Encode address of child node compactly.
	 For starters, we know that it must preceed us in the bit stream.
	 We also know that we write them in order, so once we know the address
	 of a previous one, we can narrow the range further. */
      range_encode_equiprobable(c,highAddr-lowAddr+1,childAddresses[i]-lowAddr);
      if (debug) fprintf(stderr,":    writing addr = %d of %d (lowAddr=%d)\n",
			 childAddresses[i]-lowAddr,highAddr-lowAddr+1,lowAddr);
      lowAddr=childAddresses[i];
      storedChildren--;
    } else {
      range_encode_symbol(c,&isStored,2,0);
    }  
  }

  range_conclude(c);

  /* Unaccounted for observations are observations that terminate at this point.
     They are totall normal and expected. */
  if (debug)
    if (remainingCount) {    
      fprintf(stderr,"'%s' Count incomplete: %d of %lld not accounted for.\n",
	      s,remainingCount,totalCount);
    }
  
  unsigned int addr = ftello(out);
  int bytes=c->bits_used>>3;
  if (c->bits_used&7) bytes++;
  fwrite(c->bit_stream,bytes,1,out);

  /* Verify */
  {
    /* Make pretend stats handle to extract from */
    stats_handle h;
    h.file=(FILE*)0xdeadbeef;
    h.mmap=c->bit_stream;
    h.dummyOffset=addr;
    h.fileLength=addr+bytes;
    if (0) fprintf(stderr,"verifying node @ 0x%x\n",addr);
    struct node *v=extractNodeAt(NULL,0,addr,totalCountIncludingTerminations,&h,
				 0 /* don't extract whole tree */,debug);

    int i;
    int error=0;
    for(i=0;i<CHARCOUNT;i++)
      {
	if (v->counts[i]!=getCount(n,i)) {
	  if (!error) {
	    fprintf(stderr,"Verify error writing node for '%s'\n",s);
	    fprintf(stderr,"  n->count=%lld, totalCount=%lld\n",
		    n->count,totalCount);
	  }
	  fprintf(stderr,"  '%c' (%d) : %d versus %d written.\n",
		  chars[i],i,v->counts[i],getCount(n,i));
	  error++;
	}
      }
    if (error) {
      fprintf(stderr,"Bit stream (%d bytes):",bytes);
      for(i=0;i<bytes;i++) fprintf(stderr," %02x",c->bit_stream[i]);
      fprintf(stderr,"\n");
      exit(-1);
    }
#ifdef DEBUG
    if ((!strcmp(s,"esae"))||(!strcmp(s,"esael")))
      {
	fprintf(stderr,"%s 0x%x (%f bits) totalCountIncTerms=%d\n",
		s,addr,c->entropy,totalCountIncludingTerminations);
	dumpNode(v);
      }
#endif
    node_free(v);
  }
  range_coder_free(c);

  return addr;
}