inline void binary_encode(int low,int *pp,int high,range_coder *c) { /* Work out the range of values we can encode/decode */ int p=*pp; int range_minus_1=high-low; int range=range_minus_1+1; int value=p-low; #ifdef DEBUG if (low>high||(p>high)||(p<low)) { printf("menc Illegal triple encountered: [%d,%d,%d]\n", low,p,high); sleep(60); } #endif range_encode_equiprobable(c,range,value); return; }
int stats3_compress_bits(range_coder *c,unsigned char *m) { unsigned char alpha[1024]; // message with all non alpha/spaces removed unsigned char lcalpha[1024]; // message with all alpha chars folded to lower-case /* Use model instead of just packed ASCII */ range_encode_equiprobable(c,2,1); // not raw ASCII range_encode_symbol(c,&probPackedASCII,2,1); // not packed ASCII // printf("%f bits to encode model\n",c->entropy); total_model_bits+=c->entropy; double lastEntropy=c->entropy; /* Encode length of message */ encodeLength(c,strlen((char *)m)); // printf("%f bits to encode length\n",c->entropy-lastEntropy); total_length_bits+=c->entropy-lastEntropy; lastEntropy=c->entropy; /* encode any non-ASCII characters */ encodeNonAlpha(c,m); stripNonAlpha(m,alpha); int nonAlphaChars=strlen(m)-strlen(alpha); // printf("%f bits (%d emitted) to encode non-alpha\n",c->entropy-lastEntropy,c->bits_used); total_nonalpha_bits+=c->entropy-lastEntropy; lastEntropy=c->entropy; /* compress lower-caseified version of message */ stripCase(alpha,lcalpha); encodeLCAlphaSpace(c,lcalpha); // printf("%f bits (%d emitted) to encode chars\n",c->entropy-lastEntropy,c->bits_used); total_alpha_bits+=c->entropy-lastEntropy; lastEntropy=c->entropy; /* case must be encoded after symbols, so we know how many letters and where word breaks are. */ mungeCase((char *)alpha); encodeCaseModel1(c,alpha); // printf("%f bits (%d emitted) to encode case\n",c->entropy-lastEntropy,c->bits_used); total_case_bits+=c->entropy-lastEntropy; range_conclude(c); // printf("%d bits actually used after concluding.\n",c->bits_used); total_finalisation_bits+=c->bits_used-c->entropy; if ((!nonAlphaChars)&&c->bits_used>=7*strlen((char *)m)) { /* Can we code it more efficiently without statistical modelling? */ range_coder *c2=range_new_coder(1024); range_encode_equiprobable(c2,2,1); // not raw ASCII range_encode_symbol(c2,&probPackedASCII,2,0); // is packed ASCII encodeLength(c2,strlen((char *)m)); encodePackedASCII(c2,(char *)m); range_conclude(c2); if (c2->bits_used<c->bits_used) { range_coder_reset(c); range_encode_equiprobable(c,2,1); // not raw ASCII range_encode_symbol(c,&probPackedASCII,2,0); // is packed ASCII encodeLength(c,strlen((char *)m)); encodePackedASCII(c,(char *)m); range_conclude(c); // printf("Reverting to raw non-statistical encoding: %d chars in %d bits\n", // (int)strlen((char *)m),c->bits_used); } range_coder_free(c2); } if ((c->bits_used>=8*strlen((char*)m)) &&(!(m[0]&0x80))) { /* we can't encode it more efficiently than 8-bit raw. We can only do this is MSB of first char of message is 0, as we use the first bit of the message to indicate if it is compressed or not. */ int i; range_coder_reset(c); for(i=0;m[i];i++) c->bit_stream[i]=m[i]; c->bits_used=8*i; c->entropy=8*i; // printf("Reverting to raw 8-bit encoding: used %d bits\n",c->bits_used); } return 0; }
int recipe_encode_field(struct recipe *recipe,stats_handle *stats, range_coder *c, int fieldnumber,char *value) { int normalised_value; int minimum; int maximum; int precision; int h,m,s,d,y; float lat,lon; int ilat,ilon; precision=recipe->fields[fieldnumber].precision; switch (recipe->fields[fieldnumber].type) { case FIELDTYPE_INTEGER: normalised_value=atoi(value)-recipe->fields[fieldnumber].minimum; minimum=recipe->fields[fieldnumber].minimum; maximum=recipe->fields[fieldnumber].maximum; if (maximum<=minimum) { fprintf(stderr,"Illegal range: min=%d, max=%d\n",minimum,maximum); LOGI("Illegal range: min=%d, max=%d\n",minimum,maximum); return -1; } if (normalised_value<0||normalised_value>(maximum-minimum+1)) { fprintf(stderr,"Illegal value: min=%d, max=%d, value=%d\n", minimum,maximum,atoi(value)); LOGI("Illegal value: min=%d, max=%d, value=%d\n", minimum,maximum,atoi(value)); range_encode_equiprobable(c,maximum-minimum+2,maximum-minimum+1); int r=stats3_compress_append(c,(unsigned char *)value,strlen(value),stats, NULL); return r; } return range_encode_equiprobable(c,maximum-minimum+2,normalised_value); case FIELDTYPE_FLOAT: { float f = atof(value); int sign=0; int exponent=0; int mantissa=0; if (f<0) { sign=1; f=-f; } else sign=0; double m = frexp(f,&exponent); mantissa = m * 0xffffff; if (exponent<-127) exponent=-127; if (exponent>127) exponent=127; fprintf(stderr,"encoding sign=%d, exp=%d, mantissa=%x, f=%f\n", sign,exponent,mantissa,atof(value)); // Sign range_encode_equiprobable(c,2,sign); // Exponent range_encode_equiprobable(c,256,exponent+128); // Mantissa range_encode_equiprobable(c,256,(mantissa>>16)&0xff); range_encode_equiprobable(c,256,(mantissa>>8)&0xff); return range_encode_equiprobable(c,256,(mantissa>>0)&0xff); } case FIELDTYPE_FIXEDPOINT: case FIELDTYPE_BOOLEAN: normalised_value=recipe_parse_boolean(value); minimum=0; maximum=1; return range_encode_equiprobable(c,maximum-minimum+1,normalised_value); case FIELDTYPE_TIMEOFDAY: if (sscanf(value,"%d:%d.%d",&h,&m,&s)<2) return -1; // XXX - We don't support leap seconds if (h<0||h>23||m<0||m>59||s<0||s>59) return -1; normalised_value=h*3600+m*60+s; minimum=0; maximum=24*60*60; if (precision==0) precision=17; // 2^16 < 24*60*60 < 2^17 if (precision<17) { normalised_value=normalised_value >> (17 - precision); minimum=minimum >> (17 - precision); maximum=maximum >> (17 - precision); maximum+=1; // make sure that normalised_value cannot = maximum } return range_encode_equiprobable(c,maximum-minimum+1,normalised_value); case FIELDTYPE_TIMEDATE: { struct tm tm; int tzh=0,tzm=0; int r; bzero(&tm,sizeof(tm)); if ((r=sscanf(value,"%d-%d-%dT%d:%d:%d.%*d+%d:%d", &tm.tm_year,&tm.tm_mon,&tm.tm_mday, &tm.tm_hour,&tm.tm_min,&tm.tm_sec, &tzh,&tzm))<6) { printf("r=%d\n",r); return -1; } #if defined(__sgi) || defined(__sun) #else tm.tm_gmtoff=tzm*60+tzh*3600; #endif tm.tm_year-=1900; tm.tm_mon-=1; time_t t = mktime(&tm); minimum=1; maximum=0x7fffffff; normalised_value=t; int b; b=range_encode_equiprobable(c,0x8000,t>>16); b=range_encode_equiprobable(c,0x10000,t&0xffff); printf("TIMEDATE: encoding t=%d\n",(int)t); return b; } case FIELDTYPE_MAGPITIMEDATE: { struct tm tm; // int tzh=0,tzm=0; int r; bzero(&tm,sizeof(tm)); if ((r=sscanf(value,"%d-%d-%d %d:%d:%d", &tm.tm_year,&tm.tm_mon,&tm.tm_mday, &tm.tm_hour,&tm.tm_min,&tm.tm_sec))<6) { printf("r=%d\n",r); return -1; } // Validate fields if (tm.tm_year<0||tm.tm_year>9999) return -1; if (tm.tm_mon<1||tm.tm_mon>12) return -1; if (tm.tm_mday<1||tm.tm_mday>31) return -1; if (tm.tm_hour<0||tm.tm_hour>24) return -1; if (tm.tm_min<0||tm.tm_min>59) return -1; if (tm.tm_sec<0||tm.tm_sec>61) return -1; // Encode each field: requires about 40 bits, but safely encodes all values // without risk of timezone munging on Android range_encode_equiprobable(c,10000,tm.tm_year); range_encode_equiprobable(c,12,tm.tm_mon-1); range_encode_equiprobable(c,31,tm.tm_mday-1); range_encode_equiprobable(c,25,tm.tm_hour); range_encode_equiprobable(c,60,tm.tm_min); return range_encode_equiprobable(c,62,tm.tm_sec); } case FIELDTYPE_DATE: // ODK does YYYY/MM/DD // Magpi does DD-MM-YYYY // The different delimiter allows us to discern between the two fprintf(stderr,"Parsing FIELDTYPE_DATE value '%s'\n",value); if (sscanf(value,"%d/%d/%d",&y,&m,&d)==3) { } else if (sscanf(value,"%d-%d-%d",&d,&m,&y)==3) { } else return -1; // XXX Not as efficient as it could be (assumes all months have 31 days) if (y<1||y>9999||m<1||m>12||d<1||d>31) { fprintf(stderr,"Invalid field value\n"); return -1; } normalised_value=y*372+(m-1)*31+(d-1); minimum=0; maximum=10000*372; if (precision==0) precision=22; // 2^21 < maximum < 2^22 if (precision<22) { normalised_value=normalised_value >> (22 - precision); minimum=minimum >> (22 - precision); maximum=maximum >> (22 - precision); maximum+=1; // make sure that normalised_value cannot = maximum }
unsigned int writeNode(FILE *out,struct countnode *n,char *s, /* Terminations don't get counted internally in a node, but are used when encoding and decoding the node, so we have to pass it in here. */ int totalCountIncludingTerminations,int threshold) { nodesWritten++; char schild[128]; int i; long long totalCount=0; int debug=0; for(i=0;i<CHARCOUNT;i++) totalCount+=getCount(n,i); if (totalCount!=n->count) { fprintf(stderr,"Sequence '%s' counts don't add up: %lld vs %lld\n", s,totalCount,n->count); } if (debug) fprintf(stderr,"sequence '%s' occurs %lld times (%d inc. terminals).\n", s,totalCount,totalCountIncludingTerminations); /* Don't go any deeper if the sequence is too rare */ if (totalCount<threshold) return 0; int children=0; if (n->allChildren) { for(i=0;i<CHARCOUNT;i++) if (n->allChildren[i]) children++; } else { for(i=0;i<FEW;i++) if (n->fewChildIds[i]) children++; } range_coder *c=range_new_coder(1024); int childAddresses[CHARCOUNT]; int childCount=0; int storedChildren=0; /* Encode children first so that we know where they live */ for(i=0;i<CHARCOUNT;i++) { childAddresses[i]=0; struct countnode **nn; nn=getChild(n,i,0); if (nn&&*nn&&(*nn)->count>=threshold) { if (0) fprintf(stderr,"n->children[%d]->count=%lld\n",i,(*nn)->count); snprintf(schild,128,"%s%c",s,chars[i]); childAddresses[i]=writeNode(out,*nn,schild,totalCount,threshold); storedChildren++; } if (getCount(n,i)) { childCount++; } } /* Write total count in this node */ range_encode_equiprobable(c,totalCountIncludingTerminations+1,totalCount); /* Write number of children with counts */ range_encode_equiprobable(c,CHARCOUNT+1,childCount); /* Now number of children that we are storing sub-nodes for */ range_encode_equiprobable(c,CHARCOUNT+1,storedChildren); unsigned int highAddr=ftell(out); unsigned int lowAddr=0; if (debug) fprintf(stderr," lowAddr=0x%x, highAddr=0x%x\n",lowAddr,highAddr); if (debug) fprintf(stderr, "wrote: childCount=%d, storedChildren=%d, count=%lld, superCount=%d @ 0x%x\n", childCount,storedChildren,totalCount,totalCountIncludingTerminations, (unsigned int)ftello(out)); unsigned int remainingCount=totalCount; // XXX - we can improve on these probabilities by adjusting them // according to the remaining number of children and stored children. unsigned int hasCount=(CHARCOUNT-childCount)*0xffffff/CHARCOUNT; unsigned int isStored=(CHARCOUNT-storedChildren)*0xffffff/CHARCOUNT; for(i=0;i<CHARCOUNT;i++) { hasCount=(CHARCOUNT-i-childCount)*0xffffff/(CHARCOUNT-i); if (getCount(n,i)) { snprintf(schild,128,"%c%s",chars[i],s); if (debug) fprintf(stderr, "writing: '%s' x %d\n", schild,getCount(n,i)); if (debug) fprintf(stderr,": writing %d of %d count for '%c'\n", getCount(n,i),remainingCount+1,chars[i]); range_encode_symbol(c,&hasCount,2,1); range_encode_equiprobable(c,remainingCount+1,getCount(n,i)); remainingCount-=getCount(n,i); childCount--; } else { range_encode_symbol(c,&hasCount,2,0); } } for(i=0;i<CHARCOUNT;i++) { isStored=(CHARCOUNT-i-storedChildren)*0xffffff/(CHARCOUNT-i); if (childAddresses[i]) { range_encode_symbol(c,&isStored,2,1); if (debug) fprintf(stderr,": writing child %d (address attached)\n",i); /* Encode address of child node compactly. For starters, we know that it must preceed us in the bit stream. We also know that we write them in order, so once we know the address of a previous one, we can narrow the range further. */ range_encode_equiprobable(c,highAddr-lowAddr+1,childAddresses[i]-lowAddr); if (debug) fprintf(stderr,": writing addr = %d of %d (lowAddr=%d)\n", childAddresses[i]-lowAddr,highAddr-lowAddr+1,lowAddr); lowAddr=childAddresses[i]; storedChildren--; } else { range_encode_symbol(c,&isStored,2,0); } } range_conclude(c); /* Unaccounted for observations are observations that terminate at this point. They are totall normal and expected. */ if (debug) if (remainingCount) { fprintf(stderr,"'%s' Count incomplete: %d of %lld not accounted for.\n", s,remainingCount,totalCount); } unsigned int addr = ftello(out); int bytes=c->bits_used>>3; if (c->bits_used&7) bytes++; fwrite(c->bit_stream,bytes,1,out); /* Verify */ { /* Make pretend stats handle to extract from */ stats_handle h; h.file=(FILE*)0xdeadbeef; h.mmap=c->bit_stream; h.dummyOffset=addr; h.fileLength=addr+bytes; if (0) fprintf(stderr,"verifying node @ 0x%x\n",addr); struct node *v=extractNodeAt(NULL,0,addr,totalCountIncludingTerminations,&h, 0 /* don't extract whole tree */,debug); int i; int error=0; for(i=0;i<CHARCOUNT;i++) { if (v->counts[i]!=getCount(n,i)) { if (!error) { fprintf(stderr,"Verify error writing node for '%s'\n",s); fprintf(stderr," n->count=%lld, totalCount=%lld\n", n->count,totalCount); } fprintf(stderr," '%c' (%d) : %d versus %d written.\n", chars[i],i,v->counts[i],getCount(n,i)); error++; } } if (error) { fprintf(stderr,"Bit stream (%d bytes):",bytes); for(i=0;i<bytes;i++) fprintf(stderr," %02x",c->bit_stream[i]); fprintf(stderr,"\n"); exit(-1); } #ifdef DEBUG if ((!strcmp(s,"esae"))||(!strcmp(s,"esael"))) { fprintf(stderr,"%s 0x%x (%f bits) totalCountIncTerms=%d\n", s,addr,c->entropy,totalCountIncludingTerminations); dumpNode(v); } #endif node_free(v); } range_coder_free(c); return addr; }