int get_vector(codebook *b,FILE *in,int start, int n,float *a){ int i; const static_codebook *c=b->c; while(1){ if(v_sofar==n || get_line_value(in,a)){ reset_next_value(); if(get_next_value(in,a)) break; for(i=0;i<start;i++){ sequence_base=*a; get_line_value(in,a); } } for(i=1;i<c->dim;i++) if(get_line_value(in,a+i)) break; if(i==c->dim){ float temp=a[c->dim-1]; for(i=0;i<c->dim;i++)a[i]-=sequence_base; if(c->q_sequencep)sequence_base=temp; v_sofar++; return(0); } sequence_base=0.f; } return(-1); }
int get_next_value(FILE *in,float *value){ while(1){ if(get_line_value(in,value)){ value_line_buff=get_line(in); if(!value_line_buff)return(-1); }else{ return(0); } } }
static int getline(FILE *in,float *vec,int begin,int n){ int i,next=0; reset_next_value(); if(get_next_value(in,vec))return(0); if(begin){ for(i=1;i<begin;i++) get_line_value(in,vec); next=0; }else{ next=1; } for(i=next;i<n;i++) if(get_line_value(in,vec+i)){ fprintf(stderr,"ran out of columns in input data\n"); exit(1); } return(1); }
static int getval(FILE *in,int begin,int n,int group,int max){ float v; int i; long val=0; if(nsofar>=n || get_line_value(in,&v)){ reset_next_value(); nsofar=0; if(get_next_value(in,&v)) return(-1); for(i=1;i<=begin;i++) get_line_value(in,&v); } val=(int)v; nsofar++; for(i=1;i<group;i++,nsofar++) if(nsofar>=n || get_line_value(in,&v)) return(getval(in,begin,n,group,max)); else val = val*max+(int)v; return(val); }
int main(int argc,char *argv[]){ char *basename; codebook **b=_ogg_calloc(1,sizeof(codebook *)); int *addmul=_ogg_calloc(1,sizeof(int)); int books=0; int input=0; int interleave=0; int j; int start=0; int num=-1; argv++; if(*argv==NULL){ process_usage(); exit(1); } /* yes, this is evil. However, it's very convenient to parse file extentions */ while(*argv){ if(*argv[0]=='-'){ /* option */ if(argv[0][1]=='s'){ /* subvector */ if(sscanf(argv[1],"%d,%d",&start,&num)!=2){ num= -1; if(sscanf(argv[1],"%d",&start)!=1){ fprintf(stderr,"Syntax error using -s\n"); exit(1); } } argv+=2; } if(argv[0][1]=='i'){ /* interleave */ interleave=1; argv+=1; } }else{ /* input file. What kind? */ char *dot; char *ext=NULL; char *name=strdup(*argv++); dot=strrchr(name,'.'); if(dot) ext=dot+1; else ext=""; /* codebook */ if(!strcmp(ext,"vqh")){ int multp=0; if(input){ fprintf(stderr,"specify all input data (.vqd) files following\n" "codebook header (.vqh) files\n"); exit(1); } /* is it additive or multiplicative? */ if(name[0]=='*'){ multp=1; name++; } if(name[0]=='+')name++; basename=strrchr(name,'/'); if(basename) basename=strdup(basename)+1; else basename=strdup(name); dot=strrchr(basename,'.'); if(dot)*dot='\0'; b=_ogg_realloc(b,sizeof(codebook *)*(books+2)); b[books]=codebook_load(name); addmul=_ogg_realloc(addmul,sizeof(int)*(books+1)); addmul[books++]=multp; b[books]=NULL; } /* data file */ if(!strcmp(ext,"vqd")){ int cols; long lines=0; char *line; float *vec; FILE *in=fopen(name,"r"); if(!in){ fprintf(stderr,"Could not open input file %s\n",name); exit(1); } if(!input){ process_preprocess(b,basename); input++; } reset_next_value(); line=setup_line(in); /* count cols before we start reading */ { char *temp=line; while(*temp==' ')temp++; for(cols=0;*temp;cols++){ while(*temp>32)temp++; while(*temp==' ')temp++; } } vec=alloca(cols*sizeof(float)); while(line){ lines++; for(j=0;j<cols;j++) if(get_line_value(in,vec+j)){ fprintf(stderr,"Too few columns on line %ld in data file\n",lines); exit(1); } /* ignores -s for now */ process_vector(b,addmul,interleave,vec,cols); line=setup_line(in); } fclose(in); } } } /* take any data from stdin */ { struct stat st; if(fstat(STDIN_FILENO,&st)==-1){ fprintf(stderr,"Could not stat STDIN\n"); exit(1); } if((S_IFIFO|S_IFREG|S_IFSOCK)&st.st_mode){ int cols; char *line; long lines=0; float *vec; if(!input){ process_preprocess(b,basename); input++; } line=setup_line(stdin); /* count cols before we start reading */ { char *temp=line; while(*temp==' ')temp++; for(cols=0;*temp;cols++){ while(*temp>32)temp++; while(*temp==' ')temp++; } } vec=alloca(cols*sizeof(float)); while(line){ lines++; for(j=0;j<cols;j++) if(get_line_value(stdin,vec+j)){ fprintf(stderr,"Too few columns on line %ld in data file\n",lines); exit(1); } /* ignores -s for now */ process_vector(b,addmul,interleave,vec,cols); line=setup_line(stdin); } } } process_postprocess(b,basename); return 0; }
int main(int argc,char *argv[]){ char *basename; codebook *b=NULL; int entries=0; int dim=0; long i,j,target=-1,protect=-1; FILE *out=NULL; int argnum=0; argv++; if(*argv==NULL){ usage(); exit(1); } while(*argv){ if(*argv[0]=='-'){ argv++; }else{ switch (argnum++){ case 0:case 1: { /* yes, this is evil. However, it's very convenient to parse file extentions */ /* input file. What kind? */ char *dot; char *ext=NULL; char *name=strdup(*argv++); dot=strrchr(name,'.'); if(dot) ext=dot+1; else{ ext=""; } /* codebook */ if(!strcmp(ext,"vqh")){ basename=strrchr(name,'/'); if(basename) basename=strdup(basename)+1; else basename=strdup(name); dot=strrchr(basename,'.'); if(dot)*dot='\0'; b=codebook_load(name); dim=b->dim; entries=b->entries; } /* data file; we do actually need to suck it into memory */ /* we're dealing with just one book, so we can de-interleave */ if(!strcmp(ext,"vqd") && !points){ int cols; long lines=0; char *line; float *vec; FILE *in=fopen(name,"r"); if(!in){ fprintf(stderr,"Could not open input file %s\n",name); exit(1); } reset_next_value(); line=setup_line(in); /* count cols before we start reading */ { char *temp=line; while(*temp==' ')temp++; for(cols=0;*temp;cols++){ while(*temp>32)temp++; while(*temp==' ')temp++; } } vec=alloca(cols*sizeof(float)); /* count, then load, to avoid fragmenting the hell out of memory */ while(line){ lines++; for(j=0;j<cols;j++) if(get_line_value(in,vec+j)){ fprintf(stderr,"Too few columns on line %ld in data file\n",lines); exit(1); } if((lines&0xff)==0)spinnit("counting samples...",lines*cols); line=setup_line(in); } pointlist=_ogg_malloc((cols*lines+entries*dim)*sizeof(float)); rewind(in); line=setup_line(in); while(line){ lines--; for(j=0;j<cols;j++) if(get_line_value(in,vec+j)){ fprintf(stderr,"Too few columns on line %ld in data file\n",lines); exit(1); } /* deinterleave, add to heap */ add_vector(b,vec,cols); if((lines&0xff)==0)spinnit("loading samples...",lines*cols); line=setup_line(in); } fclose(in); } } break; case 2: target=atol(*argv++); if(target==0)target=entries; break; case 3: protect=atol(*argv++); break; case 4: { char *buff=alloca(strlen(*argv)+5); sprintf(buff,"%s.vqh",*argv); basename=*argv++; out=fopen(buff,"w"); if(!out){ fprintf(stderr,"unable ot open %s for output",buff); exit(1); } } break; default: usage(); } } } if(!entries || !points || !out)usage(); if(target==-1)usage(); /* add guard points */ for(i=0;i<entries;i++) for(j=0;j<dim;j++) pointlist[points++]=b->valuelist[i*dim+j]; points/=dim; /* set up auxiliary vectors for error tracking */ { encode_aux_nearestmatch *nt=NULL; long pointssofar=0; long *pointindex; long indexedpoints=0; long *entryindex; long *reventry; long *membership=_ogg_malloc(points*sizeof(long)); long *firsthead=_ogg_malloc(entries*sizeof(long)); long *secondary=_ogg_malloc(points*sizeof(long)); long *secondhead=_ogg_malloc(entries*sizeof(long)); long *cellcount=_ogg_calloc(entries,sizeof(long)); long *cellcount2=_ogg_calloc(entries,sizeof(long)); float *cellerror=_ogg_calloc(entries,sizeof(float)); float *cellerrormax=_ogg_calloc(entries,sizeof(float)); long cellsleft=entries; for(i=0;i<points;i++)membership[i]=-1; for(i=0;i<entries;i++)firsthead[i]=-1; for(i=0;i<points;i++)secondary[i]=-1; for(i=0;i<entries;i++)secondhead[i]=-1; for(i=0;i<points;i++){ /* assign vectors to the nearest cell. Also keep track of second nearest for error statistics */ float *ppt=pointlist+i*dim; int firstentry=closest(b,ppt,-1); int secondentry=closest(b,ppt,firstentry); float firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt); float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt); if(!(i&0xff))spinnit("initializing... ",points-i); membership[i]=firsthead[firstentry]; firsthead[firstentry]=i; secondary[i]=secondhead[secondentry]; secondhead[secondentry]=i; if(i<points-entries){ cellerror[firstentry]+=secondmetric-firstmetric; cellerrormax[firstentry]=max(cellerrormax[firstentry], _heuristic(b,ppt,secondentry)); cellcount[firstentry]++; cellcount2[secondentry]++; } } /* which cells are most heavily populated? Protect as many from dispersal as the user has requested */ { long **countindex=_ogg_calloc(entries,sizeof(long *)); for(i=0;i<entries;i++)countindex[i]=cellcount+i; qsort(countindex,entries,sizeof(long *),longsort); for(i=0;i<protect;i++){ int ptr=countindex[i]-cellcount; cellerrormax[ptr]=9e50f; } } { fprintf(stderr,"\r"); for(i=0;i<entries;i++){ /* decompose index */ int entry=i; for(j=0;j<dim;j++){ fprintf(stderr,"%d:",entry%b->c->thresh_tree->quantvals); entry/=b->c->thresh_tree->quantvals; } fprintf(stderr,":%ld/%ld, ",cellcount[i],cellcount2[i]); } fprintf(stderr,"\n"); } /* do the automatic cull request */ while(cellsleft>target){ int bestcell=-1; float besterror=0; float besterror2=0; long head=-1; char spinbuf[80]; sprintf(spinbuf,"cells left to eliminate: %ld : ",cellsleft-target); /* find the cell with lowest removal impact */ for(i=0;i<entries;i++){ if(b->c->lengthlist[i]>0){ if(bestcell==-1 || cellerrormax[i]<=besterror2){ if(bestcell==-1 || cellerrormax[i]<besterror2 || besterror>cellerror[i]){ besterror=cellerror[i]; besterror2=cellerrormax[i]; bestcell=i; } } } } fprintf(stderr,"\reliminating cell %d \n" " dispersal error of %g max/%g total (%ld hits)\n", bestcell,besterror2,besterror,cellcount[bestcell]); /* disperse it. move each point out, adding it (properly) to the second best */ b->c->lengthlist[bestcell]=0; head=firsthead[bestcell]; firsthead[bestcell]=-1; while(head!=-1){ /* head is a point number */ float *ppt=pointlist+head*dim; int firstentry=closest(b,ppt,-1); int secondentry=closest(b,ppt,firstentry); float firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt); float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt); long next=membership[head]; if(head<points-entries){ cellcount[firstentry]++; cellcount[bestcell]--; cellerror[firstentry]+=secondmetric-firstmetric; cellerrormax[firstentry]=max(cellerrormax[firstentry], _heuristic(b,ppt,secondentry)); } membership[head]=firsthead[firstentry]; firsthead[firstentry]=head; head=next; if(cellcount[bestcell]%128==0) spinnit(spinbuf,cellcount[bestcell]+cellcount2[bestcell]); } /* now see that all points that had the dispersed cell as second choice have second choice reassigned */ head=secondhead[bestcell]; secondhead[bestcell]=-1; while(head!=-1){ float *ppt=pointlist+head*dim; /* who are we assigned to now? */ int firstentry=closest(b,ppt,-1); /* what is the new second closest match? */ int secondentry=closest(b,ppt,firstentry); /* old second closest is the cell being disbanded */ float oldsecondmetric=_dist(dim,b->valuelist+dim*bestcell,ppt); /* new second closest error */ float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt); long next=secondary[head]; if(head<points-entries){ cellcount2[secondentry]++; cellcount2[bestcell]--; cellerror[firstentry]+=secondmetric-oldsecondmetric; cellerrormax[firstentry]=max(cellerrormax[firstentry], _heuristic(b,ppt,secondentry)); } secondary[head]=secondhead[secondentry]; secondhead[secondentry]=head; head=next; if(cellcount2[bestcell]%128==0) spinnit(spinbuf,cellcount2[bestcell]); } cellsleft--; } /* paring is over. Build decision trees using points that now fall through the thresh matcher. */ /* we don't free membership; we flatten it in order to use in lp_split */ for(i=0;i<entries;i++){ long head=firsthead[i]; spinnit("rearranging membership cache... ",entries-i); while(head!=-1){ long next=membership[head]; membership[head]=i; head=next; } } free(secondhead); free(firsthead); free(cellerror); free(cellerrormax); free(secondary); pointindex=_ogg_malloc(points*sizeof(long)); /* make a point index of fall-through points */ for(i=0;i<points;i++){ int best=_best(b,pointlist+i*dim,1); if(best==-1) pointindex[indexedpoints++]=i; spinnit("finding orphaned points... ",points-i); } /* make an entry index */ entryindex=_ogg_malloc(entries*sizeof(long)); target=0; for(i=0;i<entries;i++){ if(b->c->lengthlist[i]>0) entryindex[target++]=i; } /* make working space for a reverse entry index */ reventry=_ogg_malloc(entries*sizeof(long)); /* do the split */ nt=b->c->nearest_tree= _ogg_calloc(1,sizeof(encode_aux_nearestmatch)); nt->alloc=4096; nt->ptr0=_ogg_malloc(sizeof(long)*nt->alloc); nt->ptr1=_ogg_malloc(sizeof(long)*nt->alloc); nt->p=_ogg_malloc(sizeof(long)*nt->alloc); nt->q=_ogg_malloc(sizeof(long)*nt->alloc); nt->aux=0; fprintf(stderr,"Leaves added: %d \n", lp_split(pointlist,points, b,entryindex,target, pointindex,indexedpoints, membership,reventry, 0,&pointssofar)); free(membership); free(reventry); free(pointindex); /* hack alert. I should just change the damned splitter and codebook writer */ for(i=0;i<nt->aux;i++)nt->p[i]*=dim; for(i=0;i<nt->aux;i++)nt->q[i]*=dim; /* recount hits. Build new lengthlist. reuse entryindex storage */ for(i=0;i<entries;i++)entryindex[i]=1; for(i=0;i<points-entries;i++){ int best=_best(b,pointlist+i*dim,1); float *a=pointlist+i*dim; if(!(i&0xff))spinnit("counting hits...",i); if(best==-1){ fprintf(stderr,"\nINTERNAL ERROR; a point count not be matched to a\n" "codebook entry. The new decision tree is broken.\n"); exit(1); } entryindex[best]++; } for(i=0;i<nt->aux;i++)nt->p[i]/=dim; for(i=0;i<nt->aux;i++)nt->q[i]/=dim; /* the lengthlist builder doesn't actually deal with 0 hit entries. So, we pack the 'sparse' hit list into a dense list, then unpack the lengths after the build */ { int upper=0; long *lengthlist=_ogg_calloc(entries,sizeof(long)); for(i=0;i<entries;i++){ if(b->c->lengthlist[i]>0) entryindex[upper++]=entryindex[i]; else{ if(entryindex[i]>1){ fprintf(stderr,"\nINTERNAL ERROR; _best matched to unused entry\n"); exit(1); } } } /* sanity check */ if(upper != target){ fprintf(stderr,"\nINTERNAL ERROR; packed the wrong number of entries\n"); exit(1); } build_tree_from_lengths(upper,entryindex,lengthlist); upper=0; for(i=0;i<entries;i++){ if(b->c->lengthlist[i]>0) b->c->lengthlist[i]=lengthlist[upper++]; } } } /* we're done. write it out. */ write_codebook(out,basename,b->c); fprintf(stderr,"\r \nDone.\n"); return(0); }