Example #1
0
int get_vector(codebook *b,FILE *in,int start, int n,float *a){
  int i;
  const static_codebook *c=b->c;

  while(1){

    if(v_sofar==n || get_line_value(in,a)){
      reset_next_value();
      if(get_next_value(in,a))
	break;
      for(i=0;i<start;i++){
	sequence_base=*a;
	get_line_value(in,a);
      }
    }

    for(i=1;i<c->dim;i++)
      if(get_line_value(in,a+i))
	break;
    
    if(i==c->dim){
      float temp=a[c->dim-1];
      for(i=0;i<c->dim;i++)a[i]-=sequence_base;
      if(c->q_sequencep)sequence_base=temp;
      v_sofar++;
      return(0);
    }
    sequence_base=0.f;
  }

  return(-1);
}
Example #2
0
int get_next_value(FILE *in,float *value){
  while(1){
    if(get_line_value(in,value)){
      value_line_buff=get_line(in);
      if(!value_line_buff)return(-1);
    }else{
      return(0);
    }
  }
}
Example #3
0
static int getline(FILE *in,float *vec,int begin,int n){
  int i,next=0;

  reset_next_value();
  if(get_next_value(in,vec))return(0);
  if(begin){
    for(i=1;i<begin;i++)
      get_line_value(in,vec);
    next=0;
  }else{
    next=1;
  }

  for(i=next;i<n;i++)
    if(get_line_value(in,vec+i)){
      fprintf(stderr,"ran out of columns in input data\n");
      exit(1);
    }
  
  return(1);
}
Example #4
0
static int getval(FILE *in,int begin,int n,int group,int max){
  float v;
  int i;
  long val=0;

  if(nsofar>=n || get_line_value(in,&v)){
    reset_next_value();
    nsofar=0;
    if(get_next_value(in,&v))
      return(-1);
    for(i=1;i<=begin;i++)
      get_line_value(in,&v);
  }

  val=(int)v;
  nsofar++;

  for(i=1;i<group;i++,nsofar++)
    if(nsofar>=n || get_line_value(in,&v))
      return(getval(in,begin,n,group,max));
    else
      val = val*max+(int)v;
  return(val);
}
Example #5
0
int main(int argc,char *argv[]){
  char *basename;
  codebook **b=_ogg_calloc(1,sizeof(codebook *));
  int *addmul=_ogg_calloc(1,sizeof(int));
  int books=0;
  int input=0;
  int interleave=0;
  int j;
  int start=0;
  int num=-1;
  argv++;

  if(*argv==NULL){
    process_usage();
    exit(1);
  }

  /* yes, this is evil.  However, it's very convenient to parse file
     extentions */

  while(*argv){
    if(*argv[0]=='-'){
      /* option */
      if(argv[0][1]=='s'){
	/* subvector */
	if(sscanf(argv[1],"%d,%d",&start,&num)!=2){
	  num= -1;
	  if(sscanf(argv[1],"%d",&start)!=1){
	    fprintf(stderr,"Syntax error using -s\n");
	    exit(1);
	  }
	}
	argv+=2;
      }
      if(argv[0][1]=='i'){
	/* interleave */
	interleave=1;
	argv+=1;
      }
    }else{
      /* input file.  What kind? */
      char *dot;
      char *ext=NULL;
      char *name=strdup(*argv++);
      dot=strrchr(name,'.');
      if(dot)
	ext=dot+1;
      else
	ext="";

      /* codebook */
      if(!strcmp(ext,"vqh")){
	int multp=0;
	if(input){
	  fprintf(stderr,"specify all input data (.vqd) files following\n"
		  "codebook header (.vqh) files\n");
	  exit(1);
	}
	/* is it additive or multiplicative? */
	if(name[0]=='*'){
	  multp=1;
	  name++;
	}
	if(name[0]=='+')name++;

	basename=strrchr(name,'/');
	if(basename)
	  basename=strdup(basename)+1;
	else
	  basename=strdup(name);
	dot=strrchr(basename,'.');
	if(dot)*dot='\0';

	b=_ogg_realloc(b,sizeof(codebook *)*(books+2));
	b[books]=codebook_load(name);
	addmul=_ogg_realloc(addmul,sizeof(int)*(books+1));
	addmul[books++]=multp;
	b[books]=NULL;
      }

      /* data file */
      if(!strcmp(ext,"vqd")){
	int cols;
	long lines=0;
	char *line;
	float *vec;
	FILE *in=fopen(name,"r");
	if(!in){
	  fprintf(stderr,"Could not open input file %s\n",name);
	  exit(1);
	}

	if(!input){
	  process_preprocess(b,basename);
	  input++;
	}

	reset_next_value();
	line=setup_line(in);
	/* count cols before we start reading */
	{
	  char *temp=line;
	  while(*temp==' ')temp++;
	  for(cols=0;*temp;cols++){
	    while(*temp>32)temp++;
	    while(*temp==' ')temp++;
	  }
	}
	vec=alloca(cols*sizeof(float));
	while(line){
	  lines++;
	  for(j=0;j<cols;j++)
	    if(get_line_value(in,vec+j)){
	      fprintf(stderr,"Too few columns on line %ld in data file\n",lines);
	      exit(1);
	    }
	  /* ignores -s for now */
	  process_vector(b,addmul,interleave,vec,cols);

	  line=setup_line(in);
	}
	fclose(in);
      }
    }
  }

  /* take any data from stdin */
  {
    struct stat st;
    if(fstat(STDIN_FILENO,&st)==-1){
      fprintf(stderr,"Could not stat STDIN\n");
      exit(1);
    }
    if((S_IFIFO|S_IFREG|S_IFSOCK)&st.st_mode){
      int cols;
      char *line;
      long lines=0;
      float *vec;
      if(!input){
	process_preprocess(b,basename);
	input++;
      }
      
      line=setup_line(stdin);
      /* count cols before we start reading */
      {
	char *temp=line;
	while(*temp==' ')temp++;
	for(cols=0;*temp;cols++){
	  while(*temp>32)temp++;
	  while(*temp==' ')temp++;
	}
      }
      vec=alloca(cols*sizeof(float));
      while(line){
	lines++;
	for(j=0;j<cols;j++)
	  if(get_line_value(stdin,vec+j)){
	    fprintf(stderr,"Too few columns on line %ld in data file\n",lines);
	    exit(1);
	  }
	/* ignores -s for now */
	process_vector(b,addmul,interleave,vec,cols);
	
	line=setup_line(stdin);
      }
    }
  }

  process_postprocess(b,basename);

  return 0;
}
Example #6
0
int main(int argc,char *argv[]){
  char *basename;
  codebook *b=NULL;
  int entries=0;
  int dim=0;
  long i,j,target=-1,protect=-1;
  FILE *out=NULL;

  int argnum=0;

  argv++;
  if(*argv==NULL){
    usage();
    exit(1);
  }

  while(*argv){
    if(*argv[0]=='-'){

      argv++;
	
    }else{
      switch (argnum++){
      case 0:case 1:
	{
	  /* yes, this is evil.  However, it's very convenient to parse file
	     extentions */
	  
	  /* input file.  What kind? */
	  char *dot;
	  char *ext=NULL;
	  char *name=strdup(*argv++);
	  dot=strrchr(name,'.');
	  if(dot)
	    ext=dot+1;
	  else{
	    ext="";
	    
	  }
	  
	  
	  /* codebook */
	  if(!strcmp(ext,"vqh")){
	    
	    basename=strrchr(name,'/');
	    if(basename)
	      basename=strdup(basename)+1;
	    else
	      basename=strdup(name);
	    dot=strrchr(basename,'.');
	    if(dot)*dot='\0';
	    
	    b=codebook_load(name);
	    dim=b->dim;
	    entries=b->entries;
	  }
	  
	  /* data file; we do actually need to suck it into memory */
	  /* we're dealing with just one book, so we can de-interleave */ 
	  if(!strcmp(ext,"vqd") && !points){
	    int cols;
	    long lines=0;
	    char *line;
	    float *vec;
	    FILE *in=fopen(name,"r");
	    if(!in){
	      fprintf(stderr,"Could not open input file %s\n",name);
	      exit(1);
	    }
	    
	    reset_next_value();
	    line=setup_line(in);
	    /* count cols before we start reading */
	    {
	      char *temp=line;
	      while(*temp==' ')temp++;
	      for(cols=0;*temp;cols++){
		while(*temp>32)temp++;
		while(*temp==' ')temp++;
	      }
	    }
	    vec=alloca(cols*sizeof(float));
	    /* count, then load, to avoid fragmenting the hell out of
	       memory */
	    while(line){
	      lines++;
	      for(j=0;j<cols;j++)
		if(get_line_value(in,vec+j)){
		  fprintf(stderr,"Too few columns on line %ld in data file\n",lines);
		  exit(1);
		}
	      if((lines&0xff)==0)spinnit("counting samples...",lines*cols);
	      line=setup_line(in);
	    }
	    pointlist=_ogg_malloc((cols*lines+entries*dim)*sizeof(float));
	    
	    rewind(in);
	    line=setup_line(in);
	    while(line){
	      lines--;
	      for(j=0;j<cols;j++)
		if(get_line_value(in,vec+j)){
		  fprintf(stderr,"Too few columns on line %ld in data file\n",lines);
		  exit(1);
		}
	      /* deinterleave, add to heap */
	      add_vector(b,vec,cols);
	      if((lines&0xff)==0)spinnit("loading samples...",lines*cols);
	      
	      line=setup_line(in);
	    }
	    fclose(in);
	  }
	}
	break;
      case 2:
	target=atol(*argv++);
	if(target==0)target=entries;
	break;
      case 3:
	protect=atol(*argv++);
	break;
      case 4:
	{
	  char *buff=alloca(strlen(*argv)+5);
	  sprintf(buff,"%s.vqh",*argv);
	  basename=*argv++;

	  out=fopen(buff,"w");
	  if(!out){
	    fprintf(stderr,"unable ot open %s for output",buff);
	    exit(1);
	  }
	}
	break;
      default:
	usage();
      }
    }
  }
  if(!entries || !points || !out)usage();
  if(target==-1)usage();

  /* add guard points */
  for(i=0;i<entries;i++)
    for(j=0;j<dim;j++)
      pointlist[points++]=b->valuelist[i*dim+j];
  
  points/=dim;

  /* set up auxiliary vectors for error tracking */
  {
    encode_aux_nearestmatch *nt=NULL;
    long pointssofar=0;
    long *pointindex;
    long indexedpoints=0;
    long *entryindex;
    long *reventry;
    long *membership=_ogg_malloc(points*sizeof(long));
    long *firsthead=_ogg_malloc(entries*sizeof(long));
    long *secondary=_ogg_malloc(points*sizeof(long));
    long *secondhead=_ogg_malloc(entries*sizeof(long));

    long *cellcount=_ogg_calloc(entries,sizeof(long));
    long *cellcount2=_ogg_calloc(entries,sizeof(long));
    float *cellerror=_ogg_calloc(entries,sizeof(float));
    float *cellerrormax=_ogg_calloc(entries,sizeof(float));
    long cellsleft=entries;
    for(i=0;i<points;i++)membership[i]=-1;
    for(i=0;i<entries;i++)firsthead[i]=-1;
    for(i=0;i<points;i++)secondary[i]=-1;
    for(i=0;i<entries;i++)secondhead[i]=-1;

    for(i=0;i<points;i++){
      /* assign vectors to the nearest cell.  Also keep track of second
	 nearest for error statistics */
      float *ppt=pointlist+i*dim;
      int    firstentry=closest(b,ppt,-1);
      int    secondentry=closest(b,ppt,firstentry);
      float firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt);
      float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
      
      if(!(i&0xff))spinnit("initializing... ",points-i);
    
      membership[i]=firsthead[firstentry];
      firsthead[firstentry]=i;
      secondary[i]=secondhead[secondentry];
      secondhead[secondentry]=i;

      if(i<points-entries){
	cellerror[firstentry]+=secondmetric-firstmetric;
	cellerrormax[firstentry]=max(cellerrormax[firstentry],
				     _heuristic(b,ppt,secondentry));
	cellcount[firstentry]++;
	cellcount2[secondentry]++;
      }
    }

    /* which cells are most heavily populated?  Protect as many from
       dispersal as the user has requested */
    {
      long **countindex=_ogg_calloc(entries,sizeof(long *));
      for(i=0;i<entries;i++)countindex[i]=cellcount+i;
      qsort(countindex,entries,sizeof(long *),longsort);
      for(i=0;i<protect;i++){
	int ptr=countindex[i]-cellcount;
	cellerrormax[ptr]=9e50f;
      }
    }

    {
      fprintf(stderr,"\r");
      for(i=0;i<entries;i++){
	/* decompose index */
	int entry=i;
	for(j=0;j<dim;j++){
	  fprintf(stderr,"%d:",entry%b->c->thresh_tree->quantvals);
	  entry/=b->c->thresh_tree->quantvals;
	}
	
	fprintf(stderr,":%ld/%ld, ",cellcount[i],cellcount2[i]);
      }
      fprintf(stderr,"\n");
    }

    /* do the automatic cull request */
    while(cellsleft>target){
      int bestcell=-1;
      float besterror=0;
      float besterror2=0;
      long head=-1;
      char spinbuf[80];
      sprintf(spinbuf,"cells left to eliminate: %ld : ",cellsleft-target);

      /* find the cell with lowest removal impact */
      for(i=0;i<entries;i++){
	if(b->c->lengthlist[i]>0){
	  if(bestcell==-1 || cellerrormax[i]<=besterror2){
	    if(bestcell==-1 || cellerrormax[i]<besterror2 || 
	       besterror>cellerror[i]){
	      besterror=cellerror[i];
	      besterror2=cellerrormax[i];
	      bestcell=i;
	    }
	  }
	}
      }

      fprintf(stderr,"\reliminating cell %d                              \n"
	      "     dispersal error of %g max/%g total (%ld hits)\n",
	      bestcell,besterror2,besterror,cellcount[bestcell]);

      /* disperse it.  move each point out, adding it (properly) to
         the second best */
      b->c->lengthlist[bestcell]=0;
      head=firsthead[bestcell];
      firsthead[bestcell]=-1;
      while(head!=-1){
	/* head is a point number */
	float *ppt=pointlist+head*dim;
	int firstentry=closest(b,ppt,-1);
	int secondentry=closest(b,ppt,firstentry);
	float firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt);
	float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
	long next=membership[head];

	if(head<points-entries){
	  cellcount[firstentry]++;
	  cellcount[bestcell]--;
	  cellerror[firstentry]+=secondmetric-firstmetric;
	  cellerrormax[firstentry]=max(cellerrormax[firstentry],
				       _heuristic(b,ppt,secondentry));
	}

	membership[head]=firsthead[firstentry];
	firsthead[firstentry]=head;
	head=next;
	if(cellcount[bestcell]%128==0)
	  spinnit(spinbuf,cellcount[bestcell]+cellcount2[bestcell]);

      }

      /* now see that all points that had the dispersed cell as second
         choice have second choice reassigned */
      head=secondhead[bestcell];
      secondhead[bestcell]=-1;
      while(head!=-1){
	float *ppt=pointlist+head*dim;
	/* who are we assigned to now? */
	int firstentry=closest(b,ppt,-1);
	/* what is the new second closest match? */
	int secondentry=closest(b,ppt,firstentry);
	/* old second closest is the cell being disbanded */
	float oldsecondmetric=_dist(dim,b->valuelist+dim*bestcell,ppt);
	/* new second closest error */
	float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
	long next=secondary[head];

	if(head<points-entries){
	  cellcount2[secondentry]++;
	  cellcount2[bestcell]--;
	  cellerror[firstentry]+=secondmetric-oldsecondmetric;
	  cellerrormax[firstentry]=max(cellerrormax[firstentry],
				       _heuristic(b,ppt,secondentry));
	}
	
	secondary[head]=secondhead[secondentry];
	secondhead[secondentry]=head;
	head=next;

	if(cellcount2[bestcell]%128==0)
	  spinnit(spinbuf,cellcount2[bestcell]);
      }

      cellsleft--;
    }

    /* paring is over.  Build decision trees using points that now fall
       through the thresh matcher. */
    /* we don't free membership; we flatten it in order to use in lp_split */

    for(i=0;i<entries;i++){
      long head=firsthead[i];
      spinnit("rearranging membership cache... ",entries-i);
      while(head!=-1){
	long next=membership[head];
	membership[head]=i;
	head=next;
      }
    }

    free(secondhead);
    free(firsthead);
    free(cellerror);
    free(cellerrormax);
    free(secondary);

    pointindex=_ogg_malloc(points*sizeof(long));
    /* make a point index of fall-through points */
    for(i=0;i<points;i++){
      int best=_best(b,pointlist+i*dim,1);
      if(best==-1)
	pointindex[indexedpoints++]=i;
      spinnit("finding orphaned points... ",points-i);
    }

    /* make an entry index */
    entryindex=_ogg_malloc(entries*sizeof(long));
    target=0;
    for(i=0;i<entries;i++){
      if(b->c->lengthlist[i]>0)
	entryindex[target++]=i;
    }

    /* make working space for a reverse entry index */
    reventry=_ogg_malloc(entries*sizeof(long));

    /* do the split */
    nt=b->c->nearest_tree=
      _ogg_calloc(1,sizeof(encode_aux_nearestmatch));

    nt->alloc=4096;
    nt->ptr0=_ogg_malloc(sizeof(long)*nt->alloc);
    nt->ptr1=_ogg_malloc(sizeof(long)*nt->alloc);
    nt->p=_ogg_malloc(sizeof(long)*nt->alloc);
    nt->q=_ogg_malloc(sizeof(long)*nt->alloc);
    nt->aux=0;

    fprintf(stderr,"Leaves added: %d              \n",
            lp_split(pointlist,points,
                     b,entryindex,target,
                     pointindex,indexedpoints,
                     membership,reventry,
                     0,&pointssofar));
    free(membership);
    free(reventry);
    free(pointindex);

    /* hack alert.  I should just change the damned splitter and
       codebook writer */
    for(i=0;i<nt->aux;i++)nt->p[i]*=dim;
    for(i=0;i<nt->aux;i++)nt->q[i]*=dim;
    
    /* recount hits.  Build new lengthlist. reuse entryindex storage */
    for(i=0;i<entries;i++)entryindex[i]=1;
    for(i=0;i<points-entries;i++){
      int best=_best(b,pointlist+i*dim,1);
      float *a=pointlist+i*dim;
      if(!(i&0xff))spinnit("counting hits...",i);
      if(best==-1){
	fprintf(stderr,"\nINTERNAL ERROR; a point count not be matched to a\n"
		"codebook entry.  The new decision tree is broken.\n");
	exit(1);
      }
      entryindex[best]++;
    }
    for(i=0;i<nt->aux;i++)nt->p[i]/=dim;
    for(i=0;i<nt->aux;i++)nt->q[i]/=dim;
    
    /* the lengthlist builder doesn't actually deal with 0 hit entries.
       So, we pack the 'sparse' hit list into a dense list, then unpack
       the lengths after the build */
    {
      int upper=0;
      long *lengthlist=_ogg_calloc(entries,sizeof(long));
      for(i=0;i<entries;i++){
	if(b->c->lengthlist[i]>0)
	  entryindex[upper++]=entryindex[i];
	else{
	  if(entryindex[i]>1){
	    fprintf(stderr,"\nINTERNAL ERROR; _best matched to unused entry\n");
	    exit(1);
	  }
	}
      }
      
      /* sanity check */
      if(upper != target){
	fprintf(stderr,"\nINTERNAL ERROR; packed the wrong number of entries\n");
	exit(1);
      }
    
      build_tree_from_lengths(upper,entryindex,lengthlist);
      
      upper=0;
      for(i=0;i<entries;i++){
	if(b->c->lengthlist[i]>0)
	  b->c->lengthlist[i]=lengthlist[upper++];
      }

    }
  }
  /* we're done.  write it out. */
  write_codebook(out,basename,b->c);

  fprintf(stderr,"\r                                        \nDone.\n");
  return(0);
}