Exemple #1
0
void find_most_violated_constraint(SVECTOR **fydelta, double *rhs, 
				   EXAMPLE *ex, SVECTOR *fycached, long n, 
				   STRUCTMODEL *sm, STRUCT_LEARN_PARM *sparm,
				   double *rt_viol, double *rt_psi, 
				   long *argmax_count)
     /* returns fydelta=fy-fybar and rhs scalar value that correspond
	to the most violated constraint for example ex */
{
  double      rt2=0;
  LABEL       ybar;
  SVECTOR     *fybar, *fy;
  double      factor,lossval;

  if(struct_verbosity>=2) rt2=get_runtime();
  (*argmax_count)++;
  if(sparm->loss_type == SLACK_RESCALING) 
    ybar=find_most_violated_constraint_slackrescaling(ex->x,ex->y,sm,sparm);
  else
    ybar=find_most_violated_constraint_marginrescaling(ex->x,ex->y,sm,sparm);
  if(struct_verbosity>=2) (*rt_viol)+=MAX(get_runtime()-rt2,0);
  
  if(empty_label(ybar)) {
    printf("ERROR: empty label was returned for example\n");
    /* exit(1); */
    /* continue; */
  }
  
  /**** get psi(x,y) and psi(x,ybar) ****/
  if(struct_verbosity>=2) rt2=get_runtime();
  if(fycached)
    fy=copy_svector(fycached); 
  else 
    fy=psi(ex->x,ex->y,sm,sparm);
  fybar=psi(ex->x,ybar,sm,sparm);
  if(struct_verbosity>=2) (*rt_psi)+=MAX(get_runtime()-rt2,0);
  lossval=loss(ex->y,ybar,sparm);
  free_label(ybar);
  
  /**** scale feature vector and margin by loss ****/
  if(sparm->loss_type == SLACK_RESCALING)
    factor=lossval/n;
  else                 /* do not rescale vector for */
    factor=1.0/n;      /* margin rescaling loss type */
  mult_svector_list(fy,factor);
  mult_svector_list(fybar,-factor);
  append_svector_list(fybar,fy);   /* compute fy-fybar */

  (*fydelta)=fybar;
  (*rhs)=lossval/n;
}
Exemple #2
0
void svm_learn_struct(SAMPLE sample, STRUCT_LEARN_PARM *sparm,
		      LEARN_PARM *lparm, KERNEL_PARM *kparm, 
		      STRUCTMODEL *sm)
{
  int         i,j;
  int         numIt=0;
  long        newconstraints=0, activenum=0; 
  int         opti_round, *opti;
  long        old_numConst=0;
  double      epsilon;
  long        tolerance;
  double      lossval,factor;
  double      margin=0;
  double      slack, *slacks, slacksum;
  long        sizePsi;
  double      *alpha=NULL;
  CONSTSET    cset;
  SVECTOR     *diff=NULL;
  SVECTOR     *fy, *fybar, *f;
  SVECTOR     *slackvec;
  WORD        slackv[2];
  MODEL       *svmModel=NULL;
  KERNEL_CACHE *kcache=NULL;
  LABEL       ybar;
  DOC         *doc;

  long        n=sample.n;
  EXAMPLE     *ex=sample.examples;
  double      rt_total=0.0, rt_opt=0.0;
  long        rt1,rt2;

  init_struct_model(sample,sm,sparm); 
  sizePsi=sm->sizePsi+1;          /* sm must contain size of psi on return */

  /* initialize example selection heuristic */ 
  opti=(int*)my_malloc(n*sizeof(int));
  for(i=0;i<n;i++) {
    opti[i]=0;
  }
  opti_round=0;

  if(sparm->slack_norm == 1) {
    lparm->svm_c=sparm->C;          /* set upper bound C */
    lparm->sharedslack=1;
  }
  else if(sparm->slack_norm == 2) {
    lparm->svm_c=999999999999999.0; /* upper bound C must never be reached */
    lparm->sharedslack=0;
    if(kparm->kernel_type != LINEAR) {
      printf("ERROR: Kernels are not implemented for L2 slack norm!"); 
      fflush(stdout);
      exit(0);
    }
  }
  else {
    printf("ERROR: Slack norm must be L1 or L2!"); fflush(stdout);
    exit(0);
  }


  epsilon=1.0;                    /* start with low precision and
				     increase later */
  tolerance=n/100;                /* increase precision, whenever less
                                     than that number of constraints
                                     is not fulfilled */
  lparm->biased_hyperplane=0;     /* set threshold to zero */

  cset=init_struct_constraints(sample, sm, sparm);
  if(cset.m > 0) {
    alpha=realloc(alpha,sizeof(double)*cset.m);
    for(i=0; i<cset.m; i++) 
      alpha[i]=0;
  }

  /* set initial model and slack variables*/
  svmModel=(MODEL *)my_malloc(sizeof(MODEL));
  svm_learn_optimization(cset.lhs,cset.rhs,cset.m,sizePsi+n,
			 lparm,kparm,NULL,svmModel,alpha);
  add_weight_vector_to_linear_model(svmModel);
  sm->svm_model=svmModel;
  sm->w=svmModel->lin_weights; /* short cut to weight vector */

  printf("Starting Iterations\n");

    /*****************/
   /*** main loop ***/
  /*****************/
  do { /* iteratively increase precision */

    epsilon=MAX(epsilon*0.09999999999,sparm->epsilon);
    if(epsilon == sparm->epsilon)   /* for final precision, find all SV */
      tolerance=0;
    lparm->epsilon_crit=epsilon/2;  /* svm precision must be higher than eps */
    if(struct_verbosity>=1)
      printf("Setting current working precision to %g.\n",epsilon);

    do { /* iteration until (approx) all SV are found for current
            precision and tolerance */
      
      old_numConst=cset.m;
      opti_round++;
      activenum=n;

      do { /* go through examples that keep producing new constraints */

	if(struct_verbosity>=1) { 
	  printf("--Iteration %i (%ld active): ",++numIt,activenum); 
	  fflush(stdout);
	}
	
	for(i=0; i<n; i++) { /*** example loop ***/
	  
	  rt1=get_runtime();
	    
	  if(opti[i] != opti_round) {/* if the example is not shrunk
	                                away, then see if it is necessary to 
					add a new constraint */
	    if(sparm->loss_type == SLACK_RESCALING) 
	      ybar=find_most_violated_constraint_slackrescaling(ex[i].x,
								ex[i].y,sm,
								sparm);
	    else
	      ybar=find_most_violated_constraint_marginrescaling(ex[i].x,
								 ex[i].y,sm,
								 sparm);
	    
	    if(empty_label(ybar)) {
	      if(opti[i] != opti_round) {
		activenum--;
		opti[i]=opti_round; 
	      }
	      if(struct_verbosity>=2)
		printf("no-incorrect-found(%i) ",i);
	      continue;
	    }
	  
	    /**** get psi(y)-psi(ybar) ****/
	    fy=psi(ex[i].x,ex[i].y,sm,sparm);
	    fybar=psi(ex[i].x,ybar,sm,sparm);
	    
	    /**** scale feature vector and margin by loss ****/
	    lossval=loss(ex[i].y,ybar,sparm);
	    if(sparm->slack_norm == 2)
	      lossval=sqrt(lossval);
	    if(sparm->loss_type == SLACK_RESCALING)
	      factor=lossval;
	    else               /* do not rescale vector for */
	      factor=1.0;      /* margin rescaling loss type */
	    for(f=fy;f;f=f->next)
	      f->factor*=factor;
	    for(f=fybar;f;f=f->next)
	      f->factor*=-factor;
	    margin=lossval;

	    /**** create constraint for current ybar ****/
	    append_svector_list(fy,fybar);/* append the two vector lists */
	    doc=create_example(cset.m,0,i+1,1,fy);

	    /**** compute slack for this example ****/
	    slack=0;
	    for(j=0;j<cset.m;j++) 
	      if(cset.lhs[j]->slackid == i+1) {
		if(sparm->slack_norm == 2) /* works only for linear kernel */
		  slack=MAX(slack,cset.rhs[j]
			          -(classify_example(svmModel,cset.lhs[j])
				    -sm->w[sizePsi+i]/(sqrt(2*sparm->C))));
		else
		  slack=MAX(slack,
			   cset.rhs[j]-classify_example(svmModel,cset.lhs[j]));
	      }
	    
	    /**** if `error' add constraint and recompute ****/
	    if((classify_example(svmModel,doc)+slack)<(margin-epsilon)) { 
	      if(struct_verbosity>=2)
		{printf("(%i) ",i); fflush(stdout);}
	      if(struct_verbosity==1)
		{printf("."); fflush(stdout);}
	      
	      /**** resize constraint matrix and add new constraint ****/
	      cset.m++;
	      cset.lhs=realloc(cset.lhs,sizeof(DOC *)*cset.m);
	      if(kparm->kernel_type == LINEAR) {
		diff=add_list_ss(fy); /* store difference vector directly */
		if(sparm->slack_norm == 1) 
		  cset.lhs[cset.m-1]=create_example(cset.m-1,0,i+1,1,
						    copy_svector(diff));
		else if(sparm->slack_norm == 2) {
		  /**** add squared slack variable to feature vector ****/
		  slackv[0].wnum=sizePsi+i;
		  slackv[0].weight=1/(sqrt(2*sparm->C));
		  slackv[1].wnum=0; /*terminator*/
		  slackvec=create_svector(slackv,"",1.0);
		  cset.lhs[cset.m-1]=create_example(cset.m-1,0,i+1,1,
						    add_ss(diff,slackvec));
		  free_svector(slackvec);
		}
		free_svector(diff);
	      }
	      else { /* kernel is used */
		if(sparm->slack_norm == 1) 
		  cset.lhs[cset.m-1]=create_example(cset.m-1,0,i+1,1,
						    copy_svector(fy));
		else if(sparm->slack_norm == 2)
		  exit(1);
	      }
	      cset.rhs=realloc(cset.rhs,sizeof(double)*cset.m);
	      cset.rhs[cset.m-1]=margin;
	      alpha=realloc(alpha,sizeof(double)*cset.m);
	      alpha[cset.m-1]=0;
	      newconstraints++;
	    }
	    else {
	      printf("+"); fflush(stdout); 
	      if(opti[i] != opti_round) {
		activenum--;
		opti[i]=opti_round; 
	      }
	    }

	    free_example(doc,0);
	    free_svector(fy); /* this also free's fybar */
	    free_label(ybar);
	  }

	  /**** get new QP solution ****/
	  if((newconstraints >= sparm->newconstretrain) 
	     || ((newconstraints > 0) && (i == n-1))) {
	    if(struct_verbosity>=1) {
	      printf("*");fflush(stdout);
	    }
	    rt2=get_runtime();
	    free_model(svmModel,0);
	    svmModel=(MODEL *)my_malloc(sizeof(MODEL));
	    /* Always get a new kernel cache. It is not possible to use the
	       same cache for two different training runs */
	    if(kparm->kernel_type != LINEAR)
	      kcache=kernel_cache_init(cset.m,lparm->kernel_cache_size);
	    /* Run the QP solver on cset. */
	    svm_learn_optimization(cset.lhs,cset.rhs,cset.m,sizePsi+n,
				   lparm,kparm,kcache,svmModel,alpha);
	    if(kcache)
	      kernel_cache_cleanup(kcache);
	    /* Always add weight vector, in case part of the kernel is
	       linear. If not, ignore the weight vector since its
	       content is bogus. */
	    add_weight_vector_to_linear_model(svmModel);
	    sm->svm_model=svmModel;
	    sm->w=svmModel->lin_weights; /* short cut to weight vector */
	    rt_opt+=MAX(get_runtime()-rt2,0);
	    
	    newconstraints=0;
	  }	

	  rt_total+=MAX(get_runtime()-rt1,0);
	} /* end of example loop */

	if(struct_verbosity>=1)
	  printf("(NumConst=%d, SV=%ld, Eps=%.4f)\n",cset.m,svmModel->sv_num-1,
		 svmModel->maxdiff);

      } while(activenum > 0);   /* repeat until all examples produced no
				   constraint at least once */

    } while((cset.m - old_numConst) > tolerance) ;

  } while(epsilon > sparm->epsilon);  

  if(struct_verbosity>=1) {
    /**** compute sum of slacks ****/
    slacks=(double *)my_malloc(sizeof(double)*(n+1));
    for(i=0; i<=n; i++) { 
      slacks[i]=0;
    }
    if(sparm->slack_norm == 1) {
      for(j=0;j<cset.m;j++) 
	slacks[cset.lhs[j]->slackid]=MAX(slacks[cset.lhs[j]->slackid],
			   cset.rhs[j]-classify_example(svmModel,cset.lhs[j]));
      }
    else if(sparm->slack_norm == 2) {
      for(j=0;j<cset.m;j++) 
	slacks[cset.lhs[j]->slackid]=MAX(slacks[cset.lhs[j]->slackid],
		cset.rhs[j]
	         -(classify_example(svmModel,cset.lhs[j])
		   -sm->w[sizePsi+cset.lhs[j]->slackid-1]/(sqrt(2*sparm->C))));
    }
    slacksum=0;
    for(i=0; i<=n; i++)  
      slacksum+=slacks[i];
    free(slacks);

    printf("Final epsilon on KKT-Conditions: %.5f\n",
	   MAX(svmModel->maxdiff,epsilon));
    printf("Total number of constraints added: %i\n",(int)cset.m);
    if(sparm->slack_norm == 1) {
      printf("Number of SV: %ld \n",svmModel->sv_num-1);
      printf("Number of non-zero slack variables: %ld (out of %ld)\n",
	     svmModel->at_upper_bound,n);
      printf("Norm of weight vector: |w|=%.5f\n",
	     model_length_s(svmModel,kparm));
    }
    else if(sparm->slack_norm == 2){ 
      printf("Number of SV: %ld (including %ld at upper bound)\n",
	     svmModel->sv_num-1,svmModel->at_upper_bound);
      printf("Norm of weight vector (including L2-loss): |w|=%.5f\n",
	     model_length_s(svmModel,kparm));
    }
    printf("Sum of slack variables: sum(xi_i)=%.5f\n",slacksum);
    printf("Norm of longest difference vector: ||Psi(x,y)-Psi(x,ybar)||=%.5f\n",
	   length_of_longest_document_vector(cset.lhs,cset.m,kparm));
    printf("Runtime in cpu-seconds: %.2f (%.2f%% for SVM optimization)\n",
	   rt_total/100.0, 100.0*rt_opt/rt_total);
  }
  if(struct_verbosity>=4)
    printW(sm->w,sizePsi,n,lparm->svm_c);

  if(svmModel) {
    sm->svm_model=copy_model(svmModel);
    sm->w=sm->svm_model->lin_weights; /* short cut to weight vector */
  }

  print_struct_learning_stats(sample,sm,cset,alpha,sparm);

  if(svmModel)
    free_model(svmModel,0);
  free(alpha); 
  free(opti); 
  free(cset.rhs); 
  for(i=0;i<cset.m;i++) 
    free_example(cset.lhs[i],1);
  free(cset.lhs);
}
int main (int argc, char* argv[])
{
  long correct=0,incorrect=0,no_accuracy=0;
  long i;
  double t1,runtime=0;
  double avgloss=0,l;
  FILE *predfl;
  STRUCTMODEL model; 
  STRUCT_LEARN_PARM sparm;
  STRUCT_TEST_STATS teststats;
  SAMPLE testsample;
  LABEL y;

  svm_struct_classify_api_init(argc,argv);

  read_input_parameters(argc,argv,testfile,modelfile,predictionsfile,&sparm,
			&verbosity,&struct_verbosity);

  if(struct_verbosity>=1) {
    printf("Reading model..."); fflush(stdout);
  }
  model=read_struct_model(modelfile,&sparm);
  if(struct_verbosity>=1) {
    fprintf(stdout, "done.\n");
  }

  if(model.svm_model->kernel_parm.kernel_type == LINEAR) { /* linear kernel */
    /* compute weight vector */
    //add_weight_vector_to_linear_model(model.svm_model);
    //model.w=model.svm_model->lin_weights;
  }
  
  if(struct_verbosity>=1) {
    printf("Reading test examples..."); fflush(stdout);
  }
  testsample=read_struct_examples(testfile,&sparm);
  if(struct_verbosity>=1) {
    printf("done.\n"); fflush(stdout);
  }

  if(struct_verbosity>=1) {
    printf("Classifying test examples..."); fflush(stdout);
  }

  if ((predfl = fopen (predictionsfile, "w")) == NULL)
  { perror (predictionsfile); exit (1); }

  for(i=0;i<testsample.n;i++) {
    t1=get_runtime();
    y=classify_struct_example(testsample.examples[i].x,&model,&sparm);
    runtime+=(get_runtime()-t1);

    write_label(predfl,y);
    l=loss(testsample.examples[i].y,y,&sparm);
    avgloss+=l;
    if(l == 0) 
      correct++;
    else
      incorrect++;
    eval_prediction(i,testsample.examples[i],y,&model,&sparm,&teststats);

    if(empty_label(testsample.examples[i].y)) 
      { no_accuracy=1; } /* test data is not labeled */
    if(struct_verbosity>=2) {
      if((i+1) % 100 == 0) {
	printf("%ld..",i+1); fflush(stdout);
      }
    }
    free_label(y);
  }  
  avgloss/=testsample.n;
  fclose(predfl);

  if(struct_verbosity>=1) {
    printf("done\n");
    printf("Runtime (without IO) in cpu-seconds: %.2f\n",
	   (float)(runtime/100.0));    
  }
  if((!no_accuracy) && (struct_verbosity>=1)) {
    printf("Average loss on test set: %.4f\n",(float)avgloss);
    printf("Zero/one-error on test set: %.2f%% (%ld correct, %ld incorrect, %d total)\n",(float)100.0*incorrect/testsample.n,correct,incorrect,testsample.n);
  }
  print_struct_testing_stats(testsample,&model,&sparm,&teststats);
  free_struct_sample(testsample);
  free_struct_model(model);

  svm_struct_classify_api_exit();

  return(0);
}
Exemple #4
0
void svm_learn_struct(SAMPLE sample, STRUCT_LEARN_PARM *sparm,
		      LEARN_PARM *lparm, KERNEL_PARM *kparm, 
		      STRUCTMODEL *sm, int alg_type)
{
  int         i,j;
  int         numIt=0;
  long        argmax_count=0;
  long        newconstraints=0, totconstraints=0, activenum=0; 
  int         opti_round, *opti, fullround, use_shrinking;
  long        old_totconstraints=0;
  double      epsilon,svmCnorm;
  long        tolerance,new_precision=1,dont_stop=0;
  double      lossval,factor,dist;
  double      margin=0;
  double      slack, *slacks, slacksum, ceps;
  double      dualitygap,modellength,alphasum;
  long        sizePsi;
  double      *alpha=NULL;
  long        *alphahist=NULL,optcount=0,lastoptcount=0;
  CONSTSET    cset;
  SVECTOR     *diff=NULL;
  SVECTOR     *fy, *fybar, *f, **fycache=NULL;
  SVECTOR     *slackvec;
  WORD        slackv[2];
  MODEL       *svmModel=NULL;
  KERNEL_CACHE *kcache=NULL;
  LABEL       ybar;
  DOC         *doc;

  long        n=sample.n;
  EXAMPLE     *ex=sample.examples;
  double      rt_total=0, rt_opt=0, rt_init=0, rt_psi=0, rt_viol=0;
  double      rt1,rt2;

  rt1=get_runtime();

  init_struct_model(sample,sm,sparm,lparm,kparm); 
  sizePsi=sm->sizePsi+1;          /* sm must contain size of psi on return */

  /* initialize shrinking-style example selection heuristic */ 
  if(alg_type == NSLACK_SHRINK_ALG)
    use_shrinking=1;
  else
    use_shrinking=0;
  opti=(int*)my_malloc(n*sizeof(int));
  for(i=0;i<n;i++) {
    opti[i]=0;
  }
  opti_round=0;

  /* normalize regularization parameter C by the number of training examples */
  svmCnorm=sparm->C/n;

  if(sparm->slack_norm == 1) {
    lparm->svm_c=svmCnorm;          /* set upper bound C */
    lparm->sharedslack=1;
  }
  else if(sparm->slack_norm == 2) {
    lparm->svm_c=999999999999999.0; /* upper bound C must never be reached */
    lparm->sharedslack=0;
    if(kparm->kernel_type != LINEAR_KERNEL) {
      printf("ERROR: Kernels are not implemented for L2 slack norm!"); 
      fflush(stdout);
      exit(0); 
    }
  }
  else {
    printf("ERROR: Slack norm must be L1 or L2!"); fflush(stdout);
    exit(0);
  }


  epsilon=100.0;                  /* start with low precision and
				     increase later */
  tolerance=MIN(n/3,MAX(n/100,5));/* increase precision, whenever less
                                     than that number of constraints
                                     is not fulfilled */
  lparm->biased_hyperplane=0;     /* set threshold to zero */

  cset=init_struct_constraints(sample, sm, sparm);
  if(cset.m > 0) {
    alpha=(double *)realloc(alpha,sizeof(double)*cset.m);
    alphahist=(long *)realloc(alphahist,sizeof(long)*cset.m);
    for(i=0; i<cset.m; i++) {
      alpha[i]=0;
      alphahist[i]=-1; /* -1 makes sure these constraints are never removed */
    }
  }

  /* set initial model and slack variables*/
  svmModel=(MODEL *)my_malloc(sizeof(MODEL));
  lparm->epsilon_crit=epsilon;
  if(kparm->kernel_type != LINEAR_KERNEL)
    kcache=kernel_cache_init(MAX(cset.m,1),lparm->kernel_cache_size);
  svm_learn_optimization(cset.lhs,cset.rhs,cset.m,sizePsi+n,
			 lparm,kparm,kcache,svmModel,alpha);
  if(kcache)
    kernel_cache_cleanup(kcache);
  add_weight_vector_to_linear_model(svmModel);
  sm->svm_model=svmModel;
  sm->w=svmModel->lin_weights; /* short cut to weight vector */

  /* create a cache of the feature vectors for the correct labels */
  if(USE_FYCACHE) {
    fycache=(SVECTOR **)my_malloc(n*sizeof(SVECTOR *));
    for(i=0;i<n;i++) {
      fy=psi(ex[i].x,ex[i].y,sm,sparm);
      if(kparm->kernel_type == LINEAR_KERNEL) {
	diff=add_list_ss(fy); /* store difference vector directly */
	free_svector(fy);
	fy=diff;
      }
      fycache[i]=fy;
    }
  }

  rt_init+=MAX(get_runtime()-rt1,0);
  rt_total+=MAX(get_runtime()-rt1,0);

    /*****************/
   /*** main loop ***/
  /*****************/
  do { /* iteratively increase precision */

    epsilon=MAX(epsilon*0.49999999999,sparm->epsilon);
    new_precision=1;
    if(epsilon == sparm->epsilon)   /* for final precision, find all SV */
      tolerance=0; 
    lparm->epsilon_crit=epsilon/2;  /* svm precision must be higher than eps */
    if(struct_verbosity>=1)
      printf("Setting current working precision to %g.\n",epsilon);

    do { /* iteration until (approx) all SV are found for current
            precision and tolerance */
      
      opti_round++;
      activenum=n;
      dont_stop=0;
      old_totconstraints=totconstraints;

      do { /* with shrinking turned on, go through examples that keep
	      producing new constraints */

	if(struct_verbosity>=1) { 
	  printf("Iter %i (%ld active): ",++numIt,activenum); 
	  fflush(stdout);
	}
	
	ceps=0;
	fullround=(activenum == n);

	for(i=0; i<n; i++) { /*** example loop ***/
	  
	  rt1=get_runtime();
	    
	  if((!use_shrinking) || (opti[i] != opti_round)) {
	                                /* if the example is not shrunk
	                                away, then see if it is necessary to 
					add a new constraint */
	    rt2=get_runtime();
	    argmax_count++;
	    if(sparm->loss_type == SLACK_RESCALING) 
	      ybar=find_most_violated_constraint_slackrescaling(ex[i].x,
								ex[i].y,sm,
								sparm);
	    else
	      ybar=find_most_violated_constraint_marginrescaling(ex[i].x,
								 ex[i].y,sm,
								 sparm);
	    rt_viol+=MAX(get_runtime()-rt2,0);
	    
	    if(empty_label(ybar)) {
	      if(opti[i] != opti_round) {
		activenum--;
		opti[i]=opti_round; 
	      }
	      if(struct_verbosity>=2)
		printf("no-incorrect-found(%i) ",i);
	      continue;
	    }
	  
	    /**** get psi(y)-psi(ybar) ****/
	    rt2=get_runtime();
	    if(fycache) 
	      fy=copy_svector(fycache[i]);
	    else
	      fy=psi(ex[i].x,ex[i].y,sm,sparm);
	    fybar=psi(ex[i].x,ybar,sm,sparm);
	    rt_psi+=MAX(get_runtime()-rt2,0);
	    
	    /**** scale feature vector and margin by loss ****/
	    lossval=loss(ex[i].y,ybar,sparm);
	    if(sparm->slack_norm == 2)
	      lossval=sqrt(lossval);
	    if(sparm->loss_type == SLACK_RESCALING)
	      factor=lossval;
	    else               /* do not rescale vector for */
	      factor=1.0;      /* margin rescaling loss type */
	    for(f=fy;f;f=f->next)
	      f->factor*=factor;
	    for(f=fybar;f;f=f->next)
	      f->factor*=-factor;
	    margin=lossval;

	    /**** create constraint for current ybar ****/
	    append_svector_list(fy,fybar);/* append the two vector lists */
	    doc=create_example(cset.m,0,i+1,1,fy);

	    /**** compute slack for this example ****/
	    slack=0;
	    for(j=0;j<cset.m;j++) 
	      if(cset.lhs[j]->slackid == i+1) {
		if(sparm->slack_norm == 2) /* works only for linear kernel */
		  slack=MAX(slack,cset.rhs[j]
			          -(classify_example(svmModel,cset.lhs[j])
				    -sm->w[sizePsi+i]/(sqrt(2*svmCnorm))));
		else
		  slack=MAX(slack,
			   cset.rhs[j]-classify_example(svmModel,cset.lhs[j]));
	      }
	    
	    /**** if `error' add constraint and recompute ****/
	    dist=classify_example(svmModel,doc);
	    ceps=MAX(ceps,margin-dist-slack);
	    if(slack > (margin-dist+0.0001)) {
	      printf("\nWARNING: Slack of most violated constraint is smaller than slack of working\n");
	      printf("         set! There is probably a bug in 'find_most_violated_constraint_*'.\n");
	      printf("Ex %d: slack=%f, newslack=%f\n",i,slack,margin-dist);
	      /* exit(1); */
	    }
	    if((dist+slack)<(margin-epsilon)) { 
	      if(struct_verbosity>=2)
		{printf("(%i,eps=%.2f) ",i,margin-dist-slack); fflush(stdout);}
	      if(struct_verbosity==1)
		{printf("."); fflush(stdout);}
	      
	      /**** resize constraint matrix and add new constraint ****/
	      cset.m++;
	      cset.lhs=(DOC **)realloc(cset.lhs,sizeof(DOC *)*cset.m);
	      if(kparm->kernel_type == LINEAR_KERNEL) {
		diff=add_list_ss(fy); /* store difference vector directly */
		if(sparm->slack_norm == 1) 
		  cset.lhs[cset.m-1]=create_example(cset.m-1,0,i+1,1,
						    copy_svector(diff));
		else if(sparm->slack_norm == 2) {
		  /**** add squared slack variable to feature vector ****/
		  slackv[0].wnum=sizePsi+i;
		  slackv[0].weight=1/(sqrt(2*svmCnorm));
		  slackv[1].wnum=0; /*terminator*/
		  slackvec=create_svector(slackv,NULL,1.0);
		  cset.lhs[cset.m-1]=create_example(cset.m-1,0,i+1,1,
						    add_ss(diff,slackvec));
		  free_svector(slackvec);
		}
		free_svector(diff);
	      }
	      else { /* kernel is used */
		if(sparm->slack_norm == 1) 
		  cset.lhs[cset.m-1]=create_example(cset.m-1,0,i+1,1,
						    copy_svector(fy));
		else if(sparm->slack_norm == 2)
		  exit(1);
	      }
	      cset.rhs=(double *)realloc(cset.rhs,sizeof(double)*cset.m);
	      cset.rhs[cset.m-1]=margin;
	      alpha=(double *)realloc(alpha,sizeof(double)*cset.m);
	      alpha[cset.m-1]=0;
	      alphahist=(long *)realloc(alphahist,sizeof(long)*cset.m);
	      alphahist[cset.m-1]=optcount;
	      newconstraints++;
	      totconstraints++;
	    }
	    else {
	      printf("+"); fflush(stdout); 
	      if(opti[i] != opti_round) {
		activenum--;
		opti[i]=opti_round; 
	      }
	    }

	    free_example(doc,0);
	    free_svector(fy); /* this also free's fybar */
	    free_label(ybar);
	  }

	  /**** get new QP solution ****/
	  if((newconstraints >= sparm->newconstretrain) 
	     || ((newconstraints > 0) && (i == n-1))
	     || (new_precision && (i == n-1))) {
	    if(struct_verbosity>=1) {
	      printf("*");fflush(stdout);
	    }
	    rt2=get_runtime();
	    free_model(svmModel,0);
	    svmModel=(MODEL *)my_malloc(sizeof(MODEL));
	    /* Always get a new kernel cache. It is not possible to use the
	       same cache for two different training runs */
	    if(kparm->kernel_type != LINEAR_KERNEL)
	      kcache=kernel_cache_init(MAX(cset.m,1),lparm->kernel_cache_size);
	    /* Run the QP solver on cset. */
	    svm_learn_optimization(cset.lhs,cset.rhs,cset.m,sizePsi+n,
				   lparm,kparm,kcache,svmModel,alpha);
	    if(kcache)
	      kernel_cache_cleanup(kcache);
	    /* Always add weight vector, in case part of the kernel is
	       linear. If not, ignore the weight vector since its
	       content is bogus. */
	    add_weight_vector_to_linear_model(svmModel);
	    sm->svm_model=svmModel;
	    sm->w=svmModel->lin_weights; /* short cut to weight vector */
	    optcount++;
	    /* keep track of when each constraint was last
	       active. constraints marked with -1 are not updated */
	    for(j=0;j<cset.m;j++) 
	      if((alphahist[j]>-1) && (alpha[j] != 0))  
		alphahist[j]=optcount;
	    rt_opt+=MAX(get_runtime()-rt2,0);
	    
	    if(new_precision && (epsilon <= sparm->epsilon))  
	      dont_stop=1; /* make sure we take one final pass */
	    new_precision=0;
	    newconstraints=0;
	  }	

	  rt_total+=MAX(get_runtime()-rt1,0);

	} /* end of example loop */

	rt1=get_runtime();
	
	if(struct_verbosity>=1)
	  printf("(NumConst=%d, SV=%ld, CEps=%.4f, QPEps=%.4f)\n",cset.m,
		 svmModel->sv_num-1,ceps,svmModel->maxdiff);
	
	/* Check if some of the linear constraints have not been
	   active in a while. Those constraints are then removed to
	   avoid bloating the working set beyond necessity. */
	if(struct_verbosity>=2)
	  printf("Reducing working set...");fflush(stdout);
	remove_inactive_constraints(&cset,alpha,optcount,alphahist,
				    MAX(50,optcount-lastoptcount));
	lastoptcount=optcount;
	if(struct_verbosity>=2)
	  printf("done. (NumConst=%d)\n",cset.m);
	
	rt_total+=MAX(get_runtime()-rt1,0);
	
      } while(use_shrinking && (activenum > 0)); /* when using shrinking, 
						    repeat until all examples 
						    produced no constraint at
						    least once */

    } while(((totconstraints - old_totconstraints) > tolerance) || dont_stop);

  } while((epsilon > sparm->epsilon) 
	  || finalize_iteration(ceps,0,sample,sm,cset,alpha,sparm));  

  if(struct_verbosity>=1) {
    /**** compute sum of slacks ****/
    /**** WARNING: If positivity constraints are used, then the
	  maximum slack id is larger than what is allocated
	  below ****/
    slacks=(double *)my_malloc(sizeof(double)*(n+1));
    for(i=0; i<=n; i++) { 
      slacks[i]=0;
    }
    if(sparm->slack_norm == 1) {
      for(j=0;j<cset.m;j++) 
	slacks[cset.lhs[j]->slackid]=MAX(slacks[cset.lhs[j]->slackid],
			   cset.rhs[j]-classify_example(svmModel,cset.lhs[j]));
      }
    else if(sparm->slack_norm == 2) {
      for(j=0;j<cset.m;j++) 
	slacks[cset.lhs[j]->slackid]=MAX(slacks[cset.lhs[j]->slackid],
		cset.rhs[j]
	         -(classify_example(svmModel,cset.lhs[j])
		   -sm->w[sizePsi+cset.lhs[j]->slackid-1]/(sqrt(2*svmCnorm))));
    }
    slacksum=0;
    for(i=1; i<=n; i++)  
      slacksum+=slacks[i];
    free(slacks);
    alphasum=0;
    for(i=0; i<cset.m; i++)  
      alphasum+=alpha[i]*cset.rhs[i];
    modellength=model_length_s(svmModel);
    dualitygap=(0.5*modellength*modellength+svmCnorm*(slacksum+n*ceps))
               -(alphasum-0.5*modellength*modellength);
    
    printf("Final epsilon on KKT-Conditions: %.5f\n",
	   MAX(svmModel->maxdiff,epsilon));
    printf("Upper bound on duality gap: %.5f\n", dualitygap);
    printf("Dual objective value: dval=%.5f\n",
	    alphasum-0.5*modellength*modellength);
    printf("Total number of constraints in final working set: %i (of %i)\n",(int)cset.m,(int)totconstraints);
    printf("Number of iterations: %d\n",numIt);
    printf("Number of calls to 'find_most_violated_constraint': %ld\n",argmax_count);
    if(sparm->slack_norm == 1) {
      printf("Number of SV: %ld \n",svmModel->sv_num-1);
      printf("Number of non-zero slack variables: %ld (out of %ld)\n",
	     svmModel->at_upper_bound,n);
      printf("Norm of weight vector: |w|=%.5f\n",modellength);
    }
    else if(sparm->slack_norm == 2){ 
      printf("Number of SV: %ld (including %ld at upper bound)\n",
	     svmModel->sv_num-1,svmModel->at_upper_bound);
      printf("Norm of weight vector (including L2-loss): |w|=%.5f\n",
	     modellength);
    }
    printf("Norm. sum of slack variables (on working set): sum(xi_i)/n=%.5f\n",slacksum/n);
    printf("Norm of longest difference vector: ||Psi(x,y)-Psi(x,ybar)||=%.5f\n",
	   length_of_longest_document_vector(cset.lhs,cset.m,kparm));
    printf("Runtime in cpu-seconds: %.2f (%.2f%% for QP, %.2f%% for Argmax, %.2f%% for Psi, %.2f%% for init)\n",
	   rt_total/100.0, (100.0*rt_opt)/rt_total, (100.0*rt_viol)/rt_total, 
	   (100.0*rt_psi)/rt_total, (100.0*rt_init)/rt_total);
  }
  if(struct_verbosity>=4)
    printW(sm->w,sizePsi,n,lparm->svm_c);

  if(svmModel) {
    sm->svm_model=copy_model(svmModel);
    sm->w=sm->svm_model->lin_weights; /* short cut to weight vector */
  }

  print_struct_learning_stats(sample,sm,cset,alpha,sparm);

  if(fycache) {
    for(i=0;i<n;i++)
      free_svector(fycache[i]);
    free(fycache);
  }
  if(svmModel)
    free_model(svmModel,0);
  free(alpha); 
  free(alphahist); 
  free(opti); 
  free(cset.rhs); 
  for(i=0;i<cset.m;i++) 
    free_example(cset.lhs[i],1);
  free(cset.lhs);
}
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
#endif
{
	long correct=0,incorrect=0,no_accuracy=0;
	long i;
	double t1,runtime=0;
	double avgloss=0,l;
#ifndef COMPILE_MEX_INTERFACE
	FILE *predfl;
#endif
	STRUCTMODEL model; 
	STRUCT_LEARN_PARM sparm;
	STRUCT_TEST_STATS teststats;
	SAMPLE testsample;
	LABEL y;
#ifdef COMPILE_MEX_INTERFACE
	int argc;
	char **argv;
	if (nrhs < 3) {
		print_help();
		return;
	}
	else if (nrhs==3) {
		argc=1;
		argv=(char **)my_malloc(MAX_ARGVS*sizeof(char *));
		argv[0]="OLR";
	}
	else
		create_argc_argv(prhs[3],&argc,&argv);
#endif
	svm_struct_classify_api_init(argc,argv);
	
#ifndef COMPILE_MEX_INTERFACE
	read_input_parameters(argc,argv,testfile,modelfile,predictionsfile,&sparm,
						  &verbosity,&struct_verbosity);
#else
	read_input_parameters(argc,argv,&sparm,&verbosity,&struct_verbosity);
#endif
	
	if(struct_verbosity>=1) {
		printf("Reading model..."); fflush(stdout);
	}
#ifndef COMPILE_MEX_INTERFACE
	model=read_struct_model(modelfile,&sparm);
#else
	model=read_struct_model(prhs[2],&sparm);
#endif
	if(struct_verbosity>=1) {
		fprintf(stdout, "done.\n");
	}
	
	if(model.svm_model->kernel_parm.kernel_type == LINEAR) { /* linear kernel */
		/* compute weight vector */
		add_weight_vector_to_linear_model(model.svm_model);
		model.w=model.svm_model->lin_weights;
	}
	
	if(struct_verbosity>=1) {
		printf("Reading test examples..."); fflush(stdout);
	}
#ifndef COMPILE_MEX_INTERFACE
	testsample=read_struct_examples(testfile,&sparm);
#else
	testsample=read_struct_examples(prhs,&sparm);
#endif
	if(struct_verbosity>=1) {
		printf("done.\n"); fflush(stdout);
	}
	
	if(struct_verbosity>=1) {
		printf("Classifying test examples..."); fflush(stdout);
	}
#ifndef COMPILE_MEX_INTERFACE
	if ((predfl = fopen (predictionsfile, "w")) == NULL)
	{ perror (predictionsfile); exit (1); }
#else
	mwSize rows=mxGetM(prhs[0]);
	mxArray *predictions=mxCreateDoubleMatrix(rows,1,mxREAL);
	double *pred_ptr=mxGetPr(predictions);
#endif
	for(i=0;i<testsample.n;i++) {
		t1=get_runtime();
		y=classify_struct_example(testsample.examples[i].x,&model,&sparm);
		runtime+=(get_runtime()-t1);
#ifndef COMPILE_MEX_INTERFACE
		write_label(predfl,y);
#else
		write_label(&pred_ptr,y);
#endif
		l=loss(testsample.examples[i].y,y,&sparm);
		avgloss+=l;
		if(l == 0) 
			correct++;
		else
			incorrect++;
		eval_prediction(i,testsample.examples[i],y,&model,&sparm,&teststats);
		
		if(empty_label(testsample.examples[i].y)) 
		{ no_accuracy=1; } /* test data is not labeled */
		if(struct_verbosity>=2) {
			if((i+1) % 100 == 0) {
				printf("%ld..",i+1); fflush(stdout);
			}
		}
		free_label(y);
	}
	avgloss/=testsample.n;
#ifndef COMPILE_MEX_INTERFACE
	fclose(predfl);
#endif
	if(struct_verbosity>=1) {
		printf("done\n");
		printf("Runtime (without IO) in cpu-seconds: %.2f\n",
			   (float)(runtime/100.0));    
	}
	if((!no_accuracy) && (struct_verbosity>=1)) {
		printf("Average loss on test set: %.4f\n",(float)avgloss);
		printf("Zero/one-error on test set: %.2f%% (%ld correct, %ld incorrect, %d total)\n",(float)100.0*incorrect/testsample.n,correct,incorrect,testsample.n);
	}
	print_struct_testing_stats(testsample,&model,&sparm,&teststats);
	free_struct_sample(testsample);
	free_struct_model(model);
	
	svm_struct_classify_api_exit();
#ifndef COMPILE_MEX_INTERFACE
	return(0);
#else
	plhs[0]=predictions;
#endif
}
void svm_learn_struct_joint(SAMPLE sample, STRUCT_LEARN_PARM *sparm,
			    LEARN_PARM *lparm, KERNEL_PARM *kparm, 
			    STRUCTMODEL *sm, int alg_type)
{
  int         i,j;
  int         numIt=0;
  long        argmax_count=0;
  long        totconstraints=0;
  long        kernel_type_org;
  double      epsilon,epsilon_cached;
  double      lossval,factor,dist;
  double      margin=0;
  double      slack, slacksum, ceps;
  double      dualitygap,modellength,alphasum;
  long        sizePsi;
  double      *alpha=NULL;
  long        *alphahist=NULL,optcount=0;
  CONSTSET    cset;
  SVECTOR     *diff=NULL;
  double      *diff_n=NULL;
  SVECTOR     *fy, *fybar, *f, **fycache, *lhs;
  MODEL       *svmModel=NULL;
  LABEL       ybar;
  DOC         *doc;

  long        n=sample.n;
  EXAMPLE     *ex=sample.examples;
  double      rt_total=0,rt_opt=0,rt_init=0,rt_psi=0,rt_viol=0,rt_kernel=0;
  double      rt1,rt2;
  double      progress,progress_old;

  /*
  SVECTOR     ***fydelta_cache=NULL;
  double      **loss_cache=NULL;
  int         cache_size=0;
  */
  CCACHE      *ccache=NULL;
  int         cached_constraint;

  rt1=get_runtime();

  init_struct_model(sample,sm,sparm,lparm,kparm); 
  sizePsi=sm->sizePsi+1;          /* sm must contain size of psi on return */

  if(sparm->slack_norm == 1) {
    lparm->svm_c=sparm->C;          /* set upper bound C */
    lparm->sharedslack=1;
  }
  else if(sparm->slack_norm == 2) {
    printf("ERROR: The joint algorithm does not apply to L2 slack norm!"); 
    fflush(stdout);
    exit(0); 
  }
  else {
    printf("ERROR: Slack norm must be L1 or L2!"); fflush(stdout);
    exit(0);
  }


  lparm->biased_hyperplane=0;     /* set threshold to zero */
  epsilon=100.0;                  /* start with low precision and
				     increase later */
  epsilon_cached=epsilon;         /* epsilon to use for iterations
				     using constraints constructed
				     from the constraint cache */

  cset=init_struct_constraints(sample, sm, sparm);
  if(cset.m > 0) {
    alpha=(double *)realloc(alpha,sizeof(double)*cset.m);
    alphahist=(long *)realloc(alphahist,sizeof(long)*cset.m);
    for(i=0; i<cset.m; i++) {
      alpha[i]=0;
      alphahist[i]=-1; /* -1 makes sure these constraints are never removed */
    }
  }
  kparm->gram_matrix=NULL;
  if((alg_type == DUAL_ALG) || (alg_type == DUAL_CACHE_ALG))
    kparm->gram_matrix=init_kernel_matrix(&cset,kparm);

  /* set initial model and slack variables */
  svmModel=(MODEL *)my_malloc(sizeof(MODEL));
  lparm->epsilon_crit=epsilon;
  svm_learn_optimization(cset.lhs,cset.rhs,cset.m,sizePsi+n,
			 lparm,kparm,NULL,svmModel,alpha);
  add_weight_vector_to_linear_model(svmModel);
  sm->svm_model=svmModel;
  sm->w=svmModel->lin_weights; /* short cut to weight vector */

  /* create a cache of the feature vectors for the correct labels */
  fycache=(SVECTOR **)malloc(n*sizeof(SVECTOR *));
  for(i=0;i<n;i++) {
    fy=psi(ex[i].x,ex[i].y,sm,sparm);
    if(kparm->kernel_type == LINEAR) {
      diff=add_list_ss(fy); /* store difference vector directly */
      free_svector(fy);
      fy=diff;
    }
    fycache[i]=fy;
  }

  /* initialize the constraint cache */
  if(alg_type == DUAL_CACHE_ALG) {
    ccache=create_constraint_cache(sample,sparm);
  }

  rt_init+=MAX(get_runtime()-rt1,0);
  rt_total+=MAX(get_runtime()-rt1,0);

    /*****************/
   /*** main loop ***/
  /*****************/
  do { /* iteratively find and add constraints to working set */

      if(struct_verbosity>=1) { 
	printf("Iter %i: ",++numIt); 
	fflush(stdout);
      }
      
      rt1=get_runtime();

      /**** compute current slack ****/
      slack=0;
      for(j=0;j<cset.m;j++) 
	slack=MAX(slack,cset.rhs[j]-classify_example(svmModel,cset.lhs[j]));
      
      /**** find a violated joint constraint ****/
      lhs=NULL;
      dist=0;
      if(alg_type == DUAL_CACHE_ALG) {
	/* see if it is possible to construct violated constraint from cache */
	update_constraint_cache_for_model(ccache, svmModel);
	dist=find_most_violated_joint_constraint_in_cache(ccache,&lhs,&margin);
      }

      rt_total+=MAX(get_runtime()-rt1,0);

      /* Is there a sufficiently violated constraint in cache? */
      if(dist-slack > MAX(epsilon/10,sparm->epsilon)) { 
	/* use constraint from cache */
	rt1=get_runtime();
	cached_constraint=1;
	if(kparm->kernel_type == LINEAR) {
	  diff=add_list_ns(lhs); /* Linear case: compute weighted sum */
	  free_svector_shallow(lhs);
	}
	else { /* Non-linear case: make sure we have deep copy for cset */
	  diff=copy_svector(lhs); 
	  free_svector_shallow(lhs);
	}
	rt_total+=MAX(get_runtime()-rt1,0);
      }
      else { 
	/* do not use constraint from cache */
	rt1=get_runtime();
	cached_constraint=0;
	if(lhs)
	  free_svector_shallow(lhs);
	lhs=NULL;
	if(kparm->kernel_type == LINEAR) {
	  diff_n=create_nvector(sm->sizePsi);
	  clear_nvector(diff_n,sm->sizePsi);
	}
	margin=0;
	progress=0;
	progress_old=progress;
	rt_total+=MAX(get_runtime()-rt1,0);

	/**** find most violated joint constraint ***/
	for(i=0; i<n; i++) {
	  
	  rt1=get_runtime();
      
	  progress+=10.0/n;
	  if((struct_verbosity==1) && (((int)progress_old) != ((int)progress)))
	    {printf(".");fflush(stdout); progress_old=progress;}
	  if(struct_verbosity>=2)
	    {printf("."); fflush(stdout);}

	  rt2=get_runtime();
	  argmax_count++;
	  if(sparm->loss_type == SLACK_RESCALING) 
	    ybar=find_most_violated_constraint_slackrescaling(ex[i].x,
							      ex[i].y,sm,
							      sparm);
	  else
	    ybar=find_most_violated_constraint_marginrescaling(ex[i].x,
							       ex[i].y,sm,
							       sparm);
	  rt_viol+=MAX(get_runtime()-rt2,0);
	  
	  if(empty_label(ybar)) {
	    printf("ERROR: empty label was returned for example (%i)\n",i);
	    /* exit(1); */
	    continue;
	  }
	  
	  /**** get psi(x,y) and psi(x,ybar) ****/
	  rt2=get_runtime();
	  fy=copy_svector(fycache[i]); /*<= fy=psi(ex[i].x,ex[i].y,sm,sparm);*/
	  fybar=psi(ex[i].x,ybar,sm,sparm);
	  rt_psi+=MAX(get_runtime()-rt2,0);
	  lossval=loss(ex[i].y,ybar,sparm);
	  free_label(ybar);
	  
	  /**** scale feature vector and margin by loss ****/
	  if(sparm->loss_type == SLACK_RESCALING)
	    factor=lossval/n;
	  else                 /* do not rescale vector for */
	    factor=1.0/n;      /* margin rescaling loss type */
	  for(f=fy;f;f=f->next)
	    f->factor*=factor;
	  for(f=fybar;f;f=f->next)
	    f->factor*=-factor;
	  append_svector_list(fybar,fy);   /* compute fy-fybar */
	  
	  /**** add current fy-fybar and loss to cache ****/
	  if(alg_type == DUAL_CACHE_ALG) {
	    if(kparm->kernel_type == LINEAR) 
	      add_constraint_to_constraint_cache(ccache,svmModel,i,
						 add_list_ss(fybar),
						 lossval/n,sparm->ccache_size);
	    else
	      add_constraint_to_constraint_cache(ccache,svmModel,i,
						 copy_svector(fybar),
						 lossval/n,sparm->ccache_size);
	  }

	  /**** add current fy-fybar to constraint and margin ****/
	  if(kparm->kernel_type == LINEAR) {
	    add_list_n_ns(diff_n,fybar,1.0); /* add fy-fybar to sum */
	    free_svector(fybar);
	  }
	  else {
	    append_svector_list(fybar,lhs);  /* add fy-fybar to vector list */
	    lhs=fybar;
	  }
	  margin+=lossval/n;                 /* add loss to rhs */
	  
	  rt_total+=MAX(get_runtime()-rt1,0);

	} /* end of example loop */

	rt1=get_runtime();

	/* create sparse vector from dense sum */
	if(kparm->kernel_type == LINEAR) {
	  diff=create_svector_n(diff_n,sm->sizePsi,"",1.0);
	  free_nvector(diff_n);
	}
	else {
	  diff=lhs;
	}

	rt_total+=MAX(get_runtime()-rt1,0);

      } /* end of finding most violated joint constraint */

      rt1=get_runtime();

      /**** if `error', then add constraint and recompute QP ****/
      doc=create_example(cset.m,0,1,1,diff);
      dist=classify_example(svmModel,doc);
      ceps=MAX(0,margin-dist-slack);
      if(slack > (margin-dist+0.000001)) {
	printf("\nWARNING: Slack of most violated constraint is smaller than slack of working\n");
	printf("         set! There is probably a bug in 'find_most_violated_constraint_*'.\n");
	printf("slack=%f, newslack=%f\n",slack,margin-dist);
	/* exit(1); */
      }
      if(ceps > sparm->epsilon) { 
	/**** resize constraint matrix and add new constraint ****/
	cset.lhs=(DOC **)realloc(cset.lhs,sizeof(DOC *)*(cset.m+1));
	if(sparm->slack_norm == 1) 
	  cset.lhs[cset.m]=create_example(cset.m,0,1,1,diff);
	else if(sparm->slack_norm == 2)
	  exit(1);
	cset.rhs=(double *)realloc(cset.rhs,sizeof(double)*(cset.m+1));
	cset.rhs[cset.m]=margin;
	alpha=(double *)realloc(alpha,sizeof(double)*(cset.m+1));
	alpha[cset.m]=0;
	alphahist=(long *)realloc(alphahist,sizeof(long)*(cset.m+1));
	alphahist[cset.m]=optcount;
	cset.m++;
	totconstraints++;
	if((alg_type == DUAL_ALG) || (alg_type == DUAL_CACHE_ALG)) {
	  if(struct_verbosity>=1) {
	    printf(":");fflush(stdout);
	  }
	  rt2=get_runtime();
	  kparm->gram_matrix=update_kernel_matrix(kparm->gram_matrix,cset.m-1,
						  &cset,kparm);
	  rt_kernel+=MAX(get_runtime()-rt2,0);
	}
	
	/**** get new QP solution ****/
	if(struct_verbosity>=1) {
	  printf("*");fflush(stdout);
	}
	rt2=get_runtime();
	/* set svm precision so that higher than eps of most violated constr */
	if(cached_constraint) {
	  epsilon_cached=MIN(epsilon_cached,MAX(ceps,sparm->epsilon)); 
	  lparm->epsilon_crit=epsilon_cached/2; 
	}
	else {
	  epsilon=MIN(epsilon,MAX(ceps,sparm->epsilon)); /* best eps so far */
	  lparm->epsilon_crit=epsilon/2; 
	  epsilon_cached=epsilon;
	}
	free_model(svmModel,0);
	svmModel=(MODEL *)my_malloc(sizeof(MODEL));
	/* Run the QP solver on cset. */
	kernel_type_org=kparm->kernel_type;
	if((alg_type == DUAL_ALG) || (alg_type == DUAL_CACHE_ALG))
	  kparm->kernel_type=GRAM; /* use kernel stored in kparm */
	svm_learn_optimization(cset.lhs,cset.rhs,cset.m,sizePsi+n,
			       lparm,kparm,NULL,svmModel,alpha);
	kparm->kernel_type=kernel_type_org; 
	svmModel->kernel_parm.kernel_type=kernel_type_org;
	/* Always add weight vector, in case part of the kernel is
	   linear. If not, ignore the weight vector since its
	   content is bogus. */
	add_weight_vector_to_linear_model(svmModel);
	sm->svm_model=svmModel;
	sm->w=svmModel->lin_weights; /* short cut to weight vector */
	optcount++;
	/* keep track of when each constraint was last
	   active. constraints marked with -1 are not updated */
	for(j=0;j<cset.m;j++) 
	  if((alphahist[j]>-1) && (alpha[j] != 0))  
	    alphahist[j]=optcount;
	rt_opt+=MAX(get_runtime()-rt2,0);
	
	/* Check if some of the linear constraints have not been
	   active in a while. Those constraints are then removed to
	   avoid bloating the working set beyond necessity. */
	if(struct_verbosity>=2)
	  printf("Reducing working set...");fflush(stdout);
	remove_inactive_constraints(&cset,alpha,optcount,alphahist,50);
	if(struct_verbosity>=2)
	  printf("done. (NumConst=%d) ",cset.m);
      }
      else {
	free_svector(diff);
      }

      if(struct_verbosity>=1)
	printf("(NumConst=%d, SV=%ld, CEps=%.4f, QPEps=%.4f)\n",cset.m,
	       svmModel->sv_num-1,ceps,svmModel->maxdiff);

      free_example(doc,0);
	
      rt_total+=MAX(get_runtime()-rt1,0);

  } while((ceps > sparm->epsilon) || 
	  finalize_iteration(ceps,cached_constraint,sample,sm,cset,alpha,sparm)
	 );
  

  if(struct_verbosity>=1) {
    /**** compute sum of slacks ****/
    /**** WARNING: If positivity constraints are used, then the
	  maximum slack id is larger than what is allocated
	  below ****/
    slacksum=0;
    if(sparm->slack_norm == 1) {
      for(j=0;j<cset.m;j++) 
	slacksum=MAX(slacksum,
		     cset.rhs[j]-classify_example(svmModel,cset.lhs[j]));
      }
    else if(sparm->slack_norm == 2) {
      exit(1);
    }
    alphasum=0;
    for(i=0; i<cset.m; i++)  
      alphasum+=alpha[i]*cset.rhs[i];
    modellength=model_length_s(svmModel,kparm);
    dualitygap=(0.5*modellength*modellength+sparm->C*(slacksum+ceps))
               -(alphasum-0.5*modellength*modellength);
    
    printf("Final epsilon on KKT-Conditions: %.5f\n",
	   MAX(svmModel->maxdiff,ceps));
    printf("Upper bound on duality gap: %.5f\n", dualitygap);
    printf("Dual objective value: dval=%.5f\n",
	    alphasum-0.5*modellength*modellength);
    printf("Total number of constraints in final working set: %i (of %i)\n",(int)cset.m,(int)totconstraints);
    printf("Number of iterations: %d\n",numIt);
    printf("Number of calls to 'find_most_violated_constraint': %ld\n",argmax_count);
    if(sparm->slack_norm == 1) {
      printf("Number of SV: %ld \n",svmModel->sv_num-1);
      printf("Norm of weight vector: |w|=%.5f\n",
	     model_length_s(svmModel,kparm));
    }
    else if(sparm->slack_norm == 2){ 
      printf("Number of SV: %ld (including %ld at upper bound)\n",
	     svmModel->sv_num-1,svmModel->at_upper_bound);
      printf("Norm of weight vector (including L2-loss): |w|=%.5f\n",
	     model_length_s(svmModel,kparm));
    }
    printf("Value of slack variable (on working set): xi=%.5f\n",slacksum);
    printf("Norm of longest difference vector: ||Psi(x,y)-Psi(x,ybar)||=%.5f\n",
	   length_of_longest_document_vector(cset.lhs,cset.m,kparm));
    printf("Runtime in cpu-seconds: %.2f (%.2f%% for QP, %.2f%% for kernel, %.2f%% for Argmax, %.2f%% for Psi, %.2f%% for init)\n",
	   rt_total/100.0, (100.0*rt_opt)/rt_total, (100.0*rt_kernel)/rt_total,
	   (100.0*rt_viol)/rt_total, (100.0*rt_psi)/rt_total, 
	   (100.0*rt_init)/rt_total);
  }
  if(ccache) {
    long cnum=0;
    CCACHEELEM *celem;
    for(i=0;i<n;i++) 
      for(celem=ccache->constlist[i];celem;celem=celem->next) 
	cnum++;
    printf("Final number of constraints in cache: %ld\n",cnum);
  }
  if(struct_verbosity>=4)
    printW(sm->w,sizePsi,n,lparm->svm_c);

  if(svmModel) {
    sm->svm_model=copy_model(svmModel);
    sm->w=sm->svm_model->lin_weights; /* short cut to weight vector */
  }

  print_struct_learning_stats(sample,sm,cset,alpha,sparm);

  if(ccache)    
    free_constraint_cache(ccache);
  for(i=0;i<n;i++)
    free_svector(fycache[i]);
  free(fycache);
  if(svmModel)
    free_model(svmModel,0);
  free(alpha); 
  free(alphahist); 
  free(cset.rhs); 
  for(i=0;i<cset.m;i++) 
    free_example(cset.lhs[i],1);
  free(cset.lhs);
  if(kparm->gram_matrix)
    free_matrix(kparm->gram_matrix);
}