示例#1
0
SAMPLE      read_struct_examples(char *file, STRUCT_LEARN_PARM *sparm)
{
	/* Reads training examples and returns them in sample. The number of
	examples must be written into sample.n */
	SAMPLE   sample;  /* sample */
	EXAMPLE  *examples;
	long     n;       /* number of examples */
	DOC **docs;       /* examples in original SVM-light format */ 
	double *target;
	long totwords, i, num_classes = 0;

	/* Using the read_documents function from SVM-light */
	read_documents(file, &docs, &target, &totwords, &n);
	examples = (EXAMPLE *)my_malloc(sizeof(EXAMPLE)*n);
	for( i=0; i<n; i++ )     /* find highest class label */
		if(num_classes < (target[i]+0.1)) 
			num_classes = target[i]+0.1;
	for( i=0; i<n; i++ )     /* make sure all class labels are positive */
		if(target[i]<1) {
			printf("\nERROR: The class label '%lf' of example number %ld is not greater than '1'!\n",target[i],i+1);
			exit(1);
		} 

	for( i=0; i<n; i++ ) {          /* copy docs over into new datastructure */
		examples[i].x.doc = docs[i];
		examples[i].y.classlabel = target[i]+0.1;
		examples[i].y.scores = NULL;
		examples[i].y.num_classes = num_classes;
	}
	free(target);
	free(docs);
	sample.n = n;
	sample.examples = examples;

	if(struct_verbosity>=0)
		printf(" (%d examples) ",sample.n);
	return(sample);
}
示例#2
0
int _svm_learn (int argc, char* argv[])
{  
  char docfile[200];           /* file with training examples */
  char modelfile[200];         /* file for resulting classifier */
  char restartfile[200];       /* file with initial alphas */
  DOC **docs;  /* training examples */
  long totwords,totdoc,i;
  double *target;
  double *alpha_in=NULL;
  KERNEL_CACHE *kernel_cache;
  LEARN_PARM learn_parm;
  KERNEL_PARM kernel_parm;
  MODEL *model=(MODEL *)my_malloc(sizeof(MODEL));

  HIDEO_ENV *hideo_env=create_env();

  model->td_pred=NULL;
  model->n_td_pred=0;

  _read_input_parameters(argc,argv,docfile,modelfile,restartfile,&verbosity,
			&learn_parm,&kernel_parm);
  read_documents(docfile,&docs,&target,&totwords,&totdoc);
  if(restartfile[0]) alpha_in=read_alphas(restartfile,totdoc);

  if(kernel_parm.kernel_type == LINEAR) { /* don't need the cache */
    kernel_cache=NULL;
  }
  else {
    /* Always get a new kernel cache. It is not possible to use the
       same cache for two different training runs */
    kernel_cache=kernel_cache_init(totdoc,learn_parm.kernel_cache_size);
  }

  if(learn_parm.type == CLASSIFICATION) {
    svm_learn_classification(docs,target,totdoc,totwords,&learn_parm,
			     &kernel_parm,kernel_cache,model,alpha_in,hideo_env);
  }
  else if(learn_parm.type == REGRESSION) {
    svm_learn_regression(docs,target,totdoc,totwords,&learn_parm,
			 &kernel_parm,&kernel_cache,model,hideo_env);
  }
  else if(learn_parm.type == RANKING) {
    svm_learn_ranking(docs,target,totdoc,totwords,&learn_parm,
		      &kernel_parm,&kernel_cache,model,hideo_env);
  }
  else if(learn_parm.type == OPTIMIZATION) {
    svm_learn_optimization(docs,target,totdoc,totwords,&learn_parm,
			   &kernel_parm,kernel_cache,model,alpha_in,hideo_env);
  }

  if(kernel_cache) {
    /* Free the memory used for the cache. */
    kernel_cache_cleanup(kernel_cache);
  }

  /* Warning: The model contains references to the original data 'docs'.
     If you want to free the original data, and only keep the model, you 
     have to make a deep copy of 'model'. */
  /* deep_copy_of_model=copy_model(model); */
  write_model(modelfile,model);

  free(alpha_in);
  free_model(model,0);
  for(i=0;i<totdoc;i++) 
    free_example(docs[i],1);
  free(docs);
  free(target);
  free_env(hideo_env);

  return(0);
}
示例#3
0
int main (int argc, char* argv[])
{  
  DOC *docs;  /* training examples */
  long max_docs,max_words_doc;
  long totwords,totdoc,ll,i;
  long kernel_cache_size;
  double *target;
  KERNEL_CACHE kernel_cache;
  LEARN_PARM learn_parm;
  KERNEL_PARM kernel_parm;
  MODEL model;

  read_input_parameters(argc,argv,docfile,modelfile,&verbosity,
			&kernel_cache_size,&learn_parm,&kernel_parm);

  if(verbosity>=1) {
    printf("Scanning examples..."); fflush(stdout);
  }
  nol_ll(docfile,&max_docs,&max_words_doc,&ll); /* scan size of input file */
  max_words_doc+=10;
  ll+=10;
  max_docs+=2;
  if(verbosity>=1) {
    printf("done\n"); fflush(stdout);
  }

  docs = (DOC *)my_malloc(sizeof(DOC)*max_docs);         /* feature vectors */
  target = (double *)my_malloc(sizeof(double)*max_docs); /* target values */
 //printf("\nMax docs: %ld, approximated number of feature occurences %ld, maximal length of a line %ld\n\n",max_docs,max_words_doc,ll);
  read_documents(docfile,docs,target,max_words_doc,ll,&totwords,&totdoc,&kernel_parm);
  printf("\nNumber of examples: %ld, linear space size: %ld\n\n",totdoc,totwords);
 
 //if(kernel_parm.kernel_type==5) totwords=totdoc; // The number of features is proportional to the number of parse-trees, i.e. totdoc 
  				                 // or should we still use totwords to approximate svm_maxqpsize for the Tree Kernel (see hideo.c) ???????

  if(kernel_parm.kernel_type == LINEAR) { /* don't need the cache */
    if(learn_parm.type == CLASSIFICATION) {
      svm_learn_classification(docs,target,totdoc,totwords,&learn_parm,
			       &kernel_parm,NULL,&model);
    }
    else if(learn_parm.type == REGRESSION) {
      svm_learn_regression(docs,target,totdoc,totwords,&learn_parm,
			   &kernel_parm,NULL,&model);
    }
    else if(learn_parm.type == RANKING) {
      svm_learn_ranking(docs,target,totdoc,totwords,&learn_parm,
			&kernel_parm,NULL,&model);
    }
  }
  else {
    if(learn_parm.type == CLASSIFICATION) {
      /* Always get a new kernel cache. It is not possible to use the
         same cache for two different training runs */
      kernel_cache_init(&kernel_cache,totdoc,kernel_cache_size);
      svm_learn_classification(docs,target,totdoc,totwords,&learn_parm,
			       &kernel_parm,&kernel_cache,&model);
      /* Free the memory used for the cache. */
      kernel_cache_cleanup(&kernel_cache);
    }
    else if(learn_parm.type == REGRESSION) {
      /* Always get a new kernel cache. It is not possible to use the
         same cache for two different training runs */
      kernel_cache_init(&kernel_cache,2*totdoc,kernel_cache_size);
      svm_learn_regression(docs,target,totdoc,totwords,&learn_parm,
			   &kernel_parm,&kernel_cache,&model);
      /* Free the memory used for the cache. */
      kernel_cache_cleanup(&kernel_cache);
    }
    else if(learn_parm.type == RANKING) {
      printf("Learning rankings is not implemented for non-linear kernels in this version!\n");
      exit(1);
    }
    else if(learn_parm.type == PERCEPTRON) {
			perceptron_learn_classification(docs,target,totdoc,totwords,&learn_parm,
			&kernel_parm,&kernel_cache,&model,modelfile);
    }
	    else if(learn_parm.type == PERCEPTRON_BATCH) {
			batch_perceptron_learn_classification(docs,target,totdoc,totwords,&learn_parm,
			&kernel_parm,kernel_cache_size,&model);
    }

  }

  /* Warning: The model contains references to the original data 'docs'.
     If you want to free the original data, and only keep the model, you 
     have to make a deep copy of 'model'. */
  write_model(modelfile,&model);

  free(model.supvec);
  free(model.alpha);
  free(model.index);
  
   for(i=0;i<totdoc;i++){
     freeExample(&docs[i]);
   }
  
  free(docs);
  free(target);

  return(0);
}