Exemple #1
0
void read_documents(char *docfile, DOC *docs, double *label, 
		    long int max_words_doc, long int ll, 
		    long int *totwords, long int *totdoc, KERNEL_PARM *kernel_parm)
{
  char *line;
  DOC doc;
  long dnum=0,dpos=0,dneg=0,dunlab=0;
  double doc_label;
  FILE *docfl;

  line = (char *)my_malloc(sizeof(char)*ll);

  if ((docfl = fopen (docfile, "r")) == NULL)
  { perror (docfile); exit (1); }

  if(verbosity>=1) {
    printf("Reading examples into memory..."); fflush(stdout);
  }
  dnum=0;
  (*totwords)=0;
  
  while((!feof(docfl)) && fgets(line,(int)ll,docfl)) {

      doc.docnum=dnum+1;
      if(strlen(line)==0){
         printf("\nERROR: empty line, missing end of line before end of file\n");
         exit(1);
      }
         

      if(!parse_document(line, &doc, &doc_label, totwords, max_words_doc, kernel_parm)) {
         printf("\nParsing error in line %ld!\n%s",dnum,line);
         exit(1);
      }

    label[dnum]=doc_label;
    /*  printf("Class=%ld ",doc_label);  */
    if (doc_label > 0) dpos++;
    if (doc_label < 0) dneg++;
    if (doc_label == 0) dunlab++;

    docs[dnum].queryid = doc.queryid;
    docs[dnum].costfactor = doc.costfactor;
    
    docs[dnum].forest_vec = doc.forest_vec;
	docs[dnum].num_of_trees = doc.num_of_trees;
	docs[dnum].vectors = doc.vectors;
	docs[dnum].num_of_vectors = doc.num_of_vectors;
      
    // less than 5 basic kernels and greater than 50 only vectors (to save memory)
    if (kernel_parm->kernel_type<4) { // from 0 to 3 are original kernels => no trees
       freeForest(&doc); // save memory by freeing trees
       docs[dnum].num_of_trees = 0;
       docs[dnum].forest_vec =NULL;
       kernel_parm->second_kernel=kernel_parm->kernel_type;
    }   
 
    // establish some interval to free vectors
    
//    if(kernel_parm->kernel_type>20){
//	     docs[dnum].vectors = NULL;
//         docs[dnum].num_of_vectors = 0;
//         freeVectorSet(&doc); // save memory by freeing vectors
//     }
      
    docs[dnum].docnum=dnum;

/* printf("\nNorm=%f\n",docs[dnum].twonorm_sq);  */

/*printf("parse tree number %d: ",dnum);
    writeTreeString(doc.root);
*/
/*    printf("%d\t",(int)doc_label);  
*/ 
    dnum++;  
    if(verbosity>=1) {
      if((dnum % 100) == 0) {
	           printf("%ld..",dnum); fflush(stdout);
      }
    }
  } 

  fclose(docfl);
  free(line);

  if(verbosity>=1) {
    fprintf(stdout, "OK. (%ld examples read)\n", dnum);
  }

fflush(stdout);

  (*totdoc)=dnum;
}
void freeExample(DOC *d){
     freeVectorSet(d);
     freeForest(d);
     }