void read_documents(char *docfile, DOC *docs, double *label, long int max_words_doc, long int ll, long int *totwords, long int *totdoc, KERNEL_PARM *kernel_parm) { char *line; DOC doc; long dnum=0,dpos=0,dneg=0,dunlab=0; double doc_label; FILE *docfl; line = (char *)my_malloc(sizeof(char)*ll); if ((docfl = fopen (docfile, "r")) == NULL) { perror (docfile); exit (1); } if(verbosity>=1) { printf("Reading examples into memory..."); fflush(stdout); } dnum=0; (*totwords)=0; while((!feof(docfl)) && fgets(line,(int)ll,docfl)) { doc.docnum=dnum+1; if(strlen(line)==0){ printf("\nERROR: empty line, missing end of line before end of file\n"); exit(1); } if(!parse_document(line, &doc, &doc_label, totwords, max_words_doc, kernel_parm)) { printf("\nParsing error in line %ld!\n%s",dnum,line); exit(1); } label[dnum]=doc_label; /* printf("Class=%ld ",doc_label); */ if (doc_label > 0) dpos++; if (doc_label < 0) dneg++; if (doc_label == 0) dunlab++; docs[dnum].queryid = doc.queryid; docs[dnum].costfactor = doc.costfactor; docs[dnum].forest_vec = doc.forest_vec; docs[dnum].num_of_trees = doc.num_of_trees; docs[dnum].vectors = doc.vectors; docs[dnum].num_of_vectors = doc.num_of_vectors; // less than 5 basic kernels and greater than 50 only vectors (to save memory) if (kernel_parm->kernel_type<4) { // from 0 to 3 are original kernels => no trees freeForest(&doc); // save memory by freeing trees docs[dnum].num_of_trees = 0; docs[dnum].forest_vec =NULL; kernel_parm->second_kernel=kernel_parm->kernel_type; } // establish some interval to free vectors // if(kernel_parm->kernel_type>20){ // docs[dnum].vectors = NULL; // docs[dnum].num_of_vectors = 0; // freeVectorSet(&doc); // save memory by freeing vectors // } docs[dnum].docnum=dnum; /* printf("\nNorm=%f\n",docs[dnum].twonorm_sq); */ /*printf("parse tree number %d: ",dnum); writeTreeString(doc.root); */ /* printf("%d\t",(int)doc_label); */ dnum++; if(verbosity>=1) { if((dnum % 100) == 0) { printf("%ld..",dnum); fflush(stdout); } } } fclose(docfl); free(line); if(verbosity>=1) { fprintf(stdout, "OK. (%ld examples read)\n", dnum); } fflush(stdout); (*totdoc)=dnum; }
void freeExample(DOC *d){ freeVectorSet(d); freeForest(d); }