int si_compile(Octstr *si_doc, Octstr *charset, Octstr **si_binary) { simple_binary_t *sibxml; int ret; xmlDocPtr pDoc; size_t size; char *si_c_text; *si_binary = octstr_create(""); sibxml = simple_binary_create(); octstr_strip_blanks(si_doc); set_charset(si_doc, charset); size = octstr_len(si_doc); si_c_text = octstr_get_cstr(si_doc); pDoc = xmlParseMemory(si_c_text, size); ret = 0; if (pDoc) { ret = parse_document(pDoc, charset, &sibxml); simple_binary_output(*si_binary, sibxml); xmlFreeDoc(pDoc); } else { xmlFreeDoc(pDoc); octstr_destroy(*si_binary); simple_binary_destroy(sibxml); error(0, "SI: No document to parse. Probably an error in SI source"); return -1; } simple_binary_destroy(sibxml); return ret; }
SdfNode* sdf_parse_file(const char* filename) { // Open the file FILE * file = fopen(filename, "r"); // Check to see if we actually opened the file OK if (file == NULL) { printf("Unable to open file \"%s\"", filename); return NULL; } // Prepare the parser status object ParserData data; data.file = file; data.filename = filename; data.buffer_size = INITIAL_BUFFER_SIZE; data.line = malloc(data.buffer_size * sizeof (char)); data.linenumber = 0; data.eof = false; next_line(&data); // Build the AST SdfNode* root = parse_document(&data); // Clean up before return fclose(file); free(data.line); // Return return root; }
int ota_compile(Octstr *ota_doc, Octstr *charset, Octstr **ota_binary) { simple_binary_t *otabxml; int ret; xmlDocPtr pDoc; size_t size; char *ota_c_text; *ota_binary = octstr_create(""); otabxml = simple_binary_create(); octstr_strip_blanks(ota_doc); octstr_shrink_blanks(ota_doc); set_charset(ota_doc, charset); size = octstr_len(ota_doc); ota_c_text = octstr_get_cstr(ota_doc); pDoc = xmlParseMemory(ota_c_text, size); ret = 0; if (pDoc) { ret = parse_document(pDoc, charset, &otabxml); simple_binary_output(*ota_binary, otabxml); xmlFreeDoc(pDoc); } else { xmlFreeDoc(pDoc); octstr_destroy(*ota_binary); simple_binary_destroy(otabxml); error(0, "OTA: No document to parse. Probably an error in OTA source"); return -1; } simple_binary_destroy(otabxml); return ret; }
void read_documents(char *docfile, DOC ***docs, double **label, long int *totwords, long int *totdoc) { char *line,*comment; WORD *words; long dnum=0,wpos,dpos=0,dneg=0,dunlab=0,queryid,slackid,max_docs; long max_words_doc, ll; double doc_label,costfactor; FILE *docfl; if(verbosity>=1) { printf("Scanning examples..."); fflush(stdout); } nol_ll(docfile,&max_docs,&max_words_doc,&ll); /* scan size of input file */ max_words_doc+=2; ll+=2; max_docs+=2; if(verbosity>=1) { printf("done\n"); fflush(stdout); } (*docs) = (DOC **)my_malloc(sizeof(DOC *)*max_docs); /* feature vectors */ (*label) = (double *)my_malloc(sizeof(double)*max_docs); /* target values */ line = (char *)my_malloc(sizeof(char)*ll); if ((docfl = fopen (docfile, "r")) == NULL) { perror (docfile); exit (1); } words = (WORD *)my_malloc(sizeof(WORD)*(max_words_doc+10)); if(verbosity>=1) { printf("Reading examples into memory..."); fflush(stdout); } dnum=0; (*totwords)=0; while((!feof(docfl)) && fgets(line,(int)ll,docfl)) { if(line[0] == '#') continue; /* line contains comments */ if(!parse_document(line,words,&doc_label,&queryid,&slackid,&costfactor, &wpos,max_words_doc,&comment)) { printf("\nParsing error in line %ld!\n%s",dnum,line); exit(1); } (*label)[dnum]=doc_label; /* printf("docnum=%ld: Class=%f ",dnum,doc_label); */ if(doc_label > 0) dpos++; if (doc_label < 0) dneg++; if (doc_label == 0) dunlab++; if((wpos>1) && ((words[wpos-2]).wnum>(*totwords))) (*totwords)=(words[wpos-2]).wnum; if((*totwords) > MAXFEATNUM) { printf("\nMaximum feature number exceeds limit defined in MAXFEATNUM!\n"); printf("LINE: %s\n",line); exit(1); } (*docs)[dnum] = create_example(dnum,queryid,slackid,costfactor, create_svector(words,comment,1.0)); /* printf("\nNorm=%f\n",((*docs)[dnum]->fvec)->twonorm_sq); */ dnum++; if(verbosity>=1) { if((dnum % 100) == 0) { printf("%ld..",dnum); fflush(stdout); } } } fclose(docfl); my_free(line); my_free(words); if(verbosity>=1) { fprintf(stdout, "OK. (%ld examples read)\n", dnum); } (*totdoc)=dnum; }
MODEL *read_model(char *modelfile) { FILE *modelfl; long i,queryid,slackid; double costfactor; long max_sv,max_words,ll,wpos; char *line,*comment; WORD *words; char version_buffer[100]; MODEL *model; if(verbosity>=1) { printf("Reading model..."); fflush(stdout); } nol_ll(modelfile,&max_sv,&max_words,&ll); /* scan size of model file */ max_words+=2; ll+=2; words = (WORD *)my_malloc(sizeof(WORD)*(max_words+10)); line = (char *)my_malloc(sizeof(char)*ll); model = (MODEL *)my_malloc(sizeof(MODEL)); if ((modelfl = fopen (modelfile, "r")) == NULL) { perror (modelfile); exit (1); } fscanf(modelfl,"SVM-light Version %s\n",version_buffer); if(strcmp(version_buffer,VERSION)) { perror ("Version of model-file does not match version of svm_classify!"); exit (1); } fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.kernel_type); fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.poly_degree); fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.rbf_gamma); fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_lin); fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_const); fscanf(modelfl,"%[^#]%*[^\n]\n", model->kernel_parm.custom); fscanf(modelfl,"%ld%*[^\n]\n", &model->totwords); fscanf(modelfl,"%ld%*[^\n]\n", &model->totdoc); fscanf(modelfl,"%ld%*[^\n]\n", &model->sv_num); fscanf(modelfl,"%lf%*[^\n]\n", &model->b); model->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num); model->alpha = (double *)my_malloc(sizeof(double)*model->sv_num); model->index=NULL; model->lin_weights=NULL; for(i=1;i<model->sv_num;i++) { fgets(line,(int)ll,modelfl); if(!parse_document(line,words,&(model->alpha[i]),&queryid,&slackid, &costfactor,&wpos,max_words,&comment)) { printf("\nParsing error while reading model file in SV %ld!\n%s", i,line); exit(1); } model->supvec[i] = create_example(-1, 0,0, 0.0, create_svector(words,comment,1.0)); } fclose(modelfl); my_free(line); my_free(words); if(verbosity>=1) { fprintf(stdout, "OK. (%d support vectors read)\n",(int)(model->sv_num-1)); } return(model); }
int main_classify (int argc, char* argv[]) { DOC *doc; /* test example */ WORDSVM *words; long max_docs,max_words_doc,lld; long totdoc=0,queryid,slackid; long correct=0,incorrect=0,no_accuracy=0; long res_a=0,res_b=0,res_c=0,res_d=0,wnum,pred_format; long j; double t1,runtime=0; double dist,doc_label,costfactor; char *line,*comment; FILE *predfl,*docfl; MODEL *model; read_input_parameters(argc,argv,docfile,modelfile,predictionsfile, &verbosity,&pred_format); nol_ll(docfile,&max_docs,&max_words_doc,&lld); /* scan size of input file */ max_words_doc+=2; lld+=2; line = (char *)my_malloc(sizeof(char)*lld); words = (WORDSVM *)my_malloc(sizeof(WORDSVM)*(max_words_doc+10)); model=read_model(modelfile); if(model->kernel_parm.kernel_type == 0) { /* linear kernel */ /* compute weight vector */ add_weight_vector_to_linear_model(model); } if(verbosity>=2) { printf("Classifying test examples.."); fflush(stdout); } if ((docfl = fopen (docfile, "r")) == NULL) { perror (docfile); exit (1); } if ((predfl = fopen (predictionsfile, "w")) == NULL) { perror (predictionsfile); exit (1); } while((!feof(docfl)) && fgets(line,(int)lld,docfl)) { if(line[0] == '#') continue; /* line contains comments */ parse_document(line,words,&doc_label,&queryid,&slackid,&costfactor,&wnum, max_words_doc,&comment); totdoc++; if(model->kernel_parm.kernel_type == 0) { /* linear kernel */ for(j=0;(words[j]).wnum != 0;j++) { /* Check if feature numbers */ if((words[j]).wnum>model->totwords) /* are not larger than in */ (words[j]).wnum=0; /* model. Remove feature if */ } /* necessary. */ doc = create_example(-1,0,0,0.0,create_svector(words,comment,1.0)); t1=get_runtime(); dist=classify_example_linear(model,doc); runtime+=(get_runtime()-t1); free_example(doc,1); } else { /* non-linear kernel */ doc = create_example(-1,0,0,0.0,create_svector(words,comment,1.0)); t1=get_runtime(); dist=classify_example(model,doc); runtime+=(get_runtime()-t1); free_example(doc,1); } if(dist>0) { if(pred_format==0) { /* old weired output format */ fprintf(predfl,"%.8g:+1 %.8g:-1\n",dist,-dist); } if(doc_label>0) correct++; else incorrect++; if(doc_label>0) res_a++; else res_b++; } else { if(pred_format==0) { /* old weired output format */ fprintf(predfl,"%.8g:-1 %.8g:+1\n",-dist,dist); } if(doc_label<0) correct++; else incorrect++; if(doc_label>0) res_c++; else res_d++; } if(pred_format==1) { /* output the value of decision function */ fprintf(predfl,"%.8g\n",dist); } if((int)(0.01+(doc_label*doc_label)) != 1) { no_accuracy=1; } /* test data is not binary labeled */ if(verbosity>=2) { if(totdoc % 100 == 0) { printf("%ld..",totdoc); fflush(stdout); } } } free(line); free(words); free_model(model,1); if(verbosity>=2) { printf("done\n"); /* Note by Gary Boone Date: 29 April 2000 */ /* o Timing is inaccurate. The timer has 0.01 second resolution. */ /* Because classification of a single vector takes less than */ /* 0.01 secs, the timer was underflowing. */ printf("Runtime (without IO) in cpu-seconds: %.2f\n", (float)(runtime/100.0)); } if((!no_accuracy) && (verbosity>=1)) { printf("Accuracy on test set: %.2f%% (%ld correct, %ld incorrect, %ld total)\n",(float)(correct)*100.0/totdoc,correct,incorrect,totdoc); printf("Precision/recall on test set: %.2f%%/%.2f%%\n",(float)(res_a)*100.0/(res_a+res_b),(float)(res_a)*100.0/(res_a+res_c)); } return(0); }
RelaxNGSchema::RelaxNGSchema(const Document* document) : pimpl_(new Impl) { parse_document(document); }
void read_documents(char *docfile, DOC *docs, double *label, long int max_words_doc, long int ll, long int *totwords, long int *totdoc, KERNEL_PARM *kernel_parm) { char *line; DOC doc; long dnum=0,dpos=0,dneg=0,dunlab=0; double doc_label; FILE *docfl; line = (char *)my_malloc(sizeof(char)*ll); if ((docfl = fopen (docfile, "r")) == NULL) { perror (docfile); exit (1); } if(verbosity>=1) { printf("Reading examples into memory..."); fflush(stdout); } dnum=0; (*totwords)=0; while((!feof(docfl)) && fgets(line,(int)ll,docfl)) { doc.docnum=dnum+1; if(strlen(line)==0){ printf("\nERROR: empty line, missing end of line before end of file\n"); exit(1); } if(!parse_document(line, &doc, &doc_label, totwords, max_words_doc, kernel_parm)) { printf("\nParsing error in line %ld!\n%s",dnum,line); exit(1); } label[dnum]=doc_label; /* printf("Class=%ld ",doc_label); */ if (doc_label > 0) dpos++; if (doc_label < 0) dneg++; if (doc_label == 0) dunlab++; docs[dnum].queryid = doc.queryid; docs[dnum].costfactor = doc.costfactor; docs[dnum].forest_vec = doc.forest_vec; docs[dnum].num_of_trees = doc.num_of_trees; docs[dnum].vectors = doc.vectors; docs[dnum].num_of_vectors = doc.num_of_vectors; // less than 5 basic kernels and greater than 50 only vectors (to save memory) if (kernel_parm->kernel_type<4) { // from 0 to 3 are original kernels => no trees freeForest(&doc); // save memory by freeing trees docs[dnum].num_of_trees = 0; docs[dnum].forest_vec =NULL; kernel_parm->second_kernel=kernel_parm->kernel_type; } // establish some interval to free vectors // if(kernel_parm->kernel_type>20){ // docs[dnum].vectors = NULL; // docs[dnum].num_of_vectors = 0; // freeVectorSet(&doc); // save memory by freeing vectors // } docs[dnum].docnum=dnum; /* printf("\nNorm=%f\n",docs[dnum].twonorm_sq); */ /*printf("parse tree number %d: ",dnum); writeTreeString(doc.root); */ /* printf("%d\t",(int)doc_label); */ dnum++; if(verbosity>=1) { if((dnum % 100) == 0) { printf("%ld..",dnum); fflush(stdout); } } } fclose(docfl); free(line); if(verbosity>=1) { fprintf(stdout, "OK. (%ld examples read)\n", dnum); } fflush(stdout); (*totdoc)=dnum; }
void SVMLightRunner::libraryReadDocuments ( char *docfile, DOC ***docs, double **label, long int *totwords, long int *totdoc, bool use_gmumr, SVMConfiguration &config ) { LOG( config.log, LogLevel::DEBUG_LEVEL, __debug_prefix__ + ".libraryReadDocuments() Started." ); char *line,*comment; WORD *words; long dnum=0,wpos,dpos=0,dneg=0,dunlab=0,queryid,slackid,max_docs; long max_words_doc, ll; double doc_label,costfactor; FILE *docfl; if(verbosity>=1) { C_PRINTF("Scanning examples..."); C_FFLUSH(stdout); } // GMUM.R changes { if (!use_gmumr) { nol_ll(docfile,&max_docs,&max_words_doc,&ll); /* scan size of input file */ } else { max_docs = config.target.n_rows; max_words_doc = config.getDataDim(); // ll used only for file reading } // GMUM.R changes } max_words_doc+=2; ll+=2; max_docs+=2; if(verbosity>=1) { C_PRINTF("done\n"); C_FFLUSH(stdout); } (*docs) = (DOC **)my_malloc(sizeof(DOC *)*max_docs); /* feature vectors */ (*label) = (double *)my_malloc(sizeof(double)*max_docs); /* target values */ // GMUM.R changes { if (!use_gmumr) { line = (char *)my_malloc(sizeof(char)*ll); if ((docfl = fopen (docfile, "r")) == NULL) { perror (docfile); EXIT (1); } } // GMUM.R changes } words = (WORD *)my_malloc(sizeof(WORD)*(max_words_doc+10)); if(verbosity>=1) { C_PRINTF("Reading examples into memory..."); C_FFLUSH(stdout); } dnum=0; (*totwords)=0; // GMUM.R changes { bool newline; if (!use_gmumr) { newline = (!feof(docfl)) && fgets(line,(int)ll,docfl); } else { newline = false; if (dnum < config.target.n_rows) { newline = true; std::string str = SVMConfigurationToSVMLightLearnInputLine(config, dnum); line = new char[str.size() + 1]; std::copy(str.begin(), str.end(), line); line[str.size()] = '\0'; } } while(newline) { if (use_gmumr) { std::string stringline = ""; } // GMUM.R changes } if(line[0] == '#') continue; /* line contains comments */ if(!parse_document(line,words,&doc_label,&queryid,&slackid,&costfactor, &wpos,max_words_doc,&comment)) { C_PRINTF("\nParsing error in line %ld!\n%s",dnum,line); EXIT(1); } (*label)[dnum]=doc_label; /* C_PRINTF("docnum=%ld: Class=%f ",dnum,doc_label); */ if(doc_label > 0) dpos++; if (doc_label < 0) dneg++; if (doc_label == 0) { if(config.use_transductive_learning){ dunlab++; }else{ C_PRINTF("Please for transductive learning pass use_transductive_learning\n"); EXIT(1); } } if((wpos>1) && ((words[wpos-2]).wnum>(*totwords))) (*totwords)=(words[wpos-2]).wnum; if((*totwords) > MAXFEATNUM) { C_PRINTF("\nMaximum feature number exceeds limit defined in MAXFEATNUM!\n"); EXIT(1); } (*docs)[dnum] = create_example(dnum,queryid,slackid,costfactor, create_svector(words,comment,1.0)); /* C_PRINTF("\nNorm=%f\n",((*docs)[dnum]->fvec)->twonorm_sq); */ dnum++; if(verbosity>=1) { if((dnum % 100) == 0) { C_PRINTF("%ld..",dnum); C_FFLUSH(stdout); } } // GMUM.R changes { if (!use_gmumr) { newline = (!feof(docfl)) && fgets(line,(int)ll,docfl); } else { newline = false; if (dnum < config.target.n_rows) { newline = true; std::string str = SVMConfigurationToSVMLightLearnInputLine(config, dnum); line = new char[str.size() + 1]; std::copy(str.begin(), str.end(), line); line[str.size()] = '\0'; } } // GMUM.R changes } } if (!use_gmumr) { fclose(docfl); free(line); }; free(words); if(verbosity>=1) { C_FPRINTF(stdout, "OK. (%ld examples read)\n", dnum); } (*totdoc)=dnum; }
MODEL * SVMLightRunner::libraryReadModel( char *modelfile, bool use_gmumr, SVMConfiguration &config ) { LOG( config.log, LogLevel::DEBUG_LEVEL, __debug_prefix__ + ".libraryReadModel() Started." ); FILE *modelfl; long i,queryid,slackid; double costfactor; long max_sv,max_words,ll,wpos; char *line,*comment; WORD *words; char version_buffer[100]; MODEL *model; if(verbosity>=1) { C_PRINTF("Reading model..."); C_FFLUSH(stdout); } // GMUM.R changes { model = (MODEL *)my_malloc(sizeof(MODEL)); if (!use_gmumr) { nol_ll(modelfile,&max_sv,&max_words,&ll); /* scan size of model file */ max_words+=2; ll+=2; words = (WORD *)my_malloc(sizeof(WORD)*(max_words+10)); line = (char *)my_malloc(sizeof(char)*ll); if ((modelfl = fopen (modelfile, "r")) == NULL) { perror (modelfile); EXIT (1); } fscanf(modelfl,"SVM-light Version %s\n",version_buffer); if(strcmp(version_buffer,VERSION)) { perror ("Version of model-file does not match version of svm_classify!"); EXIT (1); } fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.kernel_type); fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.poly_degree); fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.rbf_gamma); fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_lin); fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_const); fscanf(modelfl,"%[^#]%*[^\n]\n", model->kernel_parm.custom); fscanf(modelfl,"%ld%*[^\n]\n", &model->totwords); fscanf(modelfl,"%ld%*[^\n]\n", &model->totdoc); fscanf(modelfl,"%ld%*[^\n]\n", &model->sv_num); fscanf(modelfl,"%lf%*[^\n]\n", &model->b); } else { // use_gmumr max_words = config.getDataDim(); words = (WORD *)my_malloc(sizeof(WORD)*(max_words+10)); LOG( config.log, LogLevel::DEBUG_LEVEL, __debug_prefix__ + ".libraryReadModel() Converting config to model..." ); /* 0=linear, 1=poly, 2=rbf, 3=sigmoid, 4=custom -- same as GMUM.R! */ model->kernel_parm.kernel_type = static_cast<long int>(config.kernel_type); // -d int -> parameter d in polynomial kernel model->kernel_parm.poly_degree = config.degree; // -g float -> parameter gamma in rbf kernel model->kernel_parm.rbf_gamma = config.gamma; // -s float -> parameter s in sigmoid/poly kernel model->kernel_parm.coef_lin = config.gamma; // -r float -> parameter c in sigmoid/poly kernel model->kernel_parm.coef_const = config.coef0; // -u string -> parameter of user defined kernel char kernel_parm_custom[50] = "empty"; char * model_kernel_parm_custom = model->kernel_parm.custom; model_kernel_parm_custom = kernel_parm_custom; // highest feature index model->totwords = config.getDataDim(); // number of training documents model->totdoc = config.target.n_rows; // number of support vectors plus 1 (!) model->sv_num = config.l + 1; /* Threshold b (has opposite sign than SVMClient::predict()) * In svm_common.c:57 in double classify_example_linear(): * return(sum-model->b); */ model->b = - config.b; LOG( config.log, LogLevel::DEBUG_LEVEL, __debug_prefix__ + ".libraryReadModel() Converting config done." ); } // GMUM.R changes } model->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num); model->alpha = (double *)my_malloc(sizeof(double)*model->sv_num); model->index=NULL; model->lin_weights=NULL; // GMUM.R changes { if (!use_gmumr) { for(i=1;i<model->sv_num;i++) { fgets(line,(int)ll,modelfl); if(!parse_document(line,words,&(model->alpha[i]),&queryid,&slackid, &costfactor,&wpos,max_words,&comment)) { C_PRINTF("\nParsing error while reading model file in SV %ld!\n%s", i,line); EXIT(1); } model->supvec[i] = create_example(-1, 0,0, 0.0, create_svector(words,comment,1.0)); } fclose(modelfl); free(line); } else { for(i = 1; i < model->sv_num; ++i) { line = SVMConfigurationToSVMLightModelSVLine(config, i-1); if(!parse_document(line,words,&(model->alpha[i]),&queryid,&slackid, &costfactor,&wpos,max_words,&comment)) { C_PRINTF("\nParsing error while reading model file in SV %ld!\n%s", i,line); EXIT(1); } model->supvec[i] = create_example(-1, 0,0, 0.0, create_svector(words,comment,1.0)); free(line); } } // GMUM.R changes } free(words); if(verbosity>=1) { C_FPRINTF(stdout, "OK. (%d support vectors read)\n",(int)(model->sv_num-1)); } LOG( config.log, LogLevel::DEBUG_LEVEL, __debug_prefix__ + ".libraryReadModel() Done." ); return(model); }
int SVMLightRunner::librarySVMClassifyMain( int argc, char **argv, bool use_gmumr, SVMConfiguration &config ) { LOG( config.log, LogLevel::DEBUG_LEVEL, __debug_prefix__ + ".librarySVMClassifyMain() Started." ); DOC *doc; /* test example */ WORD *words; long max_docs,max_words_doc,lld; long totdoc=0,queryid,slackid; long correct=0,incorrect=0,no_accuracy=0; long res_a=0,res_b=0,res_c=0,res_d=0,wnum,pred_format; long j; double t1,runtime=0; double dist,doc_label,costfactor; char *line,*comment; FILE *predfl,*docfl; MODEL *model; // GMUM.R changes { librarySVMClassifyReadInputParameters( argc, argv, docfile, modelfile, predictionsfile, &verbosity, &pred_format, use_gmumr, config); if (!use_gmumr) { nol_ll(docfile,&max_docs,&max_words_doc,&lld); /* scan size of input file */ lld+=2; line = (char *)my_malloc(sizeof(char)*lld); } else { max_docs = config.target.n_rows; max_words_doc = config.getDataDim(); config.result = arma::zeros<arma::vec>(max_docs); // Prevent writing to the file pred_format = -1; // lld used only for file reading } max_words_doc+=2; words = (WORD *)my_malloc(sizeof(WORD)*(max_words_doc+10)); // GMUM.R changes } model=libraryReadModel(modelfile, use_gmumr, config); // GMUM.R changes } if(model->kernel_parm.kernel_type == 0) { /* linear kernel */ /* compute weight vector */ add_weight_vector_to_linear_model(model); } if(verbosity>=2) { C_PRINTF("Classifying test examples.."); C_FFLUSH(stdout); } // GMUM.R changes { bool newline; if (!use_gmumr) { if ((predfl = fopen (predictionsfile, "w")) == NULL) { perror (predictionsfile); EXIT (1); } if ((docfl = fopen (docfile, "r")) == NULL) { perror (docfile); EXIT (1); } newline = (!feof(docfl)) && fgets(line,(int)lld,docfl); } else { newline = false; if (totdoc < config.getDataExamplesNumber()) { newline = true; std::string str = SVMConfigurationToSVMLightLearnInputLine(config, totdoc); line = new char[str.size() + 1]; std::copy(str.begin(), str.end(), line); line[str.size()] = '\0'; } } while(newline) { if (use_gmumr) { std::string stringline = ""; } // GMUM.R changes } if(line[0] == '#') continue; /* line contains comments */ parse_document(line,words,&doc_label,&queryid,&slackid,&costfactor,&wnum, max_words_doc,&comment); totdoc++; if(model->kernel_parm.kernel_type == 0) { /* linear kernel */ for(j=0;(words[j]).wnum != 0;j++) { /* Check if feature numbers */ if((words[j]).wnum>model->totwords) /* are not larger than in */ (words[j]).wnum=0; /* model. Remove feature if */ } /* necessary. */ doc = create_example(-1,0,0,0.0,create_svector(words,comment,1.0)); t1=get_runtime(); dist=classify_example_linear(model,doc); runtime+=(get_runtime()-t1); free_example(doc,1); } else { /* non-linear kernel */ doc = create_example(-1,0,0,0.0,create_svector(words,comment,1.0)); t1=get_runtime(); dist=classify_example(model,doc); runtime+=(get_runtime()-t1); free_example(doc,1); } if(dist>0) { if(pred_format==0) { /* old weired output format */ C_FPRINTF(predfl,"%.8g:+1 %.8g:-1\n",dist,-dist); } if(doc_label>0) correct++; else incorrect++; if(doc_label>0) res_a++; else res_b++; } else { if(pred_format==0) { /* old weired output format */ C_FPRINTF(predfl,"%.8g:-1 %.8g:+1\n",-dist,dist); } if(doc_label<0) correct++; else incorrect++; if(doc_label>0) res_c++; else res_d++; } if(pred_format==1) { /* output the value of decision function */ C_FPRINTF(predfl,"%.8g\n",dist); } if((int)(0.01+(doc_label*doc_label)) != 1) { no_accuracy=1; } /* test data is not binary labeled */ if(verbosity>=2) { if(totdoc % 100 == 0) { C_PRINTF("%ld..",totdoc); C_FFLUSH(stdout); } } // GMUM.R changes { if (!use_gmumr) { newline = (!feof(docfl)) && fgets(line,(int)lld,docfl); } else { newline = false; // Store prediction result in config config.result[totdoc-1] = dist; // Read next line if (totdoc < config.getDataExamplesNumber()) { newline = true; std::string str = SVMConfigurationToSVMLightLearnInputLine(config, totdoc); line = new char[str.size() + 1]; std::copy(str.begin(), str.end(), line); line[str.size()] = '\0'; } } } if (!use_gmumr) { fclose(predfl); fclose(docfl); free(line); } // GMUM.R changes } free(words); free_model(model,1); if(verbosity>=2) { C_PRINTF("done\n"); /* Note by Gary Boone Date: 29 April 2000 */ /* o Timing is inaccurate. The timer has 0.01 second resolution. */ /* Because classification of a single vector takes less than */ /* 0.01 secs, the timer was underflowing. */ C_PRINTF("Runtime (without IO) in cpu-seconds: %.2f\n", (float)(runtime/100.0)); } if((!no_accuracy) && (verbosity>=1)) { C_PRINTF("Accuracy on test set: %.2f%% (%ld correct, %ld incorrect, %ld total)\n",(float)(correct)*100.0/totdoc,correct,incorrect,totdoc); C_PRINTF("Precision/recall on test set: %.2f%%/%.2f%%\n",(float)(res_a)*100.0/(res_a+res_b),(float)(res_a)*100.0/(res_a+res_c)); } return(0); }
STRUCTMODEL read_struct_model(char *file, STRUCT_LEARN_PARM *sparm) { /* Reads structural model sm from file file. This function is used only in the prediction module, not in the learning module. */ FILE *modelfl; STRUCTMODEL sm; long i,queryid,slackid; double costfactor; long max_sv,max_words,ll,wpos; char *line,*comment; TOKEN *words; char version_buffer[100]; MODEL *model; nol_ll(file,&max_sv,&max_words,&ll); /* scan size of model file */ max_words+=2; ll+=2; words = (TOKEN *)my_malloc(sizeof(TOKEN)*(max_words+10)); line = (char *)my_malloc(sizeof(char)*ll); model = (MODEL *)my_malloc(sizeof(MODEL)); if ((modelfl = fopen (file, "r")) == NULL) { perror (file); exit (1); } fscanf(modelfl,"SVM-multiclass Version %s\n",version_buffer); if(strcmp(version_buffer,INST_VERSION)) { perror ("Version of model-file does not match version of svm_struct_classify!"); exit (1); } fscanf(modelfl,"%d%*[^\n]\n", &sparm->num_classes); fscanf(modelfl,"%d%*[^\n]\n", &sparm->num_features); fscanf(modelfl,"%d%*[^\n]\n", &sparm->loss_function); fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.kernel_type); fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.poly_degree); fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.rbf_gamma); fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_lin); fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_const); fscanf(modelfl,"%[^#]%*[^\n]\n", model->kernel_parm.custom); fscanf(modelfl,"%ld%*[^\n]\n", &model->totwords); fscanf(modelfl,"%ld%*[^\n]\n", &model->totdoc); fscanf(modelfl,"%ld%*[^\n]\n", &model->sv_num); fscanf(modelfl,"%lf%*[^\n]\n", &model->b); model->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num); model->alpha = (double *)my_malloc(sizeof(double)*model->sv_num); model->index=NULL; model->lin_weights=NULL; for(i=1;i<model->sv_num;i++) { fgets(line,(int)ll,modelfl); if(!parse_document(line,words,&(model->alpha[i]),&queryid,&slackid, &costfactor,&wpos,max_words,&comment, true)) { printf("\nParsing error while reading model file in SV %ld!\n%s", i,line); exit(1); } model->supvec[i] = create_example(-1,0,0,0.0, create_svector(words,comment,1.0)); model->supvec[i]->fvec->kernel_id=queryid; } fclose(modelfl); free(line); free(words); if(verbosity>=1) { fprintf(stdout, " (%d support vectors read) ",(int)(model->sv_num-1)); } sm.svm_model=model; sm.sizePsi=model->totwords; sm.w=NULL; return(sm); }
void read_documents(char *docfile, DOC *docs, double *label, long int max_words_doc, long int ll, long int *totwords, long int *totdoc) { char *line; DOC doc; long dnum=0,wpos,i,dpos=0,dneg=0,dunlab=0; double doc_label; FILE *docfl; line = (char *)my_malloc(sizeof(char)*ll); if ((docfl = fopen (docfile, "r")) == NULL) { perror (docfile); exit (1); } doc.words = (WORD *)my_malloc(sizeof(WORD)*(max_words_doc+10)); if(verbosity>=1) { printf("Reading examples into memory..."); fflush(stdout); } dnum=0; (*totwords)=0; while((!feof(docfl)) && fgets(line,(int)ll,docfl)) { if(line[0] == '#') continue; /* line contains comments */ if(!parse_document(line,&doc,&doc_label,&wpos,max_words_doc)) { printf("\nParsing error in line %ld!\n%s",dnum,line); exit(1); } label[dnum]=doc_label; /* printf("Class=%ld ",doc_label); */ if(doc_label > 0) dpos++; if (doc_label < 0) dneg++; if (doc_label == 0) dunlab++; if((wpos>1) && ((doc.words[wpos-2]).wnum>(*totwords))) (*totwords)=(doc.words[wpos-2]).wnum; docs[dnum].queryid = doc.queryid; docs[dnum].costfactor = doc.costfactor; docs[dnum].words = (WORD *)my_malloc(sizeof(WORD)*(wpos)); docs[dnum].docnum=dnum; for(i=0;i<wpos;i++) { docs[dnum].words[i]=doc.words[i]; /* printf("%ld::%f ",(docs[dnum].words[i]).wnum,(docs[dnum].words[i]).weight); */ } docs[dnum].twonorm_sq=doc.twonorm_sq; /* printf("\nNorm=%f\n",docs[dnum].twonorm_sq); */ dnum++; if(verbosity>=1) { if((dnum % 100) == 0) { printf("%ld..",dnum); fflush(stdout); } } } fclose(docfl); free(line); free(doc.words); if(verbosity>=1) { fprintf(stdout, "OK. (%ld examples read)\n", dnum); } (*totdoc)=dnum; }