void statistic_queries(char *input_file, int *query, int l) { char *endptr; char *idx, *val, *label; double y; FILE *fp = fopen(input_file,"r"); for(int i=0;i<l;i++) { readline(fp); label = strtok(line," \t\n"); if(label == NULL) exit_input_error(i+1); y = strtod(label,&endptr); if(endptr == label || *endptr != '\0') exit_input_error(i+1); idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) exit_input_error(i+1); if(!strcmp(idx,"qid")) { errno = 0; query[i] = (int) strtol(val, &endptr, 10); if(endptr == val || errno !=0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(i+1); } } rewind(fp); fclose(fp); }
problem Flexible_vector::Read_file_without_index(std::string filename) { std::cout<<"IN Read_file_without_index"<<std::endl; std::ifstream file_in(filename.c_str()); std::string e,line; problem prob; int elements, max_index, inst_max_index; char *endptr; prob.l = 0; elements = 0; max_index = 0; while (std::getline(file_in, line)) { //std::cout<<std::endl; inst_max_index = 0; std::stringstream lineStream(line); std::getline(lineStream, e,','); //label if(e.empty()) //empty line exit_input_error(prob.l+1); prob.y.push_back(stringToNum<double>(e)); //std::cout<<prob.y[prob.y.size()-1]<<','; //prob.y.push_back(atof(e)); //if(endptr == e || *endptr != '\0') // exit_input_error(prob.l+1); prob.x_ptr.push_back(elements); //feature while(std::getline(lineStream, e,',')) { inst_max_index++; node x_tmp; x_tmp.index = inst_max_index; x_tmp.value = stringToNum<double>(e); //x_tmp.value = atof(e); prob.x.push_back(x_tmp); //std::cout<<prob.x[prob.x.size()-1].value<<','; elements++; } prob.x_interval.push_back(inst_max_index); if(inst_max_index > max_index) max_index = inst_max_index; prob.l++; } prob.max_feature = max_index; return prob; }
void predict(FILE *input, FILE *output) { int correct = 0; int total = 0; double error = 0; double sump = 0, sumt = 0, sumpp = 0, sumtt = 0, sumpt = 0; int svm_type=svm_get_svm_type(model); int nr_class=svm_get_nr_class(model); double *prob_estimates=NULL; int j; if(predict_probability) { if (svm_type==NU_SVR || svm_type==EPSILON_SVR) printf("Prob. model for test data: target value = predicted value + z,\nz: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma=%g\n",svm_get_svr_probability(model)); else { int *labels=(int *) malloc(nr_class*sizeof(int)); svm_get_labels(model,labels); prob_estimates = (double *) malloc(nr_class*sizeof(double)); fprintf(output,"labels"); for(j=0;j<nr_class;j++) fprintf(output," %d",labels[j]); fprintf(output,"\n"); free(labels); } } max_line_len = 1024; line = (char *)malloc(max_line_len*sizeof(char)); while(readline(input) != NULL) { int i = 0; double target_label, predict_label; char *idx, *val, *label, *endptr; int inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0 label = strtok(line," \t"); target_label = strtod(label,&endptr); if(endptr == label) exit_input_error(total+1); while(1) { if(i>=max_nr_attr-1) // need one more for index = -1 { max_nr_attr *= 2; x = (struct svm_node *) realloc(x,max_nr_attr*sizeof(struct svm_node)); } idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; errno = 0; x[i].index = (int) strtol(idx,&endptr,10); if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index) exit_input_error(total+1); else inst_max_index = x[i].index; errno = 0; x[i].value = strtod(val,&endptr); if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(total+1); ++i; } x[i].index = -1; if (predict_probability && (svm_type==C_SVC || svm_type==NU_SVC)) { predict_label = svm_predict_probability(model,x,prob_estimates); fprintf(output,"%g",predict_label); for(j=0;j<nr_class;j++) fprintf(output," %g",prob_estimates[j]); fprintf(output,"\n"); } else { predict_label = svm_predict(model,x); fprintf(output,"%g\n",predict_label); } if(predict_label == target_label) ++correct; error += (predict_label-target_label)*(predict_label-target_label); sump += predict_label; sumt += target_label; sumpp += predict_label*predict_label; sumtt += target_label*target_label; sumpt += predict_label*target_label; ++total; } if (svm_type==NU_SVR || svm_type==EPSILON_SVR) { printf("Mean squared error = %g (regression)\n",error/total); printf("Squared correlation coefficient = %g (regression)\n", ((total*sumpt-sump*sumt)*(total*sumpt-sump*sumt))/ ((total*sumpp-sump*sump)*(total*sumtt-sumt*sumt)) ); } else printf("Accuracy = %g%% (%d/%d) (classification)\n", (double)correct/total*100,correct,total); if(predict_probability) free(prob_estimates); }
void binary_class_predict(FILE *input, FILE *output){ int total = 0; int *labels; int max_nr_attr = 64; struct svm_node *x = Malloc(struct svm_node, max_nr_attr); dvec_t dec_values; ivec_t true_labels; int svm_type=svm_get_svm_type(model); if (svm_type==NU_SVR || svm_type==EPSILON_SVR){ fprintf(stderr, "wrong svm type."); exit(1); } labels = Malloc(int, svm_get_nr_class(model)); svm_get_labels(model, labels); max_line_len = 1024; line = (char *)malloc(max_line_len*sizeof(char)); while(readline(input) != NULL) { int i = 0; double target_label, predict_label; char *idx, *val, *label, *endptr; int inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0 label = strtok(line," \t"); target_label = strtod(label,&endptr); if(endptr == label) exit_input_error(total+1); while(1) { if(i>=max_nr_attr - 2) // need one more for index = -1 { max_nr_attr *= 2; x = (struct svm_node *) realloc(x,max_nr_attr*sizeof(struct svm_node)); } idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; errno = 0; x[i].index = (int) strtol(idx,&endptr,10); if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index) exit_input_error(total+1); else inst_max_index = x[i].index; errno = 0; x[i].value = strtod(val,&endptr); if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(total+1); ++i; } x[i].index = -1; predict_label = svm_predict(model,x); fprintf(output,"%g\n",predict_label); double dec_value; svm_predict_values(model, x, &dec_value); true_labels.push_back((target_label > 0)? 1: -1); if(labels[0] <= 0) dec_value *= -1; dec_values.push_back(dec_value); } // validation_function(dec_values, true_labels); accuracy(dec_values, true_labels); bac(dec_values, true_labels); free(labels); free(x); }
void read_problem(const char *filename) { int elements, max_index, inst_max_index, i, j; #ifdef _DENSE_REP double value; #endif FILE *fp = fopen(filename,"r"); char *endptr; char *idx, *val, *label; if(fp == NULL) { fprintf(stderr,"can't open input file %s\n",filename); exit(1); } prob.l = 0; elements = 0; max_line_len = 1024; line = Malloc(char,max_line_len); #ifdef _DENSE_REP max_index = 1; while(readline(fp) != NULL) { char *p; p = strrchr(line, ':'); if(p != NULL) { while(*p != ' ' && *p != '\t' && p > line) p--; if(p > line) max_index = (int) strtol(p,&endptr,10) + 1; } if(max_index > elements) elements = max_index; ++prob.l; } rewind(fp); prob.y = Malloc(double,prob.l); prob.x = Malloc(struct svm_node,prob.l); for(i=0;i<prob.l;i++) { int *d; int instance_elements = elements; if(param.kernel_type == TRI_PRECOMPUTED || param.kernel_type == TRI_PRECOMPUTED_RBF) instance_elements = (i+2); #ifdef _FLOAT_REP (prob.x+i)->values = Malloc(float,instance_elements); #else (prob.x+i)->values = Malloc(double,instance_elements); #endif (prob.x+i)->dim = 0; inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0 readline(fp); label = strtok(line," \t"); prob.y[i] = strtod(label,&endptr); if(endptr == label) exit_input_error(i+1); while((prob.x+i)->dim<instance_elements) { idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; errno = 0; j = (int) strtol(idx,&endptr,10); if(endptr == idx || errno != 0 || *endptr != '\0' || j <= inst_max_index) exit_input_error(i+1); else inst_max_index = j; errno = 0; value = strtod(val,&endptr); if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(i+1); d = &((prob.x+i)->dim); while (*d < j) (prob.x+i)->values[(*d)++] = 0.0; #ifdef _FLOAT_REP (prob.x+i)->values[(*d)++] = (float) value; #else (prob.x+i)->values[(*d)++] = value; #endif } } max_index = elements-1; #else while(readline(fp)!=NULL) { char *p = strtok(line," \t"); // label // features while(1) { p = strtok(NULL," \t"); if(p == NULL || *p == '\n') // check '\n' as ' ' may be after the last feature break; ++elements; } ++elements; ++prob.l; } rewind(fp); prob.y = Malloc(double,prob.l); prob.x = Malloc(struct svm_node *,prob.l); x_space = Malloc(struct svm_node,elements); max_index = 0; j=0; for(i=0;i<prob.l;i++) { inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0 readline(fp); prob.x[i] = &x_space[j]; label = strtok(line," \t\n"); if(label == NULL) // empty line exit_input_error(i+1); prob.y[i] = strtod(label,&endptr); if(endptr == label || *endptr != '\0') exit_input_error(i+1); while(1) { idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; errno = 0; x_space[j].index = (int) strtol(idx,&endptr,10); if(endptr == idx || errno != 0 || *endptr != '\0' || x_space[j].index <= inst_max_index) exit_input_error(i+1); else inst_max_index = x_space[j].index; errno = 0; #ifdef _FLOAT_REP x_space[j].value = strtof(val,&endptr); #else x_space[j].value = strtod(val,&endptr); #endif if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(i+1); ++j; } if(inst_max_index > max_index) max_index = inst_max_index; x_space[j++].index = -1; } #endif if(param.gamma == 0 && max_index > 0) param.gamma = 1.0/max_index; if(param.kernel_type == PRECOMPUTED || param.kernel_type == TRI_PRECOMPUTED || param.kernel_type == PRECOMPUTED_RBF || param.kernel_type == TRI_PRECOMPUTED_RBF) for(i=0;i<prob.l;i++) { #ifdef _DENSE_REP if ((prob.x+i)->dim == 0 || (prob.x+i)->values[0] == 0.0) { fprintf(stderr,"Wrong input format: first column must be 0:sample_serial_number\n"); exit(1); } if ((int)(prob.x+i)->values[0] < 0 || (int)(prob.x+i)->values[0] > max_index) { fprintf(stderr,"Wrong input format: sample_serial_number out of range\n"); exit(1); } #else if (prob.x[i][0].index != 0) { fprintf(stderr,"Wrong input format: first column must be 0:sample_serial_number\n"); exit(1); } if ((int)prob.x[i][0].value <= 0 || (int)prob.x[i][0].value > max_index) { fprintf(stderr,"Wrong input format: sample_serial_number out of range\n"); exit(1); } #endif } fclose(fp); }
void do_predict(FILE *input, FILE *output, struct model* model_) { int correct = 0; int total = 0; int nr_class=get_nr_class(model_); double *prob_estimates=NULL; int j, n; int nr_feature=get_nr_feature(model_); if(model_->bias>=0) n=nr_feature+1; else n=nr_feature; if(flag_predict_probability) { int *labels; if(!check_probability_model(model_)) { fprintf(stderr, "probability output is only supported for logistic regression\n"); exit(1); } labels=(int *) malloc(nr_class*sizeof(int)); get_labels(model_,labels); prob_estimates = (double *) malloc(nr_class*sizeof(double)); fprintf(output,"labels"); for(j=0;j<nr_class;j++) fprintf(output," %d",labels[j]); fprintf(output,"\n"); free(labels); } max_line_len = 1024; line = (char *)malloc(max_line_len*sizeof(char)); while(readline(input) != NULL) { int i = 0; int target_label, predict_label; char *idx, *val, *label, *endptr; int inst_max_index = 0; // strtol gives 0 if wrong format label = strtok(line," \t"); target_label = (int) strtol(label,&endptr,10); if(endptr == label) exit_input_error(total+1); while(1) { if(i>=max_nr_attr-2) // need one more for index = -1 { max_nr_attr *= 2; x = (struct feature_node *) realloc(x,max_nr_attr*sizeof(struct feature_node)); } idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; errno = 0; x[i].index = (int) strtol(idx,&endptr,10); if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index) exit_input_error(total+1); else inst_max_index = x[i].index; errno = 0; x[i].value = strtod(val,&endptr); if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(total+1); // feature indices larger than those in training are not used if(x[i].index <= nr_feature) ++i; } if(model_->bias>=0) { x[i].index = n; x[i].value = model_->bias; i++; } x[i].index = -1; if(flag_predict_probability) { int j; predict_label = predict_probability(model_,x,prob_estimates); fprintf(output,"%d",predict_label); for(j=0;j<model_->nr_class;j++) fprintf(output," %g",prob_estimates[j]); fprintf(output,"\n"); } else { predict_label = predict(model_,x); fprintf(output,"%d\n",predict_label); } if(predict_label == target_label) ++correct; ++total; } printf("Accuracy = %g%% (%d/%d)\n",(double) correct/total*100,correct,total); if(flag_predict_probability) free(prob_estimates); }
void predict(FILE *input, FILE *output) { int correct = 0; int total = 0; double error = 0; double sump = 0, sumt = 0, sumpp = 0, sumtt = 0, sumpt = 0; int svm_type=svm_get_svm_type(model); int nr_class=svm_get_nr_class(model); double *prob_estimates=NULL; int j; // This block by Jianxin Wu, for average accuracy computation int ii,label_index; // number of correct predictions in each category int* correct_sub = (int *)malloc(nr_class*sizeof(int)); for(ii=0;ii<nr_class;ii++) correct_sub[ii] = 0; // number of testing examples in each category int* total_sub = (int *)malloc(nr_class*sizeof(int)); for(ii=0;ii<nr_class;ii++) total_sub[ii] = 0; int* labels_avg = (int*)malloc(nr_class*sizeof(int)); svm_get_labels(model,labels_avg); if(predict_probability) { if (svm_type==NU_SVR || svm_type==EPSILON_SVR) printf("Prob. model for test data: target value = predicted value + z,\nz: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma=%g\n",svm_get_svr_probability(model)); else { int *labels=(int *) malloc(nr_class*sizeof(int)); svm_get_labels(model,labels); prob_estimates = (double *) malloc(nr_class*sizeof(double)); fprintf(output,"labels"); for(j=0;j<nr_class;j++) fprintf(output," %d",labels[j]); fprintf(output,"\n"); free(labels); } } max_line_len = 1024; line = (char *)malloc(max_line_len*sizeof(char)); while(readline(input) != NULL) { int i = 0; double target_label, predict_label; char *idx, *val, *label, *endptr; int inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0 label = strtok(line," \t"); target_label = strtod(label,&endptr); if(endptr == label) exit_input_error(total+1); while(1) { if(i>=max_nr_attr-1) // need one more for index = -1 { max_nr_attr *= 2; x = (struct svm_node *) realloc(x,max_nr_attr*sizeof(struct svm_node)); } idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; errno = 0; x[i].index = (int) strtol(idx,&endptr,10); if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index) exit_input_error(total+1); else inst_max_index = x[i].index; errno = 0; x[i].value = strtod(val,&endptr); if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(total+1); ++i; } x[i].index = -1; if (predict_probability && (svm_type==C_SVC || svm_type==NU_SVC)) { predict_label = svm_predict_probability(model,x,prob_estimates); fprintf(output,"%g",predict_label); for(j=0;j<nr_class;j++) fprintf(output," %g",prob_estimates[j]); fprintf(output,"\n"); } else { predict_label = svm_predict(model,x); fprintf(output,"%g\n",predict_label); } // This block by Jianxin Wu, for average accuracy label_index = FindLabel((int)target_label,labels_avg); total_sub[label_index]++; if(predict_label == target_label) correct_sub[label_index]++; if(predict_label == target_label) ++correct; error += (predict_label-target_label)*(predict_label-target_label); sump += predict_label; sumt += target_label; sumpp += predict_label*predict_label; sumtt += target_label*target_label; sumpt += predict_label*target_label; ++total; } if (svm_type==NU_SVR || svm_type==EPSILON_SVR) { printf("Mean squared error = %g (regression)\n",error/total); printf("Squared correlation coefficient = %g (regression)\n", ((total*sumpt-sump*sumt)*(total*sumpt-sump*sumt))/ ((total*sumpp-sump*sump)*(total*sumtt-sumt*sumt)) ); } else printf("Accuracy = %g%% (%d/%d) (classification)\n", (double)correct/total*100,correct,total); if(predict_probability) free(prob_estimates); // This block (till endo of function) by Jianxin WU // Print per-category accuracy and average accuracy of categories double sub_score = 0; int nonempty_category = 0; for(ii=0;ii<nr_class;ii++) { if(total_sub[ii]>0) { sub_score += (correct_sub[ii]*1.0/total_sub[ii]); nonempty_category++; } } printf("-----------\n"); for(ii=0;ii<nr_class;ii++) { printf("%d / %d (Category %d)\n",correct_sub[ii],total_sub[ii],labels_avg[ii]); } printf("-----------\n"); printf("Mean Accuray across classes = %g%%\n",sub_score*100.0/nonempty_category); free(correct_sub); free(total_sub); free(labels_avg); }
void read_problem(const char *filename) { int elements, max_index, inst_max_index, i, j; FILE *fp = fopen(filename,"r"); char *endptr; char *idx, *val, *label; if(fp == NULL) { fprintf(stderr,"can't open input file %s\n",filename); exit(1); } prob.l = 0; elements = 0; max_line_len = 1024; line = Malloc(char,max_line_len); while(readline(fp)!=NULL) { char *p = strtok(line," \t"); // label // features while(1) { p = strtok(NULL," \t"); if(p == NULL || *p == '\n') // check '\n' as ' ' may be after the last feature break; ++elements; } ++elements; ++prob.l; } rewind(fp); prob.y = Malloc(double,prob.l); prob.x = Malloc(struct svm_node *,prob.l); prob.W = Malloc(double,prob.l); x_space = Malloc(struct svm_node,elements); max_index = 0; j=0; for(i=0;i<prob.l;i++) { inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0 readline(fp); prob.x[i] = &x_space[j]; label = strtok(line," \t\n"); if(label == NULL) // empty line exit_input_error(i+1); prob.y[i] = strtod(label,&endptr); if(endptr == label || *endptr != '\0') exit_input_error(i+1); prob.W[i] = 1; while(1) { idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; errno = 0; x_space[j].index = (int) strtol(idx,&endptr,10); if(endptr == idx || errno != 0 || *endptr != '\0' || x_space[j].index <= inst_max_index) exit_input_error(i+1); else inst_max_index = x_space[j].index; errno = 0; x_space[j].value = strtod(val,&endptr); if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(i+1); ++j; } if(inst_max_index > max_index) max_index = inst_max_index; x_space[j++].index = -1; } if(param.gamma == 0 && max_index > 0) param.gamma = 1.0/max_index; if(param.kernel_type == PRECOMPUTED) for(i=0;i<prob.l;i++) { if (prob.x[i][0].index != 0) { fprintf(stderr,"Wrong input format: first column must be 0:sample_serial_number\n"); exit(1); } if ((int)prob.x[i][0].value <= 0 || (int)prob.x[i][0].value > max_index) { fprintf(stderr,"Wrong input format: sample_serial_number out of range\n"); exit(1); } } fclose(fp); if(weight_file) { fp = fopen(weight_file,"r"); for(i=0;i<prob.l;i++) fscanf(fp,"%lf",&prob.W[i]); fclose(fp); } }
// read in a problem (in libsvm format) // clicks shows qid:1 f1:val1 f2:val2 void read_problem(const char *filename) { int max_index, inst_max_index, i; long int elements, j; FILE *fp = fopen(filename,"r"); char *endptr; char *idx, *val; //char *label; int clicks = 0; int shows = 0; int lines = 0; if(fp == NULL) { fprintf(stderr,"can't open input file %s\n",filename); exit(1); } prob.l = 0; elements = 0; max_line_len = 1024; line = Malloc(char,max_line_len); while(readline(fp)!=NULL) { char *p = strtok(line," \t"); // clicks clicks = atoi(p); p = strtok(NULL," \t"); // shows shows = atoi(p); p = strtok(NULL," \t"); // qid if (shows <=0 || clicks > shows) { lines++; continue; } // features while(1) { p = strtok(NULL," \t"); if(p == NULL || *p == '\n') // check '\n' as ' ' may be after the last feature break; elements += shows; } elements += shows; // for bias term prob.l += shows; lines++; } rewind(fp); /* printf("lines:%d\n",lines); printf("prob length:%d\n", prob.l); printf("elements:%d\n", elements); */ prob.bias=bias; prob.y = Malloc(double,prob.l); prob.x = Malloc(struct feature_node *,prob.l); x_space = Malloc(struct feature_node,elements+prob.l); max_index = 0; j = 0; long int start = 0; long int end = 0; long int instances = 0; long int feature_len = 0; int non_clicks = 0; int feature_node_size = sizeof(struct feature_node); for(i=0;i<lines;i++) { inst_max_index = 0; // strtol gives 0 if wrong format readline(fp); char *p = strtok(line," \t\n"); if(p == NULL) // empty line exit_input_error(i+1); clicks = atoi(p); p = strtok(NULL," \t"); // shows shows = atoi(p); if (shows <=0 || clicks > shows) { continue; } p = strtok(NULL," \t"); // qid start = end; j=end; prob.x[instances] = &x_space[j]; prob.y[instances] = 0; // label nonclicks while(1) { idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; errno = 0; x_space[j].index = (int) strtol(idx,&endptr,10); if(endptr == idx || errno != 0 || *endptr != '\0' || x_space[j].index <= inst_max_index) exit_input_error(i+1); else inst_max_index = x_space[j].index; errno = 0; x_space[j].value = strtod(val,&endptr); if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(i+1); ++j; } if(inst_max_index > max_index) max_index = inst_max_index; if(prob.bias >= 0) x_space[j++].value = prob.bias; x_space[j++].index = -1; end = j; feature_len = end - start; non_clicks = shows - clicks; instances++; for(int k=1; k<shows; k++) { if (k < non_clicks) { prob.y[instances] = 0; } else { prob.y[instances] = 1; } prob.x[instances] = &x_space[end]; memcpy(&x_space[end], &x_space[start], feature_len * feature_node_size); /* for(j = 0; j<feature_len;j++) { x_space[end+j].index = x_space[start+j].index; x_space[end+j].value = x_space[start+j].value; } */ start = end; end += feature_len; instances++; } } /* for(i=0;i<prob.l;i++) { printf("prob[%d]:%f\n",i,prob.y[i]); } for(i=0;i<elements+prob.l;i++) { printf("x_space[%d]:%d,%f\n",i,x_space[i].index,x_space[i].value); } */ if(prob.bias >= 0) { prob.n=max_index+1; for(i=1;i<lines;i++) (prob.x[i]-2)->index = prob.n; x_space[j-2].index = prob.n; } else prob.n=max_index; fclose(fp); }
void do_predict(FILE *input, FILE *output) { int total=0; int n; int nr_feature=get_nr_feature(model_); double *dvec_t; double *ivec_t; int *query; n=nr_feature; max_line_len = 1024; line = (char *)malloc(max_line_len*sizeof(char)); while(readline(input) != NULL) total++; rewind(input); dvec_t = new double[total]; ivec_t = new double[total]; query = new int[total]; total = 0; while(readline(input) != NULL) { int i = 0; double target_label, predict_label; char *idx, *val, *label, *endptr; int inst_max_index = 0; // strtol gives 0 if wrong format query[total] = 0; label = strtok(line," \t\n"); if(label == NULL) // empty line exit_input_error(total+1); target_label = strtod(label,&endptr); if(endptr == label || *endptr != '\0') exit_input_error(total+1); ivec_t[total] = target_label; while(1) { if(i>=max_nr_attr-2) // need one more for index = -1 { max_nr_attr *= 2; x = (struct feature_node *) realloc(x,max_nr_attr*sizeof(struct feature_node)); } idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; if (strcmp(idx,"qid") == 0) { errno = 0; query[total] = (int) strtol(val,&endptr,10); if(endptr == val || errno != 0 || *endptr != '\0') exit_input_error(i+1); continue; } errno = 0; x[i].index = (int) strtol(idx,&endptr,10); if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index) exit_input_error(total+1); else inst_max_index = x[i].index; errno = 0; x[i].value = strtod(val,&endptr); if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(total+1); // feature indices larger than those in training are not used if(x[i].index <= nr_feature) ++i; } x[i].index = -1; predict_label = predict(model_,x); fprintf(output,"%.10f\n",predict_label); dvec_t[total++] = predict_label; } double result[3]; eval_list(ivec_t,dvec_t,query,total,result); info("Pairwise Accuracy = %g%%\n",result[0]*100); info("MeanNDCG (LETOR) = %g\n",result[1]); info("NDCG (YAHOO) = %g\n",result[2]); }
void split(char *input_file, int l, int machines, int nr_query, struct Query_Machine *q_machine, int *query) { int machine_id = 0; double y; int len = 0; FILE *fp = fopen(input_file,"r"); char *idx, *val, *endptr; char *label; char **out_file = (char**)malloc(sizeof(char*)*machines); for(int i=0;i<machines;i++) out_file[i] = (char*)malloc(sizeof(char)*1024); FILE **f = (FILE**)malloc(sizeof(FILE*)*machines); if(mkdir("temp_dir",0777)==0) { printf("Directory was successfully created!\n"); } else { printf("Directory has existed!!\n"); } for(int i=0;i<machines;i++) { sprintf(out_file[i],"temp_dir/train.txt.%d",i); f[i] = fopen(out_file[i],"w"); } char *copy_line = (char*)malloc(sizeof(char)*2048); for(int j=0;j<l;j++) { readline(fp); len = (int)strlen(line); //printf("len=%d for line:%d\n",len,j+1); if(len > 2048) { copy_line = (char*)realloc(copy_line,len*sizeof(char)); } sprintf(copy_line, "%s", line); //strcpy(copy_line,line); //printf("copy_line:%s",copy_line); //printf("line:%s",line); label = strtok(line, " \t\n"); if(label == NULL) exit_input_error(j+1); y = strtod(label, &endptr); if(endptr == label || *endptr != '\0') exit_input_error(j+1); idx = strtok(NULL,":"); val = strtok(NULL, " \t"); if(val == NULL) exit_input_error(j+1); if(!strcmp(idx,"qid")) { errno = 0; query[j] = (int)strtol(val, &endptr, 10); if(endptr == val || errno !=0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(j+1); } for(int i=0;i<nr_query;i++) { if(query[j] == q_machine[i].query) { machine_id = q_machine[i].machine_id; break; } } fprintf(f[machine_id],"%s",copy_line); } free(copy_line); for(int i=0;i<machines;i++) free(out_file[i]); free(out_file); for(int i=0;i<machines;i++) fclose(f[i]); rewind(fp); fclose(fp); }
void do_predict(FILE *input, FILE *output) { int total = 0; int nr_class=get_nr_class(model_); double *prob_estimates=NULL; int n; int nr_feature=get_nr_feature(model_); if(model_->bias>=0) n=nr_feature+1; else n=nr_feature; if(!check_probability_model(model_)) { fprintf(stderr, "probability output is only supported for logistic regression\n"); exit(1); } prob_estimates = (double *) malloc(nr_class*sizeof(double)); max_line_len = 1024; line = (char *)malloc(max_line_len*sizeof(char)); int clicks = 0; int shows = 0; while(readline(input) != NULL) { int i = 0; double target_ctr, predict_ctr; char *idx, *val, *endptr; int inst_max_index = 0; // strtol gives 0 if wrong format char *p = strtok(line," \t\n"); //clicks if(p == NULL) // empty line exit_input_error(total+1); clicks = atoi(p); p = strtok(NULL," \t"); // shows shows = atoi(p); p = strtok(NULL," \t"); // qid:1 if (shows <=0 || clicks > shows) { continue; } target_ctr = (double)clicks / shows; while(1) { if(i>=max_nr_attr-2) // need one more for index = -1 { max_nr_attr *= 2; x = (struct feature_node *) realloc(x,max_nr_attr*sizeof(struct feature_node)); } idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; errno = 0; x[i].index = (int) strtol(idx,&endptr,10); if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index) exit_input_error(total+1); else inst_max_index = x[i].index; errno = 0; x[i].value = strtod(val,&endptr); if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(total+1); // feature indices larger than those in training are not used if(x[i].index <= nr_feature) ++i; } if(model_->bias>=0) { x[i].index = n; x[i].value = model_->bias; i++; } x[i].index = -1; predict_probability(model_,x,prob_estimates); fprintf(output,"%d %d ", clicks, shows); predict_ctr = prob_estimates[0]; fprintf(output," %g\n", predict_ctr); } info("total:%d\n",total); free(prob_estimates); }
void read_problem(const char *filename) { int elements[2], inst_max_index, i[2], j[2]; FILE *fp = fopen(filename,"r"); char *endptr; char *idx, *val, *label; if(fp == NULL) { fprintf(stderr,"can't open input file %s\n",filename); exit(1); } prob[0].l = 0; prob[1].l = 0; elements[0] = 0; elements[1] = 0; int current_set; max_line_len = 1024; line = Malloc(char,max_line_len); while(readline(fp)!=NULL) { if(line[0] == '-') current_set = 1; else current_set = 0; char *p = strtok(line," \t"); // label // features while(1) { p = strtok(NULL," \t"); if(p == NULL || *p == '\n') // check '\n' as ' ' may be after the last feature break; ++elements[current_set]; } ++elements[current_set]; ++prob[current_set].l; } rewind(fp); prob[0].y = Malloc(double,prob[0].l); prob[0].x = Malloc(struct svm_node *,prob[0].l); x_space[0] = Malloc(struct svm_node,elements[0]); prob[1].y = Malloc(double,prob[1].l); prob[1].x = Malloc(struct svm_node *,prob[1].l); x_space[1] = Malloc(struct svm_node,elements[1]); // max_index = 0; j[0]=0; j[1]=0; i[0] = 0; i[1] = 0; while( i[0] < prob[0].l || i[1] < prob[1].l ) // for(i=0;i<prob.l;i++) { inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0 readline(fp); if(line[0] == '-') current_set = 1; else current_set = 0; prob[current_set].x[i[current_set]] = &x_space[current_set][j[current_set]]; label = strtok(line," \t"); prob[current_set].y[i[current_set]] = strtod(label,&endptr); if(endptr == label) exit_input_error(i[0]+i[1]+1); while(1) { idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; errno = 0; x_space[current_set][j[current_set]].index = (int) strtol(idx,&endptr,10); if(endptr == idx || errno != 0 || *endptr != '\0' || x_space[current_set][j[current_set]].index <= inst_max_index) exit_input_error(i[0]+i[1]+1); else inst_max_index = x_space[current_set][j[current_set]].index; errno = 0; x_space[current_set][j[current_set]].value = strtod(val,&endptr); if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(i[0]+i[1]+1); ++j[current_set]; } if(inst_max_index > max_index) max_index = inst_max_index; x_space[current_set][j[current_set]++].index = -1; i[current_set]++; } fclose(fp); }
/** * This code is largely borrowed from LIBLINEAR. * TODO: This can be made faster with SIMD. **/ size_t create_dw_corpus(std::string filename, const size_t n_elements, const size_t n_examples, double * & p_examples, long * & p_cols, long * & p_rows){ p_examples = new double[n_elements]; p_cols = new long[n_elements]; p_rows = new long[n_examples]; FILE *fp = fopen(filename.c_str(),"r"); ssize_t read; size_t max_line_len = 1024; char * line = new char[max_line_len]; char *endptr; char *idx, *val, *label; double * y = new double[n_examples]; int input_idx; double input_label; double input_value; if(fp == NULL){ fprintf(stderr,"can't open input file %s\n",filename.c_str()); exit(1); } size_t maxidx = 0; int inst_max_index, j=0; size_t ct = 0; for(int i=0;i<n_examples;i++){ inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0 read = getline(&line, &max_line_len, fp); label = strtok(line," \t\n"); if(label == NULL) // empty line exit_input_error(i+1); input_label = strtod(label,&endptr); if(endptr == label || *endptr != '\0') exit_input_error(i+1); p_rows[i] = ct; while(1){ idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; errno = 0; input_idx = (int) strtol(idx,&endptr,10); if(endptr == idx || errno != 0 || *endptr != '\0') exit_input_error(i+1); errno = 0; input_value = strtod(val,&endptr); if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(i+1); ++j; p_examples[ct] = input_value; p_cols[ct] = input_idx; ct ++; //std::cout << input_idx << std::endl; if(input_idx > maxidx){ maxidx = input_idx; } } p_examples[ct] = (input_label+1)/2; // normalize +1/-1 to 1/0 p_cols[ct] = -1; ct ++; } maxidx ++; return maxidx; }
void problem::Load(const char* filename,const NUMBER bias,const int maxdim) { // revised from the LIBLINEAR/LIBSVM read_problem() function timeb TimingMilliSeconds; ftime(&TimingMilliSeconds); Clear(); FILE *fp = fopen(filename,"r"); if(fp == NULL) { fprintf(stderr,"can't open input file %s\n",filename); exit(1); } max_line_len = 1024; line = Malloc(char,max_line_len); l = 0; allocated = 0; while(readline(fp)!=NULL) { char *p = strtok(line," \t"); // label // features while(1) { p = strtok(NULL," \t"); if(p == NULL || *p == '\n') // check '\n' as ' ' may be after the last feature break; allocated++; } allocated++; l++; } rewind(fp); if(bias>0) allocated += l; index_buf = new int[allocated]; value_buf = new NUMBER[allocated]; y = new int[l]; indexes = new int*[l]; values = new NUMBER*[l]; int max_index = 0; long int j=0; char *endptr; char *idx, *val, *label; for(int i=0;i<l;i++) { int inst_max_index = 0; // strtol gives 0 if wrong format readline(fp); indexes[i] = &index_buf[j]; values[i] = &value_buf[j]; label = strtok(line," \t"); y[i] = (int) strtol(label,&endptr,10); if(endptr == label) exit_input_error(i+1); while(1) { if(bias>0) { index_buf[j] = 1; value_buf[j] = bias; } idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; errno = 0; index_buf[j] = (int) strtol(idx,&endptr,10)+(bias>0); if(endptr == idx || errno != 0 || *endptr != '\0' || index_buf[j] <= inst_max_index) exit_input_error(i+1); else inst_max_index = index_buf[j]; errno = 0; value_buf[j] = (NUMBER)strtod(val,&endptr); if(value_buf[j]<0) value_buf[j] = 0.001; //else if(value_buf[j]>=1) //value_buf[j] = 0.999; if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(i+1); ++j; } if(inst_max_index > max_index) max_index = inst_max_index; index_buf[j++] = -1; } n=max_index; if(maxdim>0 && n>maxdim) n=maxdim; fclose(fp); free(line); line = NULL; struct timeb now; ftime(&now); std::cout<<"Dataset loaded in "<<int( (now.time-TimingMilliSeconds.time)*1000+(now.millitm-TimingMilliSeconds.millitm) )<<" msec."<<std::endl; }
//struct feature_node //{ // int index; // double value; //}; //void _parse_command_line(int argc, char **argv, char *input_file_name, char *model_file_name) //#endif //{ // int i; // void (*print_func)(const char*) = NULL; // default printing to stdout // // // default values // _param.solver_type = L2R_L2LOSS_SVC; // _param.C = 1; // _param.eps = INF; // see setting below // _param.p = 0; // _param.nr_weight = 0; // _param.weight_label = NULL; // _param.weight = NULL; // _param.ite = 100; //default // // parse options // for(i=1;i<argc;i++) // { // if(argv[i][0] != '-') break; // if(++i>=argc) // exit_with_help(); // switch(argv[i-1][1]) // { // case 's': // _param.solver_type = atoi(argv[i]); // break; // // case 'c': // _param.C = atof(argv[i]); // break; // // case 'p': // _param.p = atof(argv[i]); // break; // // case 'e': // _param.eps = atof(argv[i]); // break; // // case 'i': // _param.ite = atof(argv[i]); // break; // // case 'q': // print_func = &print_null; // i--; // break; // // default: // fprintf(stderr,"unknown option: -%c\n", argv[i-1][1]); // exit_with_help(); // break; // } // } // // set_print_string_function(print_func); // // // determine filenames // if(i>=argc) // exit_with_help(); // // strcpy(input_file_name, argv[i]); //#ifdef FIGURE56 // if(i<argc-1) // strcpy(test_file_name,argv[i+1]); // else // { // exit_with_help(); // } //#else // if(i<argc-1) // strcpy(model_file_name,argv[i+1]); // else // { // //strrchr() 函数查找字符在指定字符串中从后面开始的第一次出现的位置,如果成功,则返回从该位置到字符串结尾的所有字符,如果失败,则返回 false。 // //与之相对应的是strchr()函数,它查找字符串中首次出现指定字符的位置。 // char *p = strrchr(argv[i],'/'); // if(p==NULL)//there are no parent directories in the path of train data, that means train data are in the current directory // p = argv[i]; // else//the train data contain parent directoryis, ++p: the pointer move from char '/' to the first char of the input train data // ++p; // sprintf(model_file_name,"%s.model",p);//int sprintf ( char * str, const char * format, ... );Write formatted data to string // } //#endif // // if(_param.eps == INF) // { // switch(_param.solver_type) // { // case L2R_L2LOSS_SVC: // _param.eps = 0.01; // break; // case L2R_L2LOSS_SVR: // case WX_RBTREE: // case Y_RBTREE: // case AVLTREE: // case AATREE: // case DIRECT_COUNT: // case SELECTION_TREE: // case PRSVMP: // _param.eps = 0.001; // break; // case L2R_L1LOSS_SVR_DUAL: // _param.eps = 0.1; // break; // } // } //} void read_problem(const char *filename) { int max_index, inst_max_index, i; long int elements, j; FILE *fp = fopen(filename,"r"); char *endptr; char *idx, *val, *label; if(fp == NULL) { fprintf(stderr,"can't open input file %s\n",filename); exit(1); } prob.l = 0; elements = 0; max_line_len = 1024; line = Malloc(char,max_line_len); while(readline(fp)!=NULL) { char *p = strtok(line," \t"); // label // features while(1) { p = strtok(NULL," \t"); if(p == NULL || *p == '\n') // check '\n' as ' ' may be after the last feature break; elements++; } elements++; // for bias term prob.l++; } rewind(fp); // struct problem // { // int l, n;//l: total instance number(starts from 0);n:total feature number // int *query; // double *y;//y:label value // struct feature_node **x; // }; //prob.l: instances number of input file //prob.y: label array of each instance of input file //prob.x: pointer array of each instance's features //prob.query:query array of each instance of input file prob.y = Malloc(double,prob.l); prob.x = Malloc(struct feature_node *,prob.l); prob.query = Malloc(int,prob.l); //x_space:apply enough spaces to store all instances' features x_space = Malloc(struct feature_node,elements+prob.l); max_index = 0; j=0; for(i=0;i<prob.l;i++)//iterate all instances { prob.query[i] = 0; inst_max_index = 0; // strtol gives 0 if wrong format readline(fp); prob.x[i] = &x_space[j]; label = strtok(line," \t\n"); if(label == NULL) // empty line exit_input_error(i+1); prob.y[i] = strtod(label,&endptr); if(endptr == label || *endptr != '\0') exit_input_error(i+1); while(1) { idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; if (!strcmp(idx,"qid"))//qid { errno = 0; prob.query[i] = (int) strtol(val, &endptr,10); if(endptr == val || errno !=0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(i+1); } else//feature { errno = 0; x_space[j].index = (int) strtol(idx,&endptr,10); if(endptr == idx || errno != 0 || *endptr != '\0' || x_space[j].index <= inst_max_index) exit_input_error(i+1); else inst_max_index = x_space[j].index; errno = 0; x_space[j].value = strtod(val,&endptr); if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(i+1); ++j; /*for debug if(j==46) printf("%d",j);*/ } }//finish parsing one line of data if(inst_max_index > max_index) max_index = inst_max_index; x_space[j++].index = -1; }// iterate all instances prob.n=max_index; fclose(fp); }
// read in a problem (in libsvm format) void read_problem(const char *filename) { int max_index, inst_max_index, i; size_t elements, j; FILE *fp = fopen(filename,"r"); char *endptr; char *idx, *val, *label; if(fp == NULL) { fprintf(stderr,"can't open input file %s\n",filename); exit(1); } prob.l = 0; elements = 0; max_line_len = 1024; line = Malloc(char,max_line_len); while(readline(fp)!=NULL) { char *p = strtok(line," \t"); // label // features while(1) { p = strtok(NULL," \t"); if(p == NULL || *p == '\n') // check '\n' as ' ' may be after the last feature break; elements++; } elements++; // for bias term prob.l++; } rewind(fp); prob.bias=bias; prob.y = Malloc(double,prob.l); prob.x = Malloc(struct feature_node *,prob.l); x_space = Malloc(struct feature_node,elements+prob.l); max_index = 0; j=0; for(i=0;i<prob.l;i++) { inst_max_index = 0; // strtol gives 0 if wrong format readline(fp); prob.x[i] = &x_space[j]; label = strtok(line," \t\n"); if(label == NULL) // empty line exit_input_error(i+1); prob.y[i] = strtod(label,&endptr); if(endptr == label || *endptr != '\0') exit_input_error(i+1); while(1) { idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; errno = 0; x_space[j].index = (int) strtol(idx,&endptr,10); if(endptr == idx || errno != 0 || *endptr != '\0' || x_space[j].index <= inst_max_index) exit_input_error(i+1); else inst_max_index = x_space[j].index; errno = 0; x_space[j].value = strtod(val,&endptr); if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(i+1); ++j; } if(inst_max_index > max_index) max_index = inst_max_index; if(prob.bias >= 0) x_space[j++].value = prob.bias; x_space[j++].index = -1; } if(prob.bias >= 0) { prob.n=max_index+1; for(i=1;i<prob.l;i++) (prob.x[i]-2)->index = prob.n; x_space[j-2].index = prob.n; } else prob.n=max_index; fclose(fp); }
double * predict(FILE *input, int &size) { int correct = 0; int total = 0; double error = 0; double sump = 0, sumt = 0, sumpp = 0, sumtt = 0, sumpt = 0; int svm_type=svm_get_svm_type(model); int nr_class=svm_get_nr_class(model); double *prob_estimates=NULL; size = 0; int tmp; while ( (tmp=fgetc(input)) != EOF) { if (tmp == '\n') size++; } rewind(input); double * res = (double *) malloc (size*sizeof(double)); target = (double *) malloc (size*sizeof(double)); max_line_len = 1024; line = (char *)malloc(max_line_len*sizeof(char)); int j = 0; while(readline(input) != NULL) { int i = 0; double target_label, predict_label; char *idx, *val, *label, *endptr; int inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0 label = strtok(line," \t\n"); if(label == NULL) // empty line exit_input_error(total+1); target_label = strtod(label,&endptr); if(endptr == label || *endptr != '\0') exit_input_error(total+1); while(1) { if(i>=max_nr_attr-1) // need one more for index = -1 { max_nr_attr *= 2; x = (struct svm_node *) realloc(x,max_nr_attr*sizeof(struct svm_node)); } idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; errno = 0; x[i].index = (int) strtol(idx,&endptr,10); if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index) exit_input_error(total+1); else inst_max_index = x[i].index; errno = 0; x[i].value = strtod(val,&endptr); if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(total+1); ++i; } x[i].index = -1; predict_label = svm_predict(model,x); res[j] = predict_label; target[j] = target_label; j++; } return res; }
void do_predict(FILE *input, FILE *output) { int correct = 0; int total = 0; double error = 0; double sump = 0, sumt = 0, sumpp = 0, sumtt = 0, sumpt = 0; int nr_class=get_nr_class(model_); double *prob_estimates=NULL; int j, n; int nr_feature=get_nr_feature(model_); if(model_->bias>=0) n=nr_feature+1; else n=nr_feature; if(flag_predict_probability) { int *labels; if(!check_probability_model(model_)) { fprintf(stderr, "probability output is only supported for logistic regression\n"); exit(1); } labels=(int *) malloc(nr_class*sizeof(int)); get_labels(model_,labels); prob_estimates = (double *) malloc(nr_class*sizeof(double)); fprintf(output,"labels"); for(j=0;j<nr_class;j++) fprintf(output," %d",labels[j]); fprintf(output,"\n"); free(labels); } max_line_len = 1024; line = (char *)malloc(max_line_len*sizeof(char)); while(readline(input) != NULL) { int i = 0; double target_label, predict_label; char *idx, *val, *label, *endptr; int inst_max_index = 0; // strtol gives 0 if wrong format label = strtok(line," \t\n"); if(label == NULL) // empty line exit_input_error(total+1); target_label = strtod(label,&endptr); if(endptr == label || *endptr != '\0') exit_input_error(total+1); while(1) { if(i>=max_nr_attr-2) // need one more for index = -1 { max_nr_attr *= 2; x = (struct feature_node *) realloc(x,max_nr_attr*sizeof(struct feature_node)); } idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; errno = 0; x[i].index = (int) strtol(idx,&endptr,10); if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index) exit_input_error(total+1); else inst_max_index = x[i].index; errno = 0; x[i].value = strtod(val,&endptr); if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(total+1); // feature indices larger than those in training are not used if(x[i].index <= nr_feature) ++i; } if(model_->bias>=0) { x[i].index = n; x[i].value = model_->bias; i++; } x[i].index = -1; if(model_->normal){ double length = 0; for(int kk = 0; x[kk].index != -1; kk++) length += x[kk].value * x[kk].value; length = sqrt(length); for(int kk = 0; x[kk].index != -1; kk++) x[kk].value /= length; } if(flag_predict_probability) { int j; predict_label = predict_probability(model_,x,prob_estimates); fprintf(output,"%g",predict_label); for(j=0;j<model_->nr_class;j++) fprintf(output," %g",prob_estimates[j]); fprintf(output,"\n"); } else { predict_label = predict(model_,x); fprintf(output,"%g\n",predict_label); } if(predict_label == target_label) ++correct; error += (predict_label-target_label)*(predict_label-target_label); sump += predict_label; sumt += target_label; sumpp += predict_label*predict_label; sumtt += target_label*target_label; sumpt += predict_label*target_label; ++total; } if(model_->param.solver_type==L2R_L2LOSS_SVR || model_->param.solver_type==L2R_L1LOSS_SVR_DUAL || model_->param.solver_type==L2R_L2LOSS_SVR_DUAL) { info("Mean squared error = %g (regression)\n",error/total); info("Squared correlation coefficient = %g (regression)\n", ((total*sumpt-sump*sumt)*(total*sumpt-sump*sumt))/ ((total*sumpp-sump*sump)*(total*sumtt-sumt*sumt)) ); } else info("Accuracy = %g%% (%d/%d)\n",(double) correct/total*100,correct,total); if(flag_predict_probability) free(prob_estimates); }
void do_predict(FILE *input, FILE *output) { int correct = 0; int total = 0; double error = 0; double sump = 0, sumt = 0, sumpp = 0, sumtt = 0, sumpt = 0; int nr_class=get_nr_class(model_[0]); double *prob_estimates=NULL; int j, n; int nr_feature=get_nr_feature(model_[0]); if(model_[0]->bias>=0) n=nr_feature+1; else n=nr_feature; if(flag_predict_probability) { int *labels; if(!check_probability_model(model_[0])) { fprintf(stderr, "probability output is only supported for logistic regression\n"); exit(1); } labels=(int *) malloc(nr_class*sizeof(int)); get_labels(model_[0],labels); prob_estimates = (double *) malloc(nr_class*sizeof(double)); fprintf(output,"labels"); for(j=0;j<nr_class;j++) fprintf(output," %d",labels[j]); fprintf(output,"\n"); free(labels); } max_line_len = 1024; line = (char *)malloc(max_line_len*sizeof(char)); while(readline(input) != NULL) { int i = 0; double target_label, predict_label; char *idx, *val, *label, *endptr; int inst_max_index = 0; // strtol gives 0 if wrong format label = strtok(line," \t\n"); if(label == NULL) // empty line exit_input_error(total+1); // target_label = strtod(label,&endptr); switch (label[0]) { case 'A': target_label = 0; break; case 'B': target_label = 1; break; case 'C': target_label = 1; break; case 'D': target_label = 1; break; } // if(endptr == label || *endptr != '\0') // exit_input_error(total+1); for (int pid = 0; pid < sum_pro; pid++) { while(1) { if(i>=max_nr_attr-2) // need one more for index = -1 { max_nr_attr *= 2; x = (struct feature_node *) realloc(x,max_nr_attr*sizeof(struct feature_node)); } idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; errno = 0; x[i].index = (int) strtol(idx,&endptr,10); if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index) exit_input_error(total+1); else inst_max_index = x[i].index; errno = 0; x[i].value = strtod(val,&endptr); if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(total+1); // feature indices larger than those in training are not used if(x[i].index <= nr_feature) ++i; } if(model_[pid]->bias>=0) { x[i].index = n; x[i].value = model_[pid]->bias; i++; } x[i].index = -1; if(flag_predict_probability) { int j; predict_label = predict_probability(model_[pid],x,prob_estimates); fprintf(output,"%g",predict_label); for(j=0;j<model_[pid]->nr_class;j++) fprintf(output," %g",prob_estimates[j]); fprintf(output,"\n"); } else { p_label[pid] = predict(model_[pid],x); fprintf(output,"%g", p_label[pid]); // printf("pid%dhas done\n",pid ); } fprintf(output, "\n" ); } int count = 0; predict_label = 0; // for ( int l = 0; l < BLOCK ; l++) { // for (int m = 0;m < BLOCK * N; m++) { // // printf("%f\t", p_label[l * BLOCK + m]); // if ( p_label[l * BLOCK + m] == 1) { // // p_label[l] = 1; // // break; // p_label[l]++; // // count++;* 4 // } // } // if (p_label[l] < 4) { // count++; // } // // if ( p_label[l] == 1) { // // predict_label = 1; // // } // // else { // // predict_label = 0; // // } // // if (count >0) { // // predict_label = 1; // // } // // else { // // predict_label = 0; // // } // } // if (count > 0 ) { // predict_label = 0; // } // else { // predict_label = 1; // } // /printf("\n"); // fprintf(output,"%g\n",predict_label); // if(predict_label == target_label) // ++correct; // error += (predict_label-target_label)*(predict_label-target_label); // sump += predict_label; // sumt += target_label; // sumpp += predict_label*predict_label; // sumtt += target_label*target_label; // sumpt += predict_label*target_label; // ++total; } // if(check_regression_model(model_[0])) // { // info("Mean squared error = %g (regression)\n",error/total); // info("Squared correlation coefficient = %g (regression)\n", // ((total*sumpt-sump*sumt)*(total*sumpt-sump*sumt))/ // ((total*sumpp-sump*sump)*(total*sumtt-sumt*sumt)) // ); // } // else // info("Accuracy = %g%% (%d/%d)\n",(double) correct/total*100,correct,total); // if(flag_predict_probability) // free(prob_estimates); }