Beispiel #1
0
void statistic_queries(char *input_file, int *query, int l)
{
	char *endptr;
	char *idx, *val, *label;
	double y;
	FILE *fp = fopen(input_file,"r");
	for(int i=0;i<l;i++)
	{
		readline(fp);
		label = strtok(line," \t\n");
		if(label == NULL)
			exit_input_error(i+1);
		y = strtod(label,&endptr);
		if(endptr == label || *endptr != '\0')
			exit_input_error(i+1);
        idx = strtok(NULL,":");
		val = strtok(NULL," \t");
		if(val == NULL)
			exit_input_error(i+1);
		if(!strcmp(idx,"qid"))
		{
			errno = 0;
			query[i] = (int) strtol(val, &endptr, 10);
			if(endptr == val || errno !=0 || (*endptr != '\0' && !isspace(*endptr)))
				exit_input_error(i+1);
		}		
	}
	rewind(fp);
	fclose(fp);
}
Beispiel #2
0
problem Flexible_vector::Read_file_without_index(std::string filename)
{
    std::cout<<"IN Read_file_without_index"<<std::endl;
    std::ifstream file_in(filename.c_str());
    std::string e,line;
    problem prob;
    int elements, max_index, inst_max_index;
	
    char *endptr;

	prob.l = 0;
	elements = 0;
    max_index = 0;
    while (std::getline(file_in, line))
    {
        //std::cout<<std::endl;
        inst_max_index = 0;
        std::stringstream  lineStream(line);

        std::getline(lineStream, e,',');   //label
        if(e.empty())  //empty line
            exit_input_error(prob.l+1);
        prob.y.push_back(stringToNum<double>(e));
        //std::cout<<prob.y[prob.y.size()-1]<<',';
        //prob.y.push_back(atof(e));
        //if(endptr == e || *endptr != '\0')
        //    exit_input_error(prob.l+1);

        prob.x_ptr.push_back(elements);
        //feature
        while(std::getline(lineStream, e,','))
        {
            inst_max_index++;
            node x_tmp;
            x_tmp.index = inst_max_index;
            x_tmp.value = stringToNum<double>(e);
            //x_tmp.value = atof(e);
            prob.x.push_back(x_tmp);
            //std::cout<<prob.x[prob.x.size()-1].value<<',';
            elements++;
        }
        prob.x_interval.push_back(inst_max_index);
        if(inst_max_index > max_index)
            max_index = inst_max_index;
        prob.l++;
    }
    prob.max_feature = max_index;
    return prob;
}
Beispiel #3
0
void predict(FILE *input, FILE *output)
{
	int correct = 0;
	int total = 0;
	double error = 0;
	double sump = 0, sumt = 0, sumpp = 0, sumtt = 0, sumpt = 0;

	int svm_type=svm_get_svm_type(model);
	int nr_class=svm_get_nr_class(model);
	double *prob_estimates=NULL;
	int j;

	if(predict_probability)
	{
		if (svm_type==NU_SVR || svm_type==EPSILON_SVR)
			printf("Prob. model for test data: target value = predicted value + z,\nz: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma=%g\n",svm_get_svr_probability(model));
		else
		{
			int *labels=(int *) malloc(nr_class*sizeof(int));
			svm_get_labels(model,labels);
			prob_estimates = (double *) malloc(nr_class*sizeof(double));
			fprintf(output,"labels");		
			for(j=0;j<nr_class;j++)
				fprintf(output," %d",labels[j]);
			fprintf(output,"\n");
			free(labels);
		}
	}

	max_line_len = 1024;
	line = (char *)malloc(max_line_len*sizeof(char));
	while(readline(input) != NULL)
	{
		int i = 0;
		double target_label, predict_label;
		char *idx, *val, *label, *endptr;
		int inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0

		label = strtok(line," \t");
		target_label = strtod(label,&endptr);
		if(endptr == label)
			exit_input_error(total+1);

		while(1)
		{
			if(i>=max_nr_attr-1)	// need one more for index = -1
			{
				max_nr_attr *= 2;
				x = (struct svm_node *) realloc(x,max_nr_attr*sizeof(struct svm_node));
			}

			idx = strtok(NULL,":");
			val = strtok(NULL," \t");

			if(val == NULL)
				break;
			errno = 0;
			x[i].index = (int) strtol(idx,&endptr,10);
			if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index)
				exit_input_error(total+1);
			else
				inst_max_index = x[i].index;

			errno = 0;
			x[i].value = strtod(val,&endptr);
			if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
				exit_input_error(total+1);

			++i;
		}
		x[i].index = -1;

		if (predict_probability && (svm_type==C_SVC || svm_type==NU_SVC))
		{
			predict_label = svm_predict_probability(model,x,prob_estimates);
			fprintf(output,"%g",predict_label);
			for(j=0;j<nr_class;j++)
				fprintf(output," %g",prob_estimates[j]);
			fprintf(output,"\n");
		}
		else
		{
			predict_label = svm_predict(model,x);
			fprintf(output,"%g\n",predict_label);
		}

		if(predict_label == target_label)
			++correct;
		error += (predict_label-target_label)*(predict_label-target_label);
		sump += predict_label;
		sumt += target_label;
		sumpp += predict_label*predict_label;
		sumtt += target_label*target_label;
		sumpt += predict_label*target_label;
		++total;
	}
	if (svm_type==NU_SVR || svm_type==EPSILON_SVR)
	{
		printf("Mean squared error = %g (regression)\n",error/total);
		printf("Squared correlation coefficient = %g (regression)\n",
		       ((total*sumpt-sump*sumt)*(total*sumpt-sump*sumt))/
		       ((total*sumpp-sump*sump)*(total*sumtt-sumt*sumt))
		       );
	}
	else
		printf("Accuracy = %g%% (%d/%d) (classification)\n",
		       (double)correct/total*100,correct,total);
	if(predict_probability)
		free(prob_estimates);
}
Beispiel #4
0
void binary_class_predict(FILE *input, FILE *output){
	int    total = 0;
	int    *labels;
	int    max_nr_attr = 64;
	struct svm_node *x = Malloc(struct svm_node, max_nr_attr);
	dvec_t dec_values;
	ivec_t true_labels;


	int svm_type=svm_get_svm_type(model);

	if (svm_type==NU_SVR || svm_type==EPSILON_SVR){
		fprintf(stderr, "wrong svm type.");
		exit(1);
	}

	labels = Malloc(int, svm_get_nr_class(model));
	svm_get_labels(model, labels);

	max_line_len = 1024;
	line = (char *)malloc(max_line_len*sizeof(char));
	while(readline(input) != NULL)
	{
		int i = 0;
		double target_label, predict_label;
		char *idx, *val, *label, *endptr;
		int inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0

		label = strtok(line," \t");
		target_label = strtod(label,&endptr);
		if(endptr == label)
			exit_input_error(total+1);

		while(1)
		{
			if(i>=max_nr_attr - 2)	// need one more for index = -1
			{
				max_nr_attr *= 2;
				x = (struct svm_node *) realloc(x,max_nr_attr*sizeof(struct svm_node));
			}

			idx = strtok(NULL,":");
			val = strtok(NULL," \t");

			if(val == NULL)
				break;
			errno = 0;
			x[i].index = (int) strtol(idx,&endptr,10);
			if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index)
				exit_input_error(total+1);
			else
				inst_max_index = x[i].index;

			errno = 0;
			x[i].value = strtod(val,&endptr);
			if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
				exit_input_error(total+1);

			++i;
		}
		x[i].index = -1;

		predict_label = svm_predict(model,x);
		fprintf(output,"%g\n",predict_label);


		double dec_value;
		svm_predict_values(model, x, &dec_value);
		true_labels.push_back((target_label > 0)? 1: -1);
		if(labels[0] <= 0) dec_value *= -1;
		dec_values.push_back(dec_value);
	}

	// validation_function(dec_values, true_labels);
	accuracy(dec_values, true_labels);
	bac(dec_values, true_labels);

	free(labels);
	free(x);
}
void read_problem(const char *filename)
{
	int elements, max_index, inst_max_index, i, j;
#ifdef _DENSE_REP
	double value;
#endif
	FILE *fp = fopen(filename,"r");
	char *endptr;
	char *idx, *val, *label;

	if(fp == NULL)
	{
		fprintf(stderr,"can't open input file %s\n",filename);
		exit(1);
	}

	prob.l = 0;
	elements = 0;

	max_line_len = 1024;
	line = Malloc(char,max_line_len);
#ifdef _DENSE_REP
	max_index = 1;
	while(readline(fp) != NULL)
	{
		char *p;		
		p = strrchr(line, ':');
		if(p != NULL)
		{			
			while(*p != ' ' && *p != '\t' && p > line)
				p--;
			if(p > line)
			 	max_index = (int) strtol(p,&endptr,10) + 1;
		}
		if(max_index > elements)
			elements = max_index;
		++prob.l;
	}

	rewind(fp);

	prob.y = Malloc(double,prob.l);
	prob.x = Malloc(struct svm_node,prob.l);

	for(i=0;i<prob.l;i++)
	{
		int *d; 
		int instance_elements = elements;
		if(param.kernel_type == TRI_PRECOMPUTED ||
		   param.kernel_type == TRI_PRECOMPUTED_RBF)
			instance_elements = (i+2);
#ifdef _FLOAT_REP
		(prob.x+i)->values = Malloc(float,instance_elements);
#else
		(prob.x+i)->values = Malloc(double,instance_elements);
#endif
		(prob.x+i)->dim = 0;

		inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0
		readline(fp);

		label = strtok(line," \t");
		prob.y[i] = strtod(label,&endptr);
		if(endptr == label)
			exit_input_error(i+1);

		while((prob.x+i)->dim<instance_elements)
		{
			idx = strtok(NULL,":");
			val = strtok(NULL," \t");

			if(val == NULL)
				break;

			errno = 0;
			j = (int) strtol(idx,&endptr,10);
			if(endptr == idx || errno != 0 || *endptr != '\0' || j <= inst_max_index)
				exit_input_error(i+1);
			else
				inst_max_index = j;

			errno = 0;
			value = strtod(val,&endptr);
			if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
				exit_input_error(i+1);

			d = &((prob.x+i)->dim);
			while (*d < j)
				(prob.x+i)->values[(*d)++] = 0.0;
#ifdef _FLOAT_REP
			(prob.x+i)->values[(*d)++] = (float) value;
#else
			(prob.x+i)->values[(*d)++] = value;
#endif
		}	
	}
	max_index = elements-1;

#else
	while(readline(fp)!=NULL)
	{
		char *p = strtok(line," \t"); // label

		// features
		while(1)
		{
			p = strtok(NULL," \t");
			if(p == NULL || *p == '\n') // check '\n' as ' ' may be after the last feature
				break;
			++elements;
		}
		++elements;
		++prob.l;
	}
	rewind(fp);

	prob.y = Malloc(double,prob.l);
	prob.x = Malloc(struct svm_node *,prob.l);
	x_space = Malloc(struct svm_node,elements);

	max_index = 0;
	j=0;
	for(i=0;i<prob.l;i++)
	{
		inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0
		readline(fp);
		prob.x[i] = &x_space[j];
		label = strtok(line," \t\n");
		if(label == NULL) // empty line
			exit_input_error(i+1);

		prob.y[i] = strtod(label,&endptr);
		if(endptr == label || *endptr != '\0')
			exit_input_error(i+1);

		while(1)
		{
			idx = strtok(NULL,":");
			val = strtok(NULL," \t");

			if(val == NULL)
				break;

			errno = 0;
			x_space[j].index = (int) strtol(idx,&endptr,10);
			if(endptr == idx || errno != 0 || *endptr != '\0' || x_space[j].index <= inst_max_index)
				exit_input_error(i+1);
			else
				inst_max_index = x_space[j].index;

			errno = 0;
#ifdef _FLOAT_REP
			x_space[j].value = strtof(val,&endptr);
#else
			x_space[j].value = strtod(val,&endptr);
#endif
			if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
				exit_input_error(i+1);

			++j;
		}

		if(inst_max_index > max_index)
			max_index = inst_max_index;
		x_space[j++].index = -1;
	}
#endif

	if(param.gamma == 0 && max_index > 0)
		param.gamma = 1.0/max_index;

	if(param.kernel_type == PRECOMPUTED || 
	   param.kernel_type == TRI_PRECOMPUTED || 
	   param.kernel_type == PRECOMPUTED_RBF || 
	   param.kernel_type == TRI_PRECOMPUTED_RBF)
		for(i=0;i<prob.l;i++)
		{
#ifdef _DENSE_REP
			if ((prob.x+i)->dim == 0 || (prob.x+i)->values[0] == 0.0)
			{
				fprintf(stderr,"Wrong input format: first column must be 0:sample_serial_number\n");
				exit(1);
			}
			if ((int)(prob.x+i)->values[0] < 0 || (int)(prob.x+i)->values[0] > max_index)
			{
				fprintf(stderr,"Wrong input format: sample_serial_number out of range\n");
				exit(1);
			}
#else
			if (prob.x[i][0].index != 0)
			{
				fprintf(stderr,"Wrong input format: first column must be 0:sample_serial_number\n");
				exit(1);
			}
			if ((int)prob.x[i][0].value <= 0 || (int)prob.x[i][0].value > max_index)
			{
				fprintf(stderr,"Wrong input format: sample_serial_number out of range\n");
				exit(1);
			}
#endif
		}
	fclose(fp);
}
Beispiel #6
0
void do_predict(FILE *input, FILE *output, struct model* model_)
{
	int correct = 0;
	int total = 0;

	int nr_class=get_nr_class(model_);
	double *prob_estimates=NULL;
	int j, n;
	int nr_feature=get_nr_feature(model_);
	if(model_->bias>=0)
		n=nr_feature+1;
	else
		n=nr_feature;

	if(flag_predict_probability)
	{
		int *labels;

		if(!check_probability_model(model_))
		{
			fprintf(stderr, "probability output is only supported for logistic regression\n");
			exit(1);
		}

		labels=(int *) malloc(nr_class*sizeof(int));
		get_labels(model_,labels);
		prob_estimates = (double *) malloc(nr_class*sizeof(double));
		fprintf(output,"labels");		
		for(j=0;j<nr_class;j++)
			fprintf(output," %d",labels[j]);
		fprintf(output,"\n");
		free(labels);
	}

	max_line_len = 1024;
	line = (char *)malloc(max_line_len*sizeof(char));
	while(readline(input) != NULL)
	{
		int i = 0;
		int target_label, predict_label;
		char *idx, *val, *label, *endptr;
		int inst_max_index = 0; // strtol gives 0 if wrong format

		label = strtok(line," \t");
		target_label = (int) strtol(label,&endptr,10);
		if(endptr == label)
			exit_input_error(total+1);

		while(1)
		{
			if(i>=max_nr_attr-2)	// need one more for index = -1
			{
				max_nr_attr *= 2;
				x = (struct feature_node *) realloc(x,max_nr_attr*sizeof(struct feature_node));
			}

			idx = strtok(NULL,":");
			val = strtok(NULL," \t");

			if(val == NULL)
				break;
			errno = 0;
			x[i].index = (int) strtol(idx,&endptr,10);
			if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index)
				exit_input_error(total+1);
			else
				inst_max_index = x[i].index;

			errno = 0;
			x[i].value = strtod(val,&endptr);
			if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
				exit_input_error(total+1);

			// feature indices larger than those in training are not used
			if(x[i].index <= nr_feature)
				++i;
		}

		if(model_->bias>=0)
		{
			x[i].index = n;
			x[i].value = model_->bias;
			i++;
		}
		x[i].index = -1;

		if(flag_predict_probability)
		{
			int j;
			predict_label = predict_probability(model_,x,prob_estimates);
			fprintf(output,"%d",predict_label);
			for(j=0;j<model_->nr_class;j++)
				fprintf(output," %g",prob_estimates[j]);
			fprintf(output,"\n");
		}
		else
		{
			predict_label = predict(model_,x);
			fprintf(output,"%d\n",predict_label);
		}

		if(predict_label == target_label)
			++correct;
		++total;
	}
	printf("Accuracy = %g%% (%d/%d)\n",(double) correct/total*100,correct,total);
	if(flag_predict_probability)
		free(prob_estimates);
}
void predict(FILE *input, FILE *output)
{
	int correct = 0;
	int total = 0;
	double error = 0;
	double sump = 0, sumt = 0, sumpp = 0, sumtt = 0, sumpt = 0;

	int svm_type=svm_get_svm_type(model);
	int nr_class=svm_get_nr_class(model);
	double *prob_estimates=NULL;
	int j;
    
    // This block by Jianxin Wu, for average accuracy computation
    int ii,label_index;
    // number of correct predictions in each category
	int* correct_sub = (int *)malloc(nr_class*sizeof(int));
	for(ii=0;ii<nr_class;ii++) correct_sub[ii] = 0;
    // number of testing examples in each category
	int* total_sub = (int *)malloc(nr_class*sizeof(int));
	for(ii=0;ii<nr_class;ii++) total_sub[ii] = 0;
	int* labels_avg = (int*)malloc(nr_class*sizeof(int));
	svm_get_labels(model,labels_avg);

	if(predict_probability)
	{
		if (svm_type==NU_SVR || svm_type==EPSILON_SVR)
			printf("Prob. model for test data: target value = predicted value + z,\nz: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma=%g\n",svm_get_svr_probability(model));
		else
		{
			int *labels=(int *) malloc(nr_class*sizeof(int));
			svm_get_labels(model,labels);
			prob_estimates = (double *) malloc(nr_class*sizeof(double));
			fprintf(output,"labels");		
			for(j=0;j<nr_class;j++)
				fprintf(output," %d",labels[j]);
			fprintf(output,"\n");
			free(labels);
		}
	}

	max_line_len = 1024;
	line = (char *)malloc(max_line_len*sizeof(char));
	while(readline(input) != NULL)
	{
		int i = 0;
		double target_label, predict_label;
		char *idx, *val, *label, *endptr;
		int inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0

		label = strtok(line," \t");
		target_label = strtod(label,&endptr);
		if(endptr == label)
			exit_input_error(total+1);

		while(1)
		{
			if(i>=max_nr_attr-1)	// need one more for index = -1
			{
				max_nr_attr *= 2;
				x = (struct svm_node *) realloc(x,max_nr_attr*sizeof(struct svm_node));
			}

			idx = strtok(NULL,":");
			val = strtok(NULL," \t");

			if(val == NULL)
				break;
			errno = 0;
			x[i].index = (int) strtol(idx,&endptr,10);
			if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index)
				exit_input_error(total+1);
			else
				inst_max_index = x[i].index;

			errno = 0;
			x[i].value = strtod(val,&endptr);
			if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
				exit_input_error(total+1);

			++i;
		}
		x[i].index = -1;

		if (predict_probability && (svm_type==C_SVC || svm_type==NU_SVC))
		{
			predict_label = svm_predict_probability(model,x,prob_estimates);
			fprintf(output,"%g",predict_label);
			for(j=0;j<nr_class;j++)
				fprintf(output," %g",prob_estimates[j]);
			fprintf(output,"\n");
		}
		else
		{
			predict_label = svm_predict(model,x);
			fprintf(output,"%g\n",predict_label);
		}

        // This block by Jianxin Wu, for average accuracy
        label_index = FindLabel((int)target_label,labels_avg);
		total_sub[label_index]++;
		if(predict_label == target_label) correct_sub[label_index]++;

		if(predict_label == target_label)
			++correct;
		error += (predict_label-target_label)*(predict_label-target_label);
		sump += predict_label;
		sumt += target_label;
		sumpp += predict_label*predict_label;
		sumtt += target_label*target_label;
		sumpt += predict_label*target_label;
		++total;
	}
	if (svm_type==NU_SVR || svm_type==EPSILON_SVR)
	{
		printf("Mean squared error = %g (regression)\n",error/total);
		printf("Squared correlation coefficient = %g (regression)\n",
		       ((total*sumpt-sump*sumt)*(total*sumpt-sump*sumt))/
		       ((total*sumpp-sump*sump)*(total*sumtt-sumt*sumt))
		       );
	}
	else
		printf("Accuracy = %g%% (%d/%d) (classification)\n",
		       (double)correct/total*100,correct,total);
	if(predict_probability)
		free(prob_estimates);
        
    // This block (till endo of function) by Jianxin WU
    // Print per-category accuracy and average accuracy of categories
    double sub_score = 0;
    int nonempty_category = 0;
	for(ii=0;ii<nr_class;ii++)
	{
		if(total_sub[ii]>0)
        {
            sub_score += (correct_sub[ii]*1.0/total_sub[ii]);
            nonempty_category++;
        }
	}
    printf("-----------\n");
    for(ii=0;ii<nr_class;ii++)
    {
        printf("%d / %d (Category %d)\n",correct_sub[ii],total_sub[ii],labels_avg[ii]);
    }
    printf("-----------\n");
	printf("Mean Accuray across classes = %g%%\n",sub_score*100.0/nonempty_category);
	free(correct_sub);
	free(total_sub);
	free(labels_avg);

}
Beispiel #8
0
void read_problem(const char *filename)
{
	int elements, max_index, inst_max_index, i, j;
	FILE *fp = fopen(filename,"r");
	char *endptr;
	char *idx, *val, *label;

	if(fp == NULL)
	{
		fprintf(stderr,"can't open input file %s\n",filename);
		exit(1);
	}

	prob.l = 0;
	elements = 0;

	max_line_len = 1024;
	line = Malloc(char,max_line_len);
	while(readline(fp)!=NULL)
	{
		char *p = strtok(line," \t"); // label

		// features
		while(1)
		{
			p = strtok(NULL," \t");
			if(p == NULL || *p == '\n') // check '\n' as ' ' may be after the last feature
				break;
			++elements;
		}
		++elements;
		++prob.l;
	}
	rewind(fp);

	prob.y = Malloc(double,prob.l);
	prob.x = Malloc(struct svm_node *,prob.l);
	prob.W = Malloc(double,prob.l);
	x_space = Malloc(struct svm_node,elements);

	max_index = 0;
	j=0;
	for(i=0;i<prob.l;i++)
	{
		inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0
		readline(fp);
		prob.x[i] = &x_space[j];
		label = strtok(line," \t\n");
		if(label == NULL) // empty line
			exit_input_error(i+1);

		prob.y[i] = strtod(label,&endptr);
		if(endptr == label || *endptr != '\0')
			exit_input_error(i+1);
		prob.W[i] = 1;

		while(1)
		{
			idx = strtok(NULL,":");
			val = strtok(NULL," \t");

			if(val == NULL)
				break;

			errno = 0;
			x_space[j].index = (int) strtol(idx,&endptr,10);
			if(endptr == idx || errno != 0 || *endptr != '\0' || x_space[j].index <= inst_max_index)
				exit_input_error(i+1);
			else
				inst_max_index = x_space[j].index;

			errno = 0;
			x_space[j].value = strtod(val,&endptr);
			if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
				exit_input_error(i+1);

			++j;
		}

		if(inst_max_index > max_index)
			max_index = inst_max_index;
		x_space[j++].index = -1;
	}

	if(param.gamma == 0 && max_index > 0)
		param.gamma = 1.0/max_index;

	if(param.kernel_type == PRECOMPUTED)
		for(i=0;i<prob.l;i++)
		{
			if (prob.x[i][0].index != 0)
			{
				fprintf(stderr,"Wrong input format: first column must be 0:sample_serial_number\n");
				exit(1);
			}
			if ((int)prob.x[i][0].value <= 0 || (int)prob.x[i][0].value > max_index)
			{
				fprintf(stderr,"Wrong input format: sample_serial_number out of range\n");
				exit(1);
			}
		}

	fclose(fp);

	if(weight_file) 
	{
		fp = fopen(weight_file,"r");
		for(i=0;i<prob.l;i++)
			fscanf(fp,"%lf",&prob.W[i]);
		fclose(fp);
	}
}
Beispiel #9
0
// read in a problem (in libsvm format)
// clicks shows qid:1 f1:val1 f2:val2
void read_problem(const char *filename)
{
	int max_index, inst_max_index, i;
	long int elements, j;
	FILE *fp = fopen(filename,"r");
	char *endptr;
	char *idx, *val;
	//char *label;
	
	int clicks = 0;
	int shows = 0;
	int lines = 0;

	if(fp == NULL)
	{
		fprintf(stderr,"can't open input file %s\n",filename);
		exit(1);
	}

	prob.l = 0;
	elements = 0;
	max_line_len = 1024;
	line = Malloc(char,max_line_len);
	
	while(readline(fp)!=NULL)
	{
		char *p = strtok(line," \t"); // clicks
		clicks = atoi(p);
		p = strtok(NULL," \t"); // shows
		shows = atoi(p);
		p = strtok(NULL," \t"); // qid

		if (shows <=0 || clicks > shows) {
            	    lines++;
            	    continue;
		}
		
		// features
		while(1)
		{
			p = strtok(NULL," \t");
			if(p == NULL || *p == '\n') // check '\n' as ' ' may be after the last feature
				break;
			elements += shows;
		}
		elements += shows; // for bias term
		prob.l += shows;
		lines++;
	}
	rewind(fp);
	/*
    printf("lines:%d\n",lines);
    printf("prob length:%d\n", prob.l);
    printf("elements:%d\n", elements);
    */
	prob.bias=bias;

	prob.y = Malloc(double,prob.l);
	prob.x = Malloc(struct feature_node *,prob.l);
	x_space = Malloc(struct feature_node,elements+prob.l);

	max_index = 0;
	j = 0;
	
	long int start = 0;
	long int end = 0;
	long int instances = 0;
	long int feature_len = 0;
	int non_clicks = 0;
	int feature_node_size = sizeof(struct feature_node);
	
	for(i=0;i<lines;i++)
	{
		inst_max_index = 0; // strtol gives 0 if wrong format
		readline(fp);

		char *p = strtok(line," \t\n");
		if(p == NULL) // empty line
			exit_input_error(i+1);
		
		clicks = atoi(p);
		p = strtok(NULL," \t"); // shows
		shows = atoi(p);
		if (shows <=0 || clicks > shows) {
                    continue;
		}
		p = strtok(NULL," \t"); // qid
		
		start = end;
		j=end;
		prob.x[instances] = &x_space[j];
		prob.y[instances] = 0;  // label nonclicks

		while(1)
		{
			idx = strtok(NULL,":");
			val = strtok(NULL," \t");

			if(val == NULL)
				break;

			errno = 0;
			x_space[j].index = (int) strtol(idx,&endptr,10);
			if(endptr == idx || errno != 0 || *endptr != '\0' || x_space[j].index <= inst_max_index)
				exit_input_error(i+1);
			else
				inst_max_index = x_space[j].index;

			errno = 0;
			x_space[j].value = strtod(val,&endptr);
			if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
				exit_input_error(i+1);

			++j;
		}

		if(inst_max_index > max_index)
			max_index = inst_max_index; 

		if(prob.bias >= 0)
			x_space[j++].value = prob.bias;

		x_space[j++].index = -1;
		
		end = j;

		feature_len = end - start;
		non_clicks = shows - clicks;
		instances++;	
		for(int k=1; k<shows; k++) {
            if (k < non_clicks) {
                prob.y[instances] = 0;
            } else {
                prob.y[instances] = 1;
            }
            prob.x[instances] = &x_space[end];
            memcpy(&x_space[end], &x_space[start], feature_len * feature_node_size);
            /*
            for(j = 0; j<feature_len;j++) {
                x_space[end+j].index = x_space[start+j].index;
                x_space[end+j].value = x_space[start+j].value;
            }
            */
            start = end;
            end += feature_len;
            instances++;
		}
	}
	/*
	for(i=0;i<prob.l;i++) {
        printf("prob[%d]:%f\n",i,prob.y[i]);
    }
	
	for(i=0;i<elements+prob.l;i++) {
        printf("x_space[%d]:%d,%f\n",i,x_space[i].index,x_space[i].value);
    }
    */
	if(prob.bias >= 0)
	{
		prob.n=max_index+1;
		for(i=1;i<lines;i++)
			(prob.x[i]-2)->index = prob.n;
		x_space[j-2].index = prob.n;
	}
	else
		prob.n=max_index;

	fclose(fp);
}
Beispiel #10
0
void do_predict(FILE *input, FILE *output)
{
	int total=0;
	int n;
	int nr_feature=get_nr_feature(model_);
	double *dvec_t;
	double *ivec_t;
	int *query;
	n=nr_feature;

	max_line_len = 1024;
	line = (char *)malloc(max_line_len*sizeof(char));
	while(readline(input) != NULL)
		total++;
	rewind(input);
	dvec_t = new double[total];
	ivec_t = new double[total];
	query = new int[total];
	total = 0;
	while(readline(input) != NULL)
	{
		int i = 0;
		double target_label, predict_label;
		char *idx, *val, *label, *endptr;
		int inst_max_index = 0; // strtol gives 0 if wrong format

		query[total] = 0;
		label = strtok(line," \t\n");
		if(label == NULL) // empty line
			exit_input_error(total+1);

		target_label = strtod(label,&endptr);
		if(endptr == label || *endptr != '\0')
			exit_input_error(total+1);
		ivec_t[total] = target_label;

		while(1)
		{
			if(i>=max_nr_attr-2)	// need one more for index = -1
			{
				max_nr_attr *= 2;
				x = (struct feature_node *) realloc(x,max_nr_attr*sizeof(struct feature_node));
			}

			idx = strtok(NULL,":");
			val = strtok(NULL," \t");

			if(val == NULL)
				break;

			if (strcmp(idx,"qid") == 0)
			{
				errno = 0;
				query[total] = (int) strtol(val,&endptr,10);
				if(endptr == val || errno != 0 || *endptr != '\0')
					exit_input_error(i+1);
				continue;
			}
			errno = 0;
			x[i].index = (int) strtol(idx,&endptr,10);
			if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index)
				exit_input_error(total+1);
			else
				inst_max_index = x[i].index;

			errno = 0;
			x[i].value = strtod(val,&endptr);
			if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
				exit_input_error(total+1);

			// feature indices larger than those in training are not used
			if(x[i].index <= nr_feature)
				++i;
		}
		x[i].index = -1;

		predict_label = predict(model_,x);
		fprintf(output,"%.10f\n",predict_label);
		dvec_t[total++] = predict_label;
	}
	double result[3];
	eval_list(ivec_t,dvec_t,query,total,result);
	info("Pairwise Accuracy = %g%%\n",result[0]*100);
	info("MeanNDCG (LETOR) = %g\n",result[1]);
	info("NDCG (YAHOO) = %g\n",result[2]);
}
Beispiel #11
0
void split(char *input_file, int l, int machines, int nr_query, struct Query_Machine *q_machine, int *query)
{
	int machine_id = 0;
	double y;
	int len = 0;
	FILE *fp = fopen(input_file,"r");
	char *idx, *val, *endptr;
	char *label;
	char **out_file = (char**)malloc(sizeof(char*)*machines);
	for(int i=0;i<machines;i++)
		out_file[i] = (char*)malloc(sizeof(char)*1024);
    FILE **f = (FILE**)malloc(sizeof(FILE*)*machines);
    
    if(mkdir("temp_dir",0777)==0)
    {
        printf("Directory was successfully created!\n");
    }
    else
    {
        printf("Directory has existed!!\n");
    }
    
    for(int i=0;i<machines;i++)
	{
		sprintf(out_file[i],"temp_dir/train.txt.%d",i);
		f[i] = fopen(out_file[i],"w");
	}

	char *copy_line = (char*)malloc(sizeof(char)*2048);
	for(int j=0;j<l;j++)
	{
		readline(fp);		
		len = (int)strlen(line);
		//printf("len=%d for line:%d\n",len,j+1);
		if(len > 2048)
		{
			copy_line = (char*)realloc(copy_line,len*sizeof(char));
		}
		sprintf(copy_line, "%s", line);
		//strcpy(copy_line,line);
		//printf("copy_line:%s",copy_line);
		//printf("line:%s",line);
		label = strtok(line, " \t\n");
		if(label == NULL)
			exit_input_error(j+1);
		y = strtod(label, &endptr);
		if(endptr == label || *endptr != '\0')
			exit_input_error(j+1);
		idx = strtok(NULL,":");
		val = strtok(NULL, " \t");
		if(val == NULL)
			exit_input_error(j+1);
		if(!strcmp(idx,"qid"))
		{
			errno = 0;
			query[j] = (int)strtol(val, &endptr, 10);
			if(endptr == val || errno !=0 || (*endptr != '\0' && !isspace(*endptr)))
				exit_input_error(j+1);
		}

		for(int i=0;i<nr_query;i++)
		{
			if(query[j] == q_machine[i].query)
			{
				machine_id = q_machine[i].machine_id;
				break;
			}
		}
		fprintf(f[machine_id],"%s",copy_line);

	}
	free(copy_line);
	for(int i=0;i<machines;i++)
		free(out_file[i]);
	free(out_file);
	for(int i=0;i<machines;i++)
		fclose(f[i]);
	rewind(fp);
	fclose(fp);
}
Beispiel #12
0
void do_predict(FILE *input, FILE *output)
{
    int total = 0;

    int nr_class=get_nr_class(model_);
    double *prob_estimates=NULL;
    int n;
    int nr_feature=get_nr_feature(model_);
    if(model_->bias>=0)
        n=nr_feature+1;
    else
        n=nr_feature;

    if(!check_probability_model(model_))
    {
        fprintf(stderr, "probability output is only supported for logistic regression\n");
        exit(1);
    }

    prob_estimates = (double *) malloc(nr_class*sizeof(double));

    max_line_len = 1024;
    line = (char *)malloc(max_line_len*sizeof(char));
    int clicks = 0;
    int shows = 0;
    while(readline(input) != NULL)
    {
        int i = 0;
        double target_ctr, predict_ctr;
        char *idx, *val, *endptr;

        int inst_max_index = 0; // strtol gives 0 if wrong format

        char *p = strtok(line," \t\n"); //clicks
        if(p == NULL) // empty line
            exit_input_error(total+1);

        clicks = atoi(p);
        p = strtok(NULL," \t"); // shows
        shows = atoi(p);
        p = strtok(NULL," \t"); // qid:1

        if (shows <=0 || clicks > shows) {
            continue;
        }

        target_ctr = (double)clicks / shows;

        while(1)
        {
            if(i>=max_nr_attr-2)	// need one more for index = -1
            {
                max_nr_attr *= 2;
                x = (struct feature_node *) realloc(x,max_nr_attr*sizeof(struct feature_node));
            }

            idx = strtok(NULL,":");
            val = strtok(NULL," \t");

            if(val == NULL)
                break;
            errno = 0;
            x[i].index = (int) strtol(idx,&endptr,10);
            if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index)
                exit_input_error(total+1);
            else
                inst_max_index = x[i].index;

            errno = 0;
            x[i].value = strtod(val,&endptr);
            if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
                exit_input_error(total+1);

            // feature indices larger than those in training are not used
            if(x[i].index <= nr_feature)
                ++i;
        }

        if(model_->bias>=0)
        {
            x[i].index = n;
            x[i].value = model_->bias;
            i++;
        }
        x[i].index = -1;

        predict_probability(model_,x,prob_estimates);
        fprintf(output,"%d %d ", clicks, shows);
        predict_ctr = prob_estimates[0];
        fprintf(output," %g\n", predict_ctr);
    }

    info("total:%d\n",total);

    free(prob_estimates);
}
Beispiel #13
0
void read_problem(const char *filename)
{
	int elements[2], inst_max_index, i[2], j[2];
	FILE *fp = fopen(filename,"r");
	char *endptr;
	char *idx, *val, *label;

	if(fp == NULL)
	{
		fprintf(stderr,"can't open input file %s\n",filename);
		exit(1);
	}

	prob[0].l = 0;
	prob[1].l = 0;
	elements[0] = 0;
	elements[1] = 0;

	int current_set;
	
	max_line_len = 1024;
	line = Malloc(char,max_line_len);
	while(readline(fp)!=NULL)
	{
		if(line[0] == '-')
			current_set = 1;
		else
			current_set = 0;

		char *p = strtok(line," \t"); // label

		// features
		while(1)
		{
			p = strtok(NULL," \t");
			if(p == NULL || *p == '\n') // check '\n' as ' ' may be after the last feature
				break;
			++elements[current_set];
		}
		++elements[current_set];
		++prob[current_set].l;
	}
	rewind(fp);

	prob[0].y = Malloc(double,prob[0].l);
	prob[0].x = Malloc(struct svm_node *,prob[0].l);
	x_space[0] = 	Malloc(struct svm_node,elements[0]);

	prob[1].y = Malloc(double,prob[1].l);
	prob[1].x = Malloc(struct svm_node *,prob[1].l);
	x_space[1] = 	Malloc(struct svm_node,elements[1]);

//	max_index = 0;
	j[0]=0;
	j[1]=0;

	i[0] = 0;
	i[1] = 0;

	while( i[0] < prob[0].l || i[1] < prob[1].l )
//	for(i=0;i<prob.l;i++)
	{
		inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0
		readline(fp);
		if(line[0] == '-')
			current_set = 1;
		else
			current_set = 0;


		prob[current_set].x[i[current_set]] = &x_space[current_set][j[current_set]];
		label = strtok(line," \t");
		prob[current_set].y[i[current_set]] = strtod(label,&endptr);
		if(endptr == label)
			exit_input_error(i[0]+i[1]+1);

		while(1)
		{
			idx = strtok(NULL,":");
			val = strtok(NULL," \t");

			if(val == NULL)
				break;

			errno = 0;
			x_space[current_set][j[current_set]].index = (int) strtol(idx,&endptr,10);
			if(endptr == idx || errno != 0 || *endptr != '\0' || x_space[current_set][j[current_set]].index <= inst_max_index)
				exit_input_error(i[0]+i[1]+1);
			else
				inst_max_index = x_space[current_set][j[current_set]].index;

			errno = 0;
			x_space[current_set][j[current_set]].value = strtod(val,&endptr);
			if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
				exit_input_error(i[0]+i[1]+1);

			++j[current_set];
		}

		if(inst_max_index > max_index)
			max_index = inst_max_index;
		x_space[current_set][j[current_set]++].index = -1;
		i[current_set]++;
	}

	fclose(fp);
}
Beispiel #14
0
/**
 * This code is largely borrowed from LIBLINEAR.
 * TODO: This can be made faster with SIMD.
 **/
size_t create_dw_corpus(std::string filename, const size_t n_elements, const size_t n_examples,
  double * & p_examples, long * & p_cols, long * & p_rows){

  p_examples = new double[n_elements];
  p_cols = new long[n_elements];
  p_rows = new long[n_examples];

  FILE *fp = fopen(filename.c_str(),"r");
  ssize_t read;
  size_t max_line_len = 1024;
  char * line = new char[max_line_len];
  char *endptr;
  char *idx, *val, *label;

  double * y = new double[n_examples];

  int input_idx;
  double input_label;
  double input_value;

  if(fp == NULL){
    fprintf(stderr,"can't open input file %s\n",filename.c_str());
    exit(1);
  }

  size_t maxidx = 0;
  int inst_max_index, j=0;
  size_t ct = 0;
  for(int i=0;i<n_examples;i++){
    inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0
    
    read = getline(&line, &max_line_len, fp);

    label = strtok(line," \t\n");
    if(label == NULL) // empty line
      exit_input_error(i+1);

    input_label = strtod(label,&endptr);
    if(endptr == label || *endptr != '\0')
      exit_input_error(i+1);

    p_rows[i] = ct;

    while(1){
      idx = strtok(NULL,":");
      val = strtok(NULL," \t");

      if(val == NULL)
        break;

      errno = 0;
      input_idx = (int) strtol(idx,&endptr,10);
      if(endptr == idx || errno != 0 || *endptr != '\0')
        exit_input_error(i+1);

      errno = 0;
      input_value = strtod(val,&endptr);
      if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
        exit_input_error(i+1);

      ++j;

      p_examples[ct] = input_value;
      p_cols[ct] = input_idx;
      ct ++;

      //std::cout << input_idx << std::endl;
      if(input_idx > maxidx){
        maxidx = input_idx;
      }
    }
    p_examples[ct] = (input_label+1)/2; // normalize +1/-1 to 1/0
    p_cols[ct] = -1;
    ct ++;

  }
  maxidx ++;

  return maxidx;
}
Beispiel #15
0
void problem::Load(const char* filename,const NUMBER bias,const int maxdim)
{	// revised from the LIBLINEAR/LIBSVM read_problem() function
	timeb TimingMilliSeconds;
	ftime(&TimingMilliSeconds);
	Clear();
	
	FILE *fp = fopen(filename,"r");
	if(fp == NULL)
	{
		fprintf(stderr,"can't open input file %s\n",filename);
		exit(1);
	}

	max_line_len = 1024;
	line = Malloc(char,max_line_len);
	l = 0;
	allocated = 0;
	while(readline(fp)!=NULL)
	{
		char *p = strtok(line," \t"); // label
		// features
		while(1)
		{
			p = strtok(NULL," \t");
			if(p == NULL || *p == '\n') // check '\n' as ' ' may be after the last feature
				break;
			allocated++;
		}
		allocated++;
		l++;
	}
	rewind(fp);

	if(bias>0) allocated += l;
	index_buf = new int[allocated];
	value_buf = new NUMBER[allocated];
	y = new int[l];
	indexes = new int*[l];
	values = new NUMBER*[l];
	
	int max_index = 0;
	long int j=0;
	char *endptr;
	char *idx, *val, *label;
	for(int i=0;i<l;i++)
	{
		int inst_max_index = 0; // strtol gives 0 if wrong format
		readline(fp);
		indexes[i] = &index_buf[j];
		values[i] = &value_buf[j];
		label = strtok(line," \t");
		y[i] = (int) strtol(label,&endptr,10);
		if(endptr == label) exit_input_error(i+1);
		
		while(1)
		{
			if(bias>0)
			{
				index_buf[j] = 1;
				value_buf[j] = bias;
			}
			
			idx = strtok(NULL,":");
			val = strtok(NULL," \t");

			if(val == NULL)
				break;

			errno = 0;
			index_buf[j] = (int) strtol(idx,&endptr,10)+(bias>0);
			if(endptr == idx || errno != 0 || *endptr != '\0' || index_buf[j] <= inst_max_index)
				exit_input_error(i+1);
			else
				inst_max_index = index_buf[j];

			errno = 0;
			value_buf[j] = (NUMBER)strtod(val,&endptr);
			if(value_buf[j]<0)
				value_buf[j] = 0.001;
			//else if(value_buf[j]>=1)
				//value_buf[j] = 0.999;

			if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
				exit_input_error(i+1);

			++j;
		}
		if(inst_max_index > max_index) max_index = inst_max_index;
		index_buf[j++] = -1;
	}
	n=max_index;
	if(maxdim>0 && n>maxdim) n=maxdim;

	fclose(fp);
	free(line);
	line = NULL;
	
	struct timeb now;
    ftime(&now);
    std::cout<<"Dataset loaded in "<<int( (now.time-TimingMilliSeconds.time)*1000+(now.millitm-TimingMilliSeconds.millitm) )<<" msec."<<std::endl;
}
Beispiel #16
0
//struct feature_node
//{
//	int index;
//	double value;
//};
//void _parse_command_line(int argc, char **argv, char *input_file_name, char *model_file_name)
//#endif
//{
//	int i;
//	void (*print_func)(const char*) = NULL;	// default printing to stdout
//
//	// default values
//	_param.solver_type = L2R_L2LOSS_SVC;
//	_param.C = 1;
//	_param.eps = INF; // see setting below
//	_param.p = 0;
//	_param.nr_weight = 0;
//	_param.weight_label = NULL;
//	_param.weight = NULL;
//	_param.ite = 100; //default
//	// parse options
//	for(i=1;i<argc;i++)
//	{
//		if(argv[i][0] != '-') break;
//		if(++i>=argc)
//			exit_with_help();
//		switch(argv[i-1][1])
//		{
//			case 's':
//				_param.solver_type = atoi(argv[i]);
//				break;
//
//			case 'c':
//				_param.C = atof(argv[i]);
//				break;
//
//			case 'p':
//				_param.p = atof(argv[i]);
//				break;
//
//			case 'e':
//				_param.eps = atof(argv[i]);
//				break;
//
//			case 'i':
//				_param.ite = atof(argv[i]);
//				break;
//
//			case 'q':
//				print_func = &print_null;
//				i--;
//				break;
//
//			default:
//				fprintf(stderr,"unknown option: -%c\n", argv[i-1][1]);
//				exit_with_help();
//				break;
//		}
//	}
//
//	set_print_string_function(print_func);
//
//	// determine filenames
//	if(i>=argc)
//		exit_with_help();
//
//	strcpy(input_file_name, argv[i]);
//#ifdef FIGURE56
//	if(i<argc-1)
//		strcpy(test_file_name,argv[i+1]);
//	else
//	{
//		exit_with_help();
//	}
//#else
//	if(i<argc-1)
//		strcpy(model_file_name,argv[i+1]);
//	else
//	{
//		//strrchr() 函数查找字符在指定字符串中从后面开始的第一次出现的位置,如果成功,则返回从该位置到字符串结尾的所有字符,如果失败,则返回 false。
//		//与之相对应的是strchr()函数,它查找字符串中首次出现指定字符的位置。
//		char *p = strrchr(argv[i],'/');
//		if(p==NULL)//there are no parent directories in the path of train data, that means train data are in the current directory
//			p = argv[i];
//		else//the train data contain parent directoryis, ++p: the pointer move from char '/' to the first char of the input train data
//			++p;
//		sprintf(model_file_name,"%s.model",p);//int sprintf ( char * str, const char * format, ... );Write formatted data to string
//	}
//#endif
//
//	if(_param.eps == INF)
//	{
//		switch(_param.solver_type)
//		{
//			case L2R_L2LOSS_SVC:
//				_param.eps = 0.01;
//				break;
//			case L2R_L2LOSS_SVR:
//			case WX_RBTREE:
//			case Y_RBTREE:
//			case AVLTREE:
//			case AATREE:
//			case DIRECT_COUNT:
//			case SELECTION_TREE:
//			case PRSVMP:
//				_param.eps = 0.001;
//				break;
//			case L2R_L1LOSS_SVR_DUAL:
//				_param.eps = 0.1;
//				break;
//		}
//	}
//}
void read_problem(const char *filename)
{
	int max_index, inst_max_index, i;
	long int elements, j;
	FILE *fp = fopen(filename,"r");
	char *endptr;
	char *idx, *val, *label;

	if(fp == NULL)
	{
		fprintf(stderr,"can't open input file %s\n",filename);
		exit(1);
	}

	prob.l = 0;
	elements = 0;
	max_line_len = 1024;
	line = Malloc(char,max_line_len);
	while(readline(fp)!=NULL)
	{
		char *p = strtok(line," \t"); // label

		// features
		while(1)
		{
			p = strtok(NULL," \t");
			if(p == NULL || *p == '\n') // check '\n' as ' ' may be after the last feature
				break;
			elements++;
		}
		elements++; // for bias term
		prob.l++;
	}
	rewind(fp);
	// struct problem
	// {
	// int l, n;//l: total instance number(starts from 0);n:total feature number
	// int *query;
	// double *y;//y:label value
	// struct feature_node **x;
	// };
	//prob.l: instances number of input file
	//prob.y: label array of each instance of input file
	//prob.x: pointer array of each instance's features
	//prob.query:query array of each instance of input file
	prob.y = Malloc(double,prob.l);
	prob.x = Malloc(struct feature_node *,prob.l);
	prob.query = Malloc(int,prob.l);
	//x_space:apply enough spaces to store all instances' features
	x_space = Malloc(struct feature_node,elements+prob.l);
	max_index = 0;
	j=0;
	for(i=0;i<prob.l;i++)//iterate all instances
	{
		prob.query[i] = 0;
		inst_max_index = 0; // strtol gives 0 if wrong format
		readline(fp);
		prob.x[i] = &x_space[j];
		label = strtok(line," \t\n");
		if(label == NULL) // empty line
			exit_input_error(i+1);

		prob.y[i] = strtod(label,&endptr);
		if(endptr == label || *endptr != '\0')
			exit_input_error(i+1);

		while(1)
		{
			idx = strtok(NULL,":");
			val = strtok(NULL," \t");

			if(val == NULL)
				break;
			if (!strcmp(idx,"qid"))//qid
			{
				errno = 0;
				prob.query[i] = (int) strtol(val, &endptr,10);
				if(endptr == val || errno !=0 || (*endptr != '\0' && !isspace(*endptr)))
					exit_input_error(i+1);
			}
			else//feature
			{
				errno = 0;
				x_space[j].index = (int) strtol(idx,&endptr,10);
				if(endptr == idx || errno != 0 || *endptr != '\0' || x_space[j].index <= inst_max_index)
					exit_input_error(i+1);
				else
					inst_max_index = x_space[j].index;

				errno = 0;
				x_space[j].value = strtod(val,&endptr);
				if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
					exit_input_error(i+1);

				++j;
				/*for debug
				if(j==46)
					printf("%d",j);*/
			}
		}//finish parsing one line of data

		if(inst_max_index > max_index)
			max_index = inst_max_index;

		x_space[j++].index = -1;
	}// iterate all instances
	prob.n=max_index;
	fclose(fp);
}
Beispiel #17
0
// read in a problem (in libsvm format)
void read_problem(const char *filename)
{
	int max_index, inst_max_index, i;
	size_t elements, j;
	FILE *fp = fopen(filename,"r");
	char *endptr;
	char *idx, *val, *label;

	if(fp == NULL)
	{
		fprintf(stderr,"can't open input file %s\n",filename);
		exit(1);
	}

	prob.l = 0;
	elements = 0;
	max_line_len = 1024;
	line = Malloc(char,max_line_len);
	while(readline(fp)!=NULL)
	{
		char *p = strtok(line," \t"); // label

		// features
		while(1)
		{
			p = strtok(NULL," \t");
			if(p == NULL || *p == '\n') // check '\n' as ' ' may be after the last feature
				break;
			elements++;
		}
		elements++; // for bias term
		prob.l++;
	}
	rewind(fp);

	prob.bias=bias;

	prob.y = Malloc(double,prob.l);
	prob.x = Malloc(struct feature_node *,prob.l);
	x_space = Malloc(struct feature_node,elements+prob.l);

	max_index = 0;
	j=0;
	for(i=0;i<prob.l;i++)
	{
		inst_max_index = 0; // strtol gives 0 if wrong format
		readline(fp);
		prob.x[i] = &x_space[j];
		label = strtok(line," \t\n");
		if(label == NULL) // empty line
			exit_input_error(i+1);

		prob.y[i] = strtod(label,&endptr);
		if(endptr == label || *endptr != '\0')
			exit_input_error(i+1);

		while(1)
		{
			idx = strtok(NULL,":");
			val = strtok(NULL," \t");

			if(val == NULL)
				break;

			errno = 0;
			x_space[j].index = (int) strtol(idx,&endptr,10);
			if(endptr == idx || errno != 0 || *endptr != '\0' || x_space[j].index <= inst_max_index)
				exit_input_error(i+1);
			else
				inst_max_index = x_space[j].index;

			errno = 0;
			x_space[j].value = strtod(val,&endptr);
			if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
				exit_input_error(i+1);

			++j;
		}

		if(inst_max_index > max_index)
			max_index = inst_max_index;

		if(prob.bias >= 0)
			x_space[j++].value = prob.bias;

		x_space[j++].index = -1;
	}

	if(prob.bias >= 0)
	{
		prob.n=max_index+1;
		for(i=1;i<prob.l;i++)
			(prob.x[i]-2)->index = prob.n;
		x_space[j-2].index = prob.n;
	}
	else
		prob.n=max_index;

	fclose(fp);
}
Beispiel #18
0
double * predict(FILE *input, int &size)
{
	int correct = 0;
	int total = 0;
	double error = 0;
	double sump = 0, sumt = 0, sumpp = 0, sumtt = 0, sumpt = 0;

	int svm_type=svm_get_svm_type(model);
	int nr_class=svm_get_nr_class(model);
	double *prob_estimates=NULL;
	size = 0;
	int tmp;
	while ( (tmp=fgetc(input)) != EOF) {
				if (tmp == '\n')
				  size++;
	}
	rewind(input);
	double * res = (double *) malloc (size*sizeof(double));
	target = (double *) malloc (size*sizeof(double));


	max_line_len = 1024;
	line = (char *)malloc(max_line_len*sizeof(char));
	int j = 0;
	while(readline(input) != NULL)
	{
		int i = 0;
		double target_label, predict_label;
		char *idx, *val, *label, *endptr;
		int inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0

		label = strtok(line," \t\n");
		if(label == NULL) // empty line
			exit_input_error(total+1);

		target_label = strtod(label,&endptr);
		if(endptr == label || *endptr != '\0')
			exit_input_error(total+1);

		while(1)
		{
			if(i>=max_nr_attr-1)	// need one more for index = -1
			{
				max_nr_attr *= 2;
				x = (struct svm_node *) realloc(x,max_nr_attr*sizeof(struct svm_node));
			}

			idx = strtok(NULL,":");
			val = strtok(NULL," \t");

			if(val == NULL)
				break;
			errno = 0;
			x[i].index = (int) strtol(idx,&endptr,10);
			if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index)
				exit_input_error(total+1);
			else
				inst_max_index = x[i].index;

			errno = 0;
			x[i].value = strtod(val,&endptr);
			if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
				exit_input_error(total+1);

			++i;
		}
		x[i].index = -1;

		predict_label = svm_predict(model,x);
		res[j] = predict_label;
		target[j] = target_label;
		j++;
	}
	return res;
}
Beispiel #19
0
void do_predict(FILE *input, FILE *output)
{
	int correct = 0;
	int total = 0;
	double error = 0;
	double sump = 0, sumt = 0, sumpp = 0, sumtt = 0, sumpt = 0;

	int nr_class=get_nr_class(model_);
	double *prob_estimates=NULL;
	int j, n;
	int nr_feature=get_nr_feature(model_);
	if(model_->bias>=0)
		n=nr_feature+1;
	else
		n=nr_feature;

	if(flag_predict_probability)
	{
		int *labels;

		if(!check_probability_model(model_))
		{
			fprintf(stderr, "probability output is only supported for logistic regression\n");
			exit(1);
		}

		labels=(int *) malloc(nr_class*sizeof(int));
		get_labels(model_,labels);
		prob_estimates = (double *) malloc(nr_class*sizeof(double));
		fprintf(output,"labels");
		for(j=0;j<nr_class;j++)
			fprintf(output," %d",labels[j]);
		fprintf(output,"\n");
		free(labels);
	}

	max_line_len = 1024;
	line = (char *)malloc(max_line_len*sizeof(char));
	while(readline(input) != NULL)
	{
		int i = 0;
		double target_label, predict_label;
		char *idx, *val, *label, *endptr;
		int inst_max_index = 0; // strtol gives 0 if wrong format

		label = strtok(line," \t\n");
		if(label == NULL) // empty line
			exit_input_error(total+1);

		target_label = strtod(label,&endptr);
		if(endptr == label || *endptr != '\0')
			exit_input_error(total+1);

		while(1)
		{
			if(i>=max_nr_attr-2)	// need one more for index = -1
			{
				max_nr_attr *= 2;
				x = (struct feature_node *) realloc(x,max_nr_attr*sizeof(struct feature_node));
			}

			idx = strtok(NULL,":");
			val = strtok(NULL," \t");

			if(val == NULL)
				break;
			errno = 0;
			x[i].index = (int) strtol(idx,&endptr,10);
			if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index)
				exit_input_error(total+1);
			else
				inst_max_index = x[i].index;

			errno = 0;
			x[i].value = strtod(val,&endptr);
			if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
				exit_input_error(total+1);

			// feature indices larger than those in training are not used
			if(x[i].index <= nr_feature)
				++i;
		}

		if(model_->bias>=0)
		{
			x[i].index = n;
			x[i].value = model_->bias;
			i++;
		}
		x[i].index = -1;

		if(model_->normal){
			double length = 0;
			for(int kk = 0; x[kk].index != -1; kk++)
				length += x[kk].value * x[kk].value;
	
			length = sqrt(length);
			
			for(int kk = 0; x[kk].index != -1; kk++)
				x[kk].value /= length;
		}

		if(flag_predict_probability)
		{
			int j;
			predict_label = predict_probability(model_,x,prob_estimates);
			fprintf(output,"%g",predict_label);
			for(j=0;j<model_->nr_class;j++)
				fprintf(output," %g",prob_estimates[j]);
			fprintf(output,"\n");
		}
		else
		{
			predict_label = predict(model_,x);
			fprintf(output,"%g\n",predict_label);
		}

		if(predict_label == target_label)
			++correct;
		error += (predict_label-target_label)*(predict_label-target_label);
		sump += predict_label;
		sumt += target_label;
		sumpp += predict_label*predict_label;
		sumtt += target_label*target_label;
		sumpt += predict_label*target_label;
		++total;
	}
	if(model_->param.solver_type==L2R_L2LOSS_SVR ||
	   model_->param.solver_type==L2R_L1LOSS_SVR_DUAL ||
	   model_->param.solver_type==L2R_L2LOSS_SVR_DUAL)
	{
		info("Mean squared error = %g (regression)\n",error/total);
		info("Squared correlation coefficient = %g (regression)\n",
			((total*sumpt-sump*sumt)*(total*sumpt-sump*sumt))/
			((total*sumpp-sump*sump)*(total*sumtt-sumt*sumt))
			);
	}
	else
		info("Accuracy = %g%% (%d/%d)\n",(double) correct/total*100,correct,total);
	if(flag_predict_probability)
		free(prob_estimates);
}
Beispiel #20
0
void do_predict(FILE *input, FILE *output)
{
	int correct = 0;
	int total = 0;
	double error = 0;
	double sump = 0, sumt = 0, sumpp = 0, sumtt = 0, sumpt = 0;

	int nr_class=get_nr_class(model_[0]);
	double *prob_estimates=NULL;
	int j, n;
	int nr_feature=get_nr_feature(model_[0]);
	if(model_[0]->bias>=0)
		n=nr_feature+1;
	else
		n=nr_feature;

	if(flag_predict_probability)
	{
		int *labels;

		if(!check_probability_model(model_[0]))
		{
			fprintf(stderr, "probability output is only supported for logistic regression\n");
			exit(1);
		}

		labels=(int *) malloc(nr_class*sizeof(int));
		get_labels(model_[0],labels);
		prob_estimates = (double *) malloc(nr_class*sizeof(double));
		fprintf(output,"labels");
		for(j=0;j<nr_class;j++)
			fprintf(output," %d",labels[j]);
		fprintf(output,"\n");
		free(labels);
	}

	max_line_len = 1024;
	line = (char *)malloc(max_line_len*sizeof(char));
	while(readline(input) != NULL)
	{
		int i = 0;
		double target_label, predict_label;
		char *idx, *val, *label, *endptr;
		int inst_max_index = 0; // strtol gives 0 if wrong format

		label = strtok(line," \t\n");
		if(label == NULL) // empty line
			exit_input_error(total+1);

		// target_label = strtod(label,&endptr);
		switch (label[0]) {
			case 'A': target_label = 0; break;
			case 'B': target_label = 1; break;
			case 'C': target_label = 1; break;
			case 'D': target_label = 1; break;
		}
		// if(endptr == label || *endptr != '\0')
		// 	exit_input_error(total+1);
		for (int pid = 0; pid < sum_pro; pid++) {
			while(1)
			{
				if(i>=max_nr_attr-2)	// need one more for index = -1
				{
					max_nr_attr *= 2;
					x = (struct feature_node *) realloc(x,max_nr_attr*sizeof(struct feature_node));
				}

				idx = strtok(NULL,":");
				val = strtok(NULL," \t");

				if(val == NULL)
					break;
				errno = 0;
				x[i].index = (int) strtol(idx,&endptr,10);
				if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index)
					exit_input_error(total+1);
				else
					inst_max_index = x[i].index;

				errno = 0;
				x[i].value = strtod(val,&endptr);
				if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
					exit_input_error(total+1);

				// feature indices larger than those in training are not used
				if(x[i].index <= nr_feature)
					++i;
			}

			if(model_[pid]->bias>=0)
			{
				x[i].index = n;
				x[i].value = model_[pid]->bias;
				i++;
			}
			x[i].index = -1;

			if(flag_predict_probability)
			{
				int j;
				predict_label = predict_probability(model_[pid],x,prob_estimates);
				fprintf(output,"%g",predict_label);
				for(j=0;j<model_[pid]->nr_class;j++)
					fprintf(output," %g",prob_estimates[j]);
				fprintf(output,"\n");
			}
			else
			{
				p_label[pid] = predict(model_[pid],x);
				fprintf(output,"%g", p_label[pid]);

				// printf("pid%dhas done\n",pid );
			}
			fprintf(output, "\n" );
		}
		int count = 0;
		predict_label = 0;
		// for ( int l = 0; l < BLOCK ; l++) {
		// 	for (int m = 0;m < BLOCK * N; m++) {
		// 		// printf("%f\t", p_label[l * BLOCK + m]);
		// 		if ( p_label[l * BLOCK + m] == 1) {
		// 			// p_label[l] = 1;
		// 			// break;
		// 			p_label[l]++;
		// 			// count++;* 4
		// 		}
		// 	}
		// 	if (p_label[l] < 4) {
		// 		count++;
		// 	}
		// 	// if ( p_label[l] == 1) {
		// 	// 	predict_label = 1;
		// 	// }
		// 	// else {
		// 	// 	predict_label = 0;
		// 	// }
		// 	// if (count >0) {
		// 	// 	predict_label = 1;
		// 	// }
		// 	// else {
		// 	// 	predict_label = 0;
		// 	// }
		// }

		// if (count > 0 ) {
		// 	predict_label = 0;
		// 	}
		// else {
		// 	predict_label = 1;
		// }
		// /printf("\n");
		// fprintf(output,"%g\n",predict_label);

		// if(predict_label == target_label)
		// 	++correct;
		// error += (predict_label-target_label)*(predict_label-target_label);
		// sump += predict_label;
		// sumt += target_label;
		// sumpp += predict_label*predict_label;
		// sumtt += target_label*target_label;
		// sumpt += predict_label*target_label;
		// ++total;
	}
	// if(check_regression_model(model_[0]))
	// {
	// 	info("Mean squared error = %g (regression)\n",error/total);
	// 	info("Squared correlation coefficient = %g (regression)\n",
	// 		((total*sumpt-sump*sumt)*(total*sumpt-sump*sumt))/
	// 		((total*sumpp-sump*sump)*(total*sumtt-sumt*sumt))
	// 		);
	// }
	// else
	// 	info("Accuracy = %g%% (%d/%d)\n",(double) correct/total*100,correct,total);
	// if(flag_predict_probability)
	// 	free(prob_estimates);
}