bool MEModel::loadTrainingExamples( const char *trainFile )
{
	FILE *fin = fopen(trainFile,"r");
	if ( fin == NULL )
	{
		fprintf( stderr, "open train file %s failed!\n", trainFile );
		return false;
	}
	int lineNum = 0;
	int fieldNum = 0;
	int ch;
	bool endOfFile = false;
	int curFieldCnt = 0;
	while( !endOfFile )
	{
		ch = fgetc(fin);
		curFieldCnt = 0;
		while( true )
		{
			while( !isLineEnd(ch) && isSpace(ch) )
				ch = fgetc(fin);
			if( isLineEnd(ch) )
				break;
			while( !isLineEnd(ch) && !isSpace(ch) )
				ch = fgetc(fin);
			curFieldCnt ++;
		}
		if ( curFieldCnt > 0 )
		{
			lineNum ++;
			fieldNum += curFieldCnt;
		}
		if( ch == EOF )
			endOfFile = true;
	}
	rewind(fin);
	M = lineNum;
	VarStr word;
	endOfFile = false;
	xspace = new SpElem[fieldNum];
	x = new SpElem*[M];
	y = new int[M];
	lineNum = 0;
	int xIndex = 0;
	while( !endOfFile )
	{
		ch = fgetc(fin);
		curFieldCnt = 0;
		int curY = 0, curX = 0;
		char *ystr = NULL;
		while( true )
		{
			while( !isLineEnd(ch) && isSpace(ch) )
				ch = fgetc(fin);
			if( isLineEnd(ch) )
				break;
			word.clear();
			while( !isLineEnd(ch) && !isSpace(ch) )
			{
				word.add(ch);
				ch = fgetc(fin);
			}
			curFieldCnt ++;
			if( curFieldCnt == 1 )
			{
				ystr = new char[word.length()+1];
				strcpy(ystr,word.c_str());
			}
			else
			{
				curX = feaIdMap.add(word.c_str());
				xspace[xIndex].index = curX;
				xspace[xIndex].value = 1;
				if ( curFieldCnt == 2 )
					x[lineNum] = &xspace[xIndex];
				xIndex ++;
			}
		}
		if ( curFieldCnt > 1 )
		{
			xspace[xIndex].index = -1; xspace[xIndex].value = 0;
			xIndex ++;
			curY = classNameIdMap.add(ystr);
			y[lineNum] = curY;
			lineNum ++;
		}
		if( ystr != NULL )
			delete []ystr;
		if ( ch == EOF )
			endOfFile = true;
	}
	
	fclose(fin);
	M = lineNum;
	classNumber = (int)classNameIdMap.size();

	needDestroyTrain = true;
	return true;
}
Пример #2
0
// check whether string is a float
bool htmInterface::isFloat(const VarStr &str) {
  if(str.empty()) return false;
  uint32 len = str.length();
  return (strspn(str.data(),"+-.e0123456789") == len) ? true : false ;
}
bool MaxEnt::parseLine( FILE *fin, FILE *fout, bool withClassLabel )
{
	vector<string> feature;
	int c;
	int cnt = 0;
	enum State{ SPACE, WORD, NONE };
	int state = NONE;
	VarStr word;
	bool first = true;
	while( (c=fgetc(fin)) != EOF )
	{
		if ( c == '\n' )
			break;
		cnt ++;
		if ( state == NONE )
		{
			if ( isSpace(c) )
				state = SPACE;
			else
			{
				word.add(c);
				state = WORD;
			}
			continue;
		}
		if ( isSpace(c) )
		{
			if ( state == WORD )
			{
				if ( !first )
					feature.push_back(word.c_str());
				else
				{
					if ( !withClassLabel )
						feature.push_back(word.c_str());
					first = false;
				}
				state = SPACE;
				word.clear();
			}
		}
		else
		{
			if ( state == SPACE )
			{
				word.add(c);
				state = WORD;
			}
			else
				word.add(c);
		}
	}
	if ( c == EOF && cnt == 0 )
		return false;
	if ( word.length() > 0 )
	{
		if ( !first || (first&&!withClassLabel) )
			feature.push_back(word.c_str());
	}

	if ( feature.size() > 0 )
	{
		vector<pair<string,double> > labelProbs;
		int pcls = predict(feature,labelProbs);
		fprintf(fout,"%s",labelProbs[pcls].first.c_str());
		for( int i = 0; i < labelProbs.size(); i ++ )
			fprintf(fout, "\t%s\t%lf", labelProbs[i].first.c_str(), labelProbs[i].second);
		fprintf(fout,"\n");
	}
	return true;
}