Esempio n. 1
0
bool CMorphDict::Save(string GrammarFileName) const
{
	try {
		if (!m_pFormAutomat->Save(MakeFName(GrammarFileName,"forms_autom")))
		{
			ErrorMessage (Format("Cannot write to %s", MakeFName(GrammarFileName,"forms_autom").c_str()));
			return false;
		}


		string PrecompiledFile = MakeFName(GrammarFileName,"annot");
		FILE * fp = fopen(PrecompiledFile.c_str(), "wb");
		if (!fp)
		{
			ErrorMessage (Format("Cannot write to %s", PrecompiledFile.c_str()));
			return false;
		};

		

		WriteFlexiaModels(fp, m_FlexiaModels);

		WriteAccentModels(fp, m_AccentModels);

		
		assert (!m_Prefixes.empty() && m_Prefixes[0].empty());
		// do not write the first empty prefix, instead add it manually each time during loading
		fprintf(fp, "%i\n", m_Prefixes.size()-1);

		for (size_t i=1; i < m_Prefixes.size(); i++)
			fprintf (fp, "%s\n",m_Prefixes[i].c_str());
		

		fprintf(fp, "%i\n", m_LemmaInfos.size());
		if (!WriteVectorInner(fp, m_LemmaInfos)) return false;

		assert (m_NPSs.size()  == m_FlexiaModels.size());
		fprintf(fp, "%i\n", m_NPSs.size());
		if (!WriteVectorInner(fp, m_NPSs)) return false;
		

		fclose(fp);

		if (!m_Bases.WriteShortStringHolder(MakeFName(GrammarFileName,"bases")))
		{
			fprintf(stderr, "Cannot save bases\n");
			return false;
		};

		return true;
	}
	catch (...)
	{
		fprintf (stderr, "Cannot save CMorphDict");
		return false;
	};
};
Esempio n. 2
0
int main(int argc, char* argv[])
{
	if (argc != 2)
		PrintUsageAndExit();

	vector<CWordInfo> WordInfos;
	string BigramsFileName =  argv[1];
	string WordFreqFileName =  MakeFName(BigramsFileName, "wrd_freq");

	// читаем статистику слов
	if (!ReadWordFreqs(WordFreqFileName, WordInfos))
		return 1;

	// создаем бинарный файл для биграмм
	if (!BuildBigramsBin(BigramsFileName, WordInfos, true))
		return 1;

	{
		string RevFile = MakeFName(BigramsFileName, "rev");
		string Command = Format ("gsort -k 2,3 <%s >%s", BigramsFileName.c_str(), RevFile.c_str());
		fprintf (stderr, "%s\n", Command.c_str());
		if (system (Command.c_str()) != 0)
		{
			fprintf (stderr,"!!! an exception occurred (cannot sort) !!!\n");
			return 1;
		};

		if (!BuildBigramsBin(RevFile, WordInfos, false))
			return 1;
		fprintf (stderr, "remove %s\n", RevFile.c_str());
		remove(RevFile.c_str());
	}


	string OutIndexFile = MakeFName(BigramsFileName, "wrd_idx");
	fprintf (stderr, "create file %s\n", OutIndexFile.c_str());
	FILE* fp = fopen (OutIndexFile.c_str(), "w");
	if (!fp)
	{
		fprintf (stderr, "cannot open file %s\n", OutIndexFile.c_str());
		return 1;
	}
	for (vector<CWordInfo>::const_iterator it = WordInfos.begin(); it != WordInfos.end(); it++)
	{
		fprintf (fp, "%s %u %u %u %u %u\n",it->m_WordStr.c_str(), 
			it->m_Freq, 
			it->m_FileOffset1,it->m_FileLen1,
			it->m_FileOffset2,it->m_FileLen2
			);
	}
	fclose (fp);
	return 0;
}
Esempio n. 3
0
bool CGraphmatFile :: LoadFileToGraphan (const string&  CommandLine)
{
    try
    {
        m_SourceFileName = CommandLine.c_str();
        m_GraFileName = MakeFName (m_SourceFileName,"gra");
        m_XmlMacSynName = MakeFName (m_SourceFileName,"xml");
        m_SaveTxtName = MakeFName (m_SourceFileName,"tds");

        if (IsHtmlFile(m_SourceFileName))
        {
            HTML Convert(m_SourceFileName);
            string Text = Convert.getText();

            if (!InitInputBuffer(Text))
            {
                m_LastError = Format("Cannot init inpur buffer for %i bytes", Text.length());
                return false;
            }

            if   (m_bSaveHtmlFileToTdsFile)
                WriteVector(m_SaveTxtName, GetInputBuffer());
        }
        else
        {
            if (access(m_SourceFileName.c_str(), 04) != 0) return  false;
            string Text;
            LoadFileToString(m_SourceFileName, Text);
            if (!InitInputBuffer(Text))
            {
                m_LastError = Format("Cannot init inpur buffer for %i bytes", Text.length());
                return false;
            };

        };



        return  GraphmatMain ();

    }
    catch (CExpc& C)
    {
        m_LastError = C.m_ErrorCode;
        return false;
    }
    catch (...)
    {
        m_LastError = "general exception";
        return false;
    };
};
Esempio n. 4
0
bool BuildBigramsBin(string BigramsFileName, vector<CWordInfo>& WordInfos, bool bFirstOffset1)
{
	string WordFreqFileName =  MakeFName(BigramsFileName, "wrd_freq");
	fprintf (stderr, "open file %s\n", BigramsFileName.c_str());
	FILE * in_fp = fopen (BigramsFileName.c_str(), "rb");
	if (!in_fp)
	{
		fprintf (stderr, "cannot open file %s\n", BigramsFileName.c_str());
		return false;
	}

	string BinName = MakeFName(BigramsFileName, bFirstOffset1 ? "bin1" : "bin2");
	fprintf (stderr, "write  to  file %s\n", BinName.c_str());
	FILE * out_fp = fopen (BinName.c_str(),  "wb");
	if (!out_fp)
	{
		fprintf (stderr, "cannot open file %s\n", BinName.c_str());
		return false;
	}

	
	char buffer[10000];
	size_t BigramsCount = 0;
	vector<CWordInfo>::iterator curr_it = WordInfos.end();
	while (fgets(buffer, 10000, in_fp))
	{
		if ((BigramsCount % 100000) == 0)
			fprintf (stderr, "%u               \r", BigramsCount);
		char w1[500], w2[500];
		int Bigramsfreq;
		int Res = sscanf(buffer, "%[^\t]\t%[^\t]\t%i", w1, w2, &Bigramsfreq);
		if ( Res != 3)
		{
			fprintf (stderr, "%s: skip line %s (scanf returned %i)\n", BigramsFileName.c_str(), buffer, Res);
			continue;
		}
		const char* w  = bFirstOffset1 ? w1 : w2;
		const char* conv_w  = bFirstOffset1 ? w2 : w1;
		bool bChange = false;
		if	(		(curr_it == WordInfos.end())
				||	(w != curr_it->m_WordStr)
			)
		{
			if (curr_it == WordInfos.end())
				curr_it = lower_bound (WordInfos.begin(),  WordInfos.end(), w);
			else
			{
				curr_it++;
				curr_it = lower_bound (curr_it,  WordInfos.end(), w);
			}
			bChange = true;
		}

		if (curr_it == WordInfos.end())
		{
			fprintf (stderr, "Cannot find word \"%s\" in %s\n", w, buffer);
			return false;
		}
		else
		{
			if (bFirstOffset1)
			{
				if (bChange && (curr_it->m_FileOffset1 != UINT_MAX))
				{
					fprintf (stderr, "Wrong order \"%s\" from line \"%s\" (curr_it->m_FileOffset1=%u)\n", w, buffer, curr_it->m_FileOffset1);
					return  false;
				}

				if (curr_it->m_FileOffset1 == UINT_MAX)
					curr_it->m_FileOffset1 = BigramsCount;

				curr_it->m_FileLen1++;
			}
			else
			{
				if (bChange && (curr_it->m_FileOffset2 != UINT_MAX))
				{
					fprintf (stderr, "Wrong order \"%s\" from %s\n", w, buffer);
					return  false;
				}

				if (curr_it->m_FileOffset2 == UINT_MAX)
					curr_it->m_FileOffset2 = BigramsCount;

				curr_it->m_FileLen2++;
			}

			vector<CWordInfo>::const_iterator conv_curr_it = lower_bound(WordInfos.begin(), WordInfos.end(), conv_w);
			if (		conv_curr_it == WordInfos.end() 
					||	(conv_curr_it->m_WordStr != conv_w)
				)
			{
				fprintf (stderr, "Cannot find word \"%s\" in %s\n", conv_w, buffer);
				return false;
			}
			else
			{
				size_t ConvWordNo =  conv_curr_it - WordInfos.begin();
				// write the other item of the bigram
				fwrite(&ConvWordNo, 1, sizeof(ConvWordNo), out_fp);
				// write the frequence of bigram
				fwrite(&Bigramsfreq, 1, sizeof(Bigramsfreq), out_fp);
			}

		}

		BigramsCount++;
	}
	fprintf (stderr, "%u               \n", BigramsCount);
	fclose(in_fp);
	fclose (out_fp);
	return true;
}
Esempio n. 5
0
//----------------------------------------------------------------------------
void CMorphwizardView::OnToolsImport() 
{
	int line_no = 0;
	CString PathName;

try {
   	CFileDialog D(TRUE, "slf", "paradigms.slf",OFN_ALLOWMULTISELECT|OFN_HIDEREADONLY | OFN_OVERWRITEPROMPT);
	char file_buffer[10000];
	file_buffer[0] = 0;
	D.m_ofn.lpstrFile = file_buffer;
	D.m_ofn.nMaxFile = 10000;
	if (D.DoModal() != IDOK) return;
	POSITION pos = D.GetStartPosition();
	if (pos == 0) return;
	bool bTestMode = ::MessageBox (0,"Test it or import?","MorphWizard",MB_YESNO   ) == IDYES;
	
	GetWizard()->m_bFullTrace = false;
	PathName = D.GetNextPathName(pos);
	while  (true) // all selected files 
	{

		FILE * fp = fopen (PathName,"r");
		if (!fp) 
		{
			AfxMessageBox ("Cannot open file");
			return;
		};
				
		GetWizard()->log (std::string("import file ") + (const char*)PathName);
				
		CProgressMeterRML meter;
		meter.SetFileMaxPos(fp);
		CString info = (bTestMode?"Test":"Import");
		info+= "ing file " + PathName + "...";
		meter.SetInfo(info);
				
		int ParadigmCount = 0;
		string Errors;
		line_no = 0;
		bool bError;
		CDumpParadigm P;
		while  (GetWizard()->ReadNextParadigmFromFile(fp, P, line_no, bError, Errors)) 
		{
			if (!bError)
			{
				int line_no_err = 0;
				ParadigmCount++;
				try 
				{
					if (bTestMode)
					{
						std::string lemm;
						CFlexiaModel Dummy1;
						CAccentModel AccentModel;
						BYTE Dummy;
						GetWizard()->slf_to_mrd (P.m_SlfStr, lemm, Dummy1, AccentModel, Dummy, line_no_err);
					}
					else
					{
						WORD SessionNo = GetWizard()->RegisterSession(P.m_Session);
						GetWizard()->add_lemma(P.m_SlfStr, P.m_TypeGrammemsStr, P.m_PrefixesStr, line_no_err, SessionNo);
					};

					meter.SetFilePos();		  
				}
				catch (CExpc C)
				{
					Errors += Format("%s (%s:%i) \n", C.m_strCause.c_str(), (const char*)PathName, P.m_FirstSlfLineNo+line_no_err);
				}
				catch  (...) 
				{
					Errors += Format("error at %s:%i \n", (const char*)PathName, P.m_FirstSlfLineNo+line_no_err);
				};
					
			}
		}
		fclose (fp);
		if (!Errors.empty())
		{
			try {
				string ErrorFile = MakeFName((const char*)PathName, "err");
				FILE * fp = fopen (ErrorFile.c_str(), "w");
				fprintf (fp, "%s",Errors.c_str());
				fclose(fp);
				ErrorMessage(Format("Errors were written to file %s",  ErrorFile.c_str()));
			} 
			catch (...){
				AfxMessageBox ("Cannot write errors to paradigms.err ");
			}
		}
		else
		{
			ErrorMessage(Format("Successfully %s %i paradigms from \"%s\"" , bTestMode? "tested":"imported", ParadigmCount, PathName), "Confirmation");
		};
		if (pos == 0) break;
		PathName = D.GetNextPathName(pos);
	} // all selected files

}
catch (...)
{
	ErrorMessage (Format("some error has occurred (file=%s, line=%i", PathName, line_no));
};
	GetWizard()->m_bFullTrace = true;
}
Esempio n. 6
0
bool CBigrams::Initialize(string BigramsFileName)
{
	m_Word2Infos.clear();
	m_CorpusSize = 0;
	
	string IndexFile = MakeFName(BigramsFileName, "wrd_idx");
	fprintf (stderr,"load %s into memory\n", IndexFile.c_str()  );
	FILE *fp = 	fopen (IndexFile.c_str(), "r");
	if (!fp)
	{
		fprintf (stderr,"Cannot open %s", IndexFile.c_str());
		return false;
	}
	char buffer[1000];
	while (fgets (buffer, 1000, fp))
	{
		char word[1000];
		CBigramsWordInfo I;
		if (sscanf(buffer, "%[^ ] %u %u %u %u %u", word, 
			&I.m_Freq, 
			&I.m_FileOffset1, &I.m_FileLen1,
			&I.m_FileOffset2, &I.m_FileLen2) != 6)
		{
			fprintf (stderr,"Bad format in  %s", IndexFile.c_str());
			fclose(fp);
			return false;
		}
		I.m_Word = word;
		if (lower_bound(m_Word2Infos.begin(),  m_Word2Infos.end(), word, IsLessBigramsWordInfo()) != m_Word2Infos.end() )
		{
			fprintf (stderr,"A dublicate \"%s\" is found", word);
			fclose(fp);
			return false;
		}
		m_Word2Infos.push_back( I );
		m_CorpusSize += I.m_Freq;
	}
	fclose(fp);
	fprintf (stderr,"  open %s \n", BigramsFileName.c_str()  );
	if (m_Bigrams) fclose (m_Bigrams);

	string Bin1File = MakeFName(BigramsFileName, "bin1");
	fprintf (stderr, "  open %s \n", Bin1File.c_str()  );
	m_Bigrams = fopen (Bin1File.c_str(), "rb");
	if (!m_Bigrams)
	{
		fprintf (stderr, "Cannot open file %s", Bin1File.c_str());
		return false;
	}

	
	if (m_BigramsRev) fclose (m_BigramsRev);
	string Bin2File = MakeFName(BigramsFileName, "bin2");
	fprintf (stderr, "  open %s \n", Bin2File.c_str()  );
	m_BigramsRev = fopen (Bin2File.c_str(), "rb");
	if (!m_BigramsRev)
	{
		fprintf (stderr, "Cannot open file %s", Bin2File.c_str());
		return false;
	}

	return true;
}
Esempio n. 7
0
bool CMorphDict::Load(string GrammarFileName)
{
	//fprintf (stderr," open %s\n", GrammarFileName.c_str());
	if (!m_pFormAutomat->Load(MakeFName(GrammarFileName,"forms_autom")))
		return false;

	string PrecompiledFile = MakeFName(GrammarFileName,"annot");
	FILE * fp = fopen(PrecompiledFile.c_str(), "rb");
	if (!fp)
	{
		ErrorMessage (Format("Cannot open %s", PrecompiledFile.c_str()));
		return false;
	};
	
	ReadFlexiaModels(fp, m_FlexiaModels);

	ReadAccentModels(fp, m_AccentModels);


	
	int Count;
	char buffer[256];

	{
		if (!fgets(buffer, 256, fp)) return false;
		Count = atoi(buffer);
	}
	// add empty prefix
	m_Prefixes.resize(1,"");
	for (size_t i=0; i < Count; i++)
	{
		char buffer[256];
		if (!fgets(buffer, 256, fp)) return false;
		string q = buffer;
		Trim(q);
		assert (!q.empty());
		m_Prefixes.push_back(q);
	};

	
	{
		if (!fgets(buffer, 256, fp)) return false;
		Count = atoi(buffer);
	}
	m_LemmaInfos.clear();
	ReadVectorInner(fp, m_LemmaInfos, Count);


	{
		if (!fgets(buffer, 256, fp)) return false;
		Count = atoi(buffer);
	}

	m_NPSs.clear();
	ReadVectorInner(fp, m_NPSs, Count);
	assert (m_NPSs.size()  == m_FlexiaModels.size());

	fclose(fp);

	m_Bases.ReadShortStringHolder(MakeFName(GrammarFileName,"bases"));

	CreateModelsIndex();

	return true;
};