bool CMorphDict::Save(string GrammarFileName) const { try { if (!m_pFormAutomat->Save(MakeFName(GrammarFileName,"forms_autom"))) { ErrorMessage (Format("Cannot write to %s", MakeFName(GrammarFileName,"forms_autom").c_str())); return false; } string PrecompiledFile = MakeFName(GrammarFileName,"annot"); FILE * fp = fopen(PrecompiledFile.c_str(), "wb"); if (!fp) { ErrorMessage (Format("Cannot write to %s", PrecompiledFile.c_str())); return false; }; WriteFlexiaModels(fp, m_FlexiaModels); WriteAccentModels(fp, m_AccentModels); assert (!m_Prefixes.empty() && m_Prefixes[0].empty()); // do not write the first empty prefix, instead add it manually each time during loading fprintf(fp, "%i\n", m_Prefixes.size()-1); for (size_t i=1; i < m_Prefixes.size(); i++) fprintf (fp, "%s\n",m_Prefixes[i].c_str()); fprintf(fp, "%i\n", m_LemmaInfos.size()); if (!WriteVectorInner(fp, m_LemmaInfos)) return false; assert (m_NPSs.size() == m_FlexiaModels.size()); fprintf(fp, "%i\n", m_NPSs.size()); if (!WriteVectorInner(fp, m_NPSs)) return false; fclose(fp); if (!m_Bases.WriteShortStringHolder(MakeFName(GrammarFileName,"bases"))) { fprintf(stderr, "Cannot save bases\n"); return false; }; return true; } catch (...) { fprintf (stderr, "Cannot save CMorphDict"); return false; }; };
int main(int argc, char* argv[]) { if (argc != 2) PrintUsageAndExit(); vector<CWordInfo> WordInfos; string BigramsFileName = argv[1]; string WordFreqFileName = MakeFName(BigramsFileName, "wrd_freq"); // читаем статистику слов if (!ReadWordFreqs(WordFreqFileName, WordInfos)) return 1; // создаем бинарный файл для биграмм if (!BuildBigramsBin(BigramsFileName, WordInfos, true)) return 1; { string RevFile = MakeFName(BigramsFileName, "rev"); string Command = Format ("gsort -k 2,3 <%s >%s", BigramsFileName.c_str(), RevFile.c_str()); fprintf (stderr, "%s\n", Command.c_str()); if (system (Command.c_str()) != 0) { fprintf (stderr,"!!! an exception occurred (cannot sort) !!!\n"); return 1; }; if (!BuildBigramsBin(RevFile, WordInfos, false)) return 1; fprintf (stderr, "remove %s\n", RevFile.c_str()); remove(RevFile.c_str()); } string OutIndexFile = MakeFName(BigramsFileName, "wrd_idx"); fprintf (stderr, "create file %s\n", OutIndexFile.c_str()); FILE* fp = fopen (OutIndexFile.c_str(), "w"); if (!fp) { fprintf (stderr, "cannot open file %s\n", OutIndexFile.c_str()); return 1; } for (vector<CWordInfo>::const_iterator it = WordInfos.begin(); it != WordInfos.end(); it++) { fprintf (fp, "%s %u %u %u %u %u\n",it->m_WordStr.c_str(), it->m_Freq, it->m_FileOffset1,it->m_FileLen1, it->m_FileOffset2,it->m_FileLen2 ); } fclose (fp); return 0; }
bool CGraphmatFile :: LoadFileToGraphan (const string& CommandLine) { try { m_SourceFileName = CommandLine.c_str(); m_GraFileName = MakeFName (m_SourceFileName,"gra"); m_XmlMacSynName = MakeFName (m_SourceFileName,"xml"); m_SaveTxtName = MakeFName (m_SourceFileName,"tds"); if (IsHtmlFile(m_SourceFileName)) { HTML Convert(m_SourceFileName); string Text = Convert.getText(); if (!InitInputBuffer(Text)) { m_LastError = Format("Cannot init inpur buffer for %i bytes", Text.length()); return false; } if (m_bSaveHtmlFileToTdsFile) WriteVector(m_SaveTxtName, GetInputBuffer()); } else { if (access(m_SourceFileName.c_str(), 04) != 0) return false; string Text; LoadFileToString(m_SourceFileName, Text); if (!InitInputBuffer(Text)) { m_LastError = Format("Cannot init inpur buffer for %i bytes", Text.length()); return false; }; }; return GraphmatMain (); } catch (CExpc& C) { m_LastError = C.m_ErrorCode; return false; } catch (...) { m_LastError = "general exception"; return false; }; };
bool BuildBigramsBin(string BigramsFileName, vector<CWordInfo>& WordInfos, bool bFirstOffset1) { string WordFreqFileName = MakeFName(BigramsFileName, "wrd_freq"); fprintf (stderr, "open file %s\n", BigramsFileName.c_str()); FILE * in_fp = fopen (BigramsFileName.c_str(), "rb"); if (!in_fp) { fprintf (stderr, "cannot open file %s\n", BigramsFileName.c_str()); return false; } string BinName = MakeFName(BigramsFileName, bFirstOffset1 ? "bin1" : "bin2"); fprintf (stderr, "write to file %s\n", BinName.c_str()); FILE * out_fp = fopen (BinName.c_str(), "wb"); if (!out_fp) { fprintf (stderr, "cannot open file %s\n", BinName.c_str()); return false; } char buffer[10000]; size_t BigramsCount = 0; vector<CWordInfo>::iterator curr_it = WordInfos.end(); while (fgets(buffer, 10000, in_fp)) { if ((BigramsCount % 100000) == 0) fprintf (stderr, "%u \r", BigramsCount); char w1[500], w2[500]; int Bigramsfreq; int Res = sscanf(buffer, "%[^\t]\t%[^\t]\t%i", w1, w2, &Bigramsfreq); if ( Res != 3) { fprintf (stderr, "%s: skip line %s (scanf returned %i)\n", BigramsFileName.c_str(), buffer, Res); continue; } const char* w = bFirstOffset1 ? w1 : w2; const char* conv_w = bFirstOffset1 ? w2 : w1; bool bChange = false; if ( (curr_it == WordInfos.end()) || (w != curr_it->m_WordStr) ) { if (curr_it == WordInfos.end()) curr_it = lower_bound (WordInfos.begin(), WordInfos.end(), w); else { curr_it++; curr_it = lower_bound (curr_it, WordInfos.end(), w); } bChange = true; } if (curr_it == WordInfos.end()) { fprintf (stderr, "Cannot find word \"%s\" in %s\n", w, buffer); return false; } else { if (bFirstOffset1) { if (bChange && (curr_it->m_FileOffset1 != UINT_MAX)) { fprintf (stderr, "Wrong order \"%s\" from line \"%s\" (curr_it->m_FileOffset1=%u)\n", w, buffer, curr_it->m_FileOffset1); return false; } if (curr_it->m_FileOffset1 == UINT_MAX) curr_it->m_FileOffset1 = BigramsCount; curr_it->m_FileLen1++; } else { if (bChange && (curr_it->m_FileOffset2 != UINT_MAX)) { fprintf (stderr, "Wrong order \"%s\" from %s\n", w, buffer); return false; } if (curr_it->m_FileOffset2 == UINT_MAX) curr_it->m_FileOffset2 = BigramsCount; curr_it->m_FileLen2++; } vector<CWordInfo>::const_iterator conv_curr_it = lower_bound(WordInfos.begin(), WordInfos.end(), conv_w); if ( conv_curr_it == WordInfos.end() || (conv_curr_it->m_WordStr != conv_w) ) { fprintf (stderr, "Cannot find word \"%s\" in %s\n", conv_w, buffer); return false; } else { size_t ConvWordNo = conv_curr_it - WordInfos.begin(); // write the other item of the bigram fwrite(&ConvWordNo, 1, sizeof(ConvWordNo), out_fp); // write the frequence of bigram fwrite(&Bigramsfreq, 1, sizeof(Bigramsfreq), out_fp); } } BigramsCount++; } fprintf (stderr, "%u \n", BigramsCount); fclose(in_fp); fclose (out_fp); return true; }
//---------------------------------------------------------------------------- void CMorphwizardView::OnToolsImport() { int line_no = 0; CString PathName; try { CFileDialog D(TRUE, "slf", "paradigms.slf",OFN_ALLOWMULTISELECT|OFN_HIDEREADONLY | OFN_OVERWRITEPROMPT); char file_buffer[10000]; file_buffer[0] = 0; D.m_ofn.lpstrFile = file_buffer; D.m_ofn.nMaxFile = 10000; if (D.DoModal() != IDOK) return; POSITION pos = D.GetStartPosition(); if (pos == 0) return; bool bTestMode = ::MessageBox (0,"Test it or import?","MorphWizard",MB_YESNO ) == IDYES; GetWizard()->m_bFullTrace = false; PathName = D.GetNextPathName(pos); while (true) // all selected files { FILE * fp = fopen (PathName,"r"); if (!fp) { AfxMessageBox ("Cannot open file"); return; }; GetWizard()->log (std::string("import file ") + (const char*)PathName); CProgressMeterRML meter; meter.SetFileMaxPos(fp); CString info = (bTestMode?"Test":"Import"); info+= "ing file " + PathName + "..."; meter.SetInfo(info); int ParadigmCount = 0; string Errors; line_no = 0; bool bError; CDumpParadigm P; while (GetWizard()->ReadNextParadigmFromFile(fp, P, line_no, bError, Errors)) { if (!bError) { int line_no_err = 0; ParadigmCount++; try { if (bTestMode) { std::string lemm; CFlexiaModel Dummy1; CAccentModel AccentModel; BYTE Dummy; GetWizard()->slf_to_mrd (P.m_SlfStr, lemm, Dummy1, AccentModel, Dummy, line_no_err); } else { WORD SessionNo = GetWizard()->RegisterSession(P.m_Session); GetWizard()->add_lemma(P.m_SlfStr, P.m_TypeGrammemsStr, P.m_PrefixesStr, line_no_err, SessionNo); }; meter.SetFilePos(); } catch (CExpc C) { Errors += Format("%s (%s:%i) \n", C.m_strCause.c_str(), (const char*)PathName, P.m_FirstSlfLineNo+line_no_err); } catch (...) { Errors += Format("error at %s:%i \n", (const char*)PathName, P.m_FirstSlfLineNo+line_no_err); }; } } fclose (fp); if (!Errors.empty()) { try { string ErrorFile = MakeFName((const char*)PathName, "err"); FILE * fp = fopen (ErrorFile.c_str(), "w"); fprintf (fp, "%s",Errors.c_str()); fclose(fp); ErrorMessage(Format("Errors were written to file %s", ErrorFile.c_str())); } catch (...){ AfxMessageBox ("Cannot write errors to paradigms.err "); } } else { ErrorMessage(Format("Successfully %s %i paradigms from \"%s\"" , bTestMode? "tested":"imported", ParadigmCount, PathName), "Confirmation"); }; if (pos == 0) break; PathName = D.GetNextPathName(pos); } // all selected files } catch (...) { ErrorMessage (Format("some error has occurred (file=%s, line=%i", PathName, line_no)); }; GetWizard()->m_bFullTrace = true; }
bool CBigrams::Initialize(string BigramsFileName) { m_Word2Infos.clear(); m_CorpusSize = 0; string IndexFile = MakeFName(BigramsFileName, "wrd_idx"); fprintf (stderr,"load %s into memory\n", IndexFile.c_str() ); FILE *fp = fopen (IndexFile.c_str(), "r"); if (!fp) { fprintf (stderr,"Cannot open %s", IndexFile.c_str()); return false; } char buffer[1000]; while (fgets (buffer, 1000, fp)) { char word[1000]; CBigramsWordInfo I; if (sscanf(buffer, "%[^ ] %u %u %u %u %u", word, &I.m_Freq, &I.m_FileOffset1, &I.m_FileLen1, &I.m_FileOffset2, &I.m_FileLen2) != 6) { fprintf (stderr,"Bad format in %s", IndexFile.c_str()); fclose(fp); return false; } I.m_Word = word; if (lower_bound(m_Word2Infos.begin(), m_Word2Infos.end(), word, IsLessBigramsWordInfo()) != m_Word2Infos.end() ) { fprintf (stderr,"A dublicate \"%s\" is found", word); fclose(fp); return false; } m_Word2Infos.push_back( I ); m_CorpusSize += I.m_Freq; } fclose(fp); fprintf (stderr," open %s \n", BigramsFileName.c_str() ); if (m_Bigrams) fclose (m_Bigrams); string Bin1File = MakeFName(BigramsFileName, "bin1"); fprintf (stderr, " open %s \n", Bin1File.c_str() ); m_Bigrams = fopen (Bin1File.c_str(), "rb"); if (!m_Bigrams) { fprintf (stderr, "Cannot open file %s", Bin1File.c_str()); return false; } if (m_BigramsRev) fclose (m_BigramsRev); string Bin2File = MakeFName(BigramsFileName, "bin2"); fprintf (stderr, " open %s \n", Bin2File.c_str() ); m_BigramsRev = fopen (Bin2File.c_str(), "rb"); if (!m_BigramsRev) { fprintf (stderr, "Cannot open file %s", Bin2File.c_str()); return false; } return true; }
bool CMorphDict::Load(string GrammarFileName) { //fprintf (stderr," open %s\n", GrammarFileName.c_str()); if (!m_pFormAutomat->Load(MakeFName(GrammarFileName,"forms_autom"))) return false; string PrecompiledFile = MakeFName(GrammarFileName,"annot"); FILE * fp = fopen(PrecompiledFile.c_str(), "rb"); if (!fp) { ErrorMessage (Format("Cannot open %s", PrecompiledFile.c_str())); return false; }; ReadFlexiaModels(fp, m_FlexiaModels); ReadAccentModels(fp, m_AccentModels); int Count; char buffer[256]; { if (!fgets(buffer, 256, fp)) return false; Count = atoi(buffer); } // add empty prefix m_Prefixes.resize(1,""); for (size_t i=0; i < Count; i++) { char buffer[256]; if (!fgets(buffer, 256, fp)) return false; string q = buffer; Trim(q); assert (!q.empty()); m_Prefixes.push_back(q); }; { if (!fgets(buffer, 256, fp)) return false; Count = atoi(buffer); } m_LemmaInfos.clear(); ReadVectorInner(fp, m_LemmaInfos, Count); { if (!fgets(buffer, 256, fp)) return false; Count = atoi(buffer); } m_NPSs.clear(); ReadVectorInner(fp, m_NPSs, Count); assert (m_NPSs.size() == m_FlexiaModels.size()); fclose(fp); m_Bases.ReadShortStringHolder(MakeFName(GrammarFileName,"bases")); CreateModelsIndex(); return true; };