void CCmpLexicon::WriteToken(const char* inText, obit_stream& ioBits) { Lexicon::iterator i = lower_bound(lexicon.begin() + 1, lexicon.end(), LexEntry(inText, 0)); if (i != lexicon.end() && strcmp(inText, (*i).text) == 0) { Push(ioBits, (*i).code, (*i).cnt); } else { Push(ioBits, lexicon[0].code, lexicon[0].cnt); for (const char* p = inText; *p; ++p) { RestChar e; e.ch = static_cast<unsigned char>(*p); RestCharacters::iterator i = lower_bound(rest.begin(), rest.end(), e); assert(i != rest.end()); assert((*i).ch == static_cast<unsigned char>(*p)); Push(ioBits, (*i).code, (*i).cnt); } Push(ioBits, rest[0].code, rest[0].cnt); } }
bool InEnglish(const string &word) { for(Lexicon::iterator it = english.begin(); it != english.end(); ++it) if(word == *it) return true; return false; }
void writeLexicon(ostream &out, Lexicon const &lexicon) { for (Lexicon::const_iterator iter = lexicon.begin(); iter != lexicon.end(); ++iter) { out << iter->first; for (map<string, size_t>::const_iterator tagIter = iter->second.begin(); tagIter != iter->second.end(); ++tagIter) { out << " " << tagIter->first << " " << tagIter->second; } out << endl; } }
//********************************************************************** // // M A I N // //********************************************************************** int main(int argc, char *argv[]) { QCoreApplication a(argc, argv); QsLogging::initQsLog(); if (argc<1) { cerr << USAGE; return EXIT_FAILURE; } QsLogging::initQsLog(); readCommandLineArguments(argc,argv); if (param.help) { cerr << HELP; return EXIT_FAILURE; } string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES")); string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string(getenv("LIMA_CONF")); if ( (!param.language.size()) && (!param.codeFile.size()) ) { cerr << "no codefile nor language specified !" << endl; cerr << "Use e.g option '-l fre'." << endl; cerr << "Option '-h' gives full help" << endl; return EXIT_FAILURE; } else if ( param.language.size() ) { param.codeFile=resourcesPath+"/LinguisticProcessings/"+param.language+"/code-"+param.language+".xml"; } cerr << "read proccodeManager from file " << param.codeFile << "..." << endl; PropertyCodeManager propcodemanager; propcodemanager.readFromXmlFile(param.codeFile); cerr << "get macroManager..." << endl; const PropertyManager& macroManager = propcodemanager.getPropertyManager("MACRO"); const PropertyAccessor& propertyAccessor = macroManager.getPropertyAccessor(); set<LinguisticCode> referenceProperties; for ( std::vector<string>::const_iterator macro = param.macro.begin() ; macro != param.macro.end() ; macro++ ) { cerr << "referenceProperties.insert(" << *macro << ")" << endl; LinguisticCode referenceProperty = macroManager.getPropertyValue(*macro); referenceProperties.insert(referenceProperty); } cerr << "referencePropertySet= "; set<LinguisticCode>::iterator propIt = referenceProperties.begin(); if ( propIt != referenceProperties.end() ) { const std::string& symbol = macroManager.getPropertySymbolicValue(*propIt); cerr << symbol; propIt++; } for ( ; propIt != referenceProperties.end() ; propIt++ ) { const std::string& symbol = macroManager.getPropertySymbolicValue(*propIt); cerr << ", " << symbol; } cerr << endl; Lexicon lex; // read all files and count terms vector<string>::const_iterator file=param.inputFiles.begin(), file_end=param.inputFiles.end(); for (;file!=file_end; file++) { ifstream fileIn((*file).c_str(), std::ifstream::binary); if (! fileIn) { cerr << "cannot open input file [" << *file << "]" << endl; continue; } BoWBinaryReader reader; try { reader.readHeader(fileIn); } catch (exception& e) { cerr << "Error: " << e.what() << endl; return EXIT_FAILURE; } switch (reader.getFileType()) { case BOWFILE_TEXT: { cerr << "Build lexicon from BoWText [" << *file << "]" << endl; try { readBowFileText(fileIn,reader, lex, propertyAccessor, referenceProperties); } catch (exception& e) { cerr << "Error: " << e.what() << endl; } break; } case BOWFILE_DOCUMENTST: { cerr << "ReadBoWFile: file contains a BoWDocumentST -> not treated" << endl; } case BOWFILE_DOCUMENT: { cerr << "ReadBoWFile: build BoWdocument from " << *file<< endl; BoWDocument* document=new BoWDocument(); try { cerr << "ReadBoWFile: extract terms... " << endl; readDocuments(fileIn,document,reader, lex, macroManager, propertyAccessor, referenceProperties); } catch (exception& e) { cerr << "Error: " << e.what() << endl; } fileIn.close(); delete document; break; } default: { cerr << "format of file " << reader.getFileTypeString() << " not managed" << endl; return EXIT_FAILURE; } } } // output stream (default is 'cout') std::ostream *s_out; // Manage output if ( param.outputFilename.length() == 0) s_out=&std::cout; else s_out = new std::ofstream(param.outputFilename.c_str(), std::ios_base::out | std::ios_base::binary | std::ios_base::trunc); // output lexicon Lexicon::const_iterator w=lex.begin(), w_end=lex.end(); for (;w!=w_end; w++) { (*s_out) << Common::Misc::limastring2utf8stdstring((*w).second.second) << "|" << Common::Misc::limastring2utf8stdstring((*w).first) << "|" << (*w).second.first << endl; } // Close output file (if any) if ( param.outputFilename.length() != 0) dynamic_cast<std::ofstream*>(s_out)->close(); return EXIT_SUCCESS; }
void CCmpLexicon::Reduce(HStreamBase& inFile) { HSwapStream<net_swapper> data(inFile); lexicon.push_back(LexEntry("\x1b", 0)); RestCharacters chars; uint32 i; for (i = 0; i < 256; ++i) { chars.push_back(RestChar()); chars[i].ch = static_cast<unsigned char>(i); chars[i].cnt = 1; // was 0 chars[i].code = 0; } uint32 n, h; // try to reduce the lexicon size to something reasonable LexiconSet::iterator w; n = word_set.size(); HAutoBuf<uint32> A_(new uint32[n * 2]); uint32* A = A_.get(); HAutoBuf<const char*> str(new const char*[n]); uint32 s = 0; i = 0; for (w = word_set.begin(); w != word_set.end(); ++w, ++i) { A[i] = i + n; A[i + n] = (*w).second; str[i] = (*w).first; s += strlen(str[i]) + 1; } // word_set.clear(); word_set = LexiconSet(); h = n; make_heap(A, A + h, CntCompare(A)); while (s > max_size) { const char* t = str[A[0] - n]; ++lexicon.front().cnt; for (const char* p = t; *p; ++p) ++chars[static_cast<unsigned char>(*p)].cnt; ++chars[0].cnt; s -= strlen(t) + 1; A[0] = A[h - 1]; --h; pop_heap(A, A + h, CntCompare(A)); } for (i = 0; i < h; ++i) lexicon.push_back(LexEntry(str[A[i] - n], A[A[i]])); sort(lexicon.begin() + 1, lexicon.end()); n = lexicon.size(); A = new uint32[n * 2]; for (i = 0; i < n; ++i) { A[i] = i + n; A[i + n] = lexicon[i].cnt; } h = n; make_heap(A, A + h, CntCompare(A)); while (h > 1) { uint32 m1 = A[0]; A[0] = A[h - 1]; --h; pop_heap(A, A + h, CntCompare(A)); uint32 m2 = A[0]; A[0] = A[h - 1]; A[h] = A[m1] + A[m2]; A[0] = h; A[m1] = A[m2] = h; pop_heap(A, A + h); } A[1] = 0; for (i = 2; i < 2 * n; ++i) A[i] = A[A[i]] + 1; for (i = 0; i < n; ++i) lexicon[i].cnt = A[i + n]; uint32 numl[32]; uint32 firstcode[32]; uint32 nextcode[32]; for (i = 0; i < 32; ++i) numl[i] = 0; for (i = 0; i < n; ++i) ++numl[A[i + n]]; firstcode[31] = 0; for (int l = 30; l >= 0; --l) firstcode[l] = (firstcode[l + 1] + numl[l + 1]) / 2; for (int l = 0; l < 32; ++l) nextcode[l] = firstcode[l]; HAutoBuf<uint32> symbol_table(new uint32[n]); uint32 six[32]; six[0] = 0; for (i = 1; i < 32; ++i) six[i] = six[i - 1] + numl[i - 1]; for (i = 0; i < n; ++i) { uint32 li = A[i + n]; lexicon[i].code = nextcode[li]; symbol_table[six[li] + nextcode[li] - firstcode[li]] = i; ++nextcode[li]; } data << n; for (i = 0; i < 32; ++i) data << firstcode[i]; for (i = 0; i < 32; ++i) data << six[i]; uint32 symbol_text_length = 0; for (i = 0; i < n; ++i) symbol_text_length += strlen(lexicon[symbol_table[i]].text) + 1; symbol_text = new char[symbol_text_length]; char* d = symbol_text; for (i = 0; i < n; ++i) { strcpy(d, lexicon[symbol_table[i]].text); lexicon[symbol_table[i]].text = d; symbol_table[i] = static_cast<uint32>(d - symbol_text); d += strlen(d) + 1; } data << symbol_text_length; data.Write(symbol_text, symbol_text_length); // and now repeat all steps for the rest characters // Count how many characters we actually have: n = 0; rest = chars; // for (RestCharacters::iterator i = chars.begin(); i != chars.end(); ++i) // { // if ((*i).cnt != 0) // { // rest.push_back(*i); // rest.back().cnt = 0; // } // } n = rest.size(); A_.reset(new uint32[n * 2]); A = A_.get(); for (i = 0; i < n; ++i) { A[i] = i + n; A[i + n] = rest[i].cnt; } h = n; make_heap(A, A + h, CntCompare(A)); while (h > 1) { uint32 m1 = A[0]; A[0] = A[h - 1]; --h; pop_heap(A, A + h, CntCompare(A)); uint32 m2 = A[0]; A[0] = A[h - 1]; A[h] = A[m1] + A[m2]; A[0] = h; A[m1] = A[m2] = h; pop_heap(A, A + h); } A[1] = 0; for (i = 2; i < 2 * n; ++i) A[i] = A[A[i]] + 1; for (i = 0; i < n; ++i) rest[i].cnt = A[i + n]; for (i = 0; i < 32; ++i) numl[i] = 0; for (i = 0; i < n; ++i) ++numl[A[i + n]]; firstcode[31] = 0; for (int l = 30; l >= 0; --l) firstcode[l] = (firstcode[l + 1] + numl[l + 1]) / 2; for (int l = 0; l < 32; ++l) nextcode[l] = firstcode[l]; six[0] = 0; for (i = 1; i < 32; ++i) six[i] = six[i - 1] + numl[i - 1]; HAutoBuf<unsigned char> char_symbol_table(new unsigned char[n]); for (i = 0; i < n; ++i) { uint32 li = A[i + n]; rest[i].code = nextcode[li]; char_symbol_table[six[li] + nextcode[li] - firstcode[li]] = rest[i].ch; ++nextcode[li]; } data << n; for (i = 0; i < 32; ++i) data << firstcode[i]; for (i = 0; i < 32; ++i) data << six[i]; data.Write(char_symbol_table.get(), n); }