Пример #1
0
void CCmpLexicon::WriteToken(const char* inText, obit_stream& ioBits)
{
	Lexicon::iterator i = lower_bound(lexicon.begin() + 1, lexicon.end(), LexEntry(inText, 0));
	if (i != lexicon.end() && strcmp(inText, (*i).text) == 0)
	{
		Push(ioBits, (*i).code, (*i).cnt);
	}
	else
	{
		Push(ioBits, lexicon[0].code, lexicon[0].cnt);

		for (const char* p = inText; *p; ++p)
		{
			RestChar e;
			e.ch = static_cast<unsigned char>(*p);
			RestCharacters::iterator i = lower_bound(rest.begin(), rest.end(), e); 
			assert(i != rest.end());
			assert((*i).ch == static_cast<unsigned char>(*p));
			
			Push(ioBits, (*i).code, (*i).cnt);
		}
			
		Push(ioBits, rest[0].code, rest[0].cnt);
	}
}
Пример #2
0
bool InEnglish(const string &word)
{
  for(Lexicon::iterator it = english.begin(); it != english.end(); ++it)
    if(word == *it) return true;

  return false;
}
Пример #3
0
void writeLexicon(ostream &out, Lexicon const &lexicon)
{
	for (Lexicon::const_iterator iter = lexicon.begin();
		iter != lexicon.end(); ++iter)
	{
		out << iter->first;

		for (map<string, size_t>::const_iterator tagIter = iter->second.begin();
			tagIter != iter->second.end(); ++tagIter)
		{
			out << " " << tagIter->first << " " << tagIter->second;
		}

		out << endl;
	}
}
Пример #4
0
//**********************************************************************
//
// M A I N
//
//**********************************************************************
int main(int argc, char *argv[])
{
  QCoreApplication a(argc, argv);
  QsLogging::initQsLog();
  if (argc<1) {
        cerr << USAGE;
        return EXIT_FAILURE;
    }
    QsLogging::initQsLog();
    readCommandLineArguments(argc,argv);
    if (param.help) {
        cerr << HELP;
        return EXIT_FAILURE;
    }


    string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES"));
    string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string(getenv("LIMA_CONF"));

    if ( (!param.language.size()) && (!param.codeFile.size()) ) {
        cerr << "no codefile nor language specified !" << endl;
        cerr << "Use e.g option '-l fre'." << endl;
        cerr << "Option '-h' gives full help" << endl;
        return EXIT_FAILURE;
    }
    else if ( param.language.size() ) {
        param.codeFile=resourcesPath+"/LinguisticProcessings/"+param.language+"/code-"+param.language+".xml";
    }

    cerr << "read proccodeManager from file " << param.codeFile << "..." << endl;
    PropertyCodeManager propcodemanager;
    propcodemanager.readFromXmlFile(param.codeFile);
    cerr << "get macroManager..." << endl;
    const PropertyManager& macroManager = propcodemanager.getPropertyManager("MACRO");
    const PropertyAccessor& propertyAccessor = macroManager.getPropertyAccessor();
    set<LinguisticCode> referenceProperties;
    for ( std::vector<string>::const_iterator macro = param.macro.begin() ;
            macro != param.macro.end() ; macro++ ) {
        cerr << "referenceProperties.insert(" << *macro << ")" << endl;
        LinguisticCode referenceProperty = macroManager.getPropertyValue(*macro);
        referenceProperties.insert(referenceProperty);
    }

    cerr << "referencePropertySet= ";
    set<LinguisticCode>::iterator propIt = referenceProperties.begin();
    if ( propIt != referenceProperties.end() ) {
        const std::string& symbol = macroManager.getPropertySymbolicValue(*propIt);
        cerr << symbol;
        propIt++;
    }
    for ( ; propIt != referenceProperties.end() ; propIt++ ) {
        const std::string& symbol = macroManager.getPropertySymbolicValue(*propIt);
        cerr << ", " << symbol;
    }
    cerr << endl;

    Lexicon lex;

    // read all files and count terms
    vector<string>::const_iterator
    file=param.inputFiles.begin(),
         file_end=param.inputFiles.end();
    for (;file!=file_end; file++) {

        ifstream fileIn((*file).c_str(), std::ifstream::binary);
        if (! fileIn) {
            cerr << "cannot open input file [" << *file << "]" << endl;
            continue;
        }
        BoWBinaryReader reader;
        try {
            reader.readHeader(fileIn);
        }
        catch (exception& e) {
            cerr << "Error: " << e.what() << endl;
            return EXIT_FAILURE;
        }

        switch (reader.getFileType()) {
        case BOWFILE_TEXT: {
            cerr << "Build lexicon from BoWText [" << *file << "]" << endl;
            try {
                readBowFileText(fileIn,reader, lex, propertyAccessor, referenceProperties);
            }
            catch (exception& e) {
                cerr << "Error: " << e.what() << endl;
            }
            break;
        }
        case BOWFILE_DOCUMENTST: {
            cerr << "ReadBoWFile: file contains a BoWDocumentST  -> not treated" << endl;
        }
        case BOWFILE_DOCUMENT: {
            cerr << "ReadBoWFile: build BoWdocument from  " << *file<< endl;
            BoWDocument* document=new BoWDocument();
            try {
                cerr << "ReadBoWFile: extract terms... " << endl;
                readDocuments(fileIn,document,reader, lex, macroManager, propertyAccessor, referenceProperties);
            }
            catch (exception& e) {
                cerr << "Error: " << e.what() << endl;
            }
            fileIn.close();
            delete document;
            break;
        }
        default: {
            cerr << "format of file " << reader.getFileTypeString() << " not managed"
                 << endl;
            return EXIT_FAILURE;
        }
        }
    }

    // output stream (default is 'cout')
    std::ostream *s_out;

    // Manage output
    if ( param.outputFilename.length() == 0) s_out=&std::cout;
    else s_out = new std::ofstream(param.outputFilename.c_str(), std::ios_base::out | std::ios_base::binary | std::ios_base::trunc);

    // output lexicon
    Lexicon::const_iterator
    w=lex.begin(),
      w_end=lex.end();
    for (;w!=w_end; w++) {
        (*s_out) << Common::Misc::limastring2utf8stdstring((*w).second.second) << "|"
        << Common::Misc::limastring2utf8stdstring((*w).first) << "|"
        << (*w).second.first << endl;
    }

    // Close output file (if any)
    if (  param.outputFilename.length() != 0)
        dynamic_cast<std::ofstream*>(s_out)->close();

    return EXIT_SUCCESS;
}
Пример #5
0
void CCmpLexicon::Reduce(HStreamBase& inFile)
{
	HSwapStream<net_swapper> data(inFile);
	
	lexicon.push_back(LexEntry("\x1b", 0));
	
	RestCharacters chars;
	uint32 i;
	
	for (i = 0; i < 256; ++i)
	{
		chars.push_back(RestChar());
		chars[i].ch = static_cast<unsigned char>(i);
		chars[i].cnt = 1;	// was 0
		chars[i].code = 0;
	}

	uint32 n, h;

	// try to reduce the lexicon size to something reasonable
	LexiconSet::iterator w;

	n = word_set.size();
	HAutoBuf<uint32> A_(new uint32[n * 2]);
	uint32* A = A_.get();
	
	HAutoBuf<const char*> str(new const char*[n]);
	
	uint32 s = 0;
	i = 0;
	for (w = word_set.begin(); w != word_set.end(); ++w, ++i)
	{
		A[i] = i + n;
		A[i + n] = (*w).second;
		str[i] = (*w).first;
		s += strlen(str[i]) + 1;
	}

//	word_set.clear();
	word_set = LexiconSet();
	
	h = n;
	make_heap(A, A + h, CntCompare(A));
	
	while (s > max_size)
	{
		const char* t = str[A[0] - n];
		
		++lexicon.front().cnt;
		
		for (const char* p = t; *p; ++p)
			++chars[static_cast<unsigned char>(*p)].cnt;
		++chars[0].cnt;
		
		s -= strlen(t) + 1;
		A[0] = A[h - 1];
		--h;
		pop_heap(A, A + h, CntCompare(A));
	}
	
	for (i = 0; i < h; ++i)
		lexicon.push_back(LexEntry(str[A[i] - n], A[A[i]]));

	sort(lexicon.begin() + 1, lexicon.end());
	
	n = lexicon.size();
	A = new uint32[n * 2];

	for (i = 0; i < n; ++i)
	{
		A[i] = i + n;
		A[i + n] = lexicon[i].cnt;
	}
	
	h = n;
	make_heap(A, A + h, CntCompare(A));
	
	while (h > 1)
	{
		uint32 m1 = A[0];
		A[0] = A[h - 1];
		--h;
		pop_heap(A, A + h, CntCompare(A));
		
		uint32 m2 = A[0];
		A[0] = A[h - 1];
		
		A[h] = A[m1] + A[m2];
		A[0] = h;
		A[m1] = A[m2] = h;
		
		pop_heap(A, A + h);
	}
	
	A[1] = 0;
	for (i = 2; i < 2 * n; ++i)
		A[i] = A[A[i]] + 1;
	
	for (i = 0; i < n; ++i)
		lexicon[i].cnt = A[i + n];

	uint32 numl[32];
	uint32 firstcode[32];
	uint32 nextcode[32];
	
	for (i = 0; i < 32; ++i)
		numl[i] = 0;
	
	for (i = 0; i < n; ++i)
		++numl[A[i + n]];
	
	firstcode[31] = 0;
	for (int l = 30; l >= 0; --l)
		firstcode[l] = (firstcode[l + 1] + numl[l + 1]) / 2;
	
	for (int l = 0; l < 32; ++l)
		nextcode[l] = firstcode[l];
	
	HAutoBuf<uint32> symbol_table(new uint32[n]);
	
	uint32 six[32];
	six[0] = 0;
	for (i = 1; i < 32; ++i)
		six[i] = six[i - 1] + numl[i - 1];
	
	for (i = 0; i < n; ++i)
	{
		uint32 li = A[i + n];
		
		lexicon[i].code = nextcode[li];
		symbol_table[six[li] + nextcode[li] - firstcode[li]] = i;
		++nextcode[li];
	}
	
	data << n;
	for (i = 0; i < 32; ++i)	data << firstcode[i];
	for (i = 0; i < 32; ++i)	data << six[i];

	uint32 symbol_text_length = 0;
	for (i = 0; i < n; ++i)
		symbol_text_length += strlen(lexicon[symbol_table[i]].text) + 1;
	
	symbol_text = new char[symbol_text_length];
	char* d = symbol_text;
	
	for (i = 0; i < n; ++i)
	{
		strcpy(d, lexicon[symbol_table[i]].text);
		lexicon[symbol_table[i]].text = d;
		symbol_table[i] = static_cast<uint32>(d - symbol_text);
		d += strlen(d) + 1;
	}

	data << symbol_text_length;
	data.Write(symbol_text, symbol_text_length);

	// and now repeat all steps for the rest characters
	
	// Count how many characters we actually have:
	n = 0;
	
	rest = chars;
//	for (RestCharacters::iterator i = chars.begin(); i != chars.end(); ++i)
//	{
//		if ((*i).cnt != 0)
//		{
//			rest.push_back(*i);
//			rest.back().cnt = 0;
//		}
//	}

	n = rest.size();
	A_.reset(new uint32[n * 2]);
	A = A_.get();

	for (i = 0; i < n; ++i)
	{
		A[i] = i + n;
		A[i + n] = rest[i].cnt;
	}
	
	h = n;
	make_heap(A, A + h, CntCompare(A));
	
	while (h > 1)
	{
		uint32 m1 = A[0];
		A[0] = A[h - 1];
		--h;
		pop_heap(A, A + h, CntCompare(A));
		
		uint32 m2 = A[0];
		A[0] = A[h - 1];
		
		A[h] = A[m1] + A[m2];
		A[0] = h;
		A[m1] = A[m2] = h;
		
		pop_heap(A, A + h);
	}
	
	A[1] = 0;
	for (i = 2; i < 2 * n; ++i)
		A[i] = A[A[i]] + 1;
	
	for (i = 0; i < n; ++i)
		rest[i].cnt = A[i + n];

	for (i = 0; i < 32; ++i)
		numl[i] = 0;
	
	for (i = 0; i < n; ++i)
		++numl[A[i + n]];
	
	firstcode[31] = 0;
	for (int l = 30; l >= 0; --l)
		firstcode[l] = (firstcode[l + 1] + numl[l + 1]) / 2;
	
	for (int l = 0; l < 32; ++l)
		nextcode[l] = firstcode[l];
	
	six[0] = 0;
	for (i = 1; i < 32; ++i)
		six[i] = six[i - 1] + numl[i - 1];
	
	HAutoBuf<unsigned char> char_symbol_table(new unsigned char[n]);
	
	for (i = 0; i < n; ++i)
	{
		uint32 li = A[i + n];
		
		rest[i].code = nextcode[li];
		char_symbol_table[six[li] + nextcode[li] - firstcode[li]] = rest[i].ch;
		++nextcode[li];
	}
	
	data << n;
	for (i = 0; i < 32; ++i)	data << firstcode[i];
	for (i = 0; i < 32; ++i)	data << six[i];
	data.Write(char_symbol_table.get(), n);
}