Example #1
0
// Function to dependency parse all sentence in the given input file
// and write parsed sentences to the given output file
extern "C" void dep_parse_file(const char *sInputFile, const char *sOutputFile, bool tokenize)
{

    std::cerr << "Processing file " <<  sInputFile << std::endl;

    // initialize the input reader
    CSentenceReader input_reader(sInputFile);

    // open the output file
    FILE *outfp = NULL;
    outfp = fopen(sOutputFile, "w");

    // initialize the temporary sentence variables
    CStringVector tokenized_sent[1];
    CTwoStringVector tagged_sent[1];
    CDependencyParse parsed_sent[1];

    // get the tagger and the parser that were stored earlier
    CTagger *tagger = (CTagger *)zpm->tagger;
    CDepParser *depparser = (CDepParser *)zpm->depparser;

    // read in and tokenize the given input file if asked
    bool readSomething;
    if (tokenize) {
        readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
    }
    else {
        readSomething = input_reader.readSegmentedSentence(tokenized_sent);
    }

    while ( readSomething )
    {
        if ( tokenized_sent->back() == "\n" )
        {
            tokenized_sent->pop_back();
        }

        std::string deptree = "";
        if(tokenized_sent->size() < MAX_SENTENCE_SIZE){
            tagger->tag(tokenized_sent, tagged_sent);
            depparser->parse(*tagged_sent, parsed_sent);
            deptree = format_dependency_tree(parsed_sent);
        } else {
            std::cerr << "Sentence too long. Writing empty string. Input:" << tokenized_sent << std::endl;
        }

        fprintf(outfp, "%s\n", deptree.c_str());

        if (tokenize) {
            readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
        }
        else {
            readSomething = input_reader.readSegmentedSentence(tokenized_sent);
        }
    }

    // close the output file
    std::cerr << "Wrote output to " << sOutputFile << std::endl;
    fclose(outfp);
}
Example #2
0
// Function to dependency parse a sentence
extern "C" char* dep_parse_sentence(void* vzps, const char *input_sentence, bool tokenize)
{
    zparSession_t* zps = static_cast<zparSession_t *>(vzps);

    try {

        // create a temporary string stream from the input char *
        CSentenceReader input_reader(std::string(input_sentence), false);

        // tokenize the sentence
        CStringVector tokenized_sent[1];
        if (tokenize) {
            input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
        }
        else {
            input_reader.readSegmentedSentence(tokenized_sent);
        }

        if (zps->output_buffer != NULL) {
            delete zps->output_buffer;
            zps->output_buffer = NULL;
        }

        if(tokenized_sent->size() >= MAX_SENTENCE_SIZE){
            // The ZPar code asserts that length < MAX_SENTENCE_SIZE...
            std::cerr << "Sentence too long. Returning empty string. Sentence: " << input_sentence << std::endl;
            zps->output_buffer = new char[1];
            strcpy(zps->output_buffer, "");
        } else {

            // initialize the variable that will hold the tagged and parsed sentences
            CTwoStringVector tagged_sent[1];
            CDependencyParse parsed_sent[1];

            // get the tagger and parser that were stored earlier
            CTagger *tagger = zps->tagger;
            CDepParser *depparser = zps->depparser;

            // tag and parse the sentence
            tagger->tag(tokenized_sent, tagged_sent);
            depparser->parse(*tagged_sent, parsed_sent);

            // now output the formatted dependency tree
            std::string deptree = format_dependency_tree(parsed_sent);
            int deptreelen = deptree.length();
            zps->output_buffer = new char[deptreelen + 1];
            strcpy(zps->output_buffer, deptree.c_str());
        }

    } catch (const std::string &e) {
        std::cerr << e << std::endl;
        zps->output_buffer = new char[1];
        strcpy(zps->output_buffer, "");
    }

    return zps->output_buffer;
}
Example #3
0
// Function to tag all sentence in the given input file
// and write tagged sentences to the given output file
extern "C" void tag_file(const char *sInputFile, const char *sOutputFile, bool tokenize)
{

    std::cerr << "Processing file " <<  sInputFile << std::endl;

    // initialize the input reader
    CSentenceReader input_reader(sInputFile);

    // open the output file
    FILE *outfp = NULL;
    outfp = fopen(sOutputFile, "w");

    // initialize the temporary sentence variables
    CStringVector tokenized_sent[1];
    CTwoStringVector tagged_sent[1];

    // get the tagger and the parser that were stored earlier
    CTagger *tagger = (CTagger *)zpm->tagger;

    // read in and tokenize the given input file if asked
    bool readSomething;
    if (tokenize) {
        readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
    }
    else {
        readSomething = input_reader.readSegmentedSentence(tokenized_sent);
    }
    while ( readSomething )
    {
        if ( tokenized_sent->back() == "\n" )
        {
            tokenized_sent->pop_back();
        }

        // tag the sentence
        tagger->tag(tokenized_sent, tagged_sent);

        // write the formatted sentence to the output file
        std::string tagvec = format_tagged_vector(tagged_sent);
        fprintf(outfp, "%s\n", tagvec.c_str());

        if (tokenize) {
            readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
        }
        else {
            readSomething = input_reader.readSegmentedSentence(tokenized_sent);
        }
    }

    // close the output file
    std::cerr << "Wrote output to " << sOutputFile << std::endl;
    fclose(outfp);
}
Example #4
0
// Function to tag all sentence in the given input file
// and write tagged sentences to the given output file
extern "C" void tag_file(void* vzps, const char *sInputFile, const char *sOutputFile, bool tokenize)
{

    zparSession_t* zps = static_cast<zparSession_t *>(vzps);

    std::cerr << "Processing file " <<  sInputFile << std::endl;

    // initialize the input reader
    CSentenceReader input_reader(sInputFile);

    // initialize the temporary sentence variables
    CStringVector tokenized_sent[1];
    CTwoStringVector tagged_sent[1];

    // get the tagger and the parser that were stored earlier
    CTagger *tagger = zps->tagger;

    // initialize the output file writer
    std::string outputFileName = std::string(sOutputFile);
    CSentenceWriter output_writer(outputFileName);

    // read in and tokenize the given input file if asked
    bool readSomething;
    if (tokenize) {
        readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
    }
    else {
        readSomething = input_reader.readSegmentedSentence(tokenized_sent);
    }
    while ( readSomething )
    {
        if ( !tokenized_sent->empty() && tokenized_sent->back() == "\n" )
        {
            tokenized_sent->pop_back();
        }

        // tag the sentence
        tagger->tag(tokenized_sent, tagged_sent);

        // write the formatted sentence to the output file
        output_writer.writeSentence(tagged_sent, '/', true);

        if (tokenize) {
            readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
        }
        else {
            readSomething = input_reader.readSegmentedSentence(tokenized_sent);
        }
    }

    // close the output file
    std::cerr << "Wrote output to " << sOutputFile << std::endl;
}
Example #5
0
// Function to constituency parse a sentence
extern "C" char* parse_sentence(const char *input_sentence, bool tokenize)
{

    // create a temporary string stream from the input char *
    CSentenceReader input_reader(std::string(input_sentence), false);

    // tokenize the sentence
    CStringVector tokenized_sent[1];
    if (tokenize) {
        input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
    }
    else {
        input_reader.readSegmentedSentence(tokenized_sent);
    }

    if (zpm->output_buffer != NULL) {
        delete zpm->output_buffer;
        zpm->output_buffer = NULL;
    }

    if(tokenized_sent->size() >= MAX_SENTENCE_SIZE){
        // The ZPar code asserts that length < MAX_SENTENCE_SIZE...
        std::cerr << "Sentence too long. Returning empty string. Sentence: " << input_sentence << std::endl;
        zpm->output_buffer = new char[1];
        strcpy(zpm->output_buffer, "");
    } else {
        // initialize the variable that will hold the tagged sentence
        CTwoStringVector tagged_sent[1];
        english::CCFGTree parsed_sent[1];

        // get the tagger that was stored earlier
        CTagger *tagger = (CTagger *)zpm->tagger;
        CConParser *conparser = (CConParser *)zpm->conparser;

        // tag the sentence
        tagger->tag(tokenized_sent, tagged_sent);
        conparser->parse(*tagged_sent, parsed_sent);

        // now put the tagged_sent into a string stream
        std::string parse = parsed_sent->str_unbinarized();
        int parselen = parse.length();
        zpm->output_buffer = new char[parselen + 1];
        strcpy(zpm->output_buffer, parse.c_str());
    }

    return zpm->output_buffer;
}
Example #6
0
// Function to tag a sentence
extern "C" char* tag_sentence(void* vzps, const char *input_sentence, bool tokenize)
{

    zparSession_t* zps = static_cast<zparSession_t *>(vzps);

    try {
        // create a temporary string stream from the input char *
        CSentenceReader input_reader(std::string(input_sentence), false);

        // tokenize the sentence
        CStringVector input_sent[1];
        if (tokenize) {
            input_reader.readSegmentedSentenceAndTokenize(input_sent);
        }
        else {
            input_reader.readSegmentedSentence(input_sent);
        }

        // initialize the variable that will hold the tagged sentence
        CTwoStringVector tagged_sent[1];

        // get the tagger that was stored earlier
        CTagger *tagger = zps->tagger;

        // tag the sentence
        tagger->tag(input_sent, tagged_sent);

        // format the tagged sentence properly and return
        std::string tagvec = format_tagged_vector(tagged_sent);
        int tagveclen = tagvec.length();

        if (zps->output_buffer != NULL) {
            delete zps->output_buffer;
            zps->output_buffer = NULL;
        }
        zps->output_buffer = new char[tagveclen + 1];
        strcpy(zps->output_buffer, tagvec.c_str());
    } catch (const std::string &e) {
        std::cerr << e << std::endl;
        zps->output_buffer = new char[1];
        strcpy(zps->output_buffer, "");
    }
    return zps->output_buffer;
}
Example #7
0
void Tagging(vector<_SEN*> &rSenVec,   CTagger &rTagger)
{
	rTagger.SetIMode(false);
	clock_t start = clock();
	for (size_t i = 0; i < rSenVec.size(); ++i)
	{
		if (i != 0 && i % 500 == 0)
			fprintf(stderr, "Tagging %lu sentence\r", i);
		rTagger.Tagging(rSenVec[i]);
//		_SEN *pRes = rTagger.Tagging(rSenVec[i]);
//		for (int k = 0; k < pRes->Length(); ++k)
//			if (rTagger.GetLexicon()->SeenWord(rSenVec[i]->Word(k)) == false)
//				++nOOV;
	}

	double secs = 1.0*(clock() - start)/CLOCKS_PER_SEC;
	fprintf(stderr, "Total %d sentences, %.2f secs eclipsed, %.2f sens per sec\n", 
		(int)rSenVec.size(),  secs,	 rSenVec.size()/secs);
//	return nOOV;
}
Example #8
0
void tag(const std::string sInputFile, const std::string sOutputFile, const std::string sFeaturePath) {
   std::cout << "Tagging started" << std::endl;
   int time_start = clock();
   std::string sTaggerFeatureFile = sFeaturePath + "/tagger";
   CTagger *tagger;
   tagger = new CTagger(sTaggerFeatureFile, false);
//   if (sKnowledgeBase.size())
//      tagger.loadTagDictionary(sKnowledgeBase);
   CSentenceReader input_reader(sInputFile);
   CSentenceWriter outout_writer(sOutputFile);
   CStringVector *input_sent = new CStringVector;
   CTwoStringVector *outout_sent; 

   int nCount=0;
   
   const unsigned nBest = 1;
   outout_sent = new CTwoStringVector[nBest];

   while( input_reader.readSegmentedSentenceAndTokenize(input_sent) ) {
      TRACE("Sentence " << nCount);
      ++ nCount;
      //
      // Find decoder outout
      //
      tagger->tag(input_sent, outout_sent, nBest, NULL);
      //
      // Ouptut sent
      //
      for (int i=0; i<nBest; ++i)
         outout_writer.writeSentence(outout_sent+i, '/');
   }
   delete input_sent;
   delete [] outout_sent;

   delete tagger;

   std::cout << "Tagging has finished successfully. Total time taken is: " << double(clock()-time_start)/CLOCKS_PER_SEC << std::endl;
}
Example #9
0
// Function to tag a sentence
extern "C" char* tag_sentence(const char *input_sentence, bool tokenize)
{

    // create a temporary string stream from the input char *
    CSentenceReader input_reader(std::string(input_sentence), false);

    // tokenize the sentence
    CStringVector input_sent[1];
    if (tokenize) {
        input_reader.readSegmentedSentenceAndTokenize(input_sent);
    }
    else {
        input_reader.readSegmentedSentence(input_sent);
    }

    // initialize the variable that will hold the tagged sentence
    CTwoStringVector tagged_sent[1];

    // get the tagger that was stored earlier
    CTagger *tagger = (CTagger *)zpm->tagger;

    // tag the sentence
    tagger->tag(input_sent, tagged_sent);

    // format the tagged sentence properly and return
    std::string tagvec = format_tagged_vector(tagged_sent);
    int tagveclen = tagvec.length();

    if (zpm->output_buffer != NULL) {
        delete zpm->output_buffer;
        zpm->output_buffer = NULL;
    }
    zpm->output_buffer = new char[tagveclen + 1];
    strcpy(zpm->output_buffer, tagvec.c_str());
    return zpm->output_buffer;
}
Example #10
0
// Function to constituency parse all sentence in the given input file
// and write parsed sentences to the given output file
extern "C" void parse_file(void* vzps, const char *sInputFile, const char *sOutputFile, bool tokenize)
{

    zparSession_t* zps = static_cast<zparSession_t *>(vzps);

    std::cerr << "Processing file " <<  sInputFile << std::endl;

    // initialize the input reader
    CSentenceReader input_reader(sInputFile);

    // open the output file
    FILE *outfp = NULL;
    outfp = fopen(sOutputFile, "w");

    // initialize the temporary sentence variables
    CStringVector tokenized_sent[1];
    CTwoStringVector tagged_sent[1];
    english::CCFGTree parsed_sent[1];

    // get the tagger and the parser that were stored earlier
    CTagger *tagger = zps->tagger;
    CConParser *conparser = zps->conparser;

    // read in and tokenize the given input file if asked
    bool readSomething;
    if (tokenize) {
        readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
    }
    else {
        readSomething = input_reader.readSegmentedSentence(tokenized_sent);
    }

    while ( readSomething )
    {
        if ( !tokenized_sent->empty() && tokenized_sent->back() == "\n" )
        {
            tokenized_sent->pop_back();
        }

        std::string parse = "";
        if(tokenized_sent->size() < MAX_SENTENCE_SIZE){
            tagger->tag(tokenized_sent, tagged_sent);
            conparser->parse(*tagged_sent, parsed_sent);
            parse = parsed_sent->str_unbinarized();
        } else {
            std::cerr << "Sentence too long. Writing empty string. Sentence: " << tokenized_sent << std::endl;
        }

        fprintf(outfp, "%s\n", parse.c_str());

        if (tokenize) {
            readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
        }
        else {
            readSomething = input_reader.readSegmentedSentence(tokenized_sent);
        }
    }

    // close the output file
    std::cerr << "Wrote output to " << sOutputFile << std::endl;
    fclose(outfp);
}