// Function to dependency parse all sentence in the given input file // and write parsed sentences to the given output file extern "C" void dep_parse_file(const char *sInputFile, const char *sOutputFile, bool tokenize) { std::cerr << "Processing file " << sInputFile << std::endl; // initialize the input reader CSentenceReader input_reader(sInputFile); // open the output file FILE *outfp = NULL; outfp = fopen(sOutputFile, "w"); // initialize the temporary sentence variables CStringVector tokenized_sent[1]; CTwoStringVector tagged_sent[1]; CDependencyParse parsed_sent[1]; // get the tagger and the parser that were stored earlier CTagger *tagger = (CTagger *)zpm->tagger; CDepParser *depparser = (CDepParser *)zpm->depparser; // read in and tokenize the given input file if asked bool readSomething; if (tokenize) { readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); } else { readSomething = input_reader.readSegmentedSentence(tokenized_sent); } while ( readSomething ) { if ( tokenized_sent->back() == "\n" ) { tokenized_sent->pop_back(); } std::string deptree = ""; if(tokenized_sent->size() < MAX_SENTENCE_SIZE){ tagger->tag(tokenized_sent, tagged_sent); depparser->parse(*tagged_sent, parsed_sent); deptree = format_dependency_tree(parsed_sent); } else { std::cerr << "Sentence too long. Writing empty string. Input:" << tokenized_sent << std::endl; } fprintf(outfp, "%s\n", deptree.c_str()); if (tokenize) { readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); } else { readSomething = input_reader.readSegmentedSentence(tokenized_sent); } } // close the output file std::cerr << "Wrote output to " << sOutputFile << std::endl; fclose(outfp); }
// Function to dependency parse a sentence extern "C" char* dep_parse_sentence(void* vzps, const char *input_sentence, bool tokenize) { zparSession_t* zps = static_cast<zparSession_t *>(vzps); try { // create a temporary string stream from the input char * CSentenceReader input_reader(std::string(input_sentence), false); // tokenize the sentence CStringVector tokenized_sent[1]; if (tokenize) { input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); } else { input_reader.readSegmentedSentence(tokenized_sent); } if (zps->output_buffer != NULL) { delete zps->output_buffer; zps->output_buffer = NULL; } if(tokenized_sent->size() >= MAX_SENTENCE_SIZE){ // The ZPar code asserts that length < MAX_SENTENCE_SIZE... std::cerr << "Sentence too long. Returning empty string. Sentence: " << input_sentence << std::endl; zps->output_buffer = new char[1]; strcpy(zps->output_buffer, ""); } else { // initialize the variable that will hold the tagged and parsed sentences CTwoStringVector tagged_sent[1]; CDependencyParse parsed_sent[1]; // get the tagger and parser that were stored earlier CTagger *tagger = zps->tagger; CDepParser *depparser = zps->depparser; // tag and parse the sentence tagger->tag(tokenized_sent, tagged_sent); depparser->parse(*tagged_sent, parsed_sent); // now output the formatted dependency tree std::string deptree = format_dependency_tree(parsed_sent); int deptreelen = deptree.length(); zps->output_buffer = new char[deptreelen + 1]; strcpy(zps->output_buffer, deptree.c_str()); } } catch (const std::string &e) { std::cerr << e << std::endl; zps->output_buffer = new char[1]; strcpy(zps->output_buffer, ""); } return zps->output_buffer; }
// Function to tag all sentence in the given input file // and write tagged sentences to the given output file extern "C" void tag_file(const char *sInputFile, const char *sOutputFile, bool tokenize) { std::cerr << "Processing file " << sInputFile << std::endl; // initialize the input reader CSentenceReader input_reader(sInputFile); // open the output file FILE *outfp = NULL; outfp = fopen(sOutputFile, "w"); // initialize the temporary sentence variables CStringVector tokenized_sent[1]; CTwoStringVector tagged_sent[1]; // get the tagger and the parser that were stored earlier CTagger *tagger = (CTagger *)zpm->tagger; // read in and tokenize the given input file if asked bool readSomething; if (tokenize) { readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); } else { readSomething = input_reader.readSegmentedSentence(tokenized_sent); } while ( readSomething ) { if ( tokenized_sent->back() == "\n" ) { tokenized_sent->pop_back(); } // tag the sentence tagger->tag(tokenized_sent, tagged_sent); // write the formatted sentence to the output file std::string tagvec = format_tagged_vector(tagged_sent); fprintf(outfp, "%s\n", tagvec.c_str()); if (tokenize) { readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); } else { readSomething = input_reader.readSegmentedSentence(tokenized_sent); } } // close the output file std::cerr << "Wrote output to " << sOutputFile << std::endl; fclose(outfp); }
// Function to tag all sentence in the given input file // and write tagged sentences to the given output file extern "C" void tag_file(void* vzps, const char *sInputFile, const char *sOutputFile, bool tokenize) { zparSession_t* zps = static_cast<zparSession_t *>(vzps); std::cerr << "Processing file " << sInputFile << std::endl; // initialize the input reader CSentenceReader input_reader(sInputFile); // initialize the temporary sentence variables CStringVector tokenized_sent[1]; CTwoStringVector tagged_sent[1]; // get the tagger and the parser that were stored earlier CTagger *tagger = zps->tagger; // initialize the output file writer std::string outputFileName = std::string(sOutputFile); CSentenceWriter output_writer(outputFileName); // read in and tokenize the given input file if asked bool readSomething; if (tokenize) { readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); } else { readSomething = input_reader.readSegmentedSentence(tokenized_sent); } while ( readSomething ) { if ( !tokenized_sent->empty() && tokenized_sent->back() == "\n" ) { tokenized_sent->pop_back(); } // tag the sentence tagger->tag(tokenized_sent, tagged_sent); // write the formatted sentence to the output file output_writer.writeSentence(tagged_sent, '/', true); if (tokenize) { readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); } else { readSomething = input_reader.readSegmentedSentence(tokenized_sent); } } // close the output file std::cerr << "Wrote output to " << sOutputFile << std::endl; }
// Function to constituency parse a sentence extern "C" char* parse_sentence(const char *input_sentence, bool tokenize) { // create a temporary string stream from the input char * CSentenceReader input_reader(std::string(input_sentence), false); // tokenize the sentence CStringVector tokenized_sent[1]; if (tokenize) { input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); } else { input_reader.readSegmentedSentence(tokenized_sent); } if (zpm->output_buffer != NULL) { delete zpm->output_buffer; zpm->output_buffer = NULL; } if(tokenized_sent->size() >= MAX_SENTENCE_SIZE){ // The ZPar code asserts that length < MAX_SENTENCE_SIZE... std::cerr << "Sentence too long. Returning empty string. Sentence: " << input_sentence << std::endl; zpm->output_buffer = new char[1]; strcpy(zpm->output_buffer, ""); } else { // initialize the variable that will hold the tagged sentence CTwoStringVector tagged_sent[1]; english::CCFGTree parsed_sent[1]; // get the tagger that was stored earlier CTagger *tagger = (CTagger *)zpm->tagger; CConParser *conparser = (CConParser *)zpm->conparser; // tag the sentence tagger->tag(tokenized_sent, tagged_sent); conparser->parse(*tagged_sent, parsed_sent); // now put the tagged_sent into a string stream std::string parse = parsed_sent->str_unbinarized(); int parselen = parse.length(); zpm->output_buffer = new char[parselen + 1]; strcpy(zpm->output_buffer, parse.c_str()); } return zpm->output_buffer; }
// Function to tag a sentence extern "C" char* tag_sentence(void* vzps, const char *input_sentence, bool tokenize) { zparSession_t* zps = static_cast<zparSession_t *>(vzps); try { // create a temporary string stream from the input char * CSentenceReader input_reader(std::string(input_sentence), false); // tokenize the sentence CStringVector input_sent[1]; if (tokenize) { input_reader.readSegmentedSentenceAndTokenize(input_sent); } else { input_reader.readSegmentedSentence(input_sent); } // initialize the variable that will hold the tagged sentence CTwoStringVector tagged_sent[1]; // get the tagger that was stored earlier CTagger *tagger = zps->tagger; // tag the sentence tagger->tag(input_sent, tagged_sent); // format the tagged sentence properly and return std::string tagvec = format_tagged_vector(tagged_sent); int tagveclen = tagvec.length(); if (zps->output_buffer != NULL) { delete zps->output_buffer; zps->output_buffer = NULL; } zps->output_buffer = new char[tagveclen + 1]; strcpy(zps->output_buffer, tagvec.c_str()); } catch (const std::string &e) { std::cerr << e << std::endl; zps->output_buffer = new char[1]; strcpy(zps->output_buffer, ""); } return zps->output_buffer; }
void Tagging(vector<_SEN*> &rSenVec, CTagger &rTagger) { rTagger.SetIMode(false); clock_t start = clock(); for (size_t i = 0; i < rSenVec.size(); ++i) { if (i != 0 && i % 500 == 0) fprintf(stderr, "Tagging %lu sentence\r", i); rTagger.Tagging(rSenVec[i]); // _SEN *pRes = rTagger.Tagging(rSenVec[i]); // for (int k = 0; k < pRes->Length(); ++k) // if (rTagger.GetLexicon()->SeenWord(rSenVec[i]->Word(k)) == false) // ++nOOV; } double secs = 1.0*(clock() - start)/CLOCKS_PER_SEC; fprintf(stderr, "Total %d sentences, %.2f secs eclipsed, %.2f sens per sec\n", (int)rSenVec.size(), secs, rSenVec.size()/secs); // return nOOV; }
void tag(const std::string sInputFile, const std::string sOutputFile, const std::string sFeaturePath) { std::cout << "Tagging started" << std::endl; int time_start = clock(); std::string sTaggerFeatureFile = sFeaturePath + "/tagger"; CTagger *tagger; tagger = new CTagger(sTaggerFeatureFile, false); // if (sKnowledgeBase.size()) // tagger.loadTagDictionary(sKnowledgeBase); CSentenceReader input_reader(sInputFile); CSentenceWriter outout_writer(sOutputFile); CStringVector *input_sent = new CStringVector; CTwoStringVector *outout_sent; int nCount=0; const unsigned nBest = 1; outout_sent = new CTwoStringVector[nBest]; while( input_reader.readSegmentedSentenceAndTokenize(input_sent) ) { TRACE("Sentence " << nCount); ++ nCount; // // Find decoder outout // tagger->tag(input_sent, outout_sent, nBest, NULL); // // Ouptut sent // for (int i=0; i<nBest; ++i) outout_writer.writeSentence(outout_sent+i, '/'); } delete input_sent; delete [] outout_sent; delete tagger; std::cout << "Tagging has finished successfully. Total time taken is: " << double(clock()-time_start)/CLOCKS_PER_SEC << std::endl; }
// Function to tag a sentence extern "C" char* tag_sentence(const char *input_sentence, bool tokenize) { // create a temporary string stream from the input char * CSentenceReader input_reader(std::string(input_sentence), false); // tokenize the sentence CStringVector input_sent[1]; if (tokenize) { input_reader.readSegmentedSentenceAndTokenize(input_sent); } else { input_reader.readSegmentedSentence(input_sent); } // initialize the variable that will hold the tagged sentence CTwoStringVector tagged_sent[1]; // get the tagger that was stored earlier CTagger *tagger = (CTagger *)zpm->tagger; // tag the sentence tagger->tag(input_sent, tagged_sent); // format the tagged sentence properly and return std::string tagvec = format_tagged_vector(tagged_sent); int tagveclen = tagvec.length(); if (zpm->output_buffer != NULL) { delete zpm->output_buffer; zpm->output_buffer = NULL; } zpm->output_buffer = new char[tagveclen + 1]; strcpy(zpm->output_buffer, tagvec.c_str()); return zpm->output_buffer; }
// Function to constituency parse all sentence in the given input file // and write parsed sentences to the given output file extern "C" void parse_file(void* vzps, const char *sInputFile, const char *sOutputFile, bool tokenize) { zparSession_t* zps = static_cast<zparSession_t *>(vzps); std::cerr << "Processing file " << sInputFile << std::endl; // initialize the input reader CSentenceReader input_reader(sInputFile); // open the output file FILE *outfp = NULL; outfp = fopen(sOutputFile, "w"); // initialize the temporary sentence variables CStringVector tokenized_sent[1]; CTwoStringVector tagged_sent[1]; english::CCFGTree parsed_sent[1]; // get the tagger and the parser that were stored earlier CTagger *tagger = zps->tagger; CConParser *conparser = zps->conparser; // read in and tokenize the given input file if asked bool readSomething; if (tokenize) { readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); } else { readSomething = input_reader.readSegmentedSentence(tokenized_sent); } while ( readSomething ) { if ( !tokenized_sent->empty() && tokenized_sent->back() == "\n" ) { tokenized_sent->pop_back(); } std::string parse = ""; if(tokenized_sent->size() < MAX_SENTENCE_SIZE){ tagger->tag(tokenized_sent, tagged_sent); conparser->parse(*tagged_sent, parsed_sent); parse = parsed_sent->str_unbinarized(); } else { std::cerr << "Sentence too long. Writing empty string. Sentence: " << tokenized_sent << std::endl; } fprintf(outfp, "%s\n", parse.c_str()); if (tokenize) { readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); } else { readSomething = input_reader.readSegmentedSentence(tokenized_sent); } } // close the output file std::cerr << "Wrote output to " << sOutputFile << std::endl; fclose(outfp); }