// Function to constituency parse all sentence in the given input file // and write parsed sentences to the given output file extern "C" void parse_file(const char *sInputFile, const char *sOutputFile, bool tokenize) { std::cerr << "Processing file " << sInputFile << std::endl; // initialize the input reader CSentenceReader input_reader(sInputFile); // open the output file FILE *outfp = NULL; outfp = fopen(sOutputFile, "w"); // initialize the temporary sentence variables CStringVector tokenized_sent[1]; CTwoStringVector tagged_sent[1]; english::CCFGTree parsed_sent[1]; // get the tagger and the parser that were stored earlier CTagger *tagger = (CTagger *)zpm->tagger; CConParser *conparser = (CConParser *)zpm->conparser; // read in and tokenize the given input file if asked bool readSomething; if (tokenize) { readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); } else { readSomething = input_reader.readSegmentedSentence(tokenized_sent); } while ( readSomething ) { if ( tokenized_sent->back() == "\n" ) { tokenized_sent->pop_back(); } std::string parse = ""; if(tokenized_sent->size() < MAX_SENTENCE_SIZE){ tagger->tag(tokenized_sent, tagged_sent); conparser->parse(*tagged_sent, parsed_sent); parse = parsed_sent->str_unbinarized(); } else { std::cerr << "Sentence too long. Writing empty string. Sentence: " << tokenized_sent << std::endl; } fprintf(outfp, "%s\n", parse.c_str()); if (tokenize) { readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); } else { readSomething = input_reader.readSegmentedSentence(tokenized_sent); } } // close the output file std::cerr << "Wrote output to " << sOutputFile << std::endl; fclose(outfp); }
void process(const std::string &sInputFile, const std::string &sOutputFile){ std::cerr<<"Converting start ..." << std::endl; std::ifstream is(sInputFile.c_str()); ASSERT(is.is_open(), "The training file is unaccessible."); std::ofstream os(sOutputFile.c_str()); static CSentenceParsed sent; int nCount=0; CConParser parser; CCoNLLOutput o_conll; is >> sent; while( ! sent.empty() ) { std::cerr << "Sentence " << nCount << " ... "; nCount ++; parser.convert( sent, &o_conll ); os<<o_conll; os.flush(); std::cerr << "done." << std::endl; is >> sent; } is.close(); os.close(); std::cerr << "Done. " << std::endl; }
// Function to constituency parse a sentence extern "C" char* parse_sentence(void* vzps, const char *input_sentence, bool tokenize) { zparSession_t* zps = static_cast<zparSession_t *>(vzps); try { // create a temporary string stream from the input char * CSentenceReader input_reader(std::string(input_sentence), false); // tokenize the sentence CStringVector tokenized_sent[1]; if (tokenize) { input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); } else { input_reader.readSegmentedSentence(tokenized_sent); } if (zps->output_buffer != NULL) { delete zps->output_buffer; zps->output_buffer = NULL; } if(tokenized_sent->size() >= MAX_SENTENCE_SIZE){ // The ZPar code asserts that length < MAX_SENTENCE_SIZE... std::cerr << "Sentence too long. Returning empty string. Sentence: " << input_sentence << std::endl; zps->output_buffer = new char[1]; strcpy(zps->output_buffer, ""); } else { // initialize the variables that will hold the tagged and parsed sentences CTwoStringVector tagged_sent[1]; english::CCFGTree parsed_sent[1]; // get the tagger and parser that were stored earlier CTagger *tagger = zps->tagger; CConParser *conparser = zps->conparser; // tag and parse the sentence tagger->tag(tokenized_sent, tagged_sent); conparser->parse(*tagged_sent, parsed_sent); // now put the parsed sentence into a string stream std::string parse = parsed_sent->str_unbinarized(); int parselen = parse.length(); zps->output_buffer = new char[parselen + 1]; strcpy(zps->output_buffer, parse.c_str()); } } catch (const std::string &e) { std::cerr << e << std::endl; zps->output_buffer = new char[1]; strcpy(zps->output_buffer, ""); } return zps->output_buffer; }
extern "C" void parse_tagged_file(void* vzps, const char *sInputFile, const char *sOutputFile, const char seperator='/') { zparSession_t* zps = static_cast<zparSession_t *>(vzps); std::cerr << "Processing file " << sInputFile << std::endl; // initialize the input reader CSentenceReader input_reader(sInputFile); // open the output file FILE *outfp = NULL; outfp = fopen(sOutputFile, "w"); // initialize the temporary sentence variables CTwoStringVector tagged_sent[1]; english::CCFGTree parsed_sent[1]; // get the parser that was stored earlier CConParser *conparser = zps->conparser; // read in and tokenize the given input file if asked bool readSomething; readSomething = input_reader.readTaggedSentence(tagged_sent, false, seperator); while ( readSomething ) { std::string parse = ""; if(tagged_sent->size() < MAX_SENTENCE_SIZE){ conparser->parse(*tagged_sent, parsed_sent); parse = parsed_sent->str_unbinarized(); } else { std::cerr << "Sentence too long. Writing empty string. Sentence: " << tagged_sent << std::endl; } fprintf(outfp, "%s\n", parse.c_str()); readSomething = input_reader.readTaggedSentence(tagged_sent, false, seperator); } // close the output file std::cerr << "Wrote output to " << sOutputFile << std::endl; fclose(outfp); }