Beispiel #1
0
// Function to constituency parse all sentence in the given input file
// and write parsed sentences to the given output file
extern "C" void parse_file(const char *sInputFile, const char *sOutputFile, bool tokenize)
{

    std::cerr << "Processing file " <<  sInputFile << std::endl;

    // initialize the input reader
    CSentenceReader input_reader(sInputFile);

    // open the output file
    FILE *outfp = NULL;
    outfp = fopen(sOutputFile, "w");

    // initialize the temporary sentence variables
    CStringVector tokenized_sent[1];
    CTwoStringVector tagged_sent[1];
    english::CCFGTree parsed_sent[1];

    // get the tagger and the parser that were stored earlier
    CTagger *tagger = (CTagger *)zpm->tagger;
    CConParser *conparser = (CConParser *)zpm->conparser;

    // read in and tokenize the given input file if asked
    bool readSomething;
    if (tokenize) {
        readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
    }
    else {
        readSomething = input_reader.readSegmentedSentence(tokenized_sent);
    }

    while ( readSomething )
    {
        if ( tokenized_sent->back() == "\n" )
        {
            tokenized_sent->pop_back();
        }

        std::string parse = "";
        if(tokenized_sent->size() < MAX_SENTENCE_SIZE){
            tagger->tag(tokenized_sent, tagged_sent);
            conparser->parse(*tagged_sent, parsed_sent);
            parse = parsed_sent->str_unbinarized();
        } else {
            std::cerr << "Sentence too long. Writing empty string. Sentence: " << tokenized_sent << std::endl;
        }

        fprintf(outfp, "%s\n", parse.c_str());

        if (tokenize) {
            readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
        }
        else {
            readSomething = input_reader.readSegmentedSentence(tokenized_sent);
        }
    }

    // close the output file
    std::cerr << "Wrote output to " << sOutputFile << std::endl;
    fclose(outfp);
}
Beispiel #2
0
void process(const std::string &sInputFile, const std::string &sOutputFile){
	std::cerr<<"Converting start ..." << std::endl;

	std::ifstream is(sInputFile.c_str());
	ASSERT(is.is_open(), "The training file is unaccessible.");
	std::ofstream os(sOutputFile.c_str());

	static CSentenceParsed sent;

	int nCount=0;

	CConParser parser;
	CCoNLLOutput o_conll;
	is >> sent;
	while( ! sent.empty() ) {
		std::cerr << "Sentence " << nCount << " ... ";
		nCount ++;
		parser.convert( sent, &o_conll );
		os<<o_conll;
		os.flush();
		std::cerr << "done." << std::endl;
		is >> sent;
	}
	is.close();
	os.close();
	std::cerr << "Done. " << std::endl;
}
Beispiel #3
0
// Function to constituency parse a sentence
extern "C" char* parse_sentence(void* vzps, const char *input_sentence, bool tokenize)
{

    zparSession_t* zps = static_cast<zparSession_t *>(vzps);

    try {

        // create a temporary string stream from the input char *
        CSentenceReader input_reader(std::string(input_sentence), false);

        // tokenize the sentence
        CStringVector tokenized_sent[1];
        if (tokenize) {
            input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
        }
        else {
            input_reader.readSegmentedSentence(tokenized_sent);
        }

        if (zps->output_buffer != NULL) {
            delete zps->output_buffer;
            zps->output_buffer = NULL;
        }

        if(tokenized_sent->size() >= MAX_SENTENCE_SIZE){
            // The ZPar code asserts that length < MAX_SENTENCE_SIZE...
            std::cerr << "Sentence too long. Returning empty string. Sentence: " << input_sentence << std::endl;
            zps->output_buffer = new char[1];
            strcpy(zps->output_buffer, "");
        } else {
            // initialize the variables that will hold the tagged and parsed sentences
            CTwoStringVector tagged_sent[1];
            english::CCFGTree parsed_sent[1];

            // get the tagger and parser that were stored earlier
            CTagger *tagger = zps->tagger;
            CConParser *conparser = zps->conparser;

            // tag and parse the sentence
            tagger->tag(tokenized_sent, tagged_sent);
            conparser->parse(*tagged_sent, parsed_sent);

            // now put the parsed sentence into a string stream
            std::string parse = parsed_sent->str_unbinarized();
            int parselen = parse.length();
            zps->output_buffer = new char[parselen + 1];
            strcpy(zps->output_buffer, parse.c_str());
        }
    } catch (const std::string &e) {
        std::cerr << e << std::endl;
        zps->output_buffer = new char[1];
        strcpy(zps->output_buffer, "");
    }

    return zps->output_buffer;
}
Beispiel #4
0
extern "C" void parse_tagged_file(void* vzps, const char *sInputFile, const char *sOutputFile, const char seperator='/')
{

    zparSession_t* zps = static_cast<zparSession_t *>(vzps);

    std::cerr << "Processing file " <<  sInputFile << std::endl;

    // initialize the input reader
    CSentenceReader input_reader(sInputFile);

    // open the output file
    FILE *outfp = NULL;
    outfp = fopen(sOutputFile, "w");

    // initialize the temporary sentence variables
    CTwoStringVector tagged_sent[1];
    english::CCFGTree parsed_sent[1];

    // get the parser that was stored earlier
    CConParser *conparser = zps->conparser;

    // read in and tokenize the given input file if asked
    bool readSomething;
    readSomething = input_reader.readTaggedSentence(tagged_sent, false, seperator);

    while ( readSomething )
    {
        std::string parse = "";
        if(tagged_sent->size() < MAX_SENTENCE_SIZE){
            conparser->parse(*tagged_sent, parsed_sent);
            parse = parsed_sent->str_unbinarized();
        } else {
            std::cerr << "Sentence too long. Writing empty string. Sentence: " << tagged_sent << std::endl;
        }

        fprintf(outfp, "%s\n", parse.c_str());

        readSomething = input_reader.readTaggedSentence(tagged_sent, false, seperator);
    }

    // close the output file
    std::cerr << "Wrote output to " << sOutputFile << std::endl;
    fclose(outfp);
}