void CgRuleEngine::check(const Paragraph * paragraph) { fprintf(stderr, "CgRuleEngine::check\n"); for (size_t i = 0; i < paragraph->sentenceCount; i++) { cg3_sentence *sentence = 0; sentence = cg3_sentence_new(applicator); for(size_t j = 0; j < paragraph->sentences[i]->tokenCount; j++) { Token t = paragraph->sentences[i]->tokens[j]; if(wcscmp(t.str, L" ") == 0) { continue; } cg3_cohort *cohort = 0; cohort = cg3_cohort_create(sentence); //fprintf(stderr, "CgRuleEngine::check %ls (%d) %ld\n", paragraph->sentences[i]->tokens[j].str, t.isValidWord, t.analyses->size()); cg3_tag *tag = 0; string wordform = string("\"<") + utils::StringUtils::StringUtils::utf8FromUcs4(paragraph->sentences[i]->tokens[j].str) + string(">\""); //fprintf(stderr, "wordform is: %s\n", wordform.c_str()); tag = cg3_tag_create_u8(applicator, wordform.c_str()); cg3_cohort_setwordform(cohort, tag); list<morphology::Analysis *>::iterator it = t.analyses->begin(); unsigned int num_analyses = 0; while (it != t.analyses->end()) { morphology::Analysis *a = *it; cg3_reading *reading = 0; reading = cg3_reading_create(cohort); string baseform = string("\"") + utils::StringUtils::StringUtils::utf8FromUcs4((a->getValue("lemma"))) + string("\""); tag = cg3_tag_create_u8(applicator, baseform.c_str()); cg3_reading_addtag(reading, tag); string taglist = string(utils::StringUtils::StringUtils::utf8FromUcs4((a->getValue("tags")))); string buf = ""; for(string::iterator it2 = taglist.begin(); it2 != taglist.end(); it2++) { // This assumes + separated tags. if(*it2 == '+' && buf.length() > 0) { tag = cg3_tag_create_u8(applicator, buf.c_str()); cg3_reading_addtag(reading, tag); buf = ""; continue; } else if(*it2 == '+' && buf.length() == 0) { continue; } buf = buf + *it2; } tag = cg3_tag_create_u8(applicator, buf.c_str()); cg3_reading_addtag(reading, tag); buf = ""; num_analyses++; cg3_cohort_addreading(cohort, reading); ++it; } cg3_sentence_addcohort(sentence, cohort); //fprintf(stderr, "CgRuleEngine::check %d analyses\n", num_analyses); } int num_cohorts = cg3_sentence_numcohorts(sentence); fprintf(stderr, "CgRuleEngine::num_cohorts %d \n", num_cohorts); cg3_sentence_runrules(applicator, sentence); // We've run the grammar on the sentence, now we need to go through and look // for error tags which by convention begin with &, e.g. // // "<beassát>" // "beassi" G3 N Sg Acc PxSg2 @OBJ> &real-beassat #8->8 ADD:3178:beassat cg3_tag *tag = 0; cg3_cohort *cohort = 0; cg3_reading *reading = 0; size_t ci = 0, ce = 0, ri = 0, re = 0, ti = 0, te = 0; const char *tmp; size_t cur_pos = 0; for (ci = 0, ce = cg3_sentence_numcohorts(sentence) ; ci != ce ; ++ci) { cohort = cg3_sentence_getcohort(sentence, ci); cg3_tag *wftag = cg3_cohort_getwordform(cohort); const char *wftmp = cg3_tag_gettext_u8(wftag); fprintf(stderr, "CG[%ld, %ld]: %s\n", cur_pos, wcslen(cg3_tag_gettext_w(wftag)), wftmp); for (ri = 0, re = cg3_cohort_numreadings(cohort) ; ri != re ; ++ri) { reading = cg3_cohort_getreading(cohort, ri); fprintf(stderr, "CG: \t"); for (ti = 0, te = cg3_reading_numtags(reading) ; ti != te ; ++ti) { tag = cg3_reading_gettag(reading, ti); tmp = cg3_tag_gettext_u8(tag); fprintf(stderr, "%s ", tmp); if(tmp[0] == '&') { // We've found an error tag, mark the current cohort fprintf(stderr, "\nnew CacheEntry (%ld, %d, %d)\n", ci, GCERR_NEGATIVE_VERB_MISMATCH, 1); CacheEntry * e = new CacheEntry(0); e->error.legacyError.error_code = -1; e->error.error_id = tmp; e->error.checker = options->grammarChecker; cg3_tag *lwt = cg3_cohort_getwordform(cohort); // const char *lwf = cg3_tag_gettext_u8(lwt); e->error.startpos = cur_pos; e->error.errorlen = wcslen(cg3_tag_gettext_w(lwt)) - 4; options->grammarChecker->cache.appendError(e); fprintf(stderr, " pos: %ld; len: %ld\n", cur_pos, wcslen(cg3_tag_gettext_w(lwt)) - 4); } } for (ti = 0, te = cg3_reading_numtraces(reading) ; ti != te ; ++ti) { uint32_t rule_line = cg3_reading_gettrace(reading, ti); fprintf(stderr, "TRACE:%u ", rule_line); } fprintf(stderr, "\n"); } if(ci != 0) { // We don't count BOS >>> fprintf(stderr, "%ld + %ld = ", cur_pos, (wcslen(cg3_tag_gettext_w(wftag)) - 4)); cur_pos = cur_pos + (wcslen(cg3_tag_gettext_w(wftag)) - 4) + 1; fprintf(stderr, "%ld\n", cur_pos); } } } return; }
int main(int argc, char *argv[]) { cg3_grammar *grammar = 0; cg3_applicator *applicator = 0; cg3_sentence *sentence = 0; cg3_cohort *cohort = 0; cg3_reading *reading = 0; cg3_tag *tag = 0; size_t ci = 0, ce = 0, ri = 0, re = 0, ti = 0, te = 0; const char *tmp; if (argc < 2) { fprintf(stderr, "Error: First argument must be a CG-3 grammar to load!\n"); return 1; } if (!cg3_init(stdin, stdout, stderr)) { fprintf(stderr, "Error: Failed cg3_init()!\n"); return 2; } grammar = cg3_grammar_load(argv[1]); if (!grammar) { fprintf(stderr, "Error: Failed cg3_grammar_load( %s )!\n", argv[1]); return 3; } applicator = cg3_applicator_create(grammar); cg3_applicator_setflags(applicator, CG3F_TRACE); sentence = cg3_sentence_new(applicator); cohort = cg3_cohort_create(sentence); tag = cg3_tag_create_u8(applicator, "\"<wordform>\""); cg3_cohort_setwordform(cohort, tag); reading = cg3_reading_create(cohort); tag = cg3_tag_create_w(applicator, L"\"baseform\""); cg3_reading_addtag(reading, tag); tag = cg3_tag_create_u8(applicator, "notwanted"); cg3_reading_addtag(reading, tag); tag = cg3_tag_create_w(applicator, L"@mapping"); cg3_reading_addtag(reading, tag); cg3_cohort_addreading(cohort, reading); reading = cg3_reading_create(cohort); tag = cg3_tag_create_w(applicator, L"\"baseform\""); cg3_reading_addtag(reading, tag); tag = cg3_tag_create_u8(applicator, "wanted"); cg3_reading_addtag(reading, tag); cg3_cohort_addreading(cohort, reading); reading = cg3_reading_create(cohort); tag = cg3_tag_create_w(applicator, L"\"baseform\""); cg3_reading_addtag(reading, tag); tag = cg3_tag_create_u8(applicator, "alsonotwanted"); cg3_reading_addtag(reading, tag); cg3_cohort_addreading(cohort, reading); cg3_sentence_addcohort(sentence, cohort); cg3_sentence_runrules(applicator, sentence); for (ci = 0, ce = cg3_sentence_numcohorts(sentence); ci != ce; ++ci) { cohort = cg3_sentence_getcohort(sentence, ci); tag = cg3_cohort_getwordform(cohort); tmp = cg3_tag_gettext_u8(tag); fprintf(stdout, "%s\n", tmp); for (ri = 0, re = cg3_cohort_numreadings(cohort); ri != re; ++ri) { reading = cg3_cohort_getreading(cohort, ri); fprintf(stdout, "\t"); for (ti = 0, te = cg3_reading_numtags(reading); ti != te; ++ti) { tag = cg3_reading_gettag(reading, ti); tmp = cg3_tag_gettext_u8(tag); fprintf(stdout, "%s ", tmp); } for (ti = 0, te = cg3_reading_numtraces(reading); ti != te; ++ti) { uint32_t rule_line = cg3_reading_gettrace(reading, ti); fprintf(stdout, "TRACE:%u ", rule_line); } fprintf(stdout, "\n"); } } cg3_sentence_free(sentence); cg3_applicator_free(applicator); cg3_grammar_free(grammar); cg3_cleanup(); return 0; }