static void check_invariants (const char *text) { int len; PangoLogAttr *attrs; if (!g_utf8_validate (text, -1, NULL)) fail ("Invalid UTF-8 in test text"); len = g_utf8_strlen (text, -1); attrs = g_new0 (PangoLogAttr, len + 1); pango_get_log_attrs (text, -1, 0, pango_language_from_string ("C"), attrs, len + 1); check_line_invariants (text, attrs); check_sentence_invariants (text, attrs); check_grapheme_invariants (text, attrs); check_word_invariants (text, attrs); #if 0 print_sentences (text, attrs); #endif g_free (attrs); }
static int run_tokenizer(FILE *ofp) { int line; char *out = NULL; if (strlen(input_file) == 0 || strcmp(input_file, "-") == 0) { input_fp = stdin; } else { input_fp = fopen(input_file, "r"); if (input_fp == NULL) { fprintf(stderr, "Could not open file %s\n", input_file); return 0; } } char *save_text = new char[MAX_BUFF + 1]; option_ofp = ofp; // The ``preamble'' fprintf(option_ofp, "#%s/util/tagger\n", install_dir); if (option_silent) fprintf(option_ofp, "verbose 0\n"); if (strlen(option_untag) > 0) fprintf(option_ofp, "untag %s\n", option_untag); fprintf(option_ofp, "ngrams %s/medpost%s.ngrams\n", install_dir, mptok->suf); fprintf(option_ofp, "lex 30 %s/medpost%s.lex\n", install_dir, mptok->suf); fprintf(option_ofp, "backoff\n"); fprintf(option_ofp, "init 2\n"); fprintf(option_ofp, "smooth\n"); strcpy(input_pmid, ""); strcpy(input_loc, ""); strcpy(input_id, ""); char *text = new char[MAX_BUFF + 1]; int collect_text; collect_text = 0; // initialize mptok->init(install_dir); line = 0; while (input_fp && fgets(text, MAX_BUFF, input_fp)) { line++; // Remove space (including newline) at the end of the string for (int i = (int) strlen(text) - 1; i >= 0 && isspace(text[i]); --i) text[i] = '\0'; #if 0 fprintf(option_ofp, "\n%s\n\n", text); #endif if (strcmp(option_input, "itame") == 0) { if (strncmp(text, ".I", 2) == 0) { strcpy(input_pmid, text + 2); } else if (option_titles && strncmp(text, ".T", 2) == 0) { strcpy(input_loc, "T"); out = mptok->tokenize(text + 2); } else if (strncmp(text, ".A", 2) == 0) { strcpy(input_loc, "A"); out = mptok->tokenize(text + 2); } } else if (strcmp(option_input, "xml") == 0) { char *s1; char *s2; if ((s1 = strstr(text, "<PMID>")) && (s2 = strstr(text, "</PMID>")) && s2 > s1) { *s2 = '\0'; strcpy(input_pmid, s1 + 6); } else if (option_titles && (s1 = strstr(text, "<ArticleTitle>")) && (s2 = strstr(text, "</ArticleTitle>")) && s2 > s1) { strcpy(input_loc, "T"); *s2 = '\0'; out = mptok->tokenize(s1 + 14); } else if ((s1 = strstr(text, "<AbstractText>")) && (s2 = strstr(text, "</AbstractText>")) && s2 > s1) { strcpy(input_loc, "A"); *s2 = '\0'; out = mptok->tokenize(s1 + 14); } } else if (strcmp(option_input, "medline") == 0) { if (collect_text) { if (isspace(*text)) strcat(save_text, text); else { out = mptok->tokenize(save_text); collect_text = 0; } } if (strncmp(text, "PMID", 4) == 0) { strcpy(input_pmid, skip(text + 4, "- ")); } else if (option_titles && strncmp(text, "TI", 2) == 0) { strcpy(input_loc, "T"); strcpy(save_text, skip(text + 2, "- ")); collect_text = 1; } else if (strncmp(text, "AB", 2) == 0) { strcpy(input_loc, "A"); strcpy(save_text, skip(text + 2, "- ")); collect_text = 1; } } else if (strcmp(option_input, "text") == 0) { // If id is specified for text input // each line is preceded by an id if (option_textid && (line % 2)) { strcpy(input_id, text); } else { out = mptok->tokenize(text); } } else if (strcmp(option_input, "token") == 0) { // If id is specified for text input // each line is preceded by an id if (option_textid && (line % 2)) { strcpy(input_id, text); } else { out = mptok->tokenize_pre(text); } } if (out) { print_sentences(out); delete[] out; out = NULL; } } delete[] save_text; }
void print_statistics(const char* search_file, int n){ print_signficant_words(n); print_sentences(search_file); }