Esempio n. 1
0
static void
check_invariants (const char *text)
{
  int len;
  PangoLogAttr *attrs;

  if (!g_utf8_validate (text, -1, NULL))
    fail ("Invalid UTF-8 in test text");

  len = g_utf8_strlen (text, -1);
  attrs = g_new0 (PangoLogAttr, len + 1);

  pango_get_log_attrs (text,
		       -1,
		       0,
		       pango_language_from_string ("C"),
		       attrs,
		       len + 1);

  check_line_invariants (text, attrs);
  check_sentence_invariants (text, attrs);
  check_grapheme_invariants (text, attrs);
  check_word_invariants (text, attrs);

#if 0
  print_sentences (text, attrs);
#endif

  g_free (attrs);
}
Esempio n. 2
0
static int run_tokenizer(FILE *ofp)
{
	int	line;
	char	*out = NULL;

	if (strlen(input_file) == 0 || strcmp(input_file, "-") == 0)
	{
		input_fp = stdin;
	} else
	{
		input_fp = fopen(input_file, "r");
		if (input_fp == NULL)
		{
			fprintf(stderr, "Could not open file %s\n", input_file);
			return 0;
		}
	}

	char *save_text = new char[MAX_BUFF + 1];

	option_ofp = ofp;

	// The ``preamble''

	fprintf(option_ofp, "#%s/util/tagger\n", install_dir);

	if (option_silent) fprintf(option_ofp, "verbose 0\n");

	if (strlen(option_untag) > 0) fprintf(option_ofp, "untag %s\n", option_untag);

	fprintf(option_ofp, "ngrams %s/medpost%s.ngrams\n", install_dir, mptok->suf);
	fprintf(option_ofp, "lex 30 %s/medpost%s.lex\n", install_dir, mptok->suf);
	fprintf(option_ofp, "backoff\n");
	fprintf(option_ofp, "init 2\n");
	fprintf(option_ofp, "smooth\n");

	strcpy(input_pmid, "");
	strcpy(input_loc, "");
	strcpy(input_id, "");

	char	*text = new char[MAX_BUFF + 1];

	int	collect_text;

	collect_text = 0;

	// initialize

	mptok->init(install_dir);

	line = 0;
	while (input_fp && fgets(text, MAX_BUFF, input_fp))
	{

		line++;

		// Remove space (including newline) at the end of the string

		for (int i = (int) strlen(text) - 1; i >= 0 && isspace(text[i]); --i)
			text[i] = '\0';

#if 0
		fprintf(option_ofp, "\n%s\n\n", text);
#endif

		if (strcmp(option_input, "itame") == 0)
		{
			if (strncmp(text, ".I", 2) == 0)
			{
				strcpy(input_pmid, text + 2);
			} else if (option_titles && strncmp(text, ".T", 2) == 0)
			{
				strcpy(input_loc, "T");
				out = mptok->tokenize(text + 2);
			} else if (strncmp(text, ".A", 2) == 0)
			{
				strcpy(input_loc, "A");
				out = mptok->tokenize(text + 2);
			}
		} else if (strcmp(option_input, "xml") == 0)
		{
			char *s1;
			char *s2;

			if ((s1 = strstr(text, "<PMID>")) && (s2 = strstr(text, "</PMID>")) && s2 > s1)
			{
				*s2 = '\0';
				strcpy(input_pmid, s1 + 6);
			} else if (option_titles && (s1 = strstr(text, "<ArticleTitle>")) && (s2 = strstr(text, "</ArticleTitle>")) && s2 > s1)
			{
				strcpy(input_loc, "T");
				*s2 = '\0';
				out = mptok->tokenize(s1 + 14);
			} else if ((s1 = strstr(text, "<AbstractText>")) && (s2 = strstr(text, "</AbstractText>")) && s2 > s1)
			{
				strcpy(input_loc, "A");
				*s2 = '\0';
				out = mptok->tokenize(s1 + 14);
			}
		} else if (strcmp(option_input, "medline") == 0)
		{
			if (collect_text)
			{
				if (isspace(*text))
					strcat(save_text, text);
				else
				{
					out = mptok->tokenize(save_text);
					collect_text = 0;
				}
			}

			if (strncmp(text, "PMID", 4) == 0)
			{
				strcpy(input_pmid, skip(text + 4, "- "));
			} else if (option_titles && strncmp(text, "TI", 2) == 0)
			{
				strcpy(input_loc, "T");
				strcpy(save_text, skip(text + 2, "- "));
				collect_text = 1;
			} else if (strncmp(text, "AB", 2) == 0)
			{
				strcpy(input_loc, "A");
				strcpy(save_text, skip(text + 2, "- "));
				collect_text = 1;
			}

		} else if (strcmp(option_input, "text") == 0)
		{
			// If id is specified for text input
			// each line is preceded by an id

			if (option_textid && (line % 2))
			{
				strcpy(input_id, text);
			} else
			{
				out = mptok->tokenize(text);
			}
		} else if (strcmp(option_input, "token") == 0)
		{
			// If id is specified for text input
			// each line is preceded by an id

			if (option_textid && (line % 2))
			{
				strcpy(input_id, text);
			} else
			{
				out = mptok->tokenize_pre(text);
			}
		}

		if (out)
		{
			print_sentences(out);
			delete[] out;
			out = NULL;
		}

	}
	delete[] save_text;
}
Esempio n. 3
0
void print_statistics(const char* search_file, int n){
  print_signficant_words(n);
  print_sentences(search_file);
}