Exemple #1
0
static VALUE summarize(const VALUE self, volatile VALUE rb_str, volatile VALUE rb_dict_file, const VALUE rb_ratio, const VALUE rb_topics, const VALUE rb_sections) {
#ifdef HAVE_RUBY_ENCODING_H
    int enc = rb_enc_find_index("UTF-8");
#endif
    long int length = RSTRING_LEN(rb_str);
    char *text = StringValuePtr(rb_str);
    char *dictionary_file = StringValuePtr(rb_dict_file);
    int ratio = NUM2INT(rb_ratio);

    size_t result_len;
    OtsArticle *doc = ots_new_article();

    VALUE summary;
    VALUE topics;
    VALUE result;
    VALUE sections;

    if (!ots_load_xml_dictionary(doc, dictionary_file)) {
        ots_free_article(doc);
        rb_raise(rb_eRuntimeError, "Cannot load dictionary file");
        return Qnil;
    }

    ots_parse_stream(text, length, doc);
    ots_grade_doc(doc);
    ots_highlight_doc(doc, ratio);

    summary = rb_str_new2(ots_get_doc_text(doc, &result_len));
    topics = rb_str_new2((const char *)doc->title);
    sections = rb_ary_new();
    GList *li;
    for (li = (GList *) ots_get_doc_sections(doc); li != NULL; li = li->next) {
        VALUE section = rb_str_new2(li->data);
        rb_ary_push(sections, section);
    }

#ifdef HAVE_RUBY_ENCODING_H
    rb_enc_associate_index(summary, enc);
    rb_enc_associate_index(summary, enc);
#endif

    ots_free_article(doc);

    if (rb_topics == Qtrue) {
        result = rb_ary_new();
        rb_ary_push(result, summary);
        rb_ary_push(result, topics);
        return result;
    } else if (rb_sections == Qtrue) {
        return sections;
    } else {
        return summary;
    }
}
Exemple #2
0
int main(int argc, char **argv)
{
  char *dictionary_file = "en";	/* if not told otherwise, assume we're using english */
  const char *input_file = NULL;
  char *output_file = NULL;

  FILE *input_stream = stdin;	/*by default read from stdin */
  FILE *output_stream = stdout;	/*by default read from stdout */

  OtsArticle *Art;

  int sumPercent = 20;	 	/* if not told otherwise highlight 20% of the  document */

  int c,n_args=0;

  int html = FALSE;
  int keywords = FALSE;
  int about = FALSE;
  int version = FALSE;
	
  const char *const *args=NULL;

	GOptionContext *context = NULL;
	GError *error = NULL;
	
	const GOptionEntry options[] = {
		{"ratio"   , 'r', 0, G_OPTION_ARG_INT   , &sumPercent, "summarization % [default = 20%]", "<int>"},
		{"dic"     , 'd', 0, G_OPTION_ARG_STRING, &dictionary_file, "dictionary to use", "<string>"},
		{"out"     , 'o', 0, G_OPTION_ARG_STRING, &output_file, "output file [default = stdout]", "<string>"},
		{"html"    , 'h', 0, G_OPTION_ARG_NONE  , &html, "output as html", NULL},
		{"keywords", 'k', 0, G_OPTION_ARG_NONE  , &keywords, "only output keywords", NULL},
		{"about"   , 'a', 0, G_OPTION_ARG_NONE  , &about, "only output the summary", NULL},
		{"version" , 'v', 0, G_OPTION_ARG_NONE  , &version, "show version information", NULL},
		{ G_OPTION_REMAINING, '\0', 0, G_OPTION_ARG_FILENAME_ARRAY, &args, NULL, "[file.txt | stdin]" },
		{NULL}
  };

	context = g_option_context_new(" - Open Text Summarizer");
	g_option_context_add_main_entries(context, options, NULL);

        /* Parse command line */
        if (!g_option_context_parse (context, &argc, &argv, &error))
        {
                g_option_context_free (context);

                g_print ("%s\n", error->message);
                g_error_free (error);
                exit (1);
        }

	/* print version number */
  if (version)
    {
		printf("%s\n", PACKAGE_STRING);
		g_option_context_free(context);
      exit (0);
    }
  
  if(args==NULL) 
    {
       printf("\nInvalid number of arguments. Use --help to see options\n");
       exit(1);
    }
  if (args)
    while (args[n_args] != NULL)
      n_args++;

  if (n_args > 1)
    {
		g_option_context_free(context);
      return 1;
    }

	if (n_args == 1 && g_file_test (args[0], G_FILE_TEST_EXISTS) == TRUE)
    input_file = args[0];

  if (input_file)
    {
      input_stream = fopen (input_file, "r");
      if (!input_stream)
	{
			g_option_context_free(context);
	  perror ("Couldn't load input file");
	  return 1;
	}
    }

  if (output_file)
    {
      output_stream = fopen (output_file, "w");
      if (!output_stream)
	{
	  if (input_file)
	    fclose (input_stream);
			g_option_context_free(context);
	  perror ("Couldn't load output file");
	  return 1;
	}
    }

  Art = ots_new_article ();
	
  if (!ots_load_xml_dictionary (Art, dictionary_file))
    {
   
      ots_free_article (Art);
		if (input_file)
			fclose(input_stream);
		if (output_file)
			fclose(output_stream);
		g_option_context_free(context);
      perror ("Couldn't load dictionary");
      return 1;
    }


  ots_parse_file (input_stream, Art);	/* read article from stdin , put it in struct Article */
  ots_grade_doc (Art);					/* grade each sentence (how relevent is it to the text) */

/*
  int i;

  for (i=0;i<1000000;i++)
  {

    printf("\n word = %s ", ots_word_in_list(Art->ImpWords,i));

  }
*/

  ots_highlight_doc (Art, sumPercent);	/* highlight what we are going to print 0% - 100% of the words */


	if (html)
		ots_print_HTML(output_stream, Art);	/* print article in html form */
	else if (keywords)
		print_keywords(Art, input_file); 
	else if (about)
		print_about(output_stream, Art);
	else
		ots_print_doc(output_stream, Art);	/* print article in text */

  ots_free_article (Art);
 

  if (input_file)
    fclose (input_stream);

  if (output_file)
    fclose (output_stream);

  return 0;
}