Пример #1
0
gboolean
ots_load_xml_dictionary (OtsArticle * Doc,unsigned const char *name)
{

  xmlDocPtr doc=NULL;
  xmlNodePtr head=NULL;
  xmlNodePtr stem=NULL;
  xmlNodePtr pre=NULL;
  xmlNodePtr post=NULL;
  xmlNodePtr syno=NULL;	    		/* synonyms     */
  xmlNodePtr manual=NULL;	   	/* manual  */
  xmlNodePtr step1_pre=NULL;		/* step1  */
  xmlNodePtr step1_post=NULL;		/* step1  */
  
  xmlNodePtr parse=NULL;	   	/* parser rules */
  xmlNodePtr pbreak=NULL;	      
  xmlNodePtr pdbreak=NULL;		
  
  xmlNodePtr tc_words=NULL;	   	/* term count dictionary   */
  xmlNodePtr tf_words=NULL;	   	/* term frequency dictionary   */
  

  OtsStemRule * rule=Doc->stem;
	
  char *local_dict_name;
  
  local_dict_name = g_strdup_printf ("%s.xml", name);


	if (g_file_test(local_dict_name,G_FILE_TEST_EXISTS))
		  doc = xmlParseFile (local_dict_name); /* it warns to the screen so we cant use it; enable for web services only */
  if (doc == NULL) return (FALSE);

  head = xmlDocGetRootElement (doc);
  if (head == NULL)
    {
      fprintf (stderr, "empty document\n");
      xmlFreeDoc (doc);
      return (FALSE);
    }

  if (xmlStrcmp (head->name, (const xmlChar *) "dictionary"))
    {
      fprintf (stderr, "%s", head->name);
      xmlFreeDoc (doc);
      return (FALSE);
    }

  if (head != NULL)
    stem = head->xmlChildrenNode;
  while ((stem != NULL)
	 && (xmlStrcmp (stem->name, (const xmlChar *) "stemmer")))
    {
      stem = stem->next;
    }

  if (head != NULL)
    parse = head->xmlChildrenNode;
  while ((parse != NULL)
	 && (xmlStrcmp (parse->name, (const xmlChar *) "parser")))
    {
      parse = parse->next;
    }

  if (head != NULL)
    tc_words = head->xmlChildrenNode;
  while ((tc_words != NULL)
	 && (xmlStrcmp (tc_words->name, (const xmlChar *) "grader-tc")))
    {
      tc_words = tc_words->next;
    }


  if (head != NULL)
    tf_words = head->xmlChildrenNode;
  while ((tf_words != NULL)
	 && (xmlStrcmp (tf_words->name, (const xmlChar *) "grader-tf")))
    {
      tf_words = tf_words->next;
    }
    
    

  if (stem != NULL)
    pre = stem->xmlChildrenNode;
  while ((pre != NULL) && (xmlStrcmp (pre->name, (const xmlChar *) "pre")))
    {
      pre = pre->next;
    }

  if (stem != NULL)
    post = stem->xmlChildrenNode;
  while ((post != NULL) && (xmlStrcmp (post->name, (const xmlChar *) "post")))
    {
      post = post->next;
    }


  if (stem != NULL)
    syno = stem->xmlChildrenNode;
  while ((syno != NULL)
	 && (xmlStrcmp (syno->name, (const xmlChar *) "synonyms")))
    {
      syno = syno->next;
    }

  if (stem != NULL)
    manual = stem->xmlChildrenNode;
  while ((manual != NULL)
	 && (xmlStrcmp (manual->name, (const xmlChar *) "manual")))
    {
      manual = manual->next;
    }


  if (stem != NULL)
    step1_pre = stem->xmlChildrenNode;
  while ((step1_pre != NULL)
	 && (xmlStrcmp (step1_pre->name, (const xmlChar *) "step1_pre")))
    {
      step1_pre = step1_pre->next;
    }
    
    
    
  if (stem != NULL)
    step1_post = stem->xmlChildrenNode;
  while ((step1_post != NULL)
	 && (xmlStrcmp (step1_post->name, (const xmlChar *) "step1_post")))
    {
      step1_post = step1_post->next;
    }


  if (pre != NULL)
    pre = pre->xmlChildrenNode;	/*point to first word */
  while (pre != NULL)
    {
      if (0 == xmlStrcmp (pre->name, (const xmlChar *) "rule"))
	rule->RemovePre =
	  g_list_append (rule->RemovePre,
			 (xmlNodeListGetString
				 (doc, pre->xmlChildrenNode, 1)));
      pre = pre->next;
    }


  if (post != NULL)
    post = post->xmlChildrenNode;
  while (post != NULL)
    {
      if (0 == xmlStrcmp (post->name, (const xmlChar *) "rule"))
	rule->RemovePost =
	  g_list_append (rule->RemovePost,
			       (xmlNodeListGetString
				 (doc, post->xmlChildrenNode, 1)));
      post = post->next;
    }

  if (syno != NULL)
    syno = syno->xmlChildrenNode;
  while (syno != NULL)
    {
      if (0 == xmlStrcmp (syno->name, (const xmlChar *) "rule"))
	rule->synonyms =
	  g_list_append (rule->synonyms,
			     (xmlNodeListGetString
				 (doc, syno->xmlChildrenNode, 1)));
      syno = syno->next;
    }

  if (manual != NULL)
    manual = manual->xmlChildrenNode;
  while (manual != NULL)
    {
      if (0 == xmlStrcmp (manual->name, (const xmlChar *) "rule"))
	rule->manual =
	  g_list_append (rule->manual,
			      (xmlNodeListGetString
				 (doc, manual->xmlChildrenNode, 1)));
      manual = manual->next;
    }




 if (step1_pre != NULL)
    step1_pre = step1_pre->xmlChildrenNode;
  while (step1_pre != NULL)
    {
      if (0 == xmlStrcmp (step1_pre->name, (const xmlChar *) "rule"))
	rule->step1_pre =
	  g_list_append (rule->step1_pre,
			      (xmlNodeListGetString
				 (doc, step1_pre->xmlChildrenNode, 1)));
      step1_pre = step1_pre->next;
    }



 if (step1_post != NULL)
    step1_post = step1_post->xmlChildrenNode;
  while (step1_post != NULL)
    {
      if (0 == xmlStrcmp (step1_post->name, (const xmlChar *) "rule"))
	rule->step1_post =
	  g_list_append (rule->step1_post,
			        (xmlNodeListGetString
				 (doc, step1_post->xmlChildrenNode, 1)));
      step1_post = step1_post->next;
    }

 if (parse != NULL)
    pbreak = parse->xmlChildrenNode;
  while ((pbreak != NULL) && (xmlStrcmp (pbreak->name, (const xmlChar *) "linebreak")))
    {
      pbreak = pbreak->next;
    }



 if (parse != NULL)
    pdbreak = parse->xmlChildrenNode;
  while ((pdbreak != NULL) && (xmlStrcmp (pdbreak->name, (const xmlChar *) "linedontbreak")))
    {
      pdbreak = pdbreak->next;
    }
    
    
  /*Parser break*/
  if (pbreak != NULL)
    pbreak = pbreak->xmlChildrenNode;
  while (pbreak != NULL)
    {
      if (0 == xmlStrcmp (pbreak->name, (const xmlChar *) "rule"))
	rule->ParserBreak =
	  g_list_append (rule->ParserBreak,
			        (xmlNodeListGetString
				 (doc, pbreak->xmlChildrenNode, 1)));
      pbreak = pbreak->next;
    }

  /*Parser Don't break*/
  if (pdbreak != NULL)
    pdbreak = pdbreak->xmlChildrenNode;
  while (pdbreak != NULL)
    {
      if (0 == xmlStrcmp (pdbreak->name, (const xmlChar *) "rule"))
	rule->ParserDontBreak =
	  g_list_append (rule->ParserDontBreak,
			        (xmlNodeListGetString
				 (doc, pdbreak->xmlChildrenNode, 1)));
      pdbreak = pdbreak->next;
    }

  /*Term Count load dict*/

  if (tc_words != NULL)
    tc_words = tc_words->xmlChildrenNode;
  while (tc_words != NULL)
    {
      if (0 == xmlStrcmp (tc_words->name, (const xmlChar *) "word"))
	{
		xmlChar *key;
		key=xmlNodeListGetString(doc, tc_words->xmlChildrenNode,1);
	   Doc->dict = g_list_append (Doc->dict,(gpointer)ots_new_wordEntery(key));
      xmlFree(key);
   }
      tc_words = tc_words->next;
    }
      
  
  /*Term Frequency load dict*/
  
  if (tf_words != NULL)
    tf_words = tf_words->xmlChildrenNode;
  while (tf_words != NULL)
    {
      if (0 == xmlStrcmp (tf_words->name, (const xmlChar *) "word"))
	{
		xmlChar *key;
		xmlChar *idf_key;
		key=xmlNodeListGetString(doc, tf_words->xmlChildrenNode,1);
	   
	   idf_key=xmlGetProp(tf_words,"idf");
	   Doc->tf_terms = g_list_append (Doc->tf_terms,ots_new_OtsWordTF(key,atof(idf_key)));
      xmlFree(key);
      xmlFree(idf_key);
   }
      tf_words = tf_words->next;
    }
    
    
  xmlFreeDoc(doc);
  xmlCleanupParser ();
  g_free(local_dict_name);
  return (TRUE);
}
Пример #2
0
gboolean ots_load_xml_dictionary_buf(OtsArticle* Doc, const char* buffer) {
  char* head_name    = NULL;
  node_t* stem       = NULL;
  node_t* pre        = NULL;
  node_t* post       = NULL;
  node_t* syno       = NULL;        /* synonyms */
  node_t* manual     = NULL;        /* manual   */
  node_t* step1_pre  = NULL;        /* step1    */
  node_t* step1_post = NULL;        /* step1    */
  
  node_t* parse   = NULL;           /* parser rules */
  node_t* pbreak  = NULL;
  node_t* pdbreak = NULL;
  
  node_t* tc_words = NULL;          /* term count dictionary   */
  node_t* tf_words = NULL;          /* term frequency dictionary   */
  
  OtsStemRule* rule = Doc->stem;
  node_t* head = roxml_load_buf((char*)buffer);
  
  if (head == NULL) {
    fprintf(stderr, "empty document\n");
    roxml_release(RELEASE_ALL);
    return (FALSE);
  }
#if DEBUG > 1
  fprintf(stdout, "%d: %s\n", __LINE__, roxml_get_name(head, NULL, 0));
#endif /* DEBUG > 1 */
  if (g_strcmp0(roxml_get_name(head, head_name, 0), (const char*)"dictionary")) {
    fprintf(stderr, "%s\n", head_name);
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  
  stem = roxml_get_chld(head, NULL, 0);
  if(stem == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_get_xml_node(&stem, (const char*)"stemmer");
  
  parse = roxml_get_chld(head, NULL, 0);
  if(parse == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_get_xml_node(&parse, (const char*)"parser");
  
  tc_words = roxml_get_chld(head, NULL, 0);
  if(tc_words == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_get_xml_node(&tc_words, (const char*)"grader-tc");
  
  tf_words = roxml_get_chld(head, NULL,  0);
  if(tf_words == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_get_xml_node(&tf_words, (const char*)"grader-tf");
  
  pre = roxml_get_chld(stem, NULL,  0);
  if(pre == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_get_xml_node(&pre, (const char*)"pre");
  
  post = roxml_get_chld(stem, NULL,  0);
  if(post == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_get_xml_node(&post, (const char*)"post");
  
  syno = roxml_get_chld(stem, NULL, 0);
  if(syno == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_get_xml_node(&syno, (const char*)"synonyms");
  
  manual = roxml_get_chld(stem, NULL, 0);
  if(manual == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_get_xml_node(&manual, (const char*)"manual");
  
  step1_pre = roxml_get_chld(stem, NULL, 0);
  if(step1_pre == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_get_xml_node(&step1_pre, (const char*)"step1_pre");
  
  step1_post = roxml_get_chld(stem, NULL, 0);
  if(step1_post == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_get_xml_node(&step1_post, (const char*)"step1_post");
  
  pre = roxml_get_chld(pre, NULL, 0); /* point to first word */
  if(pre == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_append_xml_node_contents(&rule->RemovePre, &pre);
  
  post = roxml_get_chld(post, NULL, 0);
  if(post == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_append_xml_node_contents(&rule->RemovePost, &post);
  
  syno = roxml_get_chld(syno, NULL, 0);
  if(syno == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_append_xml_node_contents(&rule->synonyms, &syno);
  
  manual = roxml_get_chld(manual, NULL, 0);
  if(manual == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_append_xml_node_contents(&rule->manual, &manual);
  
  step1_pre = roxml_get_chld(step1_pre, NULL, 0);
  if(step1_pre == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_append_xml_node_contents(&rule->step1_pre, &step1_pre);
  
  step1_post = roxml_get_chld(step1_post, NULL, 0);
  if(step1_post == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_append_xml_node_contents(&rule->step1_post, &step1_post);
  
  pbreak = roxml_get_chld(parse, NULL, 0);
  if(pbreak == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_get_xml_node(&pbreak, (const char*)"linebreak");
  
  pdbreak = roxml_get_chld(parse, NULL, 0);
  if(pdbreak == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_get_xml_node(&pdbreak, (const char*)"linedontbreak");
  
  /*Parser break*/
  pbreak = roxml_get_chld(pbreak, NULL, 0);
  if(pbreak == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_append_xml_node_contents(&rule->ParserBreak, &pbreak);
  
  /*Parser Don't break*/
  pdbreak = roxml_get_chld(pdbreak, NULL, 0);
  if(pdbreak == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
  ots_append_xml_node_contents(&rule->ParserDontBreak, &pdbreak);
  
  /* Term Count load dict */
  tc_words = roxml_get_chld(tc_words, NULL, 0);
  if(tc_words == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
#if DEBUG > 1
  fprintf(stdout, "%d: %s\n", __LINE__, roxml_get_name(tc_words, NULL, 0));
  fprintf(stdout, "%d: %s\n", __LINE__, roxml_get_content(tc_words, NULL, 0, NULL));
#endif /* DEBUG > 1 */
  while (tc_words != NULL) {
    if (!g_strcmp0(roxml_get_name(tc_words, NULL, 0), (const char*)"word")) {
      Doc->dict = g_list_prepend(Doc->dict,
        (gpointer)ots_new_wordEntery(
          (unsigned const char*)roxml_get_content(tc_words, NULL, 0, NULL)));
    }
    tc_words = roxml_get_next_sibling(tc_words);
#if DEBUG > 2
    fprintf(stdout, "%s\n", roxml_get_content(tc_words, NULL, 0, NULL));
#endif /* DEBUG > 2 */
  }
  
  /*Term Frequency load dict*/
  tf_words = roxml_get_chld(tf_words, NULL, 0);
  if(tf_words == NULL) {
    roxml_release(RELEASE_ALL);
    roxml_close(head);
    return (FALSE);
  }
#if DEBUG > 1
  fprintf(stdout, "%d: %s\n", __LINE__, roxml_get_name(tf_words, NULL, 0));
  fprintf(stdout, "%d: %s\n", __LINE__, roxml_get_content(tf_words, NULL, 0, NULL));
#endif /* DEBUG > 1 */
  while (tf_words != NULL) {
    if (!g_strcmp0(roxml_get_name(tf_words, NULL, 0), (const char*)"word")) {
      const gchar* key = g_strdup(roxml_get_content(tf_words, NULL, 0, NULL));
      node_t* idf_attr = roxml_get_attr(tf_words, "idf", 0);
      if (idf_attr != NULL) {
        const char* idf_val = roxml_get_content(idf_attr, NULL, 0, NULL);
        if (idf_val != NULL) {
#if DEBUG > 3
          fprintf(stdout, "%s\n", idf_val);
#endif /* DEBUG > 3 */
          Doc->tf_terms = g_list_append(Doc->tf_terms,
          ots_new_OtsWordTF((const unsigned char*)key, atof(idf_val)));
        }
      }
      g_free((gpointer)key);
    }
    tf_words = roxml_get_next_sibling(tf_words);
#if DEBUG > 2
    fprintf(stdout, "%s\n", roxml_get_content(tf_words, NULL, 0, NULL));
#endif /* DEBUG > 2 */
  }
  roxml_release(RELEASE_ALL);
  roxml_close(head);
  return (TRUE);
}