示例#1
0
static int cmpaffix(const void *s1,const void *s2){
  int lc;
  if (((const DPS_AFFIX*)s1)->type < ((const DPS_AFFIX*)s2)->type) {
    return -1;
  }
  if (((const DPS_AFFIX*)s1)->type > ((const DPS_AFFIX*)s2)->type) {
    return 1;
  }
  lc = strcmp(((const DPS_AFFIX*)s1)->lang,((const DPS_AFFIX*)s2)->lang);
  if (lc == 0) {
    if ( (((const DPS_AFFIX*)s1)->replen == 0) && (((const DPS_AFFIX*)s2)->replen == 0) ) {
      return 0;
    }
    if (((const DPS_AFFIX*)s1)->replen == 0) {
      return -1;
    }
    if (((const DPS_AFFIX*)s2)->replen == 0) {
      return 1;
    }
    {
      dpsunicode_t u1[BUFSIZ], u2[BUFSIZ];
      DpsUniStrCpy(u1,((const DPS_AFFIX*)s1)->repl); 
      DpsUniStrCpy(u2,((const DPS_AFFIX*)s2)->repl); 
      if (((const DPS_AFFIX*)s1)->type == 'p') {
	*u1 &= 255; *u2 &= 255;
	return DpsUniStrCmp(u1, u2);
      } else {
	u1[((const DPS_AFFIX*)s1)->replen - 1] &= 255; u2[((const DPS_AFFIX*)s2)->replen -1] &= 255;
	return DpsUniStrBCmp(u1, u2);
      }
    }
  }
  return lc;
}
示例#2
0
static int cmpspellword(dpsunicode_t *w1, const dpsunicode_t *w2) {
    register dpsunicode_t u1 = (*w1 & 255), u2 = (*w2 & 255);
    if (u1 < u2) return -1;
    if (u1 > u2) return 1;
    if (u1 == 0) return 0;
    return DpsUniStrCmp(w1 + 1, w2 + 1);
}
示例#3
0
static DPS_CHINAWORD * DpsChineseListFind(DPS_CHINALIST *List, const dpsunicode_t *word) {
     int low  = 0;
     int high = List->nwords - 1;

     if(!List->ChiWord) return(0);
     while (low <= high) {
          int middle = (low + high) / 2;
          int match = DpsUniStrCmp(List->ChiWord[middle].word, word);
          if (match < 0)  { low = middle + 1;
          } else if (match > 0) { high = middle - 1;
          } else return(&List->ChiWord[middle]);
     }
     return(NULL);
}
示例#4
0
static int cmpchinese(const void *s1,const void *s2){
     return(DpsUniStrCmp(((const DPS_CHINAWORD*)s1)->word, ((const DPS_CHINAWORD*)s2)->word));
}
示例#5
0
__C_LINK int __DPSCALL DpsImportDictionary(DPS_ENV * Conf, const char *lang, const char *charset,
				   const char *filename, int skip_noflag, const char *first_letters){
        struct stat     sb;
	char *str, *data = NULL, *cur_n = NULL;	
	char *lstr;	
	dpsunicode_t *ustr;	
	DPS_CHARSET *sys_int;
	DPS_CHARSET *dict_charset;
	DPS_CONV touni;
	DPS_CONV fromuni;
	int             fd;
	char            savebyte;

	if ((lstr = (char*) DpsMalloc(2048)) == NULL) {
	  DPS_FREE(str);
	  return DPS_ERROR; 
	}
	if ((ustr = (dpsunicode_t*) DpsMalloc(8192)) == NULL) {
	  DPS_FREE(lstr);
	  return DPS_ERROR; 
	}

	dict_charset = DpsGetCharSet(charset);
	sys_int = DpsGetCharSet("sys-int");
	if ((dict_charset == NULL) || (sys_int == NULL)) {
	  DPS_FREE(lstr);
	  DPS_FREE(ustr);
	  return DPS_ERROR;
	}
	
	DpsConvInit(&touni, dict_charset, sys_int, Conf->CharsToEscape, 0);
	DpsConvInit(&fromuni, sys_int, dict_charset, Conf->CharsToEscape, 0);
	
	if (stat(filename, &sb)) {
	  fprintf(stderr, "Unable to stat synonyms file '%s': %s", filename, strerror(errno));
	  DPS_FREE(lstr);
	  DPS_FREE(ustr);
	  return DPS_ERROR;
	}
	if ((fd = DpsOpen2(filename, O_RDONLY)) <= 0) {
	  fprintf(stderr, "Unable to open synonyms file '%s': %s", filename, strerror(errno));
	  return DPS_ERROR;
	}
	if ((data = (char*)DpsMalloc(sb.st_size + 1)) == NULL) {
	  fprintf(stderr, "Unable to alloc %ld bytes", (long)sb.st_size);
	  DpsClose(fd);
	  DPS_FREE(lstr);
	  DPS_FREE(ustr);
	  return DPS_ERROR;
	}
	if (read(fd, data, sb.st_size) != (ssize_t)sb.st_size) {
	  fprintf(stderr, "Unable to read synonym file '%s': %s", filename, strerror(errno));
	  DPS_FREE(data);
	  DpsClose(fd);
	  DPS_FREE(lstr);
	  DPS_FREE(ustr);
	  return DPS_ERROR;
	}
	data[sb.st_size] = '\0';
	str = data;
	cur_n = strchr(str, '\n');
	if (cur_n != NULL) {
	  cur_n++;
	  savebyte = *cur_n;
	  *cur_n = '\0';
	}

	DpsClose(fd);
	while(str != NULL) {
		char *s;
		const char *flag;
		int res;
		
	        flag = NULL;
		s = str;
		while(*s){
			if(*s == '\r') *s = '\0';
			if(*s == '\n') *s = '\0';
			s++;
		}
		if((s=strchr(str,'/'))){
			*s=0;
			s++;flag=s;
			while(*s){
				if(((*s>='A')&&(*s<='Z'))||((*s>='a')&&(*s<='z')))s++;
				else{
					*s=0;
					break;
				}
			}
		}else{
			if(skip_noflag)	goto loop_continue;
			flag="";
		}

		res = DpsConv(&touni, (char*)ustr, 8192, str, 1024);
		DpsUniStrToLower(ustr);

		/* Dont load words if first letter is not required */
		/* It allows to optimize loading at  search time   */
		if(*first_letters) {
			DpsConv(&fromuni, lstr, 2048, ((const char*)ustr),(size_t)res);
			if(!strchr(first_letters,lstr[0]))
				goto loop_continue;
		}
		res = DpsSpellAdd(&Conf->Spells,ustr,flag,lang);
		if (res != DPS_OK) {
		  DPS_FREE(lstr);
		  DPS_FREE(ustr); DPS_FREE(data);
		  return res;
		}
		if (Conf->Flags.use_accentext) {
		  dpsunicode_t *af_uwrd = DpsUniAccentStrip(ustr);
		  if (DpsUniStrCmp(af_uwrd, ustr) != 0) {
		    res = DpsSpellAdd(&Conf->Spells, af_uwrd, flag, lang);
		    if (res != DPS_OK) {
		      DPS_FREE(lstr);
		      DPS_FREE(ustr); DPS_FREE(data); DPS_FREE(af_uwrd);
		      return res;
		    }
		  }
		  DPS_FREE(af_uwrd);
		  if (strncasecmp(lang, "de", 2) == 0) {
		    dpsunicode_t *de_uwrd = DpsUniGermanReplace(ustr);
		    if (DpsUniStrCmp(de_uwrd, ustr) != 0) {
		      res = DpsSpellAdd(&Conf->Spells, de_uwrd, flag, lang);
		      if (res != DPS_OK) {
			DPS_FREE(lstr);
			DPS_FREE(ustr); DPS_FREE(data); DPS_FREE(de_uwrd);
			return res;
		      }
		    }
		    DPS_FREE(de_uwrd);
		  }
		}
	loop_continue:
		str = cur_n;
		if (str != NULL) {
		  *str = savebyte;
		  cur_n = strchr(str, '\n');
		  if (cur_n != NULL) {
		    cur_n++;
		    savebyte = *cur_n;
		    *cur_n = '\0';
		  }
		}
	}
	DPS_FREE(data);
	DPS_FREE(lstr);
	DPS_FREE(ustr);
	return DPS_OK;
}
示例#6
0
int DpsSEAMake(DPS_AGENT *Indexer, DPS_DOCUMENT *Doc, DPS_DSTR *excerpt,  
	       const char *content_lang, size_t *indexed_size, size_t *indexed_limit, 
	       size_t max_word_len, size_t min_word_len, int crossec, int seasec
#ifdef HAVE_ASPELL
	       , int have_speller, AspellSpeller *speller
#endif
	       ) {
  DPS_SENTENCELIST List;
  DPS_MAPSTAT MapStat;
  DPS_TEXTITEM Item;
  DPS_VAR	*Sec;
  dpsunicode_t *sentence, *lt, savec;
  double *links, *lang_cs, w;
  double delta, pdiv, cur_div;
  size_t l, sent_len, order;
  size_t min_len = 10000000, min_pos = 0;
  int  it;
  register size_t i, j;
#ifdef DEBUG
  char lcstr[4096];

#endif

  TRACE_IN(Indexer, "DpsSEAMake");

  if((Sec = DpsVarListFind(&Doc->Sections, "sea"))) { /* set SEA section to NULL */
    DPS_FREE(Sec->val);
    DPS_FREE(Sec->txt_val);
    Sec->curlen = 0;
  }
  
  bzero(&List, sizeof(List));
  order = 0;
  sentence = DpsUniStrTok_SEA((dpsunicode_t*)excerpt->data, &lt);
  while(sentence) {
    if (lt != NULL) { savec = *lt; *lt = 0; }
#ifdef DEBUG
    DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)sentence, sizeof(dpsunicode_t) * (DpsUniLen(sentence) + 1));
    fprintf(stderr, "Sentence.%d: %s\n", List.nitems, lcstr);
#endif
    if ((sent_len = DpsUniLen(sentence)) >= Indexer->Flags.SEASentenceMinLength) {
      j = 1;
      for (i = 0; i < List.nitems; i++) {
	if (DpsUniStrCmp(sentence, List.Sent[i].sentence) == 0) {
	  j = 0; break;
	}
      }
      if (j) {
	if ( List.nitems < Indexer->Flags.SEASentences ) {
	  if (List.nitems == List.mitems) {
	    List.mitems += 16;
	    List.Sent = (DPS_SENTENCE*)DpsRealloc(List.Sent, List.mitems * sizeof(DPS_SENTENCE));
	    if (List.Sent == NULL) { TRACE_OUT(Indexer); return DPS_ERROR;}
	  }
	  List.Sent[List.nitems].sentence = DpsUniDup(sentence);
	  List.Sent[List.nitems].len = sent_len;
	  List.Sent[List.nitems].order = order++;
	  sentence = DpsUniDup(sentence);
	  DpsUniStrToLower(sentence);
	  bzero(&List.Sent[List.nitems].LangMap, sizeof(DPS_LANGMAP));
	  DpsBuildLangMap(&List.Sent[List.nitems].LangMap, (char*)sentence, sent_len * sizeof(dpsunicode_t), 0, 0);
	  if (sent_len < min_len) { min_len = sent_len; min_pos = List.nitems; }
	  List.nitems++;
	  DPS_FREE(sentence);
	} else if (sent_len > min_len) {
	  DPS_FREE(List.Sent[min_pos].sentence);
	  List.Sent[min_pos].sentence = DpsUniDup(sentence);
	  List.Sent[min_pos].len = sent_len;
	  List.Sent[min_pos].order = order++;
	  sentence = DpsUniDup(sentence);
	  DpsUniStrToLower(sentence);
	  bzero(&List.Sent[min_pos].LangMap, sizeof(DPS_LANGMAP));
	  DpsBuildLangMap(&List.Sent[min_pos].LangMap, (char*)sentence, sent_len * sizeof(dpsunicode_t), 0, 0);
	  DPS_FREE(sentence);
	  min_len = List.Sent[0].len; min_pos = 0;
	  for(i = 1; i < List.nitems; i++) if (List.Sent[i].len < min_len) { min_len = List.Sent[i].len; min_pos = i; }
	}
      }
    }
#ifdef DEBUG
    fprintf(stderr, "Sent. len.:%d, Min.allowed: %d\n", sent_len, Indexer->Flags.SEASentenceMinLength);
#endif
    if (lt != NULL) *lt = savec;
    sentence = DpsUniStrTok_SEA(NULL, &lt);
  }
  DpsLog(Indexer, DPS_LOG_DEBUG, "SEA sentences: %d", List.nitems);
  if (List.nitems < 4) {
    for (i = 0; i < List.nitems; i++) DPS_FREE(List.Sent[i].sentence);
    DPS_FREE(List.Sent); 
    TRACE_OUT(Indexer);
    return DPS_OK; 
  }

  links = (double*)DpsMalloc(sizeof(double) * List.nitems * List.nitems);
  lang_cs = (double*)DpsMalloc(sizeof(double) * List.nitems);
/*
        k                 ot
  links[i * List.nitems + j] 
*/

  if (links != NULL && lang_cs != NULL) {

    for (i = 0; i < List.nitems; i++) {
      DpsPrepareLangMap(&List.Sent[i].LangMap);
    }

    for (i = 0; i < List.nitems; i++) {
      List.Sent[i].Oi =  List.Sent[i].di = 0.5;
      if (Doc->lang_cs_map == NULL) {
	  links[i * List.nitems + i] = 0.0;
      } else {
	MapStat.map = &List.Sent[i].LangMap;
	DpsCheckLangMap6(Doc->lang_cs_map, &List.Sent[i].LangMap, &MapStat, DPS_LM_TOPCNT * DPS_LM_TOPCNT, 2 * DPS_LM_TOPCNT);
	links[i * List.nitems + i] = (double)MapStat.hits / (2.0 * DPS_LM_TOPCNT) / (List.nitems + 1);
      }
#ifdef DEBUG
      DpsLog(Indexer, DPS_LOG_INFO, "Link %u->%u: %f [hits:%d miss:%d]", i, i, links[i * List.nitems + i], MapStat.hits, MapStat.miss);
#endif
      for (j = 0; j < List.nitems; j++) {
	  if (j == i) continue;
	MapStat.map = &List.Sent[j].LangMap;
	DpsCheckLangMap6(&List.Sent[j].LangMap, &List.Sent[i].LangMap, &MapStat, DPS_LM_TOPCNT * DPS_LM_TOPCNT, 2 * DPS_LM_TOPCNT);

	links[i * List.nitems + j] = (double)MapStat.hits / (2.0 * DPS_LM_TOPCNT) / (List.nitems + 1);
#ifdef DEBUG
	DpsLog(Indexer, DPS_LOG_INFO, "Link %u->%u: %f [hits:%d miss:%d]", i, j, links[i * List.nitems + j], MapStat.hits, MapStat.miss);
#endif
      }
    }

    for (l = 0; l < List.nitems; l++) {
	w = 0.0;
	for (i = 0; i < List.nitems; i++) { 
	    w += links[l * List.nitems + i] * List.Sent[i].Oi;
	}
	w = f(w);
	if (w < LOW_BORDER_EPS2) w = LOW_BORDER_EPS2;
	else if (w > HI_BORDER_EPS2) w = HI_BORDER_EPS2;
	List.Sent[l].di = w;
    }

    DpsSort(List.Sent, List.nitems, sizeof(DPS_SENTENCE), (qsort_cmp)SentCmp);

#ifdef DEBUG
    DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[0].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[0].sentence) + 1));
    fprintf(stderr, "Sent.0: %f %f -- %s\n", List.Sent[0].di, List.Sent[0].Oi, lcstr);
    DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[1].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[1].sentence) + 1));
    fprintf(stderr, "Sent.1: %f %f -- %s\n", List.Sent[1].di, List.Sent[1].Oi, lcstr);
    DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[2].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[2].sentence) + 1));
    fprintf(stderr, "Sent.2: %f %f -- %s\n", List.Sent[2].di, List.Sent[2].Oi, lcstr);
    DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[3].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[3].sentence) + 1));
    fprintf(stderr, "Sent.3: %f %f -- %s\n", List.Sent[3].di, List.Sent[3].Oi, lcstr);
    DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[4].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[4].sentence) + 1));
    fprintf(stderr, "Sent.4: %f %f -- %s\n", List.Sent[4].di, List.Sent[4].Oi, lcstr);
#endif
    DpsSort(List.Sent, TOP_SENTENCES, sizeof(DPS_SENTENCE), (qsort_cmp)SentOrderCmp);

    bzero(&Item, sizeof(Item));
    Item.section = seasec;
    Item.href = NULL;
    Item.section_name = "sea";
    for (i = 0; i < TOP_SENTENCES; i++) {
      dpsunicode_t *UStr = DpsUniDup(List.Sent[i].sentence);
      DpsPrepareItem(Indexer, Doc, &Item, List.Sent[i].sentence, UStr, content_lang, indexed_size, indexed_limit,
		     max_word_len, min_word_len, crossec
#ifdef HAVE_ASPELL
		     , have_speller, speller, NULL
#endif
		     );
      DPS_FREE(UStr);
    }
  }
  DPS_FREE(lang_cs);
  DPS_FREE(links);
  for (i = 0; i < List.nitems; i++) DPS_FREE(List.Sent[i].sentence);
  DPS_FREE(List.Sent);

  TRACE_OUT(Indexer);
  return DPS_OK;
}
示例#7
0
DPS_WIDEWORDLIST * DpsSynonymListFind(const DPS_SYNONYMLIST * List,DPS_WIDEWORD * wword){
     DPS_SYNONYM syn,*res,*first,*last;
     DPS_SYNONYM *psyn, **pres, **pfirst, **plast;
     DPS_WIDEWORDLIST *Res = NULL;
     size_t nnorm,i;

     if(!List->nsynonyms)return NULL;

     syn.p.uword = wword->uword;

     res = bsearch(&syn, List->Synonym, List->nsynonyms, sizeof(DPS_SYNONYM), &cmpsyn);

     if(res){

          Res = (DPS_WIDEWORDLIST *)DpsMalloc(sizeof(*Res));
	  if (Res == NULL) return NULL;
          DpsWideWordListInit(Res);

          /* Find first and last synonym */
          for(first = res; first >= List->Synonym; first--) {
               if(DpsUniStrCmp(wword->uword,first->p.uword)){
                    break;
               }else{
                    first->s.order = wword->order;
                    first->s.origin = DPS_WORD_ORIGIN_SYNONYM;
                    DpsWideWordListAdd(Res,&first->s, DPS_WWL_LOOSE);
               }
          }
          for(last=res+1;last<List->Synonym+List->nsynonyms;last++){
               if(DpsUniStrCmp(wword->uword,last->p.uword)){
                    break;
               }else{
                    last->s.order=wword->order;
                    last->s.origin = DPS_WORD_ORIGIN_SYNONYM;
                    DpsWideWordListAdd(Res,&last->s, DPS_WWL_LOOSE);
               }
          }
     }

     syn.s.uword = wword->uword;
     psyn = &syn;
     pres = bsearch(&psyn, List->Back, List->nsynonyms, sizeof(DPS_SYNONYM*), &cmpsynback);

     if(pres) {

          if (Res == NULL) {
	    Res = (DPS_WIDEWORDLIST *)DpsMalloc(sizeof(*Res));
	    if (Res == NULL) return NULL;
	    DpsWideWordListInit(Res);
	  }

          /* Find first and last synonym */
          for(pfirst = pres; pfirst >= List->Back; pfirst--) {
	    if(DpsUniStrCmp(wword->uword, (*pfirst)->s.uword)) {
                    break;
	    }else{
	      (*pfirst)->p.order = wword->order;
	      (*pfirst)->p.origin = DPS_WORD_ORIGIN_SYNONYM;
	      DpsWideWordListAdd(Res, &((*pfirst)->p), DPS_WWL_LOOSE);
	    }
          }
          for(plast = pres + 1; plast < List->Back + List->nsynonyms; plast++) {
	    if(DpsUniStrCmp(wword->uword, (*plast)->s.uword)) {
                    break;
	    } else {
	      (*plast)->p.order = wword->order;
	      (*plast)->p.origin = DPS_WORD_ORIGIN_SYNONYM;
	      DpsWideWordListAdd(Res, &((*plast)->p), DPS_WWL_LOOSE);
	    }
          }
     }

     if (Res == NULL) return NULL;

     /* Now find each of them in reverse order */
     if ((nnorm = Res->nwords) > 0) {
          for(i = 0; i < nnorm; i++) {

	    syn.p.uword = Res->Word[i].uword;

	    res = bsearch(&syn, List->Synonym, List->nsynonyms, sizeof(DPS_SYNONYM), &cmpsyn);

	    if(res){

	      /* Find first and last synonym */
	      for(first = res; first >= List->Synonym; first--) {
		if(DpsUniStrCmp(Res->Word[i].uword, first->p.uword)){
		  break;
		}else{
		  if ((Res->Word[i].count != 0) && (first->p.count != Res->Word[i].count)) continue;
		  first->s.order = wword->order;
		  first->s.origin = DPS_WORD_ORIGIN_SYNONYM;
		  DpsWideWordListAdd(Res, &first->s, DPS_WWL_LOOSE);
		}
	      }
	      for(last=res+1;last<List->Synonym+List->nsynonyms;last++){
		if(DpsUniStrCmp(Res->Word[i].uword, last->p.uword)) {
		  break;
		}else{
		  if ((Res->Word[i].count != 0) && (last->p.count != Res->Word[i].count)) continue;
		  last->s.order=wword->order;
		  last->s.origin = DPS_WORD_ORIGIN_SYNONYM;
		  DpsWideWordListAdd(Res, &last->s, DPS_WWL_LOOSE);
		}
	      }
	    }

	    syn.s.uword = Res->Word[i].uword;
	    pres = bsearch(&psyn, List->Back, List->nsynonyms, sizeof(DPS_SYNONYM*), &cmpsynback);
               
	    if(pres) {
                    /* Find first and last synonym */
                    for(pfirst = pres; pfirst >= List->Back; pfirst--) {
		      if(DpsUniStrCmp(syn.s.uword, (*pfirst)->s.uword)) {
                              break;
		      } else {
			if ((Res->Word[i].count != 0) && ((*pfirst)->s.count != Res->Word[i].count)) continue;
			(*pfirst)->s.order = wword->order;
			(*pfirst)->s.origin = DPS_WORD_ORIGIN_SYNONYM;
			DpsWideWordListAdd(Res, &((*pfirst)->s), DPS_WWL_LOOSE);
		      }
                    }
                    for(plast = pres + 1; plast < List->Back + List->nsynonyms; plast++) {
		      if(DpsUniStrCmp(syn.s.uword, (*plast)->s.uword)) {
                              break;
		      } else {
			if ((Res->Word[i].count != 0) && ((*plast)->s.count != Res->Word[i].count)) continue;
			(*plast)->s.order = wword->order;
			(*plast)->s.origin = DPS_WORD_ORIGIN_SYNONYM;
			DpsWideWordListAdd(Res, &((*plast)->s), DPS_WWL_LOOSE);
		      }
                    }
	    }
          }
     }
     return(Res);
}
示例#8
0
static int cmpsynback(const void * v1,const void * v2){
     const DPS_SYNONYM **s1 = (const DPS_SYNONYM**)v1;
     const DPS_SYNONYM **s2 = (const DPS_SYNONYM**)v2;
     return(DpsUniStrCmp((*s1)->s.uword, (*s2)->s.uword));
}
示例#9
0
static int cmpsyn(const void * v1,const void * v2){
     const DPS_SYNONYM * s1=(const DPS_SYNONYM*)v1;
     const DPS_SYNONYM * s2=(const DPS_SYNONYM*)v2;
     return(DpsUniStrCmp(s1->p.uword,s2->p.uword));
}