static int cmpaffix(const void *s1,const void *s2){ int lc; if (((const DPS_AFFIX*)s1)->type < ((const DPS_AFFIX*)s2)->type) { return -1; } if (((const DPS_AFFIX*)s1)->type > ((const DPS_AFFIX*)s2)->type) { return 1; } lc = strcmp(((const DPS_AFFIX*)s1)->lang,((const DPS_AFFIX*)s2)->lang); if (lc == 0) { if ( (((const DPS_AFFIX*)s1)->replen == 0) && (((const DPS_AFFIX*)s2)->replen == 0) ) { return 0; } if (((const DPS_AFFIX*)s1)->replen == 0) { return -1; } if (((const DPS_AFFIX*)s2)->replen == 0) { return 1; } { dpsunicode_t u1[BUFSIZ], u2[BUFSIZ]; DpsUniStrCpy(u1,((const DPS_AFFIX*)s1)->repl); DpsUniStrCpy(u2,((const DPS_AFFIX*)s2)->repl); if (((const DPS_AFFIX*)s1)->type == 'p') { *u1 &= 255; *u2 &= 255; return DpsUniStrCmp(u1, u2); } else { u1[((const DPS_AFFIX*)s1)->replen - 1] &= 255; u2[((const DPS_AFFIX*)s2)->replen -1] &= 255; return DpsUniStrBCmp(u1, u2); } } } return lc; }
static int cmpspellword(dpsunicode_t *w1, const dpsunicode_t *w2) { register dpsunicode_t u1 = (*w1 & 255), u2 = (*w2 & 255); if (u1 < u2) return -1; if (u1 > u2) return 1; if (u1 == 0) return 0; return DpsUniStrCmp(w1 + 1, w2 + 1); }
static DPS_CHINAWORD * DpsChineseListFind(DPS_CHINALIST *List, const dpsunicode_t *word) { int low = 0; int high = List->nwords - 1; if(!List->ChiWord) return(0); while (low <= high) { int middle = (low + high) / 2; int match = DpsUniStrCmp(List->ChiWord[middle].word, word); if (match < 0) { low = middle + 1; } else if (match > 0) { high = middle - 1; } else return(&List->ChiWord[middle]); } return(NULL); }
static int cmpchinese(const void *s1,const void *s2){ return(DpsUniStrCmp(((const DPS_CHINAWORD*)s1)->word, ((const DPS_CHINAWORD*)s2)->word)); }
__C_LINK int __DPSCALL DpsImportDictionary(DPS_ENV * Conf, const char *lang, const char *charset, const char *filename, int skip_noflag, const char *first_letters){ struct stat sb; char *str, *data = NULL, *cur_n = NULL; char *lstr; dpsunicode_t *ustr; DPS_CHARSET *sys_int; DPS_CHARSET *dict_charset; DPS_CONV touni; DPS_CONV fromuni; int fd; char savebyte; if ((lstr = (char*) DpsMalloc(2048)) == NULL) { DPS_FREE(str); return DPS_ERROR; } if ((ustr = (dpsunicode_t*) DpsMalloc(8192)) == NULL) { DPS_FREE(lstr); return DPS_ERROR; } dict_charset = DpsGetCharSet(charset); sys_int = DpsGetCharSet("sys-int"); if ((dict_charset == NULL) || (sys_int == NULL)) { DPS_FREE(lstr); DPS_FREE(ustr); return DPS_ERROR; } DpsConvInit(&touni, dict_charset, sys_int, Conf->CharsToEscape, 0); DpsConvInit(&fromuni, sys_int, dict_charset, Conf->CharsToEscape, 0); if (stat(filename, &sb)) { fprintf(stderr, "Unable to stat synonyms file '%s': %s", filename, strerror(errno)); DPS_FREE(lstr); DPS_FREE(ustr); return DPS_ERROR; } if ((fd = DpsOpen2(filename, O_RDONLY)) <= 0) { fprintf(stderr, "Unable to open synonyms file '%s': %s", filename, strerror(errno)); return DPS_ERROR; } if ((data = (char*)DpsMalloc(sb.st_size + 1)) == NULL) { fprintf(stderr, "Unable to alloc %ld bytes", (long)sb.st_size); DpsClose(fd); DPS_FREE(lstr); DPS_FREE(ustr); return DPS_ERROR; } if (read(fd, data, sb.st_size) != (ssize_t)sb.st_size) { fprintf(stderr, "Unable to read synonym file '%s': %s", filename, strerror(errno)); DPS_FREE(data); DpsClose(fd); DPS_FREE(lstr); DPS_FREE(ustr); return DPS_ERROR; } data[sb.st_size] = '\0'; str = data; cur_n = strchr(str, '\n'); if (cur_n != NULL) { cur_n++; savebyte = *cur_n; *cur_n = '\0'; } DpsClose(fd); while(str != NULL) { char *s; const char *flag; int res; flag = NULL; s = str; while(*s){ if(*s == '\r') *s = '\0'; if(*s == '\n') *s = '\0'; s++; } if((s=strchr(str,'/'))){ *s=0; s++;flag=s; while(*s){ if(((*s>='A')&&(*s<='Z'))||((*s>='a')&&(*s<='z')))s++; else{ *s=0; break; } } }else{ if(skip_noflag) goto loop_continue; flag=""; } res = DpsConv(&touni, (char*)ustr, 8192, str, 1024); DpsUniStrToLower(ustr); /* Dont load words if first letter is not required */ /* It allows to optimize loading at search time */ if(*first_letters) { DpsConv(&fromuni, lstr, 2048, ((const char*)ustr),(size_t)res); if(!strchr(first_letters,lstr[0])) goto loop_continue; } res = DpsSpellAdd(&Conf->Spells,ustr,flag,lang); if (res != DPS_OK) { DPS_FREE(lstr); DPS_FREE(ustr); DPS_FREE(data); return res; } if (Conf->Flags.use_accentext) { dpsunicode_t *af_uwrd = DpsUniAccentStrip(ustr); if (DpsUniStrCmp(af_uwrd, ustr) != 0) { res = DpsSpellAdd(&Conf->Spells, af_uwrd, flag, lang); if (res != DPS_OK) { DPS_FREE(lstr); DPS_FREE(ustr); DPS_FREE(data); DPS_FREE(af_uwrd); return res; } } DPS_FREE(af_uwrd); if (strncasecmp(lang, "de", 2) == 0) { dpsunicode_t *de_uwrd = DpsUniGermanReplace(ustr); if (DpsUniStrCmp(de_uwrd, ustr) != 0) { res = DpsSpellAdd(&Conf->Spells, de_uwrd, flag, lang); if (res != DPS_OK) { DPS_FREE(lstr); DPS_FREE(ustr); DPS_FREE(data); DPS_FREE(de_uwrd); return res; } } DPS_FREE(de_uwrd); } } loop_continue: str = cur_n; if (str != NULL) { *str = savebyte; cur_n = strchr(str, '\n'); if (cur_n != NULL) { cur_n++; savebyte = *cur_n; *cur_n = '\0'; } } } DPS_FREE(data); DPS_FREE(lstr); DPS_FREE(ustr); return DPS_OK; }
int DpsSEAMake(DPS_AGENT *Indexer, DPS_DOCUMENT *Doc, DPS_DSTR *excerpt, const char *content_lang, size_t *indexed_size, size_t *indexed_limit, size_t max_word_len, size_t min_word_len, int crossec, int seasec #ifdef HAVE_ASPELL , int have_speller, AspellSpeller *speller #endif ) { DPS_SENTENCELIST List; DPS_MAPSTAT MapStat; DPS_TEXTITEM Item; DPS_VAR *Sec; dpsunicode_t *sentence, *lt, savec; double *links, *lang_cs, w; double delta, pdiv, cur_div; size_t l, sent_len, order; size_t min_len = 10000000, min_pos = 0; int it; register size_t i, j; #ifdef DEBUG char lcstr[4096]; #endif TRACE_IN(Indexer, "DpsSEAMake"); if((Sec = DpsVarListFind(&Doc->Sections, "sea"))) { /* set SEA section to NULL */ DPS_FREE(Sec->val); DPS_FREE(Sec->txt_val); Sec->curlen = 0; } bzero(&List, sizeof(List)); order = 0; sentence = DpsUniStrTok_SEA((dpsunicode_t*)excerpt->data, <); while(sentence) { if (lt != NULL) { savec = *lt; *lt = 0; } #ifdef DEBUG DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)sentence, sizeof(dpsunicode_t) * (DpsUniLen(sentence) + 1)); fprintf(stderr, "Sentence.%d: %s\n", List.nitems, lcstr); #endif if ((sent_len = DpsUniLen(sentence)) >= Indexer->Flags.SEASentenceMinLength) { j = 1; for (i = 0; i < List.nitems; i++) { if (DpsUniStrCmp(sentence, List.Sent[i].sentence) == 0) { j = 0; break; } } if (j) { if ( List.nitems < Indexer->Flags.SEASentences ) { if (List.nitems == List.mitems) { List.mitems += 16; List.Sent = (DPS_SENTENCE*)DpsRealloc(List.Sent, List.mitems * sizeof(DPS_SENTENCE)); if (List.Sent == NULL) { TRACE_OUT(Indexer); return DPS_ERROR;} } List.Sent[List.nitems].sentence = DpsUniDup(sentence); List.Sent[List.nitems].len = sent_len; List.Sent[List.nitems].order = order++; sentence = DpsUniDup(sentence); DpsUniStrToLower(sentence); bzero(&List.Sent[List.nitems].LangMap, sizeof(DPS_LANGMAP)); DpsBuildLangMap(&List.Sent[List.nitems].LangMap, (char*)sentence, sent_len * sizeof(dpsunicode_t), 0, 0); if (sent_len < min_len) { min_len = sent_len; min_pos = List.nitems; } List.nitems++; DPS_FREE(sentence); } else if (sent_len > min_len) { DPS_FREE(List.Sent[min_pos].sentence); List.Sent[min_pos].sentence = DpsUniDup(sentence); List.Sent[min_pos].len = sent_len; List.Sent[min_pos].order = order++; sentence = DpsUniDup(sentence); DpsUniStrToLower(sentence); bzero(&List.Sent[min_pos].LangMap, sizeof(DPS_LANGMAP)); DpsBuildLangMap(&List.Sent[min_pos].LangMap, (char*)sentence, sent_len * sizeof(dpsunicode_t), 0, 0); DPS_FREE(sentence); min_len = List.Sent[0].len; min_pos = 0; for(i = 1; i < List.nitems; i++) if (List.Sent[i].len < min_len) { min_len = List.Sent[i].len; min_pos = i; } } } } #ifdef DEBUG fprintf(stderr, "Sent. len.:%d, Min.allowed: %d\n", sent_len, Indexer->Flags.SEASentenceMinLength); #endif if (lt != NULL) *lt = savec; sentence = DpsUniStrTok_SEA(NULL, <); } DpsLog(Indexer, DPS_LOG_DEBUG, "SEA sentences: %d", List.nitems); if (List.nitems < 4) { for (i = 0; i < List.nitems; i++) DPS_FREE(List.Sent[i].sentence); DPS_FREE(List.Sent); TRACE_OUT(Indexer); return DPS_OK; } links = (double*)DpsMalloc(sizeof(double) * List.nitems * List.nitems); lang_cs = (double*)DpsMalloc(sizeof(double) * List.nitems); /* k ot links[i * List.nitems + j] */ if (links != NULL && lang_cs != NULL) { for (i = 0; i < List.nitems; i++) { DpsPrepareLangMap(&List.Sent[i].LangMap); } for (i = 0; i < List.nitems; i++) { List.Sent[i].Oi = List.Sent[i].di = 0.5; if (Doc->lang_cs_map == NULL) { links[i * List.nitems + i] = 0.0; } else { MapStat.map = &List.Sent[i].LangMap; DpsCheckLangMap6(Doc->lang_cs_map, &List.Sent[i].LangMap, &MapStat, DPS_LM_TOPCNT * DPS_LM_TOPCNT, 2 * DPS_LM_TOPCNT); links[i * List.nitems + i] = (double)MapStat.hits / (2.0 * DPS_LM_TOPCNT) / (List.nitems + 1); } #ifdef DEBUG DpsLog(Indexer, DPS_LOG_INFO, "Link %u->%u: %f [hits:%d miss:%d]", i, i, links[i * List.nitems + i], MapStat.hits, MapStat.miss); #endif for (j = 0; j < List.nitems; j++) { if (j == i) continue; MapStat.map = &List.Sent[j].LangMap; DpsCheckLangMap6(&List.Sent[j].LangMap, &List.Sent[i].LangMap, &MapStat, DPS_LM_TOPCNT * DPS_LM_TOPCNT, 2 * DPS_LM_TOPCNT); links[i * List.nitems + j] = (double)MapStat.hits / (2.0 * DPS_LM_TOPCNT) / (List.nitems + 1); #ifdef DEBUG DpsLog(Indexer, DPS_LOG_INFO, "Link %u->%u: %f [hits:%d miss:%d]", i, j, links[i * List.nitems + j], MapStat.hits, MapStat.miss); #endif } } for (l = 0; l < List.nitems; l++) { w = 0.0; for (i = 0; i < List.nitems; i++) { w += links[l * List.nitems + i] * List.Sent[i].Oi; } w = f(w); if (w < LOW_BORDER_EPS2) w = LOW_BORDER_EPS2; else if (w > HI_BORDER_EPS2) w = HI_BORDER_EPS2; List.Sent[l].di = w; } DpsSort(List.Sent, List.nitems, sizeof(DPS_SENTENCE), (qsort_cmp)SentCmp); #ifdef DEBUG DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[0].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[0].sentence) + 1)); fprintf(stderr, "Sent.0: %f %f -- %s\n", List.Sent[0].di, List.Sent[0].Oi, lcstr); DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[1].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[1].sentence) + 1)); fprintf(stderr, "Sent.1: %f %f -- %s\n", List.Sent[1].di, List.Sent[1].Oi, lcstr); DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[2].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[2].sentence) + 1)); fprintf(stderr, "Sent.2: %f %f -- %s\n", List.Sent[2].di, List.Sent[2].Oi, lcstr); DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[3].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[3].sentence) + 1)); fprintf(stderr, "Sent.3: %f %f -- %s\n", List.Sent[3].di, List.Sent[3].Oi, lcstr); DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[4].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[4].sentence) + 1)); fprintf(stderr, "Sent.4: %f %f -- %s\n", List.Sent[4].di, List.Sent[4].Oi, lcstr); #endif DpsSort(List.Sent, TOP_SENTENCES, sizeof(DPS_SENTENCE), (qsort_cmp)SentOrderCmp); bzero(&Item, sizeof(Item)); Item.section = seasec; Item.href = NULL; Item.section_name = "sea"; for (i = 0; i < TOP_SENTENCES; i++) { dpsunicode_t *UStr = DpsUniDup(List.Sent[i].sentence); DpsPrepareItem(Indexer, Doc, &Item, List.Sent[i].sentence, UStr, content_lang, indexed_size, indexed_limit, max_word_len, min_word_len, crossec #ifdef HAVE_ASPELL , have_speller, speller, NULL #endif ); DPS_FREE(UStr); } } DPS_FREE(lang_cs); DPS_FREE(links); for (i = 0; i < List.nitems; i++) DPS_FREE(List.Sent[i].sentence); DPS_FREE(List.Sent); TRACE_OUT(Indexer); return DPS_OK; }
DPS_WIDEWORDLIST * DpsSynonymListFind(const DPS_SYNONYMLIST * List,DPS_WIDEWORD * wword){ DPS_SYNONYM syn,*res,*first,*last; DPS_SYNONYM *psyn, **pres, **pfirst, **plast; DPS_WIDEWORDLIST *Res = NULL; size_t nnorm,i; if(!List->nsynonyms)return NULL; syn.p.uword = wword->uword; res = bsearch(&syn, List->Synonym, List->nsynonyms, sizeof(DPS_SYNONYM), &cmpsyn); if(res){ Res = (DPS_WIDEWORDLIST *)DpsMalloc(sizeof(*Res)); if (Res == NULL) return NULL; DpsWideWordListInit(Res); /* Find first and last synonym */ for(first = res; first >= List->Synonym; first--) { if(DpsUniStrCmp(wword->uword,first->p.uword)){ break; }else{ first->s.order = wword->order; first->s.origin = DPS_WORD_ORIGIN_SYNONYM; DpsWideWordListAdd(Res,&first->s, DPS_WWL_LOOSE); } } for(last=res+1;last<List->Synonym+List->nsynonyms;last++){ if(DpsUniStrCmp(wword->uword,last->p.uword)){ break; }else{ last->s.order=wword->order; last->s.origin = DPS_WORD_ORIGIN_SYNONYM; DpsWideWordListAdd(Res,&last->s, DPS_WWL_LOOSE); } } } syn.s.uword = wword->uword; psyn = &syn; pres = bsearch(&psyn, List->Back, List->nsynonyms, sizeof(DPS_SYNONYM*), &cmpsynback); if(pres) { if (Res == NULL) { Res = (DPS_WIDEWORDLIST *)DpsMalloc(sizeof(*Res)); if (Res == NULL) return NULL; DpsWideWordListInit(Res); } /* Find first and last synonym */ for(pfirst = pres; pfirst >= List->Back; pfirst--) { if(DpsUniStrCmp(wword->uword, (*pfirst)->s.uword)) { break; }else{ (*pfirst)->p.order = wword->order; (*pfirst)->p.origin = DPS_WORD_ORIGIN_SYNONYM; DpsWideWordListAdd(Res, &((*pfirst)->p), DPS_WWL_LOOSE); } } for(plast = pres + 1; plast < List->Back + List->nsynonyms; plast++) { if(DpsUniStrCmp(wword->uword, (*plast)->s.uword)) { break; } else { (*plast)->p.order = wword->order; (*plast)->p.origin = DPS_WORD_ORIGIN_SYNONYM; DpsWideWordListAdd(Res, &((*plast)->p), DPS_WWL_LOOSE); } } } if (Res == NULL) return NULL; /* Now find each of them in reverse order */ if ((nnorm = Res->nwords) > 0) { for(i = 0; i < nnorm; i++) { syn.p.uword = Res->Word[i].uword; res = bsearch(&syn, List->Synonym, List->nsynonyms, sizeof(DPS_SYNONYM), &cmpsyn); if(res){ /* Find first and last synonym */ for(first = res; first >= List->Synonym; first--) { if(DpsUniStrCmp(Res->Word[i].uword, first->p.uword)){ break; }else{ if ((Res->Word[i].count != 0) && (first->p.count != Res->Word[i].count)) continue; first->s.order = wword->order; first->s.origin = DPS_WORD_ORIGIN_SYNONYM; DpsWideWordListAdd(Res, &first->s, DPS_WWL_LOOSE); } } for(last=res+1;last<List->Synonym+List->nsynonyms;last++){ if(DpsUniStrCmp(Res->Word[i].uword, last->p.uword)) { break; }else{ if ((Res->Word[i].count != 0) && (last->p.count != Res->Word[i].count)) continue; last->s.order=wword->order; last->s.origin = DPS_WORD_ORIGIN_SYNONYM; DpsWideWordListAdd(Res, &last->s, DPS_WWL_LOOSE); } } } syn.s.uword = Res->Word[i].uword; pres = bsearch(&psyn, List->Back, List->nsynonyms, sizeof(DPS_SYNONYM*), &cmpsynback); if(pres) { /* Find first and last synonym */ for(pfirst = pres; pfirst >= List->Back; pfirst--) { if(DpsUniStrCmp(syn.s.uword, (*pfirst)->s.uword)) { break; } else { if ((Res->Word[i].count != 0) && ((*pfirst)->s.count != Res->Word[i].count)) continue; (*pfirst)->s.order = wword->order; (*pfirst)->s.origin = DPS_WORD_ORIGIN_SYNONYM; DpsWideWordListAdd(Res, &((*pfirst)->s), DPS_WWL_LOOSE); } } for(plast = pres + 1; plast < List->Back + List->nsynonyms; plast++) { if(DpsUniStrCmp(syn.s.uword, (*plast)->s.uword)) { break; } else { if ((Res->Word[i].count != 0) && ((*plast)->s.count != Res->Word[i].count)) continue; (*plast)->s.order = wword->order; (*plast)->s.origin = DPS_WORD_ORIGIN_SYNONYM; DpsWideWordListAdd(Res, &((*plast)->s), DPS_WWL_LOOSE); } } } } } return(Res); }
static int cmpsynback(const void * v1,const void * v2){ const DPS_SYNONYM **s1 = (const DPS_SYNONYM**)v1; const DPS_SYNONYM **s2 = (const DPS_SYNONYM**)v2; return(DpsUniStrCmp((*s1)->s.uword, (*s2)->s.uword)); }
static int cmpsyn(const void * v1,const void * v2){ const DPS_SYNONYM * s1=(const DPS_SYNONYM*)v1; const DPS_SYNONYM * s2=(const DPS_SYNONYM*)v2; return(DpsUniStrCmp(s1->p.uword,s2->p.uword)); }