/** This function replaces SGML entities With their UNICODE equivalents */ void DpsSGMLUniUnescape(dpsunicode_t *ustr) { dpsunicode_t *s = ustr, *e, c; char sgml[DPS_MAX_SGML_LEN+1]; while (*s){ if(*s=='&'){ int i = 0; if(*(s+1)=='#'){ for(e = s + 2; (e - s < DPS_MAX_SGML_LEN) && (*e <= '9') && (*e >= '0'); e++); if(*e==';'){ for(i = 2; s + i < e; i++) sgml[i-2]=s[i]; sgml[i-2] = '\0'; *s = atoi(sgml); dps_memmove(s + 1, e + 1, sizeof(dpsunicode_t) * (DpsUniLen(e + 1) + 1)); } }else{ for(e=s+1;(e-s<DPS_MAX_SGML_LEN)&&(((*e<='z')&&(*e>='a'))||((*e<='Z')&&(*e>='A')));e++) { sgml[i] = (char)*e; i++; } sgml[i] = '\0'; if( (*e==';') && (c = DpsSgmlToUni(sgml)) ) { *s=c; dps_memmove(s + 1, e + 1, sizeof(dpsunicode_t) * (DpsUniLen(e + 1) + 1)); } } } s++; } }
/* backward unicode string compaire */ int DpsUniStrBCmp(const dpsunicode_t *s1, const dpsunicode_t *s2) { register ssize_t l1 = DpsUniLen(s1)-1, l2 = DpsUniLen(s2)-1; while (l1 >= 0 && l2 >= 0) { if (s1[l1] < s2[l2]) return -1; if (s1[l1] > s2[l2]) return 1; l1--; l2--; } if (l1 < l2) return -1; if (l1 > l2) return 1; /* if (*s1 < *s2) return -1; if (*s1 > *s2) return 1;*/ return 0; }
static void DpsChineseListAddBundle(DPS_CHINALIST *List, DPS_CHINAWORD * chinaword){ unsigned int h; size_t len; if (List->nwords + 1 > List->mwords) { List->mwords += 1024; List->ChiWord = (DPS_CHINAWORD *)DpsRealloc(List->ChiWord, (List->mwords)*sizeof(DPS_CHINAWORD)); if (List->ChiWord == NULL) { List->mwords = List->nwords = 0; return; } } if (List->hash == NULL) { List->hash = (size_t *)DpsXmalloc(65536 * sizeof(size_t)); if (List->hash == NULL) { List->mwords = List->nwords = 0; return; } } List->ChiWord[List->nwords].word = chinaword->word; List->ChiWord[List->nwords].freq = chinaword->freq; List->total += chinaword->freq; h = (unsigned int)(List->ChiWord[List->nwords].word[0] & 0xffff); if (List->hash[h] < (len = DpsUniLen(List->ChiWord[List->nwords].word))) { List->hash[h] = len; } List->nwords++; }
dpsunicode_t *DpsUniGermanReplace(const dpsunicode_t *str) { size_t l = DpsUniLen(str); dpsunicode_t *german = DpsMalloc((2 * l + 1) * sizeof(dpsunicode_t)); if (german !=NULL) { dpsunicode_t *s = str, *d = german; while(*s) { switch(*s) { case 0x00DF: /* eszett, or scharfes s, small */ *d++ = 's'; *d++ = 's'; break; case 0x1E9E: /* eszett, or scharfes s, big */ *d++ = 'S'; *d++ = 'S'; break; case 0x00D6: *d++ = 'O'; *d++ = 'E'; break; case 0x00F6: *d++ = 'o'; *d++ = 'e'; break; case 0x00DC: *d++ = 'U'; *d++ = 'E'; break; case 0x00FC: *d++ = 'u'; *d++ = 'e'; break; case 0x00C4: *d++ = 'A'; *d++ = 'E'; break; case 0x00E4: *d++ = 'a'; *d++ = 'e'; break; default: *d++ = *s; } s++; } *d = 0; } return german; }
dpsunicode_t *DpsUniStrRCpy(dpsunicode_t *dst, const dpsunicode_t *src) { register size_t i = 0; dpsunicode_t *d = dst; register size_t l = DpsUniLen(src); if (l) for (l--; i <= l; i++) d[i] = src[l - i]; d[i] = 0; return dst; }
int DpsUniStrBNCmp(const dpsunicode_t *s1, const dpsunicode_t *s2, size_t count) { register ssize_t l1 = DpsUniLen(s1) - 1, l2 = DpsUniLen(s2) - 1, l = count; while (l1 >= 0 && l2 >= 0 && l > 0) { if (s1[l1] < s2[l2]) return -1; if (s1[l1] > s2[l2]) return 1; l1--; l2--; l--; } if (l == 0) return 0; if (l1 < l2) return -1; if (l1 > l2) return 1; if (*s1 < *s2) return -1; if (*s1 > *s2) return 1; return 0; }
dpsunicode_t *DpsUniNDup(const dpsunicode_t *s, size_t len) { dpsunicode_t *res; size_t size = DpsUniLen(s); if (size > len) size = len; if((res = (dpsunicode_t*)DpsMalloc((size + 1) * sizeof(*s))) == NULL) return(NULL); dps_memmove(res, s, size * sizeof(*s)); res[size] = 0; return res; }
dpsunicode_t *DpsUniDup(const dpsunicode_t *s) { dpsunicode_t *res; size_t size; size = (DpsUniLen(s)+1)*sizeof(*s); if((res=(dpsunicode_t*)DpsMalloc(size)) == NULL) return(NULL); dps_memcpy(res, s, size); /* was: dps_memmove */ return res; }
/* string copy */ dpsunicode_t *DpsUniStrCpy(dpsunicode_t *dst, const dpsunicode_t *src) { /* register dpsunicode_t *d = dst; register const dpsunicode_t *s = src; while (*s) { *d = *s; d++; s++; } *d = *s; return dst; */ register size_t n = DpsUniLen(src) + 1; return dps_memmove(dst, src, n * sizeof(dpsunicode_t)); }
dpsunicode_t *DpsUniStrNCpy(dpsunicode_t *dst, const dpsunicode_t *src, size_t len) { /* register dpsunicode_t *d = dst; register const dpsunicode_t *s = src; register size_t l = len; while (*s && l) { *d = *s; d++; s++; l--; } if (l) *d = *s; return dst; */ register size_t n = DpsUniLen(src) + 1; return dps_memmove(dst, src, sizeof(dpsunicode_t) * ((n < len) ? n : len)); }
dpsunicode_t *DpsUniRDup(const dpsunicode_t *s) { dpsunicode_t *res; size_t size, len; size = ((len = DpsUniLen(s)) + 1) * sizeof(*s); if((res=(dpsunicode_t*)DpsMalloc(size)) == NULL) return(NULL); { register size_t z; size = len - 1; for (z = 0; z < len; z++) res[z] = s[size - z]; res[len] = 0; } return res; }
dpsunicode_t *DpsSegmentByFreq(DPS_CHINALIST *List, dpsunicode_t *line) { dpsunicode_t *out, *mid, *last, *sentence, *segmented_sentence, part; size_t i, j, l, a; int /*reg = 1,*/ ctype, have_bukva_forte, fb_type; dpsunicode_t space[] = { 32, 0 }; l = 2 * (DpsUniLen(line) + 1); if (l < 2) return NULL; out = (dpsunicode_t*)DpsMalloc(l * sizeof(dpsunicode_t)); if (out == NULL) return NULL; *out = '\0'; mid = (dpsunicode_t*)DpsXmalloc(l * sizeof(dpsunicode_t)); if (mid == NULL) { DPS_FREE(out); return NULL; } *mid = '\0'; for (i = j = 0; i < DpsUniLen(line); i++) { /* if (line[i] >= 0x80) { if (reg == 0) { mid[j++] = *space; reg = 1; } } else { if (reg == 1) { mid[j++] = *space; reg = 0; } }*/ mid[j++] = line[i]; } /* mid[j] = 0;*/ for (sentence = DpsUniGetSepToken(/*line*/ mid, &last, &ctype, &have_bukva_forte, 0); sentence; sentence = DpsUniGetSepToken(NULL, &last, &ctype, &have_bukva_forte, 0)) { part = *last; *last = 0; fb_type = DpsUniCType(*sentence); if (fb_type > DPS_UNI_BUKVA || fb_type == 2 || fb_type == 1) { a = 2 * (DpsUniLen(sentence) + 1); j = DpsUniLen(out); if (j + a >= l) { l = j + a + 1; out = (dpsunicode_t*)DpsRealloc(out, l * sizeof(dpsunicode_t)); if (out == NULL) { DPS_FREE(mid); return NULL; } } if (*out) DpsUniStrCat(out, space); DpsUniStrCat(out, sentence); } else { if ((segmented_sentence = DpsSegmentProcess(List, sentence)) != NULL) { a = 2 * (DpsUniLen(segmented_sentence) + 1); j = DpsUniLen(out); if (j + a >= l) { l = j + a + 1; out = (dpsunicode_t*)DpsRealloc(out, l * sizeof(dpsunicode_t)); if (out == NULL) { DPS_FREE(mid); return NULL; } } if (*out) DpsUniStrCat(out, space); DpsUniStrCat(out, segmented_sentence); DPS_FREE(segmented_sentence); } else { DPS_FREE(mid); return NULL; } } *last = part; } DPS_FREE(mid); return out; }
static dpsunicode_t *DpsSegmentProcess(DPS_CHINALIST *List, dpsunicode_t *line) { int top, nextid, *position, *next, len, maxid, i, current, father, needinsert, iindex; unsigned int h; double *value, p; dpsunicode_t **result; dpsunicode_t *otv, space[] = {32, 0}; DPS_CHINAWORD *chinaword, chiw; if (/*(line[0] >= 0x80) &&*/ (List->hash != NULL)) { len = DpsUniLen(line); maxid = 2 * len + 1; position = (int*)DpsMalloc(maxid * sizeof(int)); if (position == NULL) return NULL; next = (int*)DpsMalloc(maxid * sizeof(int)); if (next == NULL) { DPS_FREE(position); return NULL; } value = (double*)DpsMalloc(maxid * sizeof(double)); if (value == NULL) { DPS_FREE(position); DPS_FREE(next); return NULL; } result = (dpsunicode_t **)DpsMalloc(maxid * sizeof(dpsunicode_t *)); if (result == NULL) { DPS_FREE(position); DPS_FREE(next); DPS_FREE(value); return NULL; } top = 0; /* value[0] = 1;*/ value[0] = 1.0 * List->total * len; position[0] = 0; next[0] = -1; result[0] = (dpsunicode_t*)DpsUniDup(&space[1]); nextid = 1; /* fprintf(stderr, "SegmentProcess start: len -- %d\n", len);*/ while ((top != -1) && (!((position[top] >= len) && (next[top] == -1)))) { /* fprintf(stderr, "top: %d position: %d (len: %d) next:%d\n", top, position[top], len, next[top]);*/ /* # find the first open path */ current = top; father = top; while ((current != -1) && (position[current] >= len)) { father = current; current = next[current]; } /* # remove this path */ if (current == top) { top = next[top]; } else { next[father] = next[current]; } if (current == -1) { /* # no open path, finished, take the first path */ next[top] = -1; } else { otv = &line[position[current]]; h = (unsigned int)(otv[0] & 0xffff); /* # if the first character doesn't have word phrase in the dict.*/ if (List->hash[h] == 0) { List->hash[h] = 1 /*2*/; } i = List->hash[h]; if (i + position[current] > len) { i = len - position[current]; } /*i = i + 1*/ /*2*/; otv = NULL; for (; i > 0; i-- /*2*/) { /*i = i - 1*/ /*2*/; DPS_FREE(otv); otv = DpsUniNDup(&line[position[current]], (size_t)i); chinaword = DpsChineseListFind(List, otv); if (i == 1 /*2*/ && chinaword == NULL) { DPS_FREE(otv); otv = DpsUniNDup(&line[position[current]], 1/*2*/); chiw.word = otv; chiw.freq = 1; DpsChineseListAdd(List, chinaword = &chiw); /* DpsChineseListSort(List);*/ /*i = 1*//*2*//*;*/ } if ((chinaword != NULL) && chinaword->freq) { /* # pronode() */ /* value[nextid] = value[current] * chinaword->freq / List->total;*/ p = (double)chinaword->freq / List->total; value[nextid] = value[current] / (-1.0 * log(p) / log(10.0)); position[nextid] = position[current] + i; h = DpsUniLen(result[current]) + DpsUniLen(otv) + 2; result[nextid] = (dpsunicode_t*)DpsXmalloc((size_t)h * sizeof(dpsunicode_t)); if (result[nextid] == NULL) { DPS_FREE(position); DPS_FREE(next); DPS_FREE(value); DPS_FREE(result); return NULL; } DpsUniStrCpy(result[nextid], result[current]); DpsUniStrCat(result[nextid], space); DpsUniStrCat(result[nextid], otv); /* # check to see whether there is duplicated path # if there is a duplicate path, remove the small value path */ needinsert = 1; iindex = top; father = top; while (iindex != -1) { if (position[iindex] == position[nextid]) { if (0.85 * value[iindex] >= value[nextid]) { needinsert = 0; } else { if (top == iindex) { next[nextid] = next[iindex]; top = nextid; needinsert = 0; /* } else { next[nextid] = next[father];*/ /* next[father] = next[nextid];*/ } } iindex = -1; } else { father = iindex; iindex = next[iindex]; } } /* # insert the new path into the list */ /* fprintf(stderr, "current:%d position:%d i:%d value[current]:%.12lf nextid:%d value[nextid]:%.12lf\n", current, position[current], i, value[current], nextid, value[nextid]);*/ if (needinsert == 1) { while ((iindex != -1) && (value[iindex] > value[nextid])) { father = iindex; iindex = next[iindex]; } if (top == iindex) { next[nextid] = top; top = nextid; } else { next[father] = nextid; next[nextid] = iindex; } } nextid++; if (nextid >= maxid) { maxid +=128; position = (int*)DpsRealloc(position, maxid * sizeof(int)); next = (int*)DpsRealloc(next, maxid * sizeof(int)); value = (double*)DpsRealloc(value, maxid * sizeof(double)); result = (dpsunicode_t **)DpsRealloc(result, maxid * sizeof(dpsunicode_t *)); if (position == NULL || next == NULL || value == NULL || result == NULL) { DPS_FREE(position); DPS_FREE(next); DPS_FREE(value); if (result != NULL) { for (i = 0; i < nextid; i++) { if (i != top) DPS_FREE(result[i]); } DPS_FREE(result); } return NULL; } } } } /*while ((i >= 1) && ( chinaword == NULL));*/ DPS_FREE(otv); } } DPS_FREE(position); DPS_FREE(next); for (i = 0; i < nextid; i++) { if (i != top) DPS_FREE(result[i]); } otv = result[top]; DPS_FREE(value); DPS_FREE(result); return otv; } else { return (dpsunicode_t*)DpsUniDup(line); } }
/* string append */ dpsunicode_t *DpsUniStrCat(dpsunicode_t *s, const dpsunicode_t *append) { size_t len = DpsUniLen(s); DpsUniStrCpy(&s[len], append); return s; }
int DpsAddStackItem(DPS_AGENT *query, DPS_RESULT *Res, DPS_PREPARE_STATE *state, char *word, dpsunicode_t *uword) { int origin; size_t i; size_t wlen = (uword == NULL) ? 0 : DpsUniLen(uword); dpshash32_t crcword = (word == NULL) ? 0 : DpsStrHash32(word); #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, "0[%d].%x %c -- %s [%x] .secno:%d\n", state->order, state->origin, item_type(state->cmd), (word == NULL) ? "<NULL>" : word, crcword, state->secno[state->p_secno]); #endif if((uword != NULL) && ( DpsStopListFind(&query->Conf->StopWords, uword, state->qlang) || (query->WordParam.min_word_len > wlen) || (query->WordParam.max_word_len < wlen)) ) { origin = state->origin | DPS_WORD_ORIGIN_STOP; } else { origin = state->origin; } if (state->cmd == DPS_STACK_WORD && !(origin & DPS_WORD_ORIGIN_QUERY)) { for (i = 0; i < Res->nitems; i++) { if ((Res->items[i].order == state->order) && (Res->items[i].crcword == crcword)) return DPS_OK; } } if (Res->nitems >= Res->mitems - 2) { Res->mitems += DPS_MAXSTACK; Res->items = (DPS_STACK_ITEM*)DpsRealloc(Res->items, Res->mitems * sizeof(DPS_STACK_ITEM)); if (Res->items == NULL) { DpsLog(query, DPS_LOG_ERROR, "Can't alloc %d bytes for %d mitems", Res->mitems * sizeof(DPS_STACK_ITEM), Res->mitems); return DPS_ERROR; } } if (Res->nitems > 0) { if (state->cmd == DPS_STACK_OR || state->cmd == DPS_STACK_AND || state->cmd == DPS_STACK_NEAR || state->cmd == DPS_STACK_ANYWORD) { if (Res->items[Res->nitems-1].cmd == DPS_STACK_AND || Res->items[Res->nitems-1].cmd == DPS_STACK_OR || Res->items[Res->nitems-1].cmd == DPS_STACK_NEAR || Res->items[Res->nitems-1].cmd == DPS_STACK_ANYWORD) { return DPS_OK; } } if ((Res->nitems > 0) && (state->cmd == DPS_STACK_WORD) && ( (Res->items[Res->nitems-1].cmd == DPS_STACK_WORD) || (Res->items[Res->nitems-1].cmd == DPS_STACK_RIGHT) || (Res->items[Res->nitems-1].cmd == DPS_STACK_PHRASE_RIGHT) )) { Res->items[Res->nitems].cmd = DPS_STACK_OR; Res->items[Res->nitems].order = 0; Res->items[Res->nitems].origin = 0; Res->items[Res->nitems].count = 0; Res->items[Res->nitems].len = 0; Res->items[Res->nitems].crcword = 0; Res->items[Res->nitems].word = NULL; Res->items[Res->nitems].ulen = 0; Res->items[Res->nitems].uword = NULL; Res->items[Res->nitems].pbegin = NULL; Res->items[Res->nitems].order_origin = 0; Res->items[Res->nitems].secno = state->secno[state->p_secno]; Res->nitems++; Res->ncmds++; #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, "1[%d].%x %c -- %s", 0, 0, item_type(DPS_STACK_OR), "<NULL>"); #endif } if ((Res->nitems > 0) && (state->cmd == DPS_STACK_LEFT) && ( (Res->items[Res->nitems-1].cmd == DPS_STACK_RIGHT) || (Res->items[Res->nitems-1].cmd == DPS_STACK_PHRASE_RIGHT) )) { Res->items[Res->nitems].cmd = state->add_cmd; Res->items[Res->nitems].order = 0; Res->items[Res->nitems].origin = 0; Res->items[Res->nitems].count = 0; Res->items[Res->nitems].len = 0; Res->items[Res->nitems].crcword = 0; Res->items[Res->nitems].word = NULL; Res->items[Res->nitems].ulen = 0; Res->items[Res->nitems].uword = NULL; Res->items[Res->nitems].pbegin = NULL; Res->items[Res->nitems].order_origin = 0; Res->items[Res->nitems].secno = state->secno[state->p_secno]; Res->nitems++; Res->ncmds++; #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, "1[%d].%x %c -- %s", 0, 0, item_type(state->add_cmd), "<NULL>"); #endif } } Res->items[Res->nitems].cmd = state->cmd; Res->items[Res->nitems].order = state->order; Res->items[Res->nitems].order_inquery = state->order_inquery; Res->items[Res->nitems].origin = origin; Res->items[Res->nitems].count = 0; Res->items[Res->nitems].len = (word == NULL) ? 0 : dps_strlen(word); Res->items[Res->nitems].crcword = crcword; Res->items[Res->nitems].word = (word == NULL) ? NULL : DpsStrdup(word); Res->items[Res->nitems].ulen = wlen; Res->items[Res->nitems].uword = (uword == NULL) ? NULL : DpsUniDup(uword); Res->items[Res->nitems].pbegin = NULL; Res->items[Res->nitems].order_origin = 0; Res->items[Res->nitems].wordnum = Res->nitems; Res->items[Res->nitems].secno = state->secno[state->p_secno]; Res->nitems++; if (state->cmd != DPS_STACK_WORD) { Res->ncmds++; } else { Res->items[state->order].order_origin |= origin; if (state->order > Res->max_order) Res->max_order = state->order; if (state->order_inquery > Res->max_order_inquery) Res->max_order_inquery = state->order; } /* if ((state->cmd == DPS_STACK_WORD) && state->order > Res->max_order) Res->max_order = state->order;*/ #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, "1[%d,%d].%x %c -- %s", state->order, state->order_inquery, state->origin, item_type(state->cmd), (word == NULL) ? "<NULL>" : word); #endif return DPS_OK; }
int DpsSEAMake(DPS_AGENT *Indexer, DPS_DOCUMENT *Doc, DPS_DSTR *excerpt, const char *content_lang, size_t *indexed_size, size_t *indexed_limit, size_t max_word_len, size_t min_word_len, int crossec, int seasec #ifdef HAVE_ASPELL , int have_speller, AspellSpeller *speller #endif ) { DPS_SENTENCELIST List; DPS_MAPSTAT MapStat; DPS_TEXTITEM Item; DPS_VAR *Sec; dpsunicode_t *sentence, *lt, savec; double *links, *lang_cs, w; double delta, pdiv, cur_div; size_t l, sent_len, order; size_t min_len = 10000000, min_pos = 0; int it; register size_t i, j; #ifdef DEBUG char lcstr[4096]; #endif TRACE_IN(Indexer, "DpsSEAMake"); if((Sec = DpsVarListFind(&Doc->Sections, "sea"))) { /* set SEA section to NULL */ DPS_FREE(Sec->val); DPS_FREE(Sec->txt_val); Sec->curlen = 0; } bzero(&List, sizeof(List)); order = 0; sentence = DpsUniStrTok_SEA((dpsunicode_t*)excerpt->data, <); while(sentence) { if (lt != NULL) { savec = *lt; *lt = 0; } #ifdef DEBUG DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)sentence, sizeof(dpsunicode_t) * (DpsUniLen(sentence) + 1)); fprintf(stderr, "Sentence.%d: %s\n", List.nitems, lcstr); #endif if ((sent_len = DpsUniLen(sentence)) >= Indexer->Flags.SEASentenceMinLength) { j = 1; for (i = 0; i < List.nitems; i++) { if (DpsUniStrCmp(sentence, List.Sent[i].sentence) == 0) { j = 0; break; } } if (j) { if ( List.nitems < Indexer->Flags.SEASentences ) { if (List.nitems == List.mitems) { List.mitems += 16; List.Sent = (DPS_SENTENCE*)DpsRealloc(List.Sent, List.mitems * sizeof(DPS_SENTENCE)); if (List.Sent == NULL) { TRACE_OUT(Indexer); return DPS_ERROR;} } List.Sent[List.nitems].sentence = DpsUniDup(sentence); List.Sent[List.nitems].len = sent_len; List.Sent[List.nitems].order = order++; sentence = DpsUniDup(sentence); DpsUniStrToLower(sentence); bzero(&List.Sent[List.nitems].LangMap, sizeof(DPS_LANGMAP)); DpsBuildLangMap(&List.Sent[List.nitems].LangMap, (char*)sentence, sent_len * sizeof(dpsunicode_t), 0, 0); if (sent_len < min_len) { min_len = sent_len; min_pos = List.nitems; } List.nitems++; DPS_FREE(sentence); } else if (sent_len > min_len) { DPS_FREE(List.Sent[min_pos].sentence); List.Sent[min_pos].sentence = DpsUniDup(sentence); List.Sent[min_pos].len = sent_len; List.Sent[min_pos].order = order++; sentence = DpsUniDup(sentence); DpsUniStrToLower(sentence); bzero(&List.Sent[min_pos].LangMap, sizeof(DPS_LANGMAP)); DpsBuildLangMap(&List.Sent[min_pos].LangMap, (char*)sentence, sent_len * sizeof(dpsunicode_t), 0, 0); DPS_FREE(sentence); min_len = List.Sent[0].len; min_pos = 0; for(i = 1; i < List.nitems; i++) if (List.Sent[i].len < min_len) { min_len = List.Sent[i].len; min_pos = i; } } } } #ifdef DEBUG fprintf(stderr, "Sent. len.:%d, Min.allowed: %d\n", sent_len, Indexer->Flags.SEASentenceMinLength); #endif if (lt != NULL) *lt = savec; sentence = DpsUniStrTok_SEA(NULL, <); } DpsLog(Indexer, DPS_LOG_DEBUG, "SEA sentences: %d", List.nitems); if (List.nitems < 4) { for (i = 0; i < List.nitems; i++) DPS_FREE(List.Sent[i].sentence); DPS_FREE(List.Sent); TRACE_OUT(Indexer); return DPS_OK; } links = (double*)DpsMalloc(sizeof(double) * List.nitems * List.nitems); lang_cs = (double*)DpsMalloc(sizeof(double) * List.nitems); /* k ot links[i * List.nitems + j] */ if (links != NULL && lang_cs != NULL) { for (i = 0; i < List.nitems; i++) { DpsPrepareLangMap(&List.Sent[i].LangMap); } for (i = 0; i < List.nitems; i++) { List.Sent[i].Oi = List.Sent[i].di = 0.5; if (Doc->lang_cs_map == NULL) { links[i * List.nitems + i] = 0.0; } else { MapStat.map = &List.Sent[i].LangMap; DpsCheckLangMap6(Doc->lang_cs_map, &List.Sent[i].LangMap, &MapStat, DPS_LM_TOPCNT * DPS_LM_TOPCNT, 2 * DPS_LM_TOPCNT); links[i * List.nitems + i] = (double)MapStat.hits / (2.0 * DPS_LM_TOPCNT) / (List.nitems + 1); } #ifdef DEBUG DpsLog(Indexer, DPS_LOG_INFO, "Link %u->%u: %f [hits:%d miss:%d]", i, i, links[i * List.nitems + i], MapStat.hits, MapStat.miss); #endif for (j = 0; j < List.nitems; j++) { if (j == i) continue; MapStat.map = &List.Sent[j].LangMap; DpsCheckLangMap6(&List.Sent[j].LangMap, &List.Sent[i].LangMap, &MapStat, DPS_LM_TOPCNT * DPS_LM_TOPCNT, 2 * DPS_LM_TOPCNT); links[i * List.nitems + j] = (double)MapStat.hits / (2.0 * DPS_LM_TOPCNT) / (List.nitems + 1); #ifdef DEBUG DpsLog(Indexer, DPS_LOG_INFO, "Link %u->%u: %f [hits:%d miss:%d]", i, j, links[i * List.nitems + j], MapStat.hits, MapStat.miss); #endif } } for (l = 0; l < List.nitems; l++) { w = 0.0; for (i = 0; i < List.nitems; i++) { w += links[l * List.nitems + i] * List.Sent[i].Oi; } w = f(w); if (w < LOW_BORDER_EPS2) w = LOW_BORDER_EPS2; else if (w > HI_BORDER_EPS2) w = HI_BORDER_EPS2; List.Sent[l].di = w; } DpsSort(List.Sent, List.nitems, sizeof(DPS_SENTENCE), (qsort_cmp)SentCmp); #ifdef DEBUG DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[0].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[0].sentence) + 1)); fprintf(stderr, "Sent.0: %f %f -- %s\n", List.Sent[0].di, List.Sent[0].Oi, lcstr); DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[1].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[1].sentence) + 1)); fprintf(stderr, "Sent.1: %f %f -- %s\n", List.Sent[1].di, List.Sent[1].Oi, lcstr); DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[2].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[2].sentence) + 1)); fprintf(stderr, "Sent.2: %f %f -- %s\n", List.Sent[2].di, List.Sent[2].Oi, lcstr); DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[3].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[3].sentence) + 1)); fprintf(stderr, "Sent.3: %f %f -- %s\n", List.Sent[3].di, List.Sent[3].Oi, lcstr); DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[4].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[4].sentence) + 1)); fprintf(stderr, "Sent.4: %f %f -- %s\n", List.Sent[4].di, List.Sent[4].Oi, lcstr); #endif DpsSort(List.Sent, TOP_SENTENCES, sizeof(DPS_SENTENCE), (qsort_cmp)SentOrderCmp); bzero(&Item, sizeof(Item)); Item.section = seasec; Item.href = NULL; Item.section_name = "sea"; for (i = 0; i < TOP_SENTENCES; i++) { dpsunicode_t *UStr = DpsUniDup(List.Sent[i].sentence); DpsPrepareItem(Indexer, Doc, &Item, List.Sent[i].sentence, UStr, content_lang, indexed_size, indexed_limit, max_word_len, min_word_len, crossec #ifdef HAVE_ASPELL , have_speller, speller, NULL #endif ); DPS_FREE(UStr); } } DPS_FREE(lang_cs); DPS_FREE(links); for (i = 0; i < List.nitems; i++) DPS_FREE(List.Sent[i].sentence); DPS_FREE(List.Sent); TRACE_OUT(Indexer); return DPS_OK; }