int DpsChineseListLoad(DPS_AGENT *Agent, DPS_CHINALIST *List, const char *charset, const char *fname) { struct stat sb; char *str, *data = NULL, *cur_n = NULL; DPS_CHINAWORD chinaword; char word[PATH_MAX]; dpsunicode_t uword[256]; DPS_CHARSET *sys_int, *fcs; DPS_CONV to_uni; int fd; char savebyte; sys_int = DpsGetCharSet("sys-int"); if (!(fcs = DpsGetCharSet(charset))) { if (Agent->Conf->is_log_open) DpsLog(Agent, DPS_LOG_ERROR, "Charset '%s' not found or not supported", charset); else fprintf(stderr, "Charset '%s' not found or not supported", charset); return DPS_ERROR; } DpsConvInit(&to_uni, fcs, sys_int, Agent->Conf->CharsToEscape, DPS_RECODE_HTML); if (*fname != '/') { dps_snprintf(word, sizeof(word), "%s/%s", DpsVarListFindStr(&Agent->Conf->Vars, "EtcDir", DPS_CONF_DIR), fname); fname = word; } if (stat(fname, &sb)) { if (Agent->Conf->is_log_open) DpsLog(Agent, DPS_LOG_ERROR, "Unable to stat FreqDic file '%s': %s", fname, strerror(errno)); else fprintf(stderr, "Unable to stat FrecDic file '%s': %s", fname, strerror(errno)); return DPS_ERROR; } if ((fd = open(fname, O_RDONLY)) <= 0) { if (Agent->Conf->is_log_open) DpsLog(Agent, DPS_LOG_ERROR, "Unable to open FreqDic file '%s': %s", fname, strerror(errno)); else fprintf(stderr, "Unable to open FreqDic file '%s': %s", fname, strerror(errno)); return DPS_ERROR; } if ((data = (char*)DpsMalloc(sb.st_size + 1)) == NULL) { if (Agent->Conf->is_log_open) DpsLog(Agent, DPS_LOG_ERROR, "Unable to alloc %d bytes", sb.st_size); else fprintf(stderr, "Unable to alloc %ld bytes", (long)sb.st_size); close(fd); return DPS_ERROR; } if (read(fd, data, sb.st_size) != (ssize_t)sb.st_size) { if (Agent->Conf->is_log_open) DpsLog(Agent, DPS_LOG_ERROR, "Unable to read FreqDic file '%s': %s", fname, strerror(errno)); else fprintf(stderr, "Unable to read FreqDic file '%s': %s", fname, strerror(errno)); DPS_FREE(data); close(fd); return DPS_ERROR; } data[sb.st_size] = '\0'; str = data; cur_n = strchr(str, NL_INT); if (cur_n != NULL) { cur_n++; savebyte = *cur_n; *cur_n = '\0'; } close(fd); bzero((void*)&chinaword, sizeof(chinaword)); chinaword.word = uword; while(str != NULL) { if(!str[0]) goto loop_continue; if(str[0]=='#') goto loop_continue; sscanf(str, "%d %63s ", &chinaword.freq, word ); DpsConv(&to_uni, (char*)uword, sizeof(uword), word, sizeof(word)); DpsChineseListAdd(List, &chinaword); loop_continue: str = cur_n; if (str != NULL) { *str = savebyte; cur_n = strchr(str, NL_INT); if (cur_n != NULL) { cur_n++; savebyte = *cur_n; *cur_n = '\0'; } } } DPS_FREE(data); DpsChineseListSort(List); { register size_t i, j = 0; for (i = 1; i < List->nwords; i++) { if (cmpchinese(&List->ChiWord[j], &List->ChiWord[i]) == 0) { List->ChiWord[j].freq += List->ChiWord[i].freq; } else { j++; } } for (i = j + 1; i < List->nwords; i++) { DPS_FREE(List->ChiWord[i].word); } List->nwords = j + 1; } return DPS_OK; }
static int DpsUniRegExec(const DPS_UNIREG_EXP *reg, const dpsunicode_t *string) { const dpsunicode_t *start = string; int match=0; #ifdef DEBUG_UNIREG DPS_CHARSET *k = DpsGetCharSet("koi8-r"); DPS_CHARSET *sy = DpsGetCharSet("sys-int"); DPS_CONV fromuni; char sstr[1024]; char rstr[1024]; DpsConvInit(&fromuni, sy, k, NULL, 0); #endif for(start=string;*start;start++){ const dpsunicode_t *tstart=start; size_t i; for(i=0;i<reg->ntokens;i++){ const dpsunicode_t *s; int inc=DPS_UNIREG_INC; #ifdef DEBUG_UNIREG DpsConv(&fromuni, sstr, 1024, (char*)tstart, 1024); DpsConv(&fromuni, rstr, 1024, (char*)reg->Token[i].str, 1024); printf("t:%d tstart='%s'\ttok='%s'\t", i, sstr, rstr); #endif switch(reg->Token[i].str[0]){ case '^': if(string!=tstart){ match=0; }else{ match=1; } break; case '[': match=0; for(s=reg->Token[i].str+1;*s;s++){ if(*s==']'){ }else if(*s=='^'){ inc=DPS_UNIREG_EXC; match=1; }else{ if((*tstart==*s)&&(inc==DPS_UNIREG_EXC)){ match=0; break; } if((*tstart==*s)&&(inc==DPS_UNIREG_INC)){ match=1; break; } } } tstart++; break; case '$': if(*tstart!=0){ match=0; }else{ match=1; } break; default: match=1; for(s=reg->Token[i].str;(*s)&&(*tstart);s++,tstart++){ if(*s=='.'){ /* Any char */ }else if((*s)!=(*tstart)){ match=0; break; } } if((*s)&&(!*tstart))match=0; break; } #ifdef DEBUG_UNIREG printf("match=%d\n",match); #endif if(!match)break; } if(match)break; } #ifdef DEBUG_UNIREG printf("return match=%d\n",match); #endif return match; }
__C_LINK int __DPSCALL DpsImportDictionary(DPS_ENV * Conf, const char *lang, const char *charset, const char *filename, int skip_noflag, const char *first_letters){ struct stat sb; char *str, *data = NULL, *cur_n = NULL; char *lstr; dpsunicode_t *ustr; DPS_CHARSET *sys_int; DPS_CHARSET *dict_charset; DPS_CONV touni; DPS_CONV fromuni; int fd; char savebyte; if ((lstr = (char*) DpsMalloc(2048)) == NULL) { DPS_FREE(str); return DPS_ERROR; } if ((ustr = (dpsunicode_t*) DpsMalloc(8192)) == NULL) { DPS_FREE(lstr); return DPS_ERROR; } dict_charset = DpsGetCharSet(charset); sys_int = DpsGetCharSet("sys-int"); if ((dict_charset == NULL) || (sys_int == NULL)) { DPS_FREE(lstr); DPS_FREE(ustr); return DPS_ERROR; } DpsConvInit(&touni, dict_charset, sys_int, Conf->CharsToEscape, 0); DpsConvInit(&fromuni, sys_int, dict_charset, Conf->CharsToEscape, 0); if (stat(filename, &sb)) { fprintf(stderr, "Unable to stat synonyms file '%s': %s", filename, strerror(errno)); DPS_FREE(lstr); DPS_FREE(ustr); return DPS_ERROR; } if ((fd = DpsOpen2(filename, O_RDONLY)) <= 0) { fprintf(stderr, "Unable to open synonyms file '%s': %s", filename, strerror(errno)); return DPS_ERROR; } if ((data = (char*)DpsMalloc(sb.st_size + 1)) == NULL) { fprintf(stderr, "Unable to alloc %ld bytes", (long)sb.st_size); DpsClose(fd); DPS_FREE(lstr); DPS_FREE(ustr); return DPS_ERROR; } if (read(fd, data, sb.st_size) != (ssize_t)sb.st_size) { fprintf(stderr, "Unable to read synonym file '%s': %s", filename, strerror(errno)); DPS_FREE(data); DpsClose(fd); DPS_FREE(lstr); DPS_FREE(ustr); return DPS_ERROR; } data[sb.st_size] = '\0'; str = data; cur_n = strchr(str, '\n'); if (cur_n != NULL) { cur_n++; savebyte = *cur_n; *cur_n = '\0'; } DpsClose(fd); while(str != NULL) { char *s; const char *flag; int res; flag = NULL; s = str; while(*s){ if(*s == '\r') *s = '\0'; if(*s == '\n') *s = '\0'; s++; } if((s=strchr(str,'/'))){ *s=0; s++;flag=s; while(*s){ if(((*s>='A')&&(*s<='Z'))||((*s>='a')&&(*s<='z')))s++; else{ *s=0; break; } } }else{ if(skip_noflag) goto loop_continue; flag=""; } res = DpsConv(&touni, (char*)ustr, 8192, str, 1024); DpsUniStrToLower(ustr); /* Dont load words if first letter is not required */ /* It allows to optimize loading at search time */ if(*first_letters) { DpsConv(&fromuni, lstr, 2048, ((const char*)ustr),(size_t)res); if(!strchr(first_letters,lstr[0])) goto loop_continue; } res = DpsSpellAdd(&Conf->Spells,ustr,flag,lang); if (res != DPS_OK) { DPS_FREE(lstr); DPS_FREE(ustr); DPS_FREE(data); return res; } if (Conf->Flags.use_accentext) { dpsunicode_t *af_uwrd = DpsUniAccentStrip(ustr); if (DpsUniStrCmp(af_uwrd, ustr) != 0) { res = DpsSpellAdd(&Conf->Spells, af_uwrd, flag, lang); if (res != DPS_OK) { DPS_FREE(lstr); DPS_FREE(ustr); DPS_FREE(data); DPS_FREE(af_uwrd); return res; } } DPS_FREE(af_uwrd); if (strncasecmp(lang, "de", 2) == 0) { dpsunicode_t *de_uwrd = DpsUniGermanReplace(ustr); if (DpsUniStrCmp(de_uwrd, ustr) != 0) { res = DpsSpellAdd(&Conf->Spells, de_uwrd, flag, lang); if (res != DPS_OK) { DPS_FREE(lstr); DPS_FREE(ustr); DPS_FREE(data); DPS_FREE(de_uwrd); return res; } } DPS_FREE(de_uwrd); } } loop_continue: str = cur_n; if (str != NULL) { *str = savebyte; cur_n = strchr(str, '\n'); if (cur_n != NULL) { cur_n++; savebyte = *cur_n; *cur_n = '\0'; } } } DPS_FREE(data); DPS_FREE(lstr); DPS_FREE(ustr); return DPS_OK; }
__C_LINK int __DPSCALL DpsSynonymListLoad(DPS_ENV * Env,const char * filename){ struct stat sb; char *str, *data = NULL, *cur_n = NULL; char lang[64]=""; DPS_CHARSET *cs=NULL; DPS_CHARSET *sys_int=DpsGetCharSet("sys-int"); DPS_CONV file_uni; DPS_WIDEWORD *ww = NULL; size_t key = 1; int flag_th = 0; int fd; char savebyte; if (stat(filename, &sb)) { fprintf(stderr, "Unable to stat synonyms file '%s': %s", filename, strerror(errno)); return DPS_ERROR; } if ((fd = DpsOpen2(filename, O_RDONLY)) <= 0) { dps_snprintf(Env->errstr,sizeof(Env->errstr)-1, "Unable to open synonyms file '%s': %s", filename, strerror(errno)); return DPS_ERROR; } if ((data = (char*)DpsMalloc(sb.st_size + 1)) == NULL) { dps_snprintf(Env->errstr,sizeof(Env->errstr)-1, "Unable to alloc %d bytes", sb.st_size); DpsClose(fd); return DPS_ERROR; } if (read(fd, data, sb.st_size) != (ssize_t)sb.st_size) { dps_snprintf(Env->errstr,sizeof(Env->errstr)-1, "Unable to read synonym file '%s': %s", filename, strerror(errno)); DPS_FREE(data); DpsClose(fd); return DPS_ERROR; } data[sb.st_size] = '\0'; str = data; cur_n = strchr(str, '\n'); if (cur_n != NULL) { cur_n++; savebyte = *cur_n; *cur_n = '\0'; } while(str != NULL) { if(str[0]=='#'||str[0]==' '||str[0]=='\t'||str[0]=='\r'||str[0]=='\n') goto loop_continue; if(!strncasecmp(str,"Charset:",8)){ char * lasttok; char * charset; if((charset = dps_strtok_r(str + 8, " \t\n\r", &lasttok))) { cs=DpsGetCharSet(charset); if(!cs){ dps_snprintf(Env->errstr, sizeof(Env->errstr), "Unknown charset '%s' in synonyms file '%s'", charset, filename); DPS_FREE(data); DpsClose(fd); return DPS_ERROR; } DpsConvInit(&file_uni, cs, sys_int, Env->CharsToEscape, 0); } }else if(!strncasecmp(str,"Language:",9)){ char * lasttok; char * l; if((l = dps_strtok_r(str + 9, " \t\n\r", &lasttok))) { dps_strncpy(lang, l, sizeof(lang)-1); } }else if(!strncasecmp(str, "Thesaurus:", 10)) { char * lasttok; char *tok = dps_strtok_r(str + 10, " \t\n\r", &lasttok); flag_th = (strncasecmp(tok, "yes", 3) == 0) ? 1 : 0; }else{ char *av[255]; size_t ac, i, j; dpsunicode_t *t; if(!cs){ dps_snprintf(Env->errstr,sizeof(Env->errstr)-1,"No Charset command in synonyms file '%s'",filename); DpsClose(fd); DPS_FREE(data); return DPS_ERROR; } if(!lang[0]){ dps_snprintf(Env->errstr,sizeof(Env->errstr)-1,"No Language command in synonyms file '%s'",filename); DpsClose(fd); DPS_FREE(data); return DPS_ERROR; } ac = DpsGetArgs(str, av, 255); if (ac < 2) goto loop_continue; if ((ww = (DPS_WIDEWORD*)DpsRealloc(ww, ac * sizeof(DPS_WIDEWORD))) == NULL) return DPS_ERROR; for (i = 0; i < ac; i++) { ww[i].word = av[i]; ww[i].len = dps_strlen(av[i]); ww[i].uword = t = (dpsunicode_t*)DpsMalloc((3 * ww[i].len + 1) * sizeof(dpsunicode_t)); if (ww[i].uword == NULL) return DPS_ERROR; DpsConv(&file_uni, (char*)ww[i].uword, sizeof(dpsunicode_t) * (3 * ww[i].len + 1), av[i], ww[i].len + 1); DpsUniStrToLower(ww[i].uword); ww[i].uword = DpsUniNormalizeNFC(NULL, ww[i].uword); DPS_FREE(t); } for (i = 0; i < ac - 1; i++) { for (j = i + 1; j < ac; j++) { if((Env->Synonyms.nsynonyms + 1) >= Env->Synonyms.msynonyms){ Env->Synonyms.msynonyms += 64; Env->Synonyms.Synonym = (DPS_SYNONYM*)DpsRealloc(Env->Synonyms.Synonym, sizeof(DPS_SYNONYM)*Env->Synonyms.msynonyms); if (Env->Synonyms.Synonym == NULL) { Env->Synonyms.msynonyms = Env->Synonyms.nsynonyms = 0; return DPS_ERROR; } } bzero((void*)&Env->Synonyms.Synonym[Env->Synonyms.nsynonyms], sizeof(DPS_SYNONYM)); /* Add direct order */ Env->Synonyms.Synonym[Env->Synonyms.nsynonyms].p.uword = DpsUniDup(ww[i].uword); Env->Synonyms.Synonym[Env->Synonyms.nsynonyms].s.uword = DpsUniDup(ww[j].uword); Env->Synonyms.Synonym[Env->Synonyms.nsynonyms].p.count = Env->Synonyms.Synonym[Env->Synonyms.nsynonyms].s.count = (size_t)((flag_th) ? key : 0); Env->Synonyms.nsynonyms++; bzero((void*)&Env->Synonyms.Synonym[Env->Synonyms.nsynonyms], sizeof(DPS_SYNONYM)); /* Add reverse order */ Env->Synonyms.Synonym[Env->Synonyms.nsynonyms].p.uword = DpsUniDup(ww[j].uword); Env->Synonyms.Synonym[Env->Synonyms.nsynonyms].s.uword = DpsUniDup(ww[i].uword); Env->Synonyms.Synonym[Env->Synonyms.nsynonyms].p.count = Env->Synonyms.Synonym[Env->Synonyms.nsynonyms].s.count = (size_t)((flag_th) ? key : 0); Env->Synonyms.nsynonyms++; } } for (i = 0; i < ac; i++) { DPS_FREE(ww[i].uword); } do { key++; } while (key == 0); } loop_continue: str = cur_n; if (str != NULL) { *str = savebyte; cur_n = strchr(str, '\n'); if (cur_n != NULL) { cur_n++; savebyte = *cur_n; *cur_n = '\0'; } } } DPS_FREE(data); DPS_FREE(ww); DpsClose(fd); return DPS_OK; }