예제 #1
0
int DpsChineseListLoad(DPS_AGENT *Agent, DPS_CHINALIST *List, const char *charset, const char *fname) {
     struct stat     sb;
     char *str, *data = NULL, *cur_n = NULL;
     DPS_CHINAWORD chinaword;
     char word[PATH_MAX];
     dpsunicode_t uword[256];
     DPS_CHARSET *sys_int, *fcs;
     DPS_CONV to_uni;
     int             fd;
     char            savebyte;

     sys_int = DpsGetCharSet("sys-int");
     if (!(fcs = DpsGetCharSet(charset))) {
       if (Agent->Conf->is_log_open) DpsLog(Agent, DPS_LOG_ERROR, "Charset '%s' not found or not supported", charset);
       else fprintf(stderr, "Charset '%s' not found or not supported", charset);
       return DPS_ERROR;
     }
     DpsConvInit(&to_uni, fcs, sys_int, Agent->Conf->CharsToEscape, DPS_RECODE_HTML);

     if (*fname != '/') {
       dps_snprintf(word, sizeof(word), "%s/%s", DpsVarListFindStr(&Agent->Conf->Vars, "EtcDir", DPS_CONF_DIR), fname);
       fname = word;
     }

     if (stat(fname, &sb)) {
       if (Agent->Conf->is_log_open) 
	    DpsLog(Agent, DPS_LOG_ERROR, "Unable to stat FreqDic file '%s': %s", fname, strerror(errno));
       else fprintf(stderr, "Unable to stat FrecDic file '%s': %s", fname, strerror(errno));
       return DPS_ERROR;
     }
     if ((fd = open(fname, O_RDONLY)) <= 0) {
       if (Agent->Conf->is_log_open) 
	    DpsLog(Agent, DPS_LOG_ERROR, "Unable to open FreqDic file '%s': %s", fname, strerror(errno));
       else fprintf(stderr, "Unable to open FreqDic file '%s': %s", fname, strerror(errno));
       return DPS_ERROR;
     }
     if ((data = (char*)DpsMalloc(sb.st_size + 1)) == NULL) {
       if (Agent->Conf->is_log_open) 
	 DpsLog(Agent, DPS_LOG_ERROR, "Unable to alloc %d bytes", sb.st_size);
       else fprintf(stderr, "Unable to alloc %ld bytes", (long)sb.st_size);
       close(fd);
       return DPS_ERROR;
     }
     if (read(fd, data, sb.st_size) != (ssize_t)sb.st_size) {
       if (Agent->Conf->is_log_open) 
	 DpsLog(Agent, DPS_LOG_ERROR, "Unable to read FreqDic file '%s': %s", fname, strerror(errno));
       else fprintf(stderr, "Unable to read FreqDic file '%s': %s", fname, strerror(errno));
       DPS_FREE(data);
       close(fd);
       return DPS_ERROR;
     }
     data[sb.st_size] = '\0';
     str = data;
     cur_n = strchr(str, NL_INT);
     if (cur_n != NULL) {
       cur_n++;
       savebyte = *cur_n;
       *cur_n = '\0';
     }
     close(fd);

     bzero((void*)&chinaword, sizeof(chinaword));
     chinaword.word = uword;
     while(str != NULL) {
          if(!str[0]) goto loop_continue;
          if(str[0]=='#') goto loop_continue;
          sscanf(str, "%d %63s ", &chinaword.freq, word );
	  DpsConv(&to_uni, (char*)uword, sizeof(uword), word, sizeof(word));
          DpsChineseListAdd(List, &chinaword);
     loop_continue:
	  str = cur_n;
	  if (str != NULL) {
	    *str = savebyte;
	    cur_n = strchr(str, NL_INT);
	    if (cur_n != NULL) {
	      cur_n++;
	      savebyte = *cur_n;
	      *cur_n = '\0';
	    }
	  }
     }
     DPS_FREE(data);
     DpsChineseListSort(List);
     { register size_t i, j = 0;
       for (i = 1; i < List->nwords; i++) {
	 if (cmpchinese(&List->ChiWord[j], &List->ChiWord[i]) == 0) {
	   List->ChiWord[j].freq += List->ChiWord[i].freq;
	 } else { j++;
	 }
       }
       for (i = j + 1; i < List->nwords; i++) {
	 DPS_FREE(List->ChiWord[i].word);
       }
       List->nwords = j + 1;
     }
     return DPS_OK;
}
예제 #2
0
static int DpsUniRegExec(const DPS_UNIREG_EXP *reg, const dpsunicode_t *string) {
	const dpsunicode_t *start = string;
	int match=0;
#ifdef DEBUG_UNIREG
	DPS_CHARSET *k = DpsGetCharSet("koi8-r");
	DPS_CHARSET *sy = DpsGetCharSet("sys-int");
	DPS_CONV fromuni;
	char sstr[1024];
	char rstr[1024];
	
	DpsConvInit(&fromuni, sy, k, NULL, 0);
#endif
	
	for(start=string;*start;start++){
		const dpsunicode_t *tstart=start;
		size_t i;
		
		for(i=0;i<reg->ntokens;i++){
			const dpsunicode_t *s;
			int inc=DPS_UNIREG_INC;
#ifdef DEBUG_UNIREG

			DpsConv(&fromuni, sstr, 1024, (char*)tstart, 1024);
			DpsConv(&fromuni, rstr, 1024, (char*)reg->Token[i].str, 1024);
			printf("t:%d tstart='%s'\ttok='%s'\t", i, sstr, rstr);
#endif
			switch(reg->Token[i].str[0]){
				case '^':
					if(string!=tstart){
						match=0;
					}else{	
						match=1;
					}
					break;
				case '[':
					match=0;
					for(s=reg->Token[i].str+1;*s;s++){
						if(*s==']'){
						}else
						if(*s=='^'){
							inc=DPS_UNIREG_EXC;
							match=1;
						}else{
							if((*tstart==*s)&&(inc==DPS_UNIREG_EXC)){
								match=0;
								break;
							}
							if((*tstart==*s)&&(inc==DPS_UNIREG_INC)){
								match=1;
								break;
							}
						}
					}
					tstart++;
					break;
				case '$':
					if(*tstart!=0){
						match=0;
					}else{
						match=1;
					}
					break;
				default:
					match=1;
					for(s=reg->Token[i].str;(*s)&&(*tstart);s++,tstart++){
						if(*s=='.'){
							/* Any char */
						}else
						if((*s)!=(*tstart)){
							match=0;
							break;
						}
					}
					if((*s)&&(!*tstart))match=0;
					break;
			}
#ifdef DEBUG_UNIREG
			printf("match=%d\n",match);
#endif
			if(!match)break;
		}
		if(match)break;
	}

#ifdef DEBUG_UNIREG
	printf("return match=%d\n",match);
#endif
	return match;

}
예제 #3
0
__C_LINK int __DPSCALL DpsImportDictionary(DPS_ENV * Conf, const char *lang, const char *charset,
				   const char *filename, int skip_noflag, const char *first_letters){
        struct stat     sb;
	char *str, *data = NULL, *cur_n = NULL;	
	char *lstr;	
	dpsunicode_t *ustr;	
	DPS_CHARSET *sys_int;
	DPS_CHARSET *dict_charset;
	DPS_CONV touni;
	DPS_CONV fromuni;
	int             fd;
	char            savebyte;

	if ((lstr = (char*) DpsMalloc(2048)) == NULL) {
	  DPS_FREE(str);
	  return DPS_ERROR; 
	}
	if ((ustr = (dpsunicode_t*) DpsMalloc(8192)) == NULL) {
	  DPS_FREE(lstr);
	  return DPS_ERROR; 
	}

	dict_charset = DpsGetCharSet(charset);
	sys_int = DpsGetCharSet("sys-int");
	if ((dict_charset == NULL) || (sys_int == NULL)) {
	  DPS_FREE(lstr);
	  DPS_FREE(ustr);
	  return DPS_ERROR;
	}
	
	DpsConvInit(&touni, dict_charset, sys_int, Conf->CharsToEscape, 0);
	DpsConvInit(&fromuni, sys_int, dict_charset, Conf->CharsToEscape, 0);
	
	if (stat(filename, &sb)) {
	  fprintf(stderr, "Unable to stat synonyms file '%s': %s", filename, strerror(errno));
	  DPS_FREE(lstr);
	  DPS_FREE(ustr);
	  return DPS_ERROR;
	}
	if ((fd = DpsOpen2(filename, O_RDONLY)) <= 0) {
	  fprintf(stderr, "Unable to open synonyms file '%s': %s", filename, strerror(errno));
	  return DPS_ERROR;
	}
	if ((data = (char*)DpsMalloc(sb.st_size + 1)) == NULL) {
	  fprintf(stderr, "Unable to alloc %ld bytes", (long)sb.st_size);
	  DpsClose(fd);
	  DPS_FREE(lstr);
	  DPS_FREE(ustr);
	  return DPS_ERROR;
	}
	if (read(fd, data, sb.st_size) != (ssize_t)sb.st_size) {
	  fprintf(stderr, "Unable to read synonym file '%s': %s", filename, strerror(errno));
	  DPS_FREE(data);
	  DpsClose(fd);
	  DPS_FREE(lstr);
	  DPS_FREE(ustr);
	  return DPS_ERROR;
	}
	data[sb.st_size] = '\0';
	str = data;
	cur_n = strchr(str, '\n');
	if (cur_n != NULL) {
	  cur_n++;
	  savebyte = *cur_n;
	  *cur_n = '\0';
	}

	DpsClose(fd);
	while(str != NULL) {
		char *s;
		const char *flag;
		int res;
		
	        flag = NULL;
		s = str;
		while(*s){
			if(*s == '\r') *s = '\0';
			if(*s == '\n') *s = '\0';
			s++;
		}
		if((s=strchr(str,'/'))){
			*s=0;
			s++;flag=s;
			while(*s){
				if(((*s>='A')&&(*s<='Z'))||((*s>='a')&&(*s<='z')))s++;
				else{
					*s=0;
					break;
				}
			}
		}else{
			if(skip_noflag)	goto loop_continue;
			flag="";
		}

		res = DpsConv(&touni, (char*)ustr, 8192, str, 1024);
		DpsUniStrToLower(ustr);

		/* Dont load words if first letter is not required */
		/* It allows to optimize loading at  search time   */
		if(*first_letters) {
			DpsConv(&fromuni, lstr, 2048, ((const char*)ustr),(size_t)res);
			if(!strchr(first_letters,lstr[0]))
				goto loop_continue;
		}
		res = DpsSpellAdd(&Conf->Spells,ustr,flag,lang);
		if (res != DPS_OK) {
		  DPS_FREE(lstr);
		  DPS_FREE(ustr); DPS_FREE(data);
		  return res;
		}
		if (Conf->Flags.use_accentext) {
		  dpsunicode_t *af_uwrd = DpsUniAccentStrip(ustr);
		  if (DpsUniStrCmp(af_uwrd, ustr) != 0) {
		    res = DpsSpellAdd(&Conf->Spells, af_uwrd, flag, lang);
		    if (res != DPS_OK) {
		      DPS_FREE(lstr);
		      DPS_FREE(ustr); DPS_FREE(data); DPS_FREE(af_uwrd);
		      return res;
		    }
		  }
		  DPS_FREE(af_uwrd);
		  if (strncasecmp(lang, "de", 2) == 0) {
		    dpsunicode_t *de_uwrd = DpsUniGermanReplace(ustr);
		    if (DpsUniStrCmp(de_uwrd, ustr) != 0) {
		      res = DpsSpellAdd(&Conf->Spells, de_uwrd, flag, lang);
		      if (res != DPS_OK) {
			DPS_FREE(lstr);
			DPS_FREE(ustr); DPS_FREE(data); DPS_FREE(de_uwrd);
			return res;
		      }
		    }
		    DPS_FREE(de_uwrd);
		  }
		}
	loop_continue:
		str = cur_n;
		if (str != NULL) {
		  *str = savebyte;
		  cur_n = strchr(str, '\n');
		  if (cur_n != NULL) {
		    cur_n++;
		    savebyte = *cur_n;
		    *cur_n = '\0';
		  }
		}
	}
	DPS_FREE(data);
	DPS_FREE(lstr);
	DPS_FREE(ustr);
	return DPS_OK;
}
예제 #4
0
__C_LINK int __DPSCALL DpsSynonymListLoad(DPS_ENV * Env,const char * filename){
     struct stat     sb;
     char      *str, *data = NULL, *cur_n = NULL;
     char      lang[64]="";
     DPS_CHARSET    *cs=NULL;
     DPS_CHARSET    *sys_int=DpsGetCharSet("sys-int");
     DPS_CONV  file_uni;
     DPS_WIDEWORD    *ww = NULL;
     size_t key = 1;
     int flag_th = 0;
     int             fd;
     char            savebyte;
     
     if (stat(filename, &sb)) {
       fprintf(stderr, "Unable to stat synonyms file '%s': %s", filename, strerror(errno));
       return DPS_ERROR;
     }
     if ((fd = DpsOpen2(filename, O_RDONLY)) <= 0) {
       dps_snprintf(Env->errstr,sizeof(Env->errstr)-1, "Unable to open synonyms file '%s': %s", filename, strerror(errno));
       return DPS_ERROR;
     }
     if ((data = (char*)DpsMalloc(sb.st_size + 1)) == NULL) {
       dps_snprintf(Env->errstr,sizeof(Env->errstr)-1, "Unable to alloc %d bytes", sb.st_size);
       DpsClose(fd);
       return DPS_ERROR;
     }
     if (read(fd, data, sb.st_size) != (ssize_t)sb.st_size) {
       dps_snprintf(Env->errstr,sizeof(Env->errstr)-1, "Unable to read synonym file '%s': %s", filename, strerror(errno));
       DPS_FREE(data);
       DpsClose(fd);
       return DPS_ERROR;
     }
     data[sb.st_size] = '\0';
     str = data;
     cur_n = strchr(str, '\n');
     if (cur_n != NULL) {
       cur_n++;
       savebyte = *cur_n;
       *cur_n = '\0';
     }

     while(str != NULL) {
          if(str[0]=='#'||str[0]==' '||str[0]=='\t'||str[0]=='\r'||str[0]=='\n') goto loop_continue;
          
          if(!strncasecmp(str,"Charset:",8)){
               char * lasttok;
               char * charset;
               if((charset = dps_strtok_r(str + 8, " \t\n\r", &lasttok))) {
                    cs=DpsGetCharSet(charset);
                    if(!cs){
                         dps_snprintf(Env->errstr, sizeof(Env->errstr), "Unknown charset '%s' in synonyms file '%s'",
                                   charset, filename);
                         DPS_FREE(data);
			 DpsClose(fd);
                         return DPS_ERROR;
                    }
                    DpsConvInit(&file_uni, cs, sys_int, Env->CharsToEscape, 0);
               }
          }else
          if(!strncasecmp(str,"Language:",9)){
               char * lasttok;
               char * l;
               if((l = dps_strtok_r(str + 9, " \t\n\r", &lasttok))) {
                    dps_strncpy(lang, l, sizeof(lang)-1);
               }
          }else
          if(!strncasecmp(str, "Thesaurus:", 10)) {
               char * lasttok;
	       char *tok = dps_strtok_r(str + 10, " \t\n\r", &lasttok);
	       flag_th = (strncasecmp(tok, "yes", 3) == 0) ? 1 : 0;
          }else{
               char      *av[255];
               size_t         ac, i, j;
	       dpsunicode_t *t;

               if(!cs){
                    dps_snprintf(Env->errstr,sizeof(Env->errstr)-1,"No Charset command in synonyms file '%s'",filename);
                    DpsClose(fd); DPS_FREE(data);
                    return DPS_ERROR;
               }
               if(!lang[0]){
                    dps_snprintf(Env->errstr,sizeof(Env->errstr)-1,"No Language command in synonyms file '%s'",filename);
                    DpsClose(fd); DPS_FREE(data);
                    return DPS_ERROR;
               }

               ac = DpsGetArgs(str, av, 255);
               if (ac < 2) goto loop_continue;

               if ((ww = (DPS_WIDEWORD*)DpsRealloc(ww, ac * sizeof(DPS_WIDEWORD))) == NULL) return DPS_ERROR;

               for (i = 0; i < ac; i++) {
                 ww[i].word = av[i];
                 ww[i].len = dps_strlen(av[i]);
                 ww[i].uword = t = (dpsunicode_t*)DpsMalloc((3 * ww[i].len + 1) * sizeof(dpsunicode_t));
		 if (ww[i].uword == NULL) return DPS_ERROR;
                 DpsConv(&file_uni, (char*)ww[i].uword, sizeof(dpsunicode_t) * (3 * ww[i].len + 1), av[i], ww[i].len + 1);
                 DpsUniStrToLower(ww[i].uword);
		 ww[i].uword = DpsUniNormalizeNFC(NULL, ww[i].uword);
		 DPS_FREE(t);
               }

               for (i = 0; i < ac - 1; i++) {
                 for (j = i + 1; j < ac; j++) {

                   if((Env->Synonyms.nsynonyms + 1) >= Env->Synonyms.msynonyms){
                    Env->Synonyms.msynonyms += 64;
                    Env->Synonyms.Synonym = (DPS_SYNONYM*)DpsRealloc(Env->Synonyms.Synonym, 
                                                   sizeof(DPS_SYNONYM)*Env->Synonyms.msynonyms);
		    if (Env->Synonyms.Synonym == NULL) {
		      Env->Synonyms.msynonyms = Env->Synonyms.nsynonyms = 0;
		      return DPS_ERROR;
  		    }
                   }
               
                   bzero((void*)&Env->Synonyms.Synonym[Env->Synonyms.nsynonyms], sizeof(DPS_SYNONYM));
               
                   /* Add direct order */
                   Env->Synonyms.Synonym[Env->Synonyms.nsynonyms].p.uword = DpsUniDup(ww[i].uword);
                   Env->Synonyms.Synonym[Env->Synonyms.nsynonyms].s.uword = DpsUniDup(ww[j].uword);
		   Env->Synonyms.Synonym[Env->Synonyms.nsynonyms].p.count = 
		     Env->Synonyms.Synonym[Env->Synonyms.nsynonyms].s.count = (size_t)((flag_th) ? key : 0);
                   Env->Synonyms.nsynonyms++;
               
                   bzero((void*)&Env->Synonyms.Synonym[Env->Synonyms.nsynonyms], sizeof(DPS_SYNONYM));
               
                   /* Add reverse order */
                   Env->Synonyms.Synonym[Env->Synonyms.nsynonyms].p.uword = DpsUniDup(ww[j].uword);
                   Env->Synonyms.Synonym[Env->Synonyms.nsynonyms].s.uword = DpsUniDup(ww[i].uword);
		   Env->Synonyms.Synonym[Env->Synonyms.nsynonyms].p.count = 
		     Env->Synonyms.Synonym[Env->Synonyms.nsynonyms].s.count = (size_t)((flag_th) ? key : 0);
                   Env->Synonyms.nsynonyms++;
                 }
               }

               for (i = 0; i < ac; i++) {
                 DPS_FREE(ww[i].uword);
               }
               do { key++; } while (key == 0);
          }
     loop_continue:
	  str = cur_n;
	  if (str != NULL) {
	    *str = savebyte;
	    cur_n = strchr(str, '\n');
	    if (cur_n != NULL) {
	      cur_n++;
	      savebyte = *cur_n;
	      *cur_n = '\0';
	    }
	  }
     }
     DPS_FREE(data);
     DPS_FREE(ww);
     DpsClose(fd);
     return DPS_OK;
}