Beispiel #1
0
__C_LINK void __DPSCALL DpsSynonymListSort(DPS_SYNONYMLIST * List){
  if (List->Synonym != NULL) {
    if (List->nsynonyms > 1)
      DpsSort(List->Synonym, List->nsynonyms, sizeof(DPS_SYNONYM), &cmpsyn);
    if ((List->Back = (DPS_SYNONYM**)DpsRealloc(List->Back, (List->nsynonyms + 1) * sizeof(DPS_SYNONYM*))) != NULL) {
      register size_t i;
      for (i = 0; i < List->nsynonyms; i++) List->Back[i] = &List->Synonym[i];
      if (List->nsynonyms > 1)
	DpsSort(List->Back, List->nsynonyms, sizeof(DPS_SYNONYM*), &cmpsynback);
    }
  }
}
int main() {
  B table[256];
  int i;

  for (i = 0; i < 256; i++) {
    table[i].c = i;
    table[i].u = tab_tscii[i];
  }
  DpsSort((void*)table, 256, sizeof(B), cmpB);

  for (i = 0; i < 256; i++) {
    if (table[i].u)
      printf("%02x %04x %04x %01d\n", table[i].c, table[i].u, tab2_tscii[table[i].c], len_tscii[table[i].c]);
  }
  
}
int main() {
  B table[256];
  int i;

  for (i = 0; i < 256; i++) {
    table[i].c = i;
    table[i].u = tab_gujarati[i];
  }
  DpsSort((void*)table, 256, sizeof(B), cmpB);

  for (i = 0; i < 256; i++) {
    if (table[i].u)
      printf("%04x %02x %03d\n", table[i].u, table[i].c, table[i].c);
  }
  
}
Beispiel #4
0
static int MakeNestedIndex(DPS_AGENT *Indexer, DPS_UINT8URLIDLIST *L, const char *lim_name, DPS_DB *db) {
     DPS_ENV   *Conf = Indexer->Conf;
     size_t    k, prev;
     urlid_t   *data=NULL;
     DPS_UINT8_POS_LEN *ind=NULL;
     size_t    mind=1000, nind=0, ndata;
     char fname[PATH_MAX];
     int  dat_fd=0, ind_fd=0;
     int  rc=DPS_OK;
     const char	*vardir = (db->vardir) ? db->vardir : DpsVarListFindStr(&Conf->Vars, "VarDir", DPS_VAR_DIR);
     
     if(!L->Item)return(1);
     
     if (L->nitems > 1) DpsSort(L->Item, L->nitems, sizeof(DPS_UINT8URLID), (qsort_cmp)cmp_ind8);
     
     data = (urlid_t*)DpsMalloc((L->nitems + 1) * sizeof(urlid_t));
     if(!data){
       DpsLog(Indexer, DPS_LOG_ERROR, "Can't alloc %d bytes [%s:%d]", (L->nitems + 1) * sizeof(urlid_t), __FILE__, __LINE__);
       goto err1;
     }
     ind=(DPS_UINT8_POS_LEN*)DpsMalloc(mind*sizeof(DPS_UINT8_POS_LEN));
     if(!ind){
       DpsLog(Indexer, DPS_LOG_ERROR, "Can't alloc %d bytes [%s:%d]", mind * sizeof(DPS_UINT8_POS_LEN), __FILE__, __LINE__);
       goto err1;
     }
     prev=0;
     for(k=0; k < L->nitems; k++) {
          data[k] = L->Item[k].url_id;
          if((k == L->nitems-1) || (L->Item[k].hi != L->Item[prev].hi) || (L->Item[k].lo != L->Item[prev].lo)) {
               if(nind==mind){
                    mind+=1000;
                    ind=(DPS_UINT8_POS_LEN*)DpsRealloc(ind,mind*sizeof(DPS_UINT8_POS_LEN));
                    if(!ind) {
		      DpsLog(Indexer, DPS_LOG_ERROR, "Can't alloc %d bytes [%s:%d]", mind * sizeof(DPS_UINT8_POS_LEN), __FILE__, __LINE__);
		      goto err1;
                    }
               }
               /* Fill index */
               ind[nind].hi = L->Item[prev].hi;
               ind[nind].lo = L->Item[prev].lo;
               ind[nind].pos = prev * sizeof(*data);
               if (k == L->nitems - 1) ind[nind].len = (k - prev + 1) * sizeof(*data);
               else ind[nind].len = (k - prev) * sizeof(*data);
               DpsLog(Indexer, DPS_LOG_DEBUG, "%08X%08X - %d %d\n", ind[nind].hi, ind[nind].lo, (int)ind[nind].pos, ind[nind].len);
               nind++;
               
               prev=k;
          }
     }
     ndata = L->nitems;
     ClearIndex8(L);
     
     dps_snprintf(fname,sizeof(fname)-1,"%s%c%s%c%s.dat", vardir,DPSSLASH, DPS_TREEDIR,DPSSLASH, lim_name);
     if((dat_fd = DpsOpen3(fname, O_CREAT | O_WRONLY | O_TRUNC | DPS_BINARY, DPS_IWRITE)) < 0) {
       DpsLog(Indexer, DPS_LOG_ERROR, "Can't open '%s': %s [%s:%d]", fname, strerror(errno), __FILE__, __LINE__);
       goto err1;
     }
     DpsWriteLock(dat_fd);
     if((ndata * sizeof(*data)) != (size_t)write(dat_fd, data, ndata * sizeof(*data))) {
       DpsLog(Indexer, DPS_LOG_ERROR, "Can't write '%s': %s [%s:%d]", fname, strerror(errno), __FILE__, __LINE__);
       goto err1;
     }
     DpsUnLock(dat_fd);
     DpsClose(dat_fd);
     DPS_FREE(data);

     dps_snprintf(fname,sizeof(fname)-1,"%s%c%s%c%s.ind", vardir, DPSSLASH,DPS_TREEDIR, DPSSLASH, lim_name);
     if((ind_fd = DpsOpen3(fname, O_CREAT | O_WRONLY | O_TRUNC | DPS_BINARY, DPS_IWRITE)) < 0) {
       DpsLog(Indexer, DPS_LOG_ERROR, "Can't open '%s': %s [%s:%d]", fname, strerror(errno), __FILE__, __LINE__);
       goto err1;
     }
     DpsWriteLock(ind_fd);
     if((nind*sizeof(DPS_UINT8_POS_LEN)) != (size_t)write(ind_fd,ind,nind*sizeof(DPS_UINT8_POS_LEN))){
       DpsLog(Indexer, DPS_LOG_ERROR, "Can't write '%s': %s [%s:%d]", fname, strerror(errno), __FILE__, __LINE__);
          goto err1;
     }
     DpsUnLock(ind_fd);
     DpsClose(ind_fd);
     DPS_FREE(ind);
     
     return(0);
     
err1:
     ClearIndex8(L);
     DPS_FREE(data);
     DPS_FREE(ind);
     if(dat_fd) DpsClose(dat_fd);
     if(ind_fd) DpsClose(ind_fd);
     return(1);
}
Beispiel #5
0
static int MakeLinearIndex(DPS_AGENT *Indexer, const char *field, const char *lim_name, int type, DPS_DB *db) {
    DPS_ENV *Conf = Indexer->Conf;
    DPS_UINT4URLIDLIST  L;
    size_t    k,prev;
    urlid_t   *data = NULL;
    DPS_UINT4_POS_LEN *ind=NULL;
    size_t    mind=1000,nind=0;
    char fname[PATH_MAX];
    int  dat_fd=0, ind_fd=0, rc;
    const char	*vardir = (db->vardir) ? db->vardir : DpsVarListFindStr(&Conf->Vars, "VarDir", DPS_VAR_DIR);

    bzero(&L, sizeof(DPS_UINT4URLIDLIST));

    rc = DpsLimit4(Indexer, &L, field, type, db);

    if(rc != DPS_OK) {
        DpsLog(Indexer, DPS_LOG_ERROR, "Error: %s [%s:%d]", DpsEnvErrMsg(Conf), __FILE__, __LINE__);
        goto err1;
    }

    if(!L.Item)return(1);

    if (L.nitems > 1) DpsSort(L.Item, L.nitems, sizeof(DPS_UINT4URLID), (qsort_cmp)cmp_ind4);

    data = (urlid_t*)DpsMalloc((L.nitems + 1) * sizeof(*data));
    if(!data) {
        fprintf(stderr,"Error1: %s\n",strerror(errno));
        goto err1;
    }
    ind=(DPS_UINT4_POS_LEN*)DpsMalloc(mind*sizeof(DPS_UINT4_POS_LEN));
    if(!ind) {
        fprintf(stderr,"Error2: %s\n",strerror(errno));
        goto err1;
    }
    prev=0;
    for(k=0; k<L.nitems; k++) {
        data[k]=L.Item[k].url_id;
        if((k==L.nitems-1) || (L.Item[k].val!=L.Item[prev].val)) {
            if(nind==mind) {
                mind+=1000;
                ind=(DPS_UINT4_POS_LEN*)DpsRealloc(ind,mind*sizeof(DPS_UINT4_POS_LEN));
                if(!ind) {
                    fprintf(stderr,"Error3: %s\n",strerror(errno));
                    goto err1;
                }
            }
            /* Fill index */
            ind[nind].val=L.Item[prev].val;
            ind[nind].pos = prev * sizeof(*data);
            if (k == L.nitems - 1) ind[nind].len = (k - prev + 1) * sizeof(*data);
            else ind[nind].len = (k - prev) * sizeof(*data);
            DpsLog(Indexer, DPS_LOG_DEBUG, "%d - pos:%x len:%d\n", ind[nind].val, (int)ind[nind].pos, ind[nind].len);
            nind++;

            prev=k;
        }
    }
    if (L.mapped) {
#ifdef HAVE_SYS_MMAN_H
        if (munmap(L.Item, (L.nitems + 1) * sizeof(DPS_UINT4URLID))) {
            fprintf(stderr, "Can't shmdt '%s': %s\n", L.shm_name, strerror(errno));
        }
#elif defined(HAVE_SYS_SHM_H)
        if (shmdt(L.Item)) {
            fprintf(stderr, "Can't shmdt '%s': %s\n", L.shm_name, strerror(errno));
        }
#endif
        unlink(L.shm_name);
    } else {
        DPS_FREE(L.Item);
    }

    dps_snprintf(fname,sizeof(fname),"%s%c%s%c%s.dat", vardir,DPSSLASH, DPS_TREEDIR, DPSSLASH, lim_name);
    if((dat_fd = DpsOpen3(fname, O_CREAT | O_WRONLY | O_TRUNC | DPS_BINARY, DPS_IWRITE)) < 0) {
        fprintf(stderr,"Can't open '%s': %s\n",fname,strerror(errno));
        goto err1;
    }
    DpsWriteLock(dat_fd);
    if((L.nitems * sizeof(*data)) != (size_t)write(dat_fd, data, L.nitems * sizeof(*data))) {
        fprintf(stderr,"Can't write '%s': %s\n",fname,strerror(errno));
        goto err1;
    }
    DpsUnLock(dat_fd);
    DpsClose(dat_fd);
    DPS_FREE(data);

    dps_snprintf(fname,sizeof(fname),"%s%c%s%c%s.ind", vardir,DPSSLASH, DPS_TREEDIR, DPSSLASH, lim_name);
    if((ind_fd = DpsOpen3(fname, O_CREAT | O_WRONLY | O_TRUNC | DPS_BINARY, DPS_IWRITE)) < 0) {
        fprintf(stderr,"Can't open '%s': %s\n",fname,strerror(errno));
        goto err1;
    }
    DpsWriteLock(ind_fd);
    if((nind*sizeof(DPS_UINT4_POS_LEN)) != (size_t)write(ind_fd,ind,nind*sizeof(DPS_UINT4_POS_LEN))) {
        fprintf(stderr,"Can't write '%s': %s\n",fname,strerror(errno));
        goto err1;
    }
    DpsUnLock(ind_fd);
    DpsClose(ind_fd);
    DPS_FREE(ind);

    return(0);

err1:
    if (L.mapped) {
#ifdef HAVE_SYS_MMAN_H
        if (munmap(L.Item, (L.nitems + 1) * sizeof(DPS_UINT4URLID))) {
            fprintf(stderr, "Can't shmdt '%s': %s\n", L.shm_name, strerror(errno));
        }
#elif defined(HAVE_SYS_SHM_H)
        if (shmdt(L.Item)) {
            fprintf(stderr, "Can't shmdt '%s': %s\n", L.shm_name, strerror(errno));
        }
#endif
        unlink(L.shm_name);
    } else {
        DPS_FREE(L.Item);
    }
    DPS_FREE(data);
    DPS_FREE(ind);
    if(dat_fd) DpsClose(dat_fd);
    if(ind_fd) DpsClose(ind_fd);
    return(1);
}
Beispiel #6
0
static void DpsChineseListSort(DPS_CHINALIST *List){
     if (List->nwords > 1) DpsSort(List->ChiWord, List->nwords, sizeof(DPS_CHINAWORD), cmpchinese);
}
Beispiel #7
0
int main(int argc,char **argv, char **envp) {
  int ch, sleeps = 1, optimize = 0, obi = 0;
  unsigned int from = 0, to = 0xFFF, p_to = 0;
	DPS_ENV * Env;
	const char * config_name = DPS_CONF_DIR "/cached.conf";

	DpsInit(argc, argv, envp); /* Initialize library */
	
	DpsInitMutexes();
	Env=DpsEnvInit(NULL);
	if (Env == NULL) exit(1);
	DpsSetLockProc(Env, DpsLockProc);

/*#ifndef HAVE_SETPROCTITLE*/
	ARGV = argv;
	ARGC = argc;
/*#endif*/
	while ((ch = getopt(argc, argv, "blt:f:op:w:v:h?")) != -1){
		switch (ch) {
			case 'f':
				sscanf(optarg, "%x", &from);
				break;	
			case 't': 
				sscanf(optarg, "%x", &p_to);
				break;
			case 'w':
			        DpsVarListReplaceStr(&Env->Vars, "VarDir", optarg);
				break;
                        case 'v': DpsSetLogLevel(NULL, atoi(optarg)); break;
                        case 'b': obi++; break;
                        case 'o': optimize++; break;
                        case 'p': sleeps = atoi(optarg); break;
			case 'h':
			case '?':
			default:
			  usage();
			  DpsEnvFree(Env);
			  DpsDeInit();
			  DpsDestroyMutexes();
				return 1;
				break;
		}
	}
	argc -= optind;
	argv += optind;

	if(argc > 1) {
		usage();
		DpsEnvFree(Env);
		DpsDeInit();
		DpsDestroyMutexes();
		return 1;
	} else if (argc == 1) {
	        config_name = argv[0];
	}
	{
		DPS_LOGDEL *del_buf=NULL;
		size_t del_count = 0, log, bytes, n = 0;
		int dd, log_fd;
		struct stat sb;
		char dname[PATH_MAX] = "";
		DPS_BASE_PARAM P;
		DPS_LOGWORD *log_buf = NULL;
		DPS_AGENT *Indexer = DpsAgentInit(NULL, Env, 0);

		log2stderr = 1;
		if (Indexer == NULL) {
		  fprintf(stderr, "Can't alloc Agent at %s:%d\n", __FILE__, __LINE__);
		  exit(DPS_ERROR);
		}
		
		if(DPS_OK != DpsEnvLoad(Indexer, config_name, (dps_uint8)0)){
		  fprintf(stderr, "%s\n", DpsEnvErrMsg(Env));
		  DpsEnvFree(Env);
		  DpsDeInit();
		  DpsDestroyMutexes();
		  return DPS_ERROR;
		}
		DpsOpenLog("splitter", Env, log2stderr);
		Indexer->flags = Env->flags = DPS_FLAG_UNOCON;
		DpsVarListAddLst(&Indexer->Vars, &Env->Vars, NULL, "*");

		bzero(&P, sizeof(P));
		P.subdir = DPS_TREEDIR;
		P.basename = "wrd";
		P.indname = "wrd";
		P.mode = DPS_WRITE_LOCK;
		P.NFiles = DpsVarListFindInt(&Indexer->Conf->Vars, "WrdFiles", 0x300);
		P.vardir = DpsStrdup(DpsVarListFindStr(&Indexer->Conf->Vars, "VarDir", DPS_VAR_DIR));
		P.A = Indexer;
		if (p_to != 0) to = p_to;
		else to = P.NFiles - 1;
#ifdef HAVE_ZLIB
		P.zlib_method = Z_DEFLATED;
		P.zlib_level = 9;
		P.zlib_windowBits = DPS_BASE_WRD_WINDOWBITS;
		P.zlib_memLevel = 9;
		P.zlib_strategy = DPS_BASE_WRD_STRATEGY;
#endif

		/* Open del log file */
		dps_snprintf(dname,sizeof(dname),"%s%c%s%cdel-split.log", P.vardir, DPSSLASH, DPS_SPLDIR, DPSSLASH);
		if((dd = DpsOpen2(dname, O_RDONLY | DPS_BINARY)) < 0) {
		  dps_strerror(NULL, 0, "Can't open del log '%s'", dname);
		  exit(DPS_ERROR);
		}

		DpsLog(Indexer, DPS_LOG_DEBUG, "VarDir: %s, WrdFiles: %d [%x]", P.vardir, P.NFiles, P.NFiles);

		/* Allocate del buffer */
		fstat(dd, &sb);
		if (sb.st_size != 0) {
		  del_buf=(DPS_LOGDEL*)DpsMalloc((size_t)sb.st_size + 1);
		  if (del_buf == NULL) {
		    fprintf(stderr, "Can't alloc %d bytes at %s:%d\n", (int)sb.st_size, __FILE__, __LINE__);
		    exit(0);
		  }
		  del_count=read(dd,del_buf,(size_t)sb.st_size)/sizeof(DPS_LOGDEL);
		}
		DpsClose(dd);

		/* Remove duplicates URLs in DEL log     */
		/* Keep only oldest records for each URL */
		if (del_count > 0) {
		  DpsLog(Indexer, DPS_LOG_DEBUG, "Sorting del_buf: %d items", del_count);
		  if (del_count > 1) DpsSort(del_buf, (size_t)del_count, sizeof(DPS_LOGDEL), DpsCmpurldellog);
		    DpsLog(Indexer, DPS_LOG_DEBUG, "Removing DelLogDups");
		  del_count = DpsRemoveDelLogDups(del_buf, del_count);
		}

		DpsLog(Indexer, DPS_LOG_DEBUG, "Processing Bufs from %d [%x] to %d [%x]", from, from, to, to);

		for(log = from; log <= to; log++) {

		  /* Open log file */
		  dps_snprintf(dname, sizeof(dname), "%s%c%s%c%03X-split.log", P.vardir, DPSSLASH, DPS_SPLDIR, DPSSLASH, log);
		  if((log_fd = DpsOpen2(dname, O_RDWR|DPS_BINARY)) < 0){
		    if (errno == ENOENT) {
		      dps_strerror(Indexer, DPS_LOG_DEBUG, "Can't open '%s'", dname);
		      n = 0;
/*		      continue;*/
		    } else {
		      dps_strerror(Indexer, DPS_LOG_ERROR, "Can't open '%s'", dname);
		      continue;
		    }
		  } else {
		    DpsWriteLock(log_fd); 
		    DpsLog(Indexer, DPS_LOG_DEBUG, "Processing Log: %x", log);
		    fstat(log_fd, &sb);
		    log_buf = (sb.st_size > 0) ? (DPS_LOGWORD*)DpsMalloc((size_t)sb.st_size + 1) : NULL;
		    if (log_buf != NULL) {
		      unlink(dname);
		      bytes = read(log_fd,log_buf,(size_t)sb.st_size);
		      (void)ftruncate(log_fd, (off_t)0);
		      DpsUnLock(log_fd);
		      DpsClose(log_fd);
		      
		      n = bytes / sizeof(DPS_LOGWORD);
		      DpsLog(Indexer, DPS_LOG_DEBUG, "Sorting log_buf: %d items", n);
		      if (n > 1) DpsSort(log_buf, n, sizeof(DPS_LOGWORD), (qsort_cmp)DpsCmplog);
		      DpsLog(Indexer, DPS_LOG_DEBUG, "Removing OldWords");
		      n = DpsRemoveOldWords(log_buf, n, del_buf, del_count);
		      if (n > 1) DpsSort(log_buf, n, sizeof(DPS_LOGWORD), (qsort_cmp)DpsCmplog_wrd);
		      
		    } else {
		      n = 0;
		      DpsUnLock(log_fd);
		      DpsClose(log_fd);
		    }
		  }

		  DpsLog(Indexer, DPS_LOG_DEBUG, "Processing Buf, optimize: %d", optimize);
		  if (obi) DpsBaseOptimize(&P, log);
		  DpsProcessBuf(Indexer, &P, log, log_buf, n, del_buf, del_count);
		  if (optimize) DpsBaseOptimize(&P, log);
		  DpsBaseClose(&P);
		  DPS_FREE(log_buf);

		  DpsLog(Indexer, DPS_LOG_DEBUG, "pas done: %d from %d to %d", log, from, to);
		  DPSSLEEP(sleeps);
		}
		DPS_FREE(del_buf);
		DpsAgentFree(Indexer);
		DPS_FREE(P.vardir);
	}

	fprintf(stderr, "Splitting done.\n");
	
	DpsEnvFree(Env);
	DpsDeInit();
	DpsDestroyMutexes();

#ifdef EFENCE
	fprintf(stderr, "Memory leaks checking\n");
	DpsEfenceCheckLeaks();
#endif
#ifdef FILENCE
	fprintf(stderr, "FD leaks checking\n");
	DpsFilenceCheckLeaks(NULL);
#endif
	return 0;
}
Beispiel #8
0
int DpsSEAMake(DPS_AGENT *Indexer, DPS_DOCUMENT *Doc, DPS_DSTR *excerpt,  
	       const char *content_lang, size_t *indexed_size, size_t *indexed_limit, 
	       size_t max_word_len, size_t min_word_len, int crossec, int seasec
#ifdef HAVE_ASPELL
	       , int have_speller, AspellSpeller *speller
#endif
	       ) {
  DPS_SENTENCELIST List;
  DPS_MAPSTAT MapStat;
  DPS_TEXTITEM Item;
  DPS_VAR	*Sec;
  dpsunicode_t *sentence, *lt, savec;
  double *links, *lang_cs, w;
  double delta, pdiv, cur_div;
  size_t l, sent_len, order;
  size_t min_len = 10000000, min_pos = 0;
  int  it;
  register size_t i, j;
#ifdef DEBUG
  char lcstr[4096];

#endif

  TRACE_IN(Indexer, "DpsSEAMake");

  if((Sec = DpsVarListFind(&Doc->Sections, "sea"))) { /* set SEA section to NULL */
    DPS_FREE(Sec->val);
    DPS_FREE(Sec->txt_val);
    Sec->curlen = 0;
  }
  
  bzero(&List, sizeof(List));
  order = 0;
  sentence = DpsUniStrTok_SEA((dpsunicode_t*)excerpt->data, &lt);
  while(sentence) {
    if (lt != NULL) { savec = *lt; *lt = 0; }
#ifdef DEBUG
    DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)sentence, sizeof(dpsunicode_t) * (DpsUniLen(sentence) + 1));
    fprintf(stderr, "Sentence.%d: %s\n", List.nitems, lcstr);
#endif
    if ((sent_len = DpsUniLen(sentence)) >= Indexer->Flags.SEASentenceMinLength) {
      j = 1;
      for (i = 0; i < List.nitems; i++) {
	if (DpsUniStrCmp(sentence, List.Sent[i].sentence) == 0) {
	  j = 0; break;
	}
      }
      if (j) {
	if ( List.nitems < Indexer->Flags.SEASentences ) {
	  if (List.nitems == List.mitems) {
	    List.mitems += 16;
	    List.Sent = (DPS_SENTENCE*)DpsRealloc(List.Sent, List.mitems * sizeof(DPS_SENTENCE));
	    if (List.Sent == NULL) { TRACE_OUT(Indexer); return DPS_ERROR;}
	  }
	  List.Sent[List.nitems].sentence = DpsUniDup(sentence);
	  List.Sent[List.nitems].len = sent_len;
	  List.Sent[List.nitems].order = order++;
	  sentence = DpsUniDup(sentence);
	  DpsUniStrToLower(sentence);
	  bzero(&List.Sent[List.nitems].LangMap, sizeof(DPS_LANGMAP));
	  DpsBuildLangMap(&List.Sent[List.nitems].LangMap, (char*)sentence, sent_len * sizeof(dpsunicode_t), 0, 0);
	  if (sent_len < min_len) { min_len = sent_len; min_pos = List.nitems; }
	  List.nitems++;
	  DPS_FREE(sentence);
	} else if (sent_len > min_len) {
	  DPS_FREE(List.Sent[min_pos].sentence);
	  List.Sent[min_pos].sentence = DpsUniDup(sentence);
	  List.Sent[min_pos].len = sent_len;
	  List.Sent[min_pos].order = order++;
	  sentence = DpsUniDup(sentence);
	  DpsUniStrToLower(sentence);
	  bzero(&List.Sent[min_pos].LangMap, sizeof(DPS_LANGMAP));
	  DpsBuildLangMap(&List.Sent[min_pos].LangMap, (char*)sentence, sent_len * sizeof(dpsunicode_t), 0, 0);
	  DPS_FREE(sentence);
	  min_len = List.Sent[0].len; min_pos = 0;
	  for(i = 1; i < List.nitems; i++) if (List.Sent[i].len < min_len) { min_len = List.Sent[i].len; min_pos = i; }
	}
      }
    }
#ifdef DEBUG
    fprintf(stderr, "Sent. len.:%d, Min.allowed: %d\n", sent_len, Indexer->Flags.SEASentenceMinLength);
#endif
    if (lt != NULL) *lt = savec;
    sentence = DpsUniStrTok_SEA(NULL, &lt);
  }
  DpsLog(Indexer, DPS_LOG_DEBUG, "SEA sentences: %d", List.nitems);
  if (List.nitems < 4) {
    for (i = 0; i < List.nitems; i++) DPS_FREE(List.Sent[i].sentence);
    DPS_FREE(List.Sent); 
    TRACE_OUT(Indexer);
    return DPS_OK; 
  }

  links = (double*)DpsMalloc(sizeof(double) * List.nitems * List.nitems);
  lang_cs = (double*)DpsMalloc(sizeof(double) * List.nitems);
/*
        k                 ot
  links[i * List.nitems + j] 
*/

  if (links != NULL && lang_cs != NULL) {

    for (i = 0; i < List.nitems; i++) {
      DpsPrepareLangMap(&List.Sent[i].LangMap);
    }

    for (i = 0; i < List.nitems; i++) {
      List.Sent[i].Oi =  List.Sent[i].di = 0.5;
      if (Doc->lang_cs_map == NULL) {
	  links[i * List.nitems + i] = 0.0;
      } else {
	MapStat.map = &List.Sent[i].LangMap;
	DpsCheckLangMap6(Doc->lang_cs_map, &List.Sent[i].LangMap, &MapStat, DPS_LM_TOPCNT * DPS_LM_TOPCNT, 2 * DPS_LM_TOPCNT);
	links[i * List.nitems + i] = (double)MapStat.hits / (2.0 * DPS_LM_TOPCNT) / (List.nitems + 1);
      }
#ifdef DEBUG
      DpsLog(Indexer, DPS_LOG_INFO, "Link %u->%u: %f [hits:%d miss:%d]", i, i, links[i * List.nitems + i], MapStat.hits, MapStat.miss);
#endif
      for (j = 0; j < List.nitems; j++) {
	  if (j == i) continue;
	MapStat.map = &List.Sent[j].LangMap;
	DpsCheckLangMap6(&List.Sent[j].LangMap, &List.Sent[i].LangMap, &MapStat, DPS_LM_TOPCNT * DPS_LM_TOPCNT, 2 * DPS_LM_TOPCNT);

	links[i * List.nitems + j] = (double)MapStat.hits / (2.0 * DPS_LM_TOPCNT) / (List.nitems + 1);
#ifdef DEBUG
	DpsLog(Indexer, DPS_LOG_INFO, "Link %u->%u: %f [hits:%d miss:%d]", i, j, links[i * List.nitems + j], MapStat.hits, MapStat.miss);
#endif
      }
    }

    for (l = 0; l < List.nitems; l++) {
	w = 0.0;
	for (i = 0; i < List.nitems; i++) { 
	    w += links[l * List.nitems + i] * List.Sent[i].Oi;
	}
	w = f(w);
	if (w < LOW_BORDER_EPS2) w = LOW_BORDER_EPS2;
	else if (w > HI_BORDER_EPS2) w = HI_BORDER_EPS2;
	List.Sent[l].di = w;
    }

    DpsSort(List.Sent, List.nitems, sizeof(DPS_SENTENCE), (qsort_cmp)SentCmp);

#ifdef DEBUG
    DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[0].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[0].sentence) + 1));
    fprintf(stderr, "Sent.0: %f %f -- %s\n", List.Sent[0].di, List.Sent[0].Oi, lcstr);
    DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[1].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[1].sentence) + 1));
    fprintf(stderr, "Sent.1: %f %f -- %s\n", List.Sent[1].di, List.Sent[1].Oi, lcstr);
    DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[2].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[2].sentence) + 1));
    fprintf(stderr, "Sent.2: %f %f -- %s\n", List.Sent[2].di, List.Sent[2].Oi, lcstr);
    DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[3].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[3].sentence) + 1));
    fprintf(stderr, "Sent.3: %f %f -- %s\n", List.Sent[3].di, List.Sent[3].Oi, lcstr);
    DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[4].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[4].sentence) + 1));
    fprintf(stderr, "Sent.4: %f %f -- %s\n", List.Sent[4].di, List.Sent[4].Oi, lcstr);
#endif
    DpsSort(List.Sent, TOP_SENTENCES, sizeof(DPS_SENTENCE), (qsort_cmp)SentOrderCmp);

    bzero(&Item, sizeof(Item));
    Item.section = seasec;
    Item.href = NULL;
    Item.section_name = "sea";
    for (i = 0; i < TOP_SENTENCES; i++) {
      dpsunicode_t *UStr = DpsUniDup(List.Sent[i].sentence);
      DpsPrepareItem(Indexer, Doc, &Item, List.Sent[i].sentence, UStr, content_lang, indexed_size, indexed_limit,
		     max_word_len, min_word_len, crossec
#ifdef HAVE_ASPELL
		     , have_speller, speller, NULL
#endif
		     );
      DPS_FREE(UStr);
    }
  }
  DPS_FREE(lang_cs);
  DPS_FREE(links);
  for (i = 0; i < List.nitems; i++) DPS_FREE(List.Sent[i].sentence);
  DPS_FREE(List.Sent);

  TRACE_OUT(Indexer);
  return DPS_OK;
}
Beispiel #9
0
extern __C_LINK int __DPSCALL DpsBaseOptimize(DPS_BASE_PARAM *P, int sbase) {
  struct	stat sb;
  urlid_t base, base_from, base_to;
  long unsigned ActualSize, OriginalSize, i, nitems;
  off_t pos, posold, NewItemPos, SSize;
  dps_uint8 diff, gain;
  double dr = 0.0, cr = 0.0;
  ssize_t nread; size_t rsize;
  ssize_t wr;
  int OptimizeRatio, res, error_cnt;
  char buffer[BUFSIZ];
  DPS_BASEITEM *hTable;
  DPS_SORTBASEITEM *si = NULL;

  OptimizeRatio = DpsVarListFindInt(&P->A->Vars, "OptimizeRatio", 15);

  P->mode = DPS_WRITE_LOCK;
  if (sbase == -1) {
    base_from = 0; base_to = (urlid_t)P->NFiles;
  } else {
    base_from = sbase; base_to = sbase + 1;
  }

  for (base = base_from; base < base_to; base++) {

    error_cnt = 0;
    gain = (dps_uint8)0;
    P->rec_id = ((base & DPS_BASE_MASK) << DPS_BASE_BITS);
    if (DpsBaseOpen(P, DPS_WRITE_LOCK) != DPS_OK) {
      DpsLog(P->A, DPS_LOG_ERROR, "Can't open base %s/%s {%s:%d}", P->subdir, P->basename, __FILE__, __LINE__);
      DpsBaseClose(P);
      return DPS_ERROR;
    }
    if (lseek(P->Ifd, (off_t)0, SEEK_SET) == (off_t)-1) {
      DpsLog(P->A, DPS_LOG_ERROR, "Can't seek %s {%s:%d}", P->Ifilename, __FILE__, __LINE__);
      DpsBaseClose(P);
      return DPS_ERROR;
    }

    if (fstat(P->Sfd, &sb) == 0) {
      SSize = sb.st_size;
    } else {
      if ((SSize = (off_t)lseek(P->Sfd, (off_t)0, SEEK_END)) == (off_t)-1) {
	DpsLog(P->A, DPS_LOG_ERROR, "Can't seek %s {%s:%d}", P->Sfilename, __FILE__, __LINE__);
	DpsBaseClose(P);
	return DPS_ERROR;
      }
    }

    nitems = 0;
    ActualSize = 0;
    OriginalSize = 0;
    while(read(P->Ifd, &P->Item, sizeof(DPS_BASEITEM)) == sizeof(DPS_BASEITEM)) {
      nitems++;
      if ((P->Item.rec_id != 0) && ((dps_uint8)P->Item.offset < (dps_uint8)SSize) && (P->Item.size > 0)) {
	ActualSize += (long unsigned)P->Item.size;
	OriginalSize += (long unsigned)(P->Item.orig_size ? P->Item.orig_size : P->Item.size);
      }
    }
    if (ftruncate(P->Ifd, (off_t)(nitems * sizeof(DPS_BASEITEM))) != 0) {
	dps_strerror(P->A, DPS_LOG_EXTRA, "ftruncate error (pos:%ld) [%s:%d]", (off_t)(nitems * sizeof(DPS_BASEITEM)), __FILE__, __LINE__);
    }

    dr = (nitems) ? fabs(100.0 * ((long unsigned)SSize - ActualSize) / ((double)SSize + 1.0)) : 0.0;
    cr = (nitems) ? fabs(100.0 * ActualSize / (OriginalSize + 1)) : 0.0;

    DpsLog(P->A, DPS_LOG_EXTRA, "Optimize: %s/%s base 0x%X, %ld recs defrag: %.2f%% Ratio: %.2f%% Data: %ld File: %ld", 
	   P->subdir, P->basename, P->FileNo, nitems, dr, cr,  ActualSize, (long)SSize);

    if ((dr >= (double)OptimizeRatio) || (ActualSize == 0 && SSize != 0)) {

      si = (DPS_SORTBASEITEM*)DpsMalloc((nitems + 1) * sizeof(DPS_SORTBASEITEM));

      if (si == NULL) {
	DpsLog(P->A, DPS_LOG_ERROR, "Can't alloc si (%d bytes) at {%s:%d}", (nitems + 1) * sizeof(DPS_SORTBASEITEM), __FILE__, __LINE__);
	DpsBaseClose(P);
	return DPS_ERROR;
      }
      if (lseek(P->Ifd, (off_t)0, SEEK_SET) == (off_t)-1) {
	DpsLog(P->A, DPS_LOG_ERROR, "Can't seek %s {%s:%d}", P->Ifilename, __FILE__, __LINE__);
	DpsBaseClose(P);
	DPS_FREE(si);
	return DPS_ERROR;
      }

      for (i = 0; (i < nitems) && (read(P->Ifd, &si[i].Item, sizeof(DPS_BASEITEM)) == sizeof(DPS_BASEITEM)); ) {
	if(si[i].Item.rec_id != 0 && ((dps_uint8)si[i].Item.offset < (dps_uint8)SSize) && (si[i].Item.size > 0) && (si[i].Item.size < ActualSize) ) {
	  i++;
	}
      }

      if (i < nitems) nitems = i;
      if (nitems > 1) DpsSort((void*)si, (size_t)nitems, sizeof(DPS_SORTBASEITEM), cmpsi);

      gain = (dps_uint8)0;
      pos = (off_t)0;
      posold = (off_t)0;
      if (nitems > 0) {
	if ((long unsigned)si[0].Item.offset < (long unsigned)SSize) {
	  posold = (off_t)si[0].Item.offset;
	} else {
	  si[0].Item.offset = (off_t)0;
	  si[0].Item.size = 0;
	}
      }
      if (nitems > 1) {
	if (si[0].Item.size > (rsize = (size_t)(si[1].Item.offset - si[0].Item.offset))) {
	  DpsLog(P->A, DPS_LOG_ERROR, "si[0] size adjusted by offset: %ld -> %ld", (long)si[0].Item.size, (long)rsize);
	  si[0].Item.size = rsize;
	  error_cnt++;
	}
      }
      if ((diff = (dps_uint8)posold) > 0) {
	for(
	    lseek(P->Sfd, posold, SEEK_SET), rsize = 0;
	    (rsize < si[0].Item.size) && ((nread = read(P->Sfd, buffer, 
							(rsize + BUFSIZ < si[0].Item.size) ? BUFSIZ : (si[0].Item.size - rsize) )) > 0);
	    lseek(P->Sfd, posold, SEEK_SET)
	    ) {
	  lseek(P->Sfd, pos, SEEK_SET);
	  (void)write(P->Sfd, buffer, (size_t)nread);
	  rsize += (size_t)nread;
	  posold += (off_t)nread;
	  pos += (off_t)nread;
	}
	si[0].Item.offset = 0;
	if (rsize != si[0].Item.size) {
	  DpsLog(P->A, DPS_LOG_ERROR, "si[0] size adjusted by size: %ld -> %ld", (long)si[0].Item.size, (long)rsize);
	  si[0].Item.size = rsize;
	  error_cnt++;
	}
	gain += diff;
      }
      
      if (nitems > 0)
      for (i = 0; i < nitems - 1; i++) {
	if ((long unsigned)si[i + 1].Item.offset > (long unsigned)SSize) {
	  DpsLog(P->A, DPS_LOG_ERROR, "si[%ld] too long offset: %ld > %ld, removing", i , (long)si[i + 1].Item.offset, (long)SSize);
	  si[i + 1].Item.size = 0;
	  si[i + 1].Item.offset = si[i].Item.offset + si[i].Item.size;
	  error_cnt++;
	} else {
	  pos = (off_t)(si[i].Item.offset + si[i].Item.size);
	  posold = (off_t)si[i + 1].Item.offset;
	  if (i < nitems - 2) {
	    if (si[i + 1].Item.size > (rsize = (size_t)(si[i + 2].Item.offset - si[i + 1].Item.offset))) {
	      DpsLog(P->A, DPS_LOG_ERROR, "si[%ld] size adjusted by offset: %ld -> %ld", i + 1, (long)si[i + 1].Item.size, (long)rsize );
	      si[i + 1].Item.size = rsize;
	      error_cnt++;
	    }
	  }
	  if ((diff = (dps_uint8)posold - (dps_uint8)pos) > 0) {
	    for(
		lseek(P->Sfd, posold, SEEK_SET), rsize = 0;
		(rsize < si[i + 1].Item.size) && ((nread = read(P->Sfd, buffer,
					      (rsize + BUFSIZ < si[i + 1].Item.size) ? BUFSIZ : (si[i + 1].Item.size - rsize) )) > 0);
		lseek(P->Sfd, posold, SEEK_SET)
		) {
	      lseek(P->Sfd, pos, SEEK_SET);
	      (void)write(P->Sfd, buffer, (size_t)nread);
	      rsize += (size_t)nread;
	      posold += (off_t)nread;
	      pos += (off_t)nread;
	    }
	    if (rsize != si[i + 1].Item.size) {
	      DpsLog(P->A, DPS_LOG_ERROR, "si[%ld] size adjusted by size: %ld -> %ld", i + 1, (long)si[i + 1].Item.size, (long)rsize);
	      si[i + 1].Item.size = rsize;
	      error_cnt++;
	    }
	    si[i + 1].Item.offset = si[i].Item.offset + si[i].Item.size;
	    gain += diff;
	  }
	}
      }
      posold = SSize;
      pos = (nitems) ? (off_t)(si[nitems - 1].Item.offset + si[nitems - 1].Item.size) : (off_t)0;
      if (ftruncate(P->Sfd, (off_t)(pos)) != 0) {
	dps_strerror(P->A, DPS_LOG_ERROR, "ftruncate error (pos:%ld) [%s:%d]", pos, __FILE__, __LINE__);
      }
      SSize = pos;

      if (posold > pos) {
	gain += ((dps_uint8)posold - (dps_uint8)pos);
      }

      /*if (gain != 0 || OptimizeRatio == 0 || error_cnt > 0)*/ {

	posold = lseek(P->Ifd, (off_t)0, SEEK_END);
	(void)ftruncate(P->Ifd, (off_t)0);
	lseek(P->Ifd, (off_t)0, SEEK_SET);

	if ((hTable = (DPS_BASEITEM *)DpsXmalloc(sizeof(DPS_BASEITEM) * DPS_HASH_PRIME)) == NULL) {
	  DpsLog(P->A, DPS_LOG_ERROR, "Memory alloc error hTable: %d bytes", sizeof(DPS_BASEITEM) * DPS_HASH_PRIME);
	  DpsBaseClose(P);
	  DPS_FREE(si);
	  return DPS_ERROR;
	}
	if ( (wr = write(P->Ifd, hTable, sizeof(DPS_BASEITEM) * DPS_HASH_PRIME)) != sizeof(DPS_BASEITEM) * DPS_HASH_PRIME) {
	  dps_strerror(P->A, DPS_LOG_ERROR, "[%s:%d] Can't set new index for file %s\nwritten %d bytes of %d",
		 __FILE__, __LINE__, P->Ifilename, wr, sizeof(DPS_BASEITEM) * DPS_HASH_PRIME);
	  DPS_FREE(hTable);
	  DpsBaseClose(P);
	  DPS_FREE(si);
	  return DPS_ERROR;
	}
	DPS_FREE(hTable);

	for (i = 0; i < nitems; i++) {
	  if (si[i].Item.rec_id == 0 || si[i].Item.size == 0) continue;
	  if ((long)si[i].Item.offset > (long)SSize) {
	    DpsLog(P->A, DPS_LOG_ERROR, "si[%ld] too long offset: %ld > %ld, removing", i , (long)si[i].Item.offset, (long)SSize);
	    error_cnt++;
	    continue;
	  }
	  P->rec_id = si[i].Item.rec_id;
	  if ((res = DpsBaseSeek(P, DPS_WRITE_LOCK)) != DPS_OK) {
	    DpsBaseClose(P);
	    DPS_FREE(si);
	    return res;
	  }
	  if (P->Item.rec_id != P->rec_id) {
	    if (P->mishash && P->Item.rec_id != 0) {
	      if ((P->Item.next = (dps_uint8)(NewItemPos = lseek(P->Ifd, (off_t)0, SEEK_END))) == (dps_uint8)-1) {
		DpsBaseClose(P);
		DPS_FREE(si);
		return DPS_ERROR;
	      }
	      if (lseek(P->Ifd, (off_t)P->CurrentItemPos, SEEK_SET) == (off_t)-1) {
		DpsBaseClose(P);
		DPS_FREE(si);
		return DPS_ERROR;
	      }
	      if (write(P->Ifd, &P->Item, sizeof(DPS_BASEITEM)) != sizeof(DPS_BASEITEM)) {
		DpsBaseClose(P);
		DPS_FREE(si);
		return DPS_ERROR;
	      }
	      P->CurrentItemPos = (dps_uint8)NewItemPos;
	    }
	  }
	  P->Item = si[i].Item;
	  P->Item.next = (off_t)0;
	  if (lseek(P->Ifd, (off_t)P->CurrentItemPos, SEEK_SET) == (off_t)-1) {
	    DpsLog(P->A, DPS_LOG_ERROR, "Can't seek %s {%s:%d}", P->Ifilename, __FILE__, __LINE__);
	    DpsBaseClose(P);
	    DPS_FREE(si);
	    return DPS_ERROR;
	  }
	  if (write(P->Ifd, &P->Item, sizeof(DPS_BASEITEM)) != sizeof(DPS_BASEITEM)) {
	    DpsLog(P->A, DPS_LOG_ERROR, "Can't write index for file %s {%s:%d}", P->Ifilename, __FILE__, __LINE__);
	    DpsBaseClose(P);
	    DPS_FREE(si);
	    return DPS_ERROR;
	  }
	}
	pos = lseek(P->Ifd, (off_t)0, SEEK_END);
	gain += ((dps_uint8)posold - (dps_uint8)pos);

	DpsLog(P->A, DPS_LOG_DEBUG, "Optimize: %s/%s base 0x%X cleaned, %ld bytes freed", P->subdir, P->basename, base, gain);
      }

      DPS_FREE(si);
    }

    if (error_cnt) base--;
    DpsBaseClose(P);
  }
  return DPS_OK;
}