예제 #1
0
static int proceedSTOP(DPS_AGENT *query, DPS_STACK_ITEM *res, DPS_STACK_ITEM *x, DPS_STACK_ITEM *stop) {

  res->pbegin = res->pcur = (DPS_URL_CRD_DB*)DpsMalloc((x->count + stop->count + 1) * sizeof(DPS_URL_CRD_DB));
  if (res->pbegin == NULL) {
    DpsLog(query, DPS_LOG_ERROR, "Can't alloc %d bytes for %d results", (x->count + stop->count + 1) * sizeof(DPS_URL_CRD_DB),
	   (x->count + stop->count + 1));
    return DPS_ERROR;
  }
  x->pcur = x->pbegin; x->plast = x->pbegin + x->count;
  stop->pcur = stop->pbegin; stop->plast = stop->pbegin + stop->count;

  if (stop->pcur < stop->plast) {
    while (x->pcur < x->plast) {
      while (stop->pcur < stop->plast && stop->pcur->url_id < x->pcur->url_id) stop->pcur++;
      while (stop->pcur < stop->plast && DpsCmpUrlid(stop->pcur, x->pcur) <= 0) { 
	*res->pcur = *stop->pcur;
	res->pcur++; stop->pcur++;
      }
      if (stop->pcur >= stop->plast) break;
      while (x->pcur < x->plast && DpsCmpUrlid(x->pcur, stop->pcur) <= 0) {
	*res->pcur = *x->pcur;
	res->pcur++; x->pcur++;
      }
    }
  }
  while (x->pcur < x->plast) {
    *res->pcur = *x->pcur;
    res->pcur++; x->pcur++;
  }

  return DPS_OK;
}
예제 #2
0
/*
 * allocateMoreSlots is called when there are only enough slot structures
 * left to support the allocation of a single malloc buffer.
 */
static void
allocateMoreSlots(void)
{
    size_t	newSize = allocationListSize + bytesPerPage;
    void *	newAllocation;
    void *	oldAllocation = allocationList;

    Page_AllowAccess(allocationList, allocationListSize);
    noAllocationListProtection = 1;
    internalUse = 1;

    newAllocation = DpsMalloc(newSize);
    dps_memcpy(newAllocation, allocationList, allocationListSize); /* was: dps_memmove */
    memset(&(((char *)newAllocation)[allocationListSize]), 0, bytesPerPage);

    allocationList = (Slot *)newAllocation;
    allocationListSize = newSize;
    slotCount += slotsPerPage;
    unUsedSlots += slotsPerPage;

    /*	DpsSort(allocationList, slotCount, sizeof(Slot), (qsort_cmp)cmp_Slot);*/
    DpsFree(oldAllocation);

    /*
     * Keep access to the allocation list open at this point, because
     * I am returning to memalign(), which needs that access.
     */
    noAllocationListProtection = 0;
    internalUse = 0;
}
예제 #3
0
dpsunicode_t *DpsUniGermanReplace(const dpsunicode_t *str) {
  size_t l = DpsUniLen(str);
  dpsunicode_t *german = DpsMalloc((2 * l + 1) * sizeof(dpsunicode_t));
  if (german !=NULL) {
    dpsunicode_t *s = str, *d = german;
    while(*s) {
      switch(*s) {
      case 0x00DF: /* eszett, or scharfes s, small */
	*d++ = 's'; *d++ = 's'; break;
      case 0x1E9E: /* eszett, or scharfes s, big */
	*d++ = 'S'; *d++ = 'S'; break;
      case 0x00D6: *d++ = 'O'; *d++ = 'E'; break;
      case 0x00F6: *d++ = 'o'; *d++ = 'e'; break;

      case 0x00DC: *d++ = 'U'; *d++ = 'E'; break;
      case 0x00FC: *d++ = 'u'; *d++ = 'e'; break;

      case 0x00C4: *d++ = 'A'; *d++ = 'E'; break;
      case 0x00E4: *d++ = 'a'; *d++ = 'e'; break;

      default: *d++ = *s;
      }
      s++;
    }
    *d = 0;
  }
  return german;
}
예제 #4
0
static int proceedOR(DPS_AGENT *query, DPS_STACK_ITEM *res, DPS_STACK_ITEM *x1, DPS_STACK_ITEM *x2) {

  res->pbegin = res->pcur = (DPS_URL_CRD_DB*)DpsMalloc((x1->count + x2->count + 1) * sizeof(DPS_URL_CRD_DB));
  if (res->pbegin == NULL) {
    DpsLog(query, DPS_LOG_ERROR, "Can't alloc %d bytes for %d results", (x1->count + x2->count + 1) * sizeof(DPS_URL_CRD_DB),
	   (x1->count + x2->count + 1));
    return DPS_ERROR;
  }
  x1->pcur = x1->pbegin; x1->plast = x1->pbegin + x1->count;
  x2->pcur = x2->pbegin; x2->plast = x2->pbegin + x2->count;
  while (x1->pcur < x1->plast && x2->pcur < x2->plast) {
    while((x1->pcur < x1->plast) && (DpsCmpUrlid(x1->pcur, x2->pcur) <= 0)) { 
      *res->pcur = *x1->pcur;
      res->pcur++; x1->pcur++;
    }
    {
      register DPS_STACK_ITEM *t = x1;
      x1 = x2; x2 = t;
    }
  }
  while (x1->pcur < x1->plast) {
    *res->pcur = *x1->pcur;
    res->pcur++; x1->pcur++;
  }
  while (x2->pcur < x2->plast) {
    *res->pcur = *x2->pcur;
    res->pcur++; x2->pcur++;
  }
  return DPS_OK;
}
예제 #5
0
__C_LINK int __DPSCALL DpsBaseRead(DPS_BASE_PARAM *P, void *buf, size_t len) {
  int res = DPS_OK;
#ifdef HAVE_ZLIB
  z_stream zstream;
  Byte *CDoc = NULL;
#endif

  if ((res = DpsBaseSeek(P, DPS_READ_LOCK)) != DPS_OK) return res;

  if (P->Item.rec_id == P->rec_id) {
    if (lseek(P->Sfd, (off_t)P->Item.offset, SEEK_SET) == (off_t)-1) {
      DpsLog(P->A, DPS_LOG_ERROR, "[%s/%s.%d] %ld lseek error, rec_id: %x",  
	     P->subdir, P->basename, P->FileNo, P->Item.offset, P->rec_id);
      return DPS_ERROR;
    }
    if ((P->Item.orig_size ? P->Item.orig_size : P->Item.size) > len) {
      DpsLog(P->A, DPS_LOG_ERROR, "[%s/%s] size %d->%d error, rec_id: %x",  
	     P->subdir, P->basename, (P->Item.orig_size ? P->Item.orig_size : P->Item.size), len, P->rec_id);
      return DPS_ERROR;
    }
#ifdef HAVE_ZLIB
    bzero(&zstream, sizeof(zstream));

    if ((P->zlib_method == Z_DEFLATED) && (P->Item.orig_size != 0)) {
	zstream.avail_in = (uInt)P->Item.size;
	zstream.avail_out = (uInt)len;
      zstream.next_out = (Byte *) buf;
      CDoc = zstream.next_in = (Byte *) DpsMalloc(P->Item.size + 1);
      if (CDoc == NULL) {
	return DPS_ERROR;
      }
      zstream.zalloc = Z_NULL;
      zstream.zfree = Z_NULL;
      zstream.opaque = Z_NULL;
      if (read(P->Sfd, CDoc, P->Item.size) != (ssize_t)P->Item.size) {
	DpsLog(P->A, DPS_LOG_ERROR, "[%s/%s] %d read error, rec_id: %x -- %d",  P->subdir, P->basename, P->Item.size, P->rec_id, __LINE__);
	DPS_FREE(CDoc);
	return DPS_ERROR;
      }
      inflateInit2(&zstream, P->zlib_windowBits);
      inflate(&zstream, Z_FINISH);
      inflateEnd(&zstream);
      DPS_FREE(CDoc);
    } else 
  
#endif
    if (read(P->Sfd, buf, P->Item.size) != (ssize_t)P->Item.size) {
      DpsLog(P->A, DPS_LOG_ERROR, "[%s/%s] %d read error, rec_id: %x -- %d",  P->subdir, P->basename, P->Item.size, P->rec_id, __LINE__);
      return DPS_ERROR;
    }

  } else {
    DpsLog(P->A, DPS_LOG_DEBUG, "%s:[%s/%s] Not found rec_id: %x",  P->vardir, P->subdir, P->basename, P->rec_id);
    return DPS_ERROR;
  }
#ifdef DEBUG_SEARCH
  DpsLog(P->A, DPS_LOG_DEBUG, "[%s/%s] Retrieved rec_id: %x Size: %d", P->subdir, P->basename, P->rec_id, P->Item.size);
#endif
  return DPS_OK;
}
예제 #6
0
dpsunicode_t *DpsUniNDup(const dpsunicode_t *s, size_t len) {
	dpsunicode_t *res;
	size_t size = DpsUniLen(s);
	if (size > len) size = len;
	if((res = (dpsunicode_t*)DpsMalloc((size + 1) * sizeof(*s))) == NULL) return(NULL);
	dps_memmove(res, s, size * sizeof(*s));
	res[size] = 0;
	return res;
}
예제 #7
0
dpsunicode_t *DpsUniDup(const dpsunicode_t *s) {
	dpsunicode_t *res;
	size_t size;
	
	size = (DpsUniLen(s)+1)*sizeof(*s);
	if((res=(dpsunicode_t*)DpsMalloc(size)) == NULL)
		return(NULL);
	dps_memcpy(res, s, size); /* was: dps_memmove */
	return res;
}
예제 #8
0
DPS_URL * __DPSCALL DpsURLInit(DPS_URL *url) {
  if (!url) {
    url = (DPS_URL*)DpsMalloc(sizeof(DPS_URL));
    if (url == NULL) return NULL;
    bzero((void*)url, sizeof(DPS_URL));
    url->freeme = 1;
  } else {
    int fr = url->freeme;
    bzero((void*)url, sizeof(DPS_URL));
    url->freeme = fr;
  }
  return url;
}
예제 #9
0
DPS_BOOLSTACK *DpsBoolStackInit(DPS_BOOLSTACK *s) {
	if(s == NULL) {
		s = (DPS_BOOLSTACK*)DpsMalloc(sizeof(DPS_BOOLSTACK));
		if (s == NULL) return NULL;
		bzero((void*)s, sizeof(*s));
		s->freeme = 1;
	}else{
		bzero((void*)s, sizeof(*s));
	}
	s->ncstack = 0;
	s->nastack = 0;
	s->mcstack = s->mastack = DPS_MAXSTACK;
	s->cstack = (int*)DpsMalloc(DPS_MAXSTACK * sizeof(int));
	if (s->cstack == NULL) { if (s->freeme) DPS_FREE(s); return NULL; }
	s->astack = (DPS_STACK_ITEM*)DpsMalloc(DPS_MAXSTACK * sizeof(DPS_STACK_ITEM));
	if (s->astack == NULL) {
	  DPS_FREE(s->cstack);
	  if (s->freeme) DPS_FREE(s); 
	  return NULL;
	}
	return s;
}
예제 #10
0
static void initialize(void) {
	size_t	size = MEMORY_CREATION_SIZE;
	size_t	slack;
	Slot *	slot;

	FE_Print(version);

	lock();

	/*
	 * Get the run-time configuration of the virtual memory page size.
 	 */
	bytesPerPage = Page_Size();

	/*
	 * Figure out how many Slot structures to allocate at one time.
	 */
	slotCount = slotsPerPage = bytesPerPage / sizeof(Slot);
	allocationListSize = bytesPerPage;

	if ( allocationListSize > size )
		size = allocationListSize;

	if ( (slack = size % bytesPerPage) != 0 )
		size += bytesPerPage - slack;

	/*
	 * Allocate memory, and break it up into two malloc buffers. The
	 * first buffer will be used for Slot structures, the second will
	 * be marked free.
	 */
	slot = allocationList = (Slot *)DpsMalloc(size);
	memset((char *)allocationList, 0, allocationListSize);

	/*
	 * Account for the two slot structures that we've used.
	 */
	unUsedSlots = slotCount;

	release();
#ifdef HAVE_PTHREAD
	if (!semEnabled) {
		semEnabled = 1;
		if (sem_init(&FE_sem, 0, 1) < 0) {
		  semEnabled = 0;
		}
	}
#endif
}
예제 #11
0
dpsunicode_t *DpsUniRDup(const dpsunicode_t *s) {
	dpsunicode_t *res;
	size_t size, len;
	
	size = ((len = DpsUniLen(s)) + 1) * sizeof(*s);
	if((res=(dpsunicode_t*)DpsMalloc(size)) == NULL)
		return(NULL);
	{
	  register size_t z;
	  size = len - 1;
	  for (z = 0; z < len; z++) res[z] = s[size - z];
	  res[len] = 0;
	}
	return res;
}
예제 #12
0
static int DoStore(DPS_AGENT *Agent, urlid_t rec_id, Byte *Doc, size_t DocSize, char *Client) {
  z_stream zstream;
  DPS_BASE_PARAM P;
  int rc = DPS_OK;
  Byte *CDoc = NULL;
  size_t dbnum = ((size_t)rec_id) % ((Agent->flags & DPS_FLAG_UNOCON) ? Agent->Conf->dbl.nitems : Agent->dbl.nitems);
  DPS_DB *db = (Agent->flags & DPS_FLAG_UNOCON) ? &Agent->Conf->dbl.db[dbnum] : &Agent->dbl.db[dbnum];

            zstream.zalloc = Z_NULL;
            zstream.zfree = Z_NULL;
            zstream.opaque = Z_NULL;
            zstream.next_in = Doc;
          
            if (deflateInit2(&zstream, 9, Z_DEFLATED, 15, 9, Z_DEFAULT_STRATEGY) == Z_OK) {
          
              zstream.avail_in = DocSize;
              zstream.avail_out = 2 * DocSize;
              CDoc = zstream.next_out = (Byte *) DpsMalloc(2 * DocSize + 1);
              if (zstream.next_out == NULL) {
                return DPS_ERROR;
              }
              deflate(&zstream, Z_FINISH);
              deflateEnd(&zstream);


/* store operations */

              bzero(&P, sizeof(P));
              P.subdir = "store";
              P.basename = "doc";
              P.indname = "doc";
              P.rec_id = rec_id;
	      P.mode = DPS_WRITE_LOCK;
	      P.NFiles = (db->StoredFiles) ? db->StoredFiles : DpsVarListFindInt(&Agent->Vars, "StoredFiles", 0x100);
	      P.vardir = (db->vardir) ? db->vardir : DpsVarListFindStr(&Agent->Vars, "VarDir", DPS_VAR_DIR);
	      P.A = Agent;
	      if (DpsBaseWrite(&P, CDoc, zstream.total_out) != DPS_OK) {
		DpsLog(Agent, DPS_LOG_ERROR, "store/doc write error: %s", strerror(errno));
		rc = DPS_ERROR;
              }

	      DpsBaseClose(&P);
	      if (rc == DPS_OK) DpsLog(Agent, DPS_LOG_EXTRA, "[%s] Stored rec_id: %x Size: %d Ratio: %5.2f%%", Client,
				       rec_id, DocSize, 100.0 * zstream.total_out / DocSize);

	      if (Agent->Flags.OptimizeAtUpdate) {
		DpsBaseOptimize(&P, ((int)rec_id) >> DPS_BASE_BITS);
	      }
예제 #13
0
DPS_ENV *DpsEnvInit(DPS_ENV *Conf){
#ifdef MECAB
  const char *mecab_argv[8] = {"mecab", "-F", "%m ", "-B", " ", "-E", " ", NULL};
#endif
	if(!Conf){
		Conf=(DPS_ENV *)DpsMalloc(sizeof(DPS_ENV));
		if (Conf == NULL) return NULL;
		bzero((void*)Conf, sizeof(*Conf));
		Conf->freeme=1;
	}else{
		bzero((void*)Conf, sizeof(*Conf));
	}
	
	Conf->Flags.OptimizeAtUpdate = 1;
	Conf->Flags.do_excerpt = 1;
	Conf->Flags.PopRankNeoIterations = 3;
	Conf->Flags.GuesserBytes = 512;
	Conf->Flags.robots_period = 604800;   /* one week */
	Conf->Flags.URLInfoSQL = 1;
	Conf->Flags.SRVInfoSQL = 1;
	Conf->Flags.CheckInsertSQL = 1;
	Conf->Flags.mark_for_index = 1;
	Conf->Flags.MaxSiteLevel = 2;
	Conf->Flags.SEASentences = 32;
	Conf->Flags.SEASentenceMinLength = 64;
	Conf->Flags.PagesInGroup = 1;
	Conf->Flags.SubDocCnt = 5;
	Conf->Flags.MaxCrawlDelay = 300;
	Conf->Flags.rel_nofollow = 1;
	Conf->Flags.bind_addr.sin_family = AF_INET;
	Conf->Flags.use_meta = 1;
	Conf->WordParam.min_word_len = 1;
	Conf->WordParam.max_word_len = 32;
	Conf->WordParam.correct_factor = 1;
	Conf->WordParam.incorrect_factor = 1;
	Conf->url_number = 0x7FFFFFFF;
	Conf->lcs=DpsGetCharSet("latin1");
	Conf->bcs=DpsGetCharSet("latin1");
	Conf->CharsToEscape = DpsStrdup("\"&<>");
#ifdef MECAB
/*	Conf->mecab = mecab_new2 ("mecab -F \"%m \" -B \" \" -E \" \"");*/
	Conf->mecab = mecab_new(7, (char**)mecab_argv);
#endif
	
	return(Conf);
}
예제 #14
0
ssize_t DpsSearchdSendPacket(int fd,const DPS_SEARCHD_PACKET_HEADER *hdr,const void *data){
	ssize_t nsent = 0;

	if (data == NULL) {
	  nsent = DpsSend(fd, hdr, sizeof(*hdr), 0);
	} else {
	  char *ldata = (char*)DpsMalloc(sizeof(*hdr) + hdr->len);
	  if (ldata != NULL) {
	    dps_memcpy(ldata, hdr, sizeof(*hdr));
	    dps_memcpy(ldata + sizeof(*hdr), data, hdr->len);
	  
	    nsent = DpsSend(fd, ldata, sizeof(*hdr) + hdr->len, 0);
	  }
	  DPS_FREE(ldata);
	}
	
	return nsent;
}
예제 #15
0
/*
 * allocateMoreSlots is called when there are only enough slot structures
 * left to support the allocation of a single malloc buffer.
 */
static void allocateMoreSlots(void) {
	size_t	newSize = allocationListSize + bytesPerPage;
	void *	newAllocation;
	void *	oldAllocation = allocationList;

	newAllocation = DpsMalloc(newSize);
	dps_memmove(newAllocation, allocationList, allocationListSize);
	memset(&(((char *)newAllocation)[allocationListSize]), 0, bytesPerPage);

	allocationList = (Slot *)newAllocation;
	allocationListSize = newSize;
	slotCount += slotsPerPage;
	unUsedSlots += slotsPerPage;

/*	DpsSort(allocationList, slotCount, sizeof(Slot), (qsort_cmp)cmp_Slot);*/
	DpsFree(oldAllocation);

}
예제 #16
0
static int DpsUniRegComp(DPS_UNIREG_EXP *reg, const dpsunicode_t *pattern) {
	const dpsunicode_t *tok, *lt;

	reg->ntokens=0;
	reg->Token=NULL;

	tok=DpsUniRegTok(pattern,&lt);
	while(tok){
		size_t len;
		reg->Token=(DPS_UNIREG_TOK*)DpsRealloc(reg->Token,sizeof(*reg->Token)*(reg->ntokens+1));
		if (reg->Token == NULL) {
		  reg->ntokens = 0;
		  return DPS_ERROR;
		}
		len=lt-tok;
		reg->Token[reg->ntokens].str = (dpsunicode_t*)DpsMalloc((len+1)*sizeof(dpsunicode_t));
		dps_memmove(reg->Token[reg->ntokens].str, tok, len * sizeof(dpsunicode_t));
		reg->Token[reg->ntokens].str[len]=0;
		
		reg->ntokens++;
		tok=DpsUniRegTok(NULL,&lt);
	}
	return DPS_OK;
}
예제 #17
0
int main(int argc,char **argv, char **envp) {
  int ch, sleeps = 1, optimize = 0, obi = 0;
  unsigned int from = 0, to = 0xFFF, p_to = 0;
	DPS_ENV * Env;
	const char * config_name = DPS_CONF_DIR "/cached.conf";

	DpsInit(argc, argv, envp); /* Initialize library */
	
	DpsInitMutexes();
	Env=DpsEnvInit(NULL);
	if (Env == NULL) exit(1);
	DpsSetLockProc(Env, DpsLockProc);

/*#ifndef HAVE_SETPROCTITLE*/
	ARGV = argv;
	ARGC = argc;
/*#endif*/
	while ((ch = getopt(argc, argv, "blt:f:op:w:v:h?")) != -1){
		switch (ch) {
			case 'f':
				sscanf(optarg, "%x", &from);
				break;	
			case 't': 
				sscanf(optarg, "%x", &p_to);
				break;
			case 'w':
			        DpsVarListReplaceStr(&Env->Vars, "VarDir", optarg);
				break;
                        case 'v': DpsSetLogLevel(NULL, atoi(optarg)); break;
                        case 'b': obi++; break;
                        case 'o': optimize++; break;
                        case 'p': sleeps = atoi(optarg); break;
			case 'h':
			case '?':
			default:
			  usage();
			  DpsEnvFree(Env);
			  DpsDeInit();
			  DpsDestroyMutexes();
				return 1;
				break;
		}
	}
	argc -= optind;
	argv += optind;

	if(argc > 1) {
		usage();
		DpsEnvFree(Env);
		DpsDeInit();
		DpsDestroyMutexes();
		return 1;
	} else if (argc == 1) {
	        config_name = argv[0];
	}
	{
		DPS_LOGDEL *del_buf=NULL;
		size_t del_count = 0, log, bytes, n = 0;
		int dd, log_fd;
		struct stat sb;
		char dname[PATH_MAX] = "";
		DPS_BASE_PARAM P;
		DPS_LOGWORD *log_buf = NULL;
		DPS_AGENT *Indexer = DpsAgentInit(NULL, Env, 0);

		log2stderr = 1;
		if (Indexer == NULL) {
		  fprintf(stderr, "Can't alloc Agent at %s:%d\n", __FILE__, __LINE__);
		  exit(DPS_ERROR);
		}
		
		if(DPS_OK != DpsEnvLoad(Indexer, config_name, (dps_uint8)0)){
		  fprintf(stderr, "%s\n", DpsEnvErrMsg(Env));
		  DpsEnvFree(Env);
		  DpsDeInit();
		  DpsDestroyMutexes();
		  return DPS_ERROR;
		}
		DpsOpenLog("splitter", Env, log2stderr);
		Indexer->flags = Env->flags = DPS_FLAG_UNOCON;
		DpsVarListAddLst(&Indexer->Vars, &Env->Vars, NULL, "*");

		bzero(&P, sizeof(P));
		P.subdir = DPS_TREEDIR;
		P.basename = "wrd";
		P.indname = "wrd";
		P.mode = DPS_WRITE_LOCK;
		P.NFiles = DpsVarListFindInt(&Indexer->Conf->Vars, "WrdFiles", 0x300);
		P.vardir = DpsStrdup(DpsVarListFindStr(&Indexer->Conf->Vars, "VarDir", DPS_VAR_DIR));
		P.A = Indexer;
		if (p_to != 0) to = p_to;
		else to = P.NFiles - 1;
#ifdef HAVE_ZLIB
		P.zlib_method = Z_DEFLATED;
		P.zlib_level = 9;
		P.zlib_windowBits = DPS_BASE_WRD_WINDOWBITS;
		P.zlib_memLevel = 9;
		P.zlib_strategy = DPS_BASE_WRD_STRATEGY;
#endif

		/* Open del log file */
		dps_snprintf(dname,sizeof(dname),"%s%c%s%cdel-split.log", P.vardir, DPSSLASH, DPS_SPLDIR, DPSSLASH);
		if((dd = DpsOpen2(dname, O_RDONLY | DPS_BINARY)) < 0) {
		  dps_strerror(NULL, 0, "Can't open del log '%s'", dname);
		  exit(DPS_ERROR);
		}

		DpsLog(Indexer, DPS_LOG_DEBUG, "VarDir: %s, WrdFiles: %d [%x]", P.vardir, P.NFiles, P.NFiles);

		/* Allocate del buffer */
		fstat(dd, &sb);
		if (sb.st_size != 0) {
		  del_buf=(DPS_LOGDEL*)DpsMalloc((size_t)sb.st_size + 1);
		  if (del_buf == NULL) {
		    fprintf(stderr, "Can't alloc %d bytes at %s:%d\n", (int)sb.st_size, __FILE__, __LINE__);
		    exit(0);
		  }
		  del_count=read(dd,del_buf,(size_t)sb.st_size)/sizeof(DPS_LOGDEL);
		}
		DpsClose(dd);

		/* Remove duplicates URLs in DEL log     */
		/* Keep only oldest records for each URL */
		if (del_count > 0) {
		  DpsLog(Indexer, DPS_LOG_DEBUG, "Sorting del_buf: %d items", del_count);
		  if (del_count > 1) DpsSort(del_buf, (size_t)del_count, sizeof(DPS_LOGDEL), DpsCmpurldellog);
		    DpsLog(Indexer, DPS_LOG_DEBUG, "Removing DelLogDups");
		  del_count = DpsRemoveDelLogDups(del_buf, del_count);
		}

		DpsLog(Indexer, DPS_LOG_DEBUG, "Processing Bufs from %d [%x] to %d [%x]", from, from, to, to);

		for(log = from; log <= to; log++) {

		  /* Open log file */
		  dps_snprintf(dname, sizeof(dname), "%s%c%s%c%03X-split.log", P.vardir, DPSSLASH, DPS_SPLDIR, DPSSLASH, log);
		  if((log_fd = DpsOpen2(dname, O_RDWR|DPS_BINARY)) < 0){
		    if (errno == ENOENT) {
		      dps_strerror(Indexer, DPS_LOG_DEBUG, "Can't open '%s'", dname);
		      n = 0;
/*		      continue;*/
		    } else {
		      dps_strerror(Indexer, DPS_LOG_ERROR, "Can't open '%s'", dname);
		      continue;
		    }
		  } else {
		    DpsWriteLock(log_fd); 
		    DpsLog(Indexer, DPS_LOG_DEBUG, "Processing Log: %x", log);
		    fstat(log_fd, &sb);
		    log_buf = (sb.st_size > 0) ? (DPS_LOGWORD*)DpsMalloc((size_t)sb.st_size + 1) : NULL;
		    if (log_buf != NULL) {
		      unlink(dname);
		      bytes = read(log_fd,log_buf,(size_t)sb.st_size);
		      (void)ftruncate(log_fd, (off_t)0);
		      DpsUnLock(log_fd);
		      DpsClose(log_fd);
		      
		      n = bytes / sizeof(DPS_LOGWORD);
		      DpsLog(Indexer, DPS_LOG_DEBUG, "Sorting log_buf: %d items", n);
		      if (n > 1) DpsSort(log_buf, n, sizeof(DPS_LOGWORD), (qsort_cmp)DpsCmplog);
		      DpsLog(Indexer, DPS_LOG_DEBUG, "Removing OldWords");
		      n = DpsRemoveOldWords(log_buf, n, del_buf, del_count);
		      if (n > 1) DpsSort(log_buf, n, sizeof(DPS_LOGWORD), (qsort_cmp)DpsCmplog_wrd);
		      
		    } else {
		      n = 0;
		      DpsUnLock(log_fd);
		      DpsClose(log_fd);
		    }
		  }

		  DpsLog(Indexer, DPS_LOG_DEBUG, "Processing Buf, optimize: %d", optimize);
		  if (obi) DpsBaseOptimize(&P, log);
		  DpsProcessBuf(Indexer, &P, log, log_buf, n, del_buf, del_count);
		  if (optimize) DpsBaseOptimize(&P, log);
		  DpsBaseClose(&P);
		  DPS_FREE(log_buf);

		  DpsLog(Indexer, DPS_LOG_DEBUG, "pas done: %d from %d to %d", log, from, to);
		  DPSSLEEP(sleeps);
		}
		DPS_FREE(del_buf);
		DpsAgentFree(Indexer);
		DPS_FREE(P.vardir);
	}

	fprintf(stderr, "Splitting done.\n");
	
	DpsEnvFree(Env);
	DpsDeInit();
	DpsDestroyMutexes();

#ifdef EFENCE
	fprintf(stderr, "Memory leaks checking\n");
	DpsEfenceCheckLeaks();
#endif
#ifdef FILENCE
	fprintf(stderr, "FD leaks checking\n");
	DpsFilenceCheckLeaks(NULL);
#endif
	return 0;
}
예제 #18
0
int _DpsURLParse(DPS_URL *url, const char *str, const char *filename, int line) {
#else
int DpsURLParse(DPS_URL *url, const char *str) {
#endif
	char *schema,*anchor,*file,*query;
	char *s;
/*	size_t len = dps_strlen(str);*/
#ifdef WITH_PARANOIA
	void * paran = DpsViolationEnter(paran);
#endif

#ifdef DEBUG_URL
	fprintf(stderr, " -- %s:%d Parser url: %s\n", filename, line, str);
#endif
	
	DPS_FREE(url->schema);
	DPS_FREE(url->specific);
	DPS_FREE(url->hostinfo);
	DPS_FREE(url->hostname);
	DPS_FREE(url->anchor);
	DPS_FREE(url->auth);
	url->port=0;
	url->default_port=0;
	DPS_FREE(url->path);
	DPS_FREE(url->directory);
	DPS_FREE(url->filename);
	DPS_FREE(url->query_string);

/*	if(len >= DPS_URLSIZE)return(DPS_URL_LONG);  FIXME: Chage this cheking for configured parameter, not DPS_URLSIZE */
	s = (char*)DpsStrdup(str);
	if (s == NULL) {
#ifdef WITH_PARANOIA
	  DpsViolationExit(-1, paran);
#endif
	  return DPS_ERROR;
	}

	url->len = dps_strlen(str);
	
	/* Find possible schema end than   */	
	/* Check that it is really schema  */
	/* It must consist of alphas only  */
	/* We will take in account digits  */
	/* also for oracle8:// for example */
	/* We must check it because        */
	/* It might be anchor also         */
	/* For example:                    */
	/* "mod/index.html#a:1"            */

	if((schema=strchr(s,':'))){
		const char * ch;
		for(ch=s;ch<schema;ch++){
			if(!isalnum(*ch)){
				/* Bad character       */
				/* so it is not schema */
				schema=0;break;
			}
		}
	}

	if(schema){
		/* Have scheme - absolute path */
		*schema=0;
		url->schema = (char*)DpsStrdup(s);
		url->specific = (char*)DpsStrdup(schema + 1);
		*schema=':';
		if(!strcasecmp(url->schema,"http"))url->default_port=80;
		else
		if(!strcasecmp(url->schema,"https"))url->default_port=443;
		else
		if(!strcasecmp(url->schema,"nntp"))url->default_port=119;
		else
		if(!strcasecmp(url->schema,"news"))url->default_port=119;
		else
		if(!strcasecmp(url->schema,"ftp"))url->default_port=21;

		if(!strncmp(url->specific,"//",2)){
			char	*ss,*hostname;
			
			/* Have hostinfo */
			if((ss=strchr(url->specific+2,'/'))){
				/* Have hostname with path */
				*ss=0;
				url->hostinfo = (char*)DpsStrdup(url->specific + 2);
				*ss='/';
				url->path = (char*)DpsStrdup(ss);
			}else{
				/* Hostname without path */
			        if ((ss = strchr(url->specific + 2, '?'))) {
				  /* Have hostname with parameters */
				  *ss = 0;
				  url->hostinfo = (char*)DpsStrdup(url->specific + 2);
				  *ss='?';
				  url->path = (char*)DpsStrdup("/");
				}else {
				  url->hostinfo = (char*)DpsStrdup(url->specific + 2);
				  url->path = (char*)DpsStrdup("/");
				}
			}
			if((hostname=strchr(url->hostinfo,'@'))){
				/* Username and password is given  */
				/* Store auth string user:password */
				*hostname=0;
				url->auth = (char*)DpsStrdup(url->hostinfo);
				*hostname='@';
				hostname++;
			}else{
				hostname = url->hostinfo;
			}
			/*
			FIXME:
			for(h=hostname;*h;h++){
				if( *h>='A' && *h<='Z')
				*h=(*h)-'A'+'a';
			}
			*/
	
			if((ss=strchr(hostname,':'))){
				*ss=0;
				url->hostname = (char*)DpsStrdup(hostname);
				*ss=':';
				url->port=atoi(ss+1);
			}else{
				url->hostname = (char*)DpsStrdup(hostname);
				url->port=0;
			}
		}else{
			/* Have not host but have schema                   */
			/* This is possible for:                           */
			/* file:  mailto:  htdb: news:                     */
			/* As far as we do not need mailto: just ignore it */
			
		        if(!strcasecmp(url->schema,"mailto") 
			   || !strcasecmp(url->schema,"javascript")
			   || !strcasecmp(url->schema,"feed")
			   ) {
			        DPS_FREE(s);
#ifdef WITH_PARANOIA
				DpsViolationExit(-1, paran);
#endif
				return(DPS_URL_BAD);
			} else
			if(!strcasecmp(url->schema,"file"))
				url->path = (char*)DpsStrdup(url->specific);
			else
			if(!strcasecmp(url->schema,"exec"))
				url->path = (char*)DpsStrdup(url->specific);
			else
			if(!strcasecmp(url->schema,"cgi"))
				url->path = (char*)DpsStrdup(url->specific);
			else
			if(!strcasecmp(url->schema,"htdb"))
				url->path = (char*)DpsStrdup(url->specific);
			else
			if(!strcasecmp(url->schema,"news")){
				/* Now we will use localhost as NNTP    */
				/* server as it is not indicated in URL */
				url->hostname = (char*)DpsStrdup("localhost");
				url->path = (char*)DpsMalloc(dps_strlen(url->specific) + 2);
				if (url->path == NULL) {
				  DPS_FREE(s);
#ifdef WITH_PARANOIA
				  DpsViolationExit(-1, paran);
#endif
				  return DPS_ERROR;
				}
				sprintf(url->path,"/%s",url->specific);
				url->default_port=119;
			}else{
				/* Unknown strange schema */
			        DPS_FREE(s);
#ifdef WITH_PARANOIA
				DpsViolationExit(-1, paran);
#endif
				return(DPS_URL_BAD);
			}
		}
	}else{
		url->path = (char*)DpsStrdup(s);
	}

	/* Cat an anchor if exist */
	if((anchor=strstr(url->path,"#")))*anchor=0;


	/* If path is not full just copy it to filename    */
	/* i.e. neither  /usr/local/ nor  c:/windows/temp/ */

	if((url->path != NULL) && (url->path[0]!='/') && (url->path[0]!='?') && (url->path[1]!=':')) { 
		/* Relative path */
		if(!strncmp(url->path,"./",2))
			url->filename = (char*)DpsStrdup(url->path + 2);
		else
			url->filename = (char*)DpsStrdup(url->path);
		url->path[0] = 0;
	}

	/* truncate path to query_string */
	/* and store query_string        */

	if((query=strrchr(url->path,'?'))){
		url->query_string = (char*)DpsStrdup(query);
		*(query) = 0;
	}
	
	DpsURLNormalizePath(url->path);
	
	/* Now find right '/' sign and copy the rest to filename */

	if((file=strrchr(url->path,'/'))&&(strcmp(file,"/"))){
		url->filename = (char*)DpsStrdup(file + 1);
		*(file+1)=0;
	}

	/* Now find right '/' sign and copy the rest to directory */

	if ((file = strrchr(url->path,'/'))) {
	  char *p_save = file;
	  for(file--; (file > url->path) && (*file != '/'); file--);
	  file++;
	  if (*file) {
	    *p_save = '\0';
	    url->directory = (char*)DpsStrdup(file);
	    *p_save = '/';
	  }
	}

	DPS_FREE(s);
	if (url->hostname != NULL) {
	  DpsRTrim(url->hostname, ".");
	  url->domain_level = 1;
	  for (s = url->hostname; *s; s++) {
	    *s = dps_tolower(*s);
	    if (*s == '.') url->domain_level++;
	    if (strchr(",'\";", (int)*s)) {
#ifdef WITH_PARANOIA
	      DpsViolationExit(-1, paran);
#endif
	      return DPS_URL_BAD;
	    }
	  }
	}
	if (url->hostinfo != NULL) {
	  DpsRTrim(url->hostinfo, ".");
	  s = strchr(url->hostinfo, '@');
	  for (s = (s == NULL) ? url->hostinfo : s + 1; *s; s++) *s = dps_tolower(*s);
	}
	if (url->schema != NULL) for (s = url->schema; *s; s++) *s = dps_tolower(*s);

/*	fprintf(stderr, "url: .path: %s port:%d\n", url->path, url->port);*/

#ifdef WITH_PARANOIA
	DpsViolationExit(-1, paran);
#endif
	return DPS_OK;
}
예제 #19
0
int DpsChineseListLoad(DPS_AGENT *Agent, DPS_CHINALIST *List, const char *charset, const char *fname) {
     struct stat     sb;
     char *str, *data = NULL, *cur_n = NULL;
     DPS_CHINAWORD chinaword;
     char word[PATH_MAX];
     dpsunicode_t uword[256];
     DPS_CHARSET *sys_int, *fcs;
     DPS_CONV to_uni;
     int             fd;
     char            savebyte;

     sys_int = DpsGetCharSet("sys-int");
     if (!(fcs = DpsGetCharSet(charset))) {
       if (Agent->Conf->is_log_open) DpsLog(Agent, DPS_LOG_ERROR, "Charset '%s' not found or not supported", charset);
       else fprintf(stderr, "Charset '%s' not found or not supported", charset);
       return DPS_ERROR;
     }
     DpsConvInit(&to_uni, fcs, sys_int, Agent->Conf->CharsToEscape, DPS_RECODE_HTML);

     if (*fname != '/') {
       dps_snprintf(word, sizeof(word), "%s/%s", DpsVarListFindStr(&Agent->Conf->Vars, "EtcDir", DPS_CONF_DIR), fname);
       fname = word;
     }

     if (stat(fname, &sb)) {
       if (Agent->Conf->is_log_open) 
	    DpsLog(Agent, DPS_LOG_ERROR, "Unable to stat FreqDic file '%s': %s", fname, strerror(errno));
       else fprintf(stderr, "Unable to stat FrecDic file '%s': %s", fname, strerror(errno));
       return DPS_ERROR;
     }
     if ((fd = open(fname, O_RDONLY)) <= 0) {
       if (Agent->Conf->is_log_open) 
	    DpsLog(Agent, DPS_LOG_ERROR, "Unable to open FreqDic file '%s': %s", fname, strerror(errno));
       else fprintf(stderr, "Unable to open FreqDic file '%s': %s", fname, strerror(errno));
       return DPS_ERROR;
     }
     if ((data = (char*)DpsMalloc(sb.st_size + 1)) == NULL) {
       if (Agent->Conf->is_log_open) 
	 DpsLog(Agent, DPS_LOG_ERROR, "Unable to alloc %d bytes", sb.st_size);
       else fprintf(stderr, "Unable to alloc %ld bytes", (long)sb.st_size);
       close(fd);
       return DPS_ERROR;
     }
     if (read(fd, data, sb.st_size) != (ssize_t)sb.st_size) {
       if (Agent->Conf->is_log_open) 
	 DpsLog(Agent, DPS_LOG_ERROR, "Unable to read FreqDic file '%s': %s", fname, strerror(errno));
       else fprintf(stderr, "Unable to read FreqDic file '%s': %s", fname, strerror(errno));
       DPS_FREE(data);
       close(fd);
       return DPS_ERROR;
     }
     data[sb.st_size] = '\0';
     str = data;
     cur_n = strchr(str, NL_INT);
     if (cur_n != NULL) {
       cur_n++;
       savebyte = *cur_n;
       *cur_n = '\0';
     }
     close(fd);

     bzero((void*)&chinaword, sizeof(chinaword));
     chinaword.word = uword;
     while(str != NULL) {
          if(!str[0]) goto loop_continue;
          if(str[0]=='#') goto loop_continue;
          sscanf(str, "%d %63s ", &chinaword.freq, word );
	  DpsConv(&to_uni, (char*)uword, sizeof(uword), word, sizeof(word));
          DpsChineseListAdd(List, &chinaword);
     loop_continue:
	  str = cur_n;
	  if (str != NULL) {
	    *str = savebyte;
	    cur_n = strchr(str, NL_INT);
	    if (cur_n != NULL) {
	      cur_n++;
	      savebyte = *cur_n;
	      *cur_n = '\0';
	    }
	  }
     }
     DPS_FREE(data);
     DpsChineseListSort(List);
     { register size_t i, j = 0;
       for (i = 1; i < List->nwords; i++) {
	 if (cmpchinese(&List->ChiWord[j], &List->ChiWord[i]) == 0) {
	   List->ChiWord[j].freq += List->ChiWord[i].freq;
	 } else { j++;
	 }
       }
       for (i = j + 1; i < List->nwords; i++) {
	 DPS_FREE(List->ChiWord[i].word);
       }
       List->nwords = j + 1;
     }
     return DPS_OK;
}
예제 #20
0
 void RelLink(DPS_AGENT *Indexer, DPS_URL *curURL, DPS_URL *newURL, char **str, int ReverseAliasFlag) {
	const char	*schema = newURL->schema ? newURL->schema : curURL->schema;
	const char	*hostname = newURL->hostname ? newURL->hostname : curURL->hostname;
	const char	*auth = newURL->auth ? newURL->auth : curURL->auth;
	const char	*path = (newURL->path && newURL->path[0]) ? newURL->path : curURL->path;
	const char	*fname = ((newURL->filename && newURL->filename[0]) || (newURL->path && newURL->path[0])) ? 
	  newURL->filename : curURL->filename;
	const char     *query_string = newURL->query_string;
	char		*pathfile = (char*)DpsMalloc(dps_strlen(DPS_NULL2EMPTY(path)) + dps_strlen(DPS_NULL2EMPTY(fname)) +
						     dps_strlen(DPS_NULL2EMPTY(query_string)) + 5);
	int             cascade;
	DPS_MATCH	*Alias;
	char		*alias = NULL;
	size_t		aliassize, nparts = 10;
	DPS_MATCH_PART	Parts[10];

	if (newURL->hostinfo == NULL) newURL->charset_id = curURL->charset_id;
	
	if (pathfile == NULL) return;
/*	sprintf(pathfile, "/%s%s%s",  DPS_NULL2EMPTY(path), DPS_NULL2EMPTY(fname), DPS_NULL2EMPTY(query_string));*/
	pathfile[0] = '/'; 
	dps_strcpy(pathfile + 1, DPS_NULL2EMPTY(path)); dps_strcat(pathfile, DPS_NULL2EMPTY(fname)); dps_strcat(pathfile, DPS_NULL2EMPTY(query_string));
		
	DpsURLNormalizePath(pathfile);

	if (!strcasecmp(DPS_NULL2EMPTY(schema), "mailto") 
	    || !strcasecmp(DPS_NULL2EMPTY(schema), "javascript")
	    || !strcasecmp(DPS_NULL2EMPTY(schema), "feed")
	    ) {
	        *str = (char*)DpsMalloc(dps_strlen(DPS_NULL2EMPTY(schema)) + dps_strlen(DPS_NULL2EMPTY(newURL->specific)) + 4);
		if (*str == NULL) return;
/*		sprintf(*str, "%s:%s", DPS_NULL2EMPTY(schema), DPS_NULL2EMPTY(newURL->specific));*/
		dps_strcpy(*str, DPS_NULL2EMPTY(schema)); dps_strcat(*str, ":"); dps_strcat(*str, DPS_NULL2EMPTY(newURL->specific));
	} else if(/*!strcasecmp(DPS_NULL2EMPTY(schema), "file") ||*/ !strcasecmp(DPS_NULL2EMPTY(schema), "htdb")) {
	        *str = (char*)DpsMalloc(dps_strlen(DPS_NULL2EMPTY(schema)) + dps_strlen(pathfile) + 4);
		if (*str == NULL) return;
/*		sprintf(*str, "%s:%s", DPS_NULL2EMPTY(schema), pathfile);*/
		dps_strcpy(*str, DPS_NULL2EMPTY(schema)); dps_strcat(*str, ":"); dps_strcat(*str, pathfile);
	}else{
	  *str = (char*)DpsMalloc(dps_strlen(DPS_NULL2EMPTY(schema)) + dps_strlen(pathfile) + dps_strlen(DPS_NULL2EMPTY(hostname)) + dps_strlen(DPS_NULL2EMPTY(auth)) + 8);
	  if (*str == NULL) return;
/*		sprintf(*str, "%s://%s%s", DPS_NULL2EMPTY(schema), DPS_NULL2EMPTY(hostinfo), pathfile);*/
	  dps_strcpy(*str, DPS_NULL2EMPTY(schema)); dps_strcat(*str, "://"); 
	  if (auth) {
	    dps_strcat(*str, auth); dps_strcat(*str,"@");
	  }
	  dps_strcat(*str, DPS_NULL2EMPTY(hostname)); dps_strcat(*str, pathfile);
	}
	
	if(!strncmp(*str, "ftp://", 6) && (strstr(*str, ";type=")))
		*(strstr(*str, ";type")) = '\0';
	DPS_FREE(pathfile);

	if (ReverseAliasFlag) {
	  const char *alias_prog = DpsVarListFindStr(&Indexer->Vars, "ReverseAliasProg", NULL);
	  
	  if (alias_prog) {
	    int		result;
	    aliassize = 256 + 2 * dps_strlen(*str);
	    alias = (char*)DpsRealloc(alias, aliassize);
	    if (alias == NULL) {
	      DpsLog(Indexer, DPS_LOG_ERROR, "No memory (%d bytes). %s line %d", aliassize, __FILE__, __LINE__);
	      goto ret;
	    }
	    alias[0] = '\0';
	    result = DpsAliasProg(Indexer, alias_prog, *str, alias, aliassize - 1);
	    DpsLog(Indexer, DPS_LOG_EXTRA, "ReverseAliasProg result: '%s'", alias);
	    if(result != DPS_OK) goto ret;
	    DPS_FREE(*str);
	    *str = (char*)DpsStrdup(alias);
	  }

	  for(cascade = 0; ((Alias=DpsMatchListFind(&Indexer->Conf->ReverseAliases,*str,nparts,Parts))) && (cascade < 1024); cascade++) {
	        aliassize = dps_strlen(Alias->arg) + dps_strlen(Alias->pattern) + dps_strlen(*str) + 128;
		alias = (char*)DpsRealloc(alias, aliassize);
		if (alias == NULL) {
		  DpsLog(Indexer, DPS_LOG_ERROR, "No memory (%d bytes). %s line %d", aliassize, __FILE__, __LINE__);
		  goto ret;
		}
		DpsMatchApply(alias,aliassize,*str,Alias->arg,Alias,nparts,Parts);
		if(alias[0]){
		  DpsLog(Indexer,DPS_LOG_DEBUG,"ReverseAlias%d: pattern:%s, arg:%s -> '%s'", cascade, Alias->pattern, Alias->arg, alias);
		  DPS_FREE(*str);
		  *str = (char*)DpsStrdup(alias);
		} else break;
		if (Alias->last) break;
	  }
	}

ret:	
	DPS_FREE(alias);

}
예제 #21
0
dpsunicode_t *DpsSegmentByFreq(DPS_CHINALIST *List, dpsunicode_t *line) {
  dpsunicode_t *out, *mid, *last, *sentence, *segmented_sentence, part;
  size_t i, j, l, a;
  int /*reg = 1,*/ ctype, have_bukva_forte, fb_type;
  dpsunicode_t space[] = { 32, 0 };

  l = 2 * (DpsUniLen(line) + 1);
  if (l < 2) return NULL;
  out = (dpsunicode_t*)DpsMalloc(l * sizeof(dpsunicode_t));
  if (out == NULL) return NULL;
  *out = '\0';
  mid = (dpsunicode_t*)DpsXmalloc(l * sizeof(dpsunicode_t));
  if (mid == NULL) { DPS_FREE(out); return NULL; }
  *mid = '\0';
  
  for (i = j = 0; i < DpsUniLen(line); i++) {
/*    if (line[i] >= 0x80) {
      if (reg == 0) {
	mid[j++] = *space;
	reg = 1;
      }
    } else {
      if (reg == 1) {
	mid[j++] = *space;
	reg = 0;
      }
    }*/
    mid[j++] = line[i];
  }
/*  mid[j] = 0;*/

  for (sentence = DpsUniGetSepToken(/*line*/ mid, &last, &ctype, &have_bukva_forte, 0);
       sentence;
       sentence = DpsUniGetSepToken(NULL, &last, &ctype, &have_bukva_forte, 0)) {
    part = *last;
    *last = 0;
    fb_type = DpsUniCType(*sentence);

    if (fb_type > DPS_UNI_BUKVA || fb_type == 2 || fb_type == 1) {
      a = 2 * (DpsUniLen(sentence) + 1);
      j = DpsUniLen(out);
      if (j + a >= l) {
	l = j + a + 1;
	out = (dpsunicode_t*)DpsRealloc(out, l * sizeof(dpsunicode_t));
	if (out == NULL) {
	  DPS_FREE(mid); return NULL;
	}
      }
      if (*out) DpsUniStrCat(out, space);
      DpsUniStrCat(out, sentence);
    } else {
      if ((segmented_sentence = DpsSegmentProcess(List, sentence)) != NULL) {
	a = 2 * (DpsUniLen(segmented_sentence) + 1);
	j = DpsUniLen(out);
	if (j + a >= l) {
	  l = j + a + 1;
	  out = (dpsunicode_t*)DpsRealloc(out, l * sizeof(dpsunicode_t));
	  if (out == NULL) {
	    DPS_FREE(mid); return NULL;
	  }
	}
	if (*out) DpsUniStrCat(out, space);
	DpsUniStrCat(out, segmented_sentence);
	DPS_FREE(segmented_sentence);
      } else {
	  DPS_FREE(mid); return NULL;
      }
    }
    *last = part;
  }

  DPS_FREE(mid);
  
  return out;
}
예제 #22
0
int DpsCloneListSearchd(DPS_AGENT *Indexer, DPS_DOCUMENT *Doc, DPS_RESULT *Res, DPS_DB *db) {
	DPS_SEARCHD_PACKET_HEADER hdr;
	ssize_t	nsent,nrecv;
	char *msg = NULL, *dinfo = NULL;
	char *tok, *lt;
	char buf[128];
	int done = 0;
	int	rc = DPS_OK;

	TRACE_IN(Indexer, "DpsCloneListSearchd");
	
	dps_snprintf(buf, 128, "%s", DpsVarListFindStr(&Doc->Sections, "DP_ID", "0"));
	hdr.cmd = DPS_SEARCHD_CMD_CLONES;
	hdr.len = dps_strlen(buf);
	nsent = DpsSearchdSendPacket(db->searchd, &hdr, buf);
	while(!done){
	  nrecv = DpsRecvall(db->searchd, &hdr, sizeof(hdr), 360);
		
		if(nrecv != sizeof(hdr)){
			DpsLog(Indexer, DPS_LOG_ERROR, "Received incomplete header from searchd (%d bytes)", (int)nrecv);
			TRACE_OUT(Indexer);
			return(DPS_ERROR);
		}else{
#ifdef DEBUG_SDP
			DpsLog(Indexer, DPS_LOG_DEBUG, "Received header cmd=%d len=%d\n", hdr.cmd, hdr.len);
#endif
		}
		switch(hdr.cmd){
			case DPS_SEARCHD_CMD_ERROR:
				msg = (char*)DpsMalloc(hdr.len + 1); 
				if (msg == NULL) {
				  done=1;
				  break;
				}
				nrecv = DpsRecvall(db->searchd, msg, hdr.len, 360);
				msg[(nrecv >= 0) ? nrecv : 0] = '\0';
				sprintf(Indexer->Conf->errstr, "Searchd error: '%s'", msg);
				rc = DPS_ERROR;
				DPS_FREE(msg);
				done = 1;
				break;
			case DPS_SEARCHD_CMD_DOCINFO:
				dinfo = (char*)DpsMalloc(hdr.len + 1);
				if (dinfo == NULL) {
				  done=1;
				  break;
				}
				nrecv = DpsRecvall(db->searchd, dinfo, hdr.len, 360);
				dinfo[(nrecv >= 0) ? nrecv : 0] = '\0';
#ifdef DEBUG_SDP
				DpsLog(Indexer, DPS_LOG_DEBUG, "Received DOCINFO size=%d buf=%s\n", hdr.len, dinfo);
#endif				
				if (strcasecmp(dinfo, "nocloneinfo") != 0) {

				  tok = dps_strtok_r(dinfo, "\r\n", &lt, NULL);
				
				  while(tok){
					DPS_DOCUMENT *D;
					size_t nd = Res->num_rows++;

					Res->Doc = (DPS_DOCUMENT*)DpsRealloc(Res->Doc, (Res->num_rows + 1) * sizeof(DPS_DOCUMENT));
					if (Res->Doc == NULL) {
					  sprintf(Indexer->Conf->errstr, "Realloc error");
					  rc = DPS_ERROR;
					  break;
					}
					D = &Res->Doc[nd];
					DpsDocInit(D);
					DpsDocFromTextBuf(D, tok);
					tok = dps_strtok_r(NULL, "\r\n", &lt, NULL);
				  }
				}
				DPS_FREE(dinfo);
				done = 1;
				break;
			default:
				sprintf(Indexer->Conf->errstr, "Unknown searchd response: cmd=%d len=%d", hdr.cmd, hdr.len);
				rc = DPS_ERROR;
				done = 1;
				break;
		}
	}
	TRACE_OUT(Indexer);
	return rc;
}
예제 #23
0
__C_LINK int __DPSCALL DpsCacheMakeIndexes(DPS_AGENT *Indexer, DPS_DB *db) {
  DPS_UINT8URLIDLIST  L8;
  DPS_UINT4URLIDLIST  L4;
  DPS_VARLIST *v = &Indexer->Conf->Vars;
  size_t i, r;
  char *ind, *nm, *lfname;

  bzero(&L4, sizeof(DPS_UINT4URLIDLIST));
  bzero(&L8, sizeof(DPS_UINT8URLIDLIST));
  
  r = (size_t) 'l';
  for (i = 0; i < v->Root[r].nvars; i++) {
    if (!strncasecmp("Limit-", v->Root[r].Var[i].name, 6)) {
      ind = v->Root[r].Var[i].val;
      lfname = v->Root[r].Var[i].name;
      nm = lfname + 6;
      if (!strcasecmp(ind, "category")) {

	/* To see the URL being indexed in "ps" output on xBSD */
	dps_setproctitle("[%d] Category index creation", Indexer->handle);
	DpsLog(Indexer, DPS_LOG_EXTRA, "Creating category index");
	if (DPS_OK == DpsLimit8(Indexer, &L8, "Category",  DPS_IFIELD_TYPE_HEX8STR, db)) {
	  MakeNestedIndex(Indexer, &L8, DPS_LIMFNAME_CAT, db);
	}

      } else if (!strcasecmp(ind, "tag")) {

	/* To see the URL being indexed in "ps" output on xBSD */
	dps_setproctitle("[%d] Tag index creation", Indexer->handle);
	DpsLog(Indexer, DPS_LOG_EXTRA, "Creating tag index");
	if (DPS_OK == DpsLimit4(Indexer, &L4, "Tag",  DPS_IFIELD_TYPE_STRCRC32, db)) {
	  MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_TAG, db);
	}

      } else if (!strcasecmp(ind, "link")) {

	/* To see the URL being indexed in "ps" output on xBSD */
	dps_setproctitle("[%d] Link index creation", Indexer->handle);
	DpsLog(Indexer, DPS_LOG_EXTRA, "Creating link index");
	if (DPS_OK == DpsLimit4(Indexer, &L4, "link",  DPS_IFIELD_TYPE_INT, db)) {
	  MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_LINK, db);
	}

      } else if (!strcasecmp(ind, "time")) {

	/* To see the URL being indexed in "ps" output on xBSD */
	dps_setproctitle("[%d] Time index creation", Indexer->handle);
	DpsLog(Indexer, DPS_LOG_EXTRA, "Creating time index");
	if (DPS_OK == DpsLimit4(Indexer, &L4, "last_mod_time",  DPS_IFIELD_TYPE_HOUR, db)) {
	  MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_TIME, db);
	}

      } else if (!strcasecmp(ind, "hostname")) {

	/* To see the URL being indexed in "ps" output on xBSD */
	dps_setproctitle("[%d] Hostname index creation", Indexer->handle);
	DpsLog(Indexer, DPS_LOG_EXTRA, "Creating hostname index");
	if (DPS_OK == DpsLimit4(Indexer, &L4, "url",  DPS_IFIELD_TYPE_HOSTNAME, db)) {
	  MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_HOST, db);
	}

      } else if (!strcasecmp(ind, "language")) {

	/* To see the URL being indexed in "ps" output on xBSD */
	dps_setproctitle("[%d] Language index creation", Indexer->handle);
	DpsLog(Indexer, DPS_LOG_EXTRA, "Creating language index");
	if (DPS_OK == DpsLimit4(Indexer, &L4, "Content-Language",  DPS_IFIELD_TYPE_STR2CRC32, db)) {
	  MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_LANG, db);
	}

      } else if (!strcasecmp(ind, "content")) {

	/* To see the URL being indexed in "ps" output on xBSD */
	dps_setproctitle("[%d] Content-Type index creation", Indexer->handle);
	DpsLog(Indexer, DPS_LOG_EXTRA, "Creating Content-Type index");
	if (DPS_OK == DpsLimit4(Indexer, &L4, "Content-Type",  DPS_IFIELD_TYPE_STRCRC32, db)) {
	  MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_CTYPE, db);
	}

      } else if (!strcasecmp(ind, "siteid")) {

	/* To see the URL being indexed in "ps" output on xBSD */
	dps_setproctitle("[%d] Site_id index creation", Indexer->handle);
	DpsLog(Indexer, DPS_LOG_EXTRA, "Creating Site_id index");
	if (DPS_OK == DpsLimit4(Indexer, &L4, "site_id",  DPS_IFIELD_TYPE_INT, db)) {
	  MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_SITE, db);
	}

      } else {
	char *buf, *req, *dbaddr;
	DPS_DB ldb, *pdb = &ldb;
	size_t buf_len = dps_strlen(nm) + 16;
	if ((buf = (char*) DpsMalloc(buf_len * sizeof(char))) == NULL) {
	  DpsLog(Indexer, DPS_LOG_ERROR, "Can't alloc %d chars at %s:%d", buf_len, __FILE__, __LINE__);
	  return DPS_ERROR;
	}
	dps_setproctitle("[%d] %s index creation", Indexer->handle, nm);
	DpsLog(Indexer, DPS_LOG_EXTRA, "Creating %s index", nm);
	dps_snprintf(buf, buf_len, "Req-%s", nm);
	req = DpsVarListFindStr(&Indexer->Conf->Vars, buf, NULL);
	if (req != NULL) {
	  dps_snprintf(buf, buf_len, "dbaddr-%s", nm);
	  dbaddr = DpsVarListFindStr(&Indexer->Conf->Vars, buf, NULL);
	  if (dbaddr != NULL) {
	    DpsDBSetAddr(pdb, dbaddr, DPS_OPEN_MODE_READ);
	  } else {
	    pdb = db;
	  }
	  if (!strcasecmp(ind, "nex8str")) {
	    if (DPS_OK == DpsSQLLimit8(Indexer, &L8, req, DPS_IFIELD_TYPE_HEX8STR, pdb)) {
	      MakeNestedIndex(Indexer, &L8, lfname, pdb);
	    }
	  } else {
	    int field_type = DPS_IFIELD_TYPE_INT;
	    if (!strcasecmp(ind, "strcrc32")) field_type = DPS_IFIELD_TYPE_STRCRC32;
	    else if (!strcasecmp(ind, "hour")) field_type = DPS_IFIELD_TYPE_HOUR;
	    else if (!strcasecmp(ind, "hostname")) field_type = DPS_IFIELD_TYPE_HOSTNAME;
	    else if (!strcasecmp(ind, "char2")) field_type = DPS_IFIELD_TYPE_STR2CRC32;
	    else if (!strcasecmp(ind, "int")) field_type = DPS_IFIELD_TYPE_INT;
	    if (DPS_OK == DpsSQLLimit4(Indexer, &L4, req,  field_type, db)) {
	      MakeLinearIndex(Indexer, &L4, lfname, db);
	    }
	  }
	}

      }
      /* To see the URL being indexed in "ps" output on xBSD */
      dps_setproctitle("[%d] Indexes done.", Indexer->handle);
      DpsLog(Indexer, DPS_LOG_EXTRA, "Done");
    }
  }
  return DPS_OK;
}
예제 #24
0
int __DPSCALL DpsSearchdURLAction(DPS_AGENT *A, DPS_DOCUMENT *D, int cmd, void *db) {
	DPS_DB		*searchd = db;

	DPS_SEARCHD_PACKET_HEADER hdr;
	char *buf;
	ssize_t nsent, nrecv;
	int done = 0;
	char *msg = NULL;
	char *dinfo = NULL;
	int	rc=DPS_OK;

	TRACE_IN(A, "DpsSearchdURLAction");

	if (cmd != DPS_URL_ACTION_DOCCOUNT) {
	  DpsLog(A, DPS_LOG_ERROR, "searchd: unsupported URL action");
	  TRACE_OUT(A);
	  return DPS_ERROR;
	}
	
	hdr.cmd = DPS_SEARCHD_CMD_URLACTION;
	hdr.len = sizeof(int);
	
	if ((buf = (char*)DpsMalloc(hdr.len + 1)) == NULL) {
	  DpsLog(A, DPS_LOG_ERROR, "Out of memory");
	  TRACE_OUT(A);
	  return DPS_ERROR;
	}

	*((int*)buf) = cmd;

	nsent = DpsSearchdSendPacket(searchd->searchd, &hdr, buf);

	DPS_FREE(buf);

	while(!done) {

	  nrecv = DpsRecvall(searchd->searchd, &hdr, sizeof(hdr), 360);
		
		if(nrecv != sizeof(hdr)){
			DpsLog(A, DPS_LOG_ERROR, "Received incomplete header from searchd (%d bytes)", (int)nrecv);
			TRACE_OUT(A);
			return(DPS_ERROR);
		}else{
#ifdef DEBUG_SDP
			DpsLog(A, DPS_LOG_ERROR, "Received header cmd=%d len=%d\n", hdr.cmd, hdr.len);
#endif
		}
		switch(hdr.cmd){
			case DPS_SEARCHD_CMD_ERROR:
				msg = (char*)DpsMalloc(hdr.len + 1); 
				if (msg == NULL) {
				  done=1;
				  break;
				}
				nrecv = DpsRecvall(searchd->searchd, msg, hdr.len, 360);
				msg[(nrecv >= 0) ? nrecv : 0] = '\0';
				sprintf(A->Conf->errstr, "Searchd error: '%s'", msg);
				rc=DPS_OK;
				DPS_FREE(msg);
				done=1;
				break;
			case DPS_SEARCHD_CMD_MESSAGE:
				msg=(char*)DpsMalloc(hdr.len+1);
				if (msg == NULL) {
				  done=1;
				  break;
				}
				nrecv = DpsRecvall(searchd->searchd, msg, hdr.len, 360);
				msg[(nrecv >= 0) ? nrecv : 0] = '\0';
#ifdef DEBUG_SDP
				DpsLog(A, DPS_LOG_ERROR, "Message from searchd: '%s'\n",msg);
#endif
				DPS_FREE(msg);
				break;
			case DPS_SEARCHD_CMD_DOCCOUNT:
			        dinfo=(char*)DpsMalloc(hdr.len+1);
				if (dinfo == NULL) {
				  done=1;
				  break;
				}
				nrecv = DpsRecvall(searchd->searchd, dinfo, hdr.len, 360);
				dinfo[(nrecv >= 0) ? nrecv : 0] = '\0';

				A->doccount += *((int *)dinfo);
#ifdef DEBUG_SDP
				DpsLog(A, DPS_LOG_DEBUG, "Received DOCCOUNT size=%d doccount=%d(+%s)\n", hdr.len, A->doccount, dinfo);
#endif				
				DPS_FREE(dinfo);
				done=1;
				break;
			default:
				sprintf(A->Conf->errstr, "Unknown searchd response: cmd=%d len=%d", hdr.cmd, hdr.len);
				rc=DPS_ERROR;
				done = 1;
				break;
		}
	}
	TRACE_OUT(A);
	return rc;
}
예제 #25
0
static dpsunicode_t *DpsSegmentProcess(DPS_CHINALIST *List, dpsunicode_t *line) {
  int top, nextid, *position, *next, len, maxid, i, current, father, needinsert, iindex;
  unsigned int h;
  double *value, p;
  dpsunicode_t **result;
  dpsunicode_t *otv, space[] = {32, 0};
  DPS_CHINAWORD *chinaword, chiw;

  if (/*(line[0] >= 0x80) &&*/ (List->hash != NULL)) {

    len = DpsUniLen(line);
    maxid = 2 * len + 1;
    position = (int*)DpsMalloc(maxid * sizeof(int));
    if (position == NULL) return NULL;
    next = (int*)DpsMalloc(maxid * sizeof(int));
    if (next == NULL) {
      DPS_FREE(position);
      return NULL;
    }
    value = (double*)DpsMalloc(maxid * sizeof(double));
    if (value == NULL) {
      DPS_FREE(position); DPS_FREE(next);
      return NULL;
    }
    result = (dpsunicode_t **)DpsMalloc(maxid * sizeof(dpsunicode_t *));
    if (result == NULL) {
      DPS_FREE(position); DPS_FREE(next); DPS_FREE(value);
      return NULL;
    }
    
    top = 0;
/*    value[0] = 1;*/
    value[0] = 1.0 * List->total * len; 
    position[0] = 0;
    next[0] = -1;
    result[0] = (dpsunicode_t*)DpsUniDup(&space[1]);
    nextid = 1;

/*    fprintf(stderr, "SegmentProcess start: len -- %d\n", len);*/

    while ((top != -1) && (!((position[top] >= len) && (next[top] == -1)))) {

/*      fprintf(stderr, "top: %d  position: %d (len: %d)  next:%d\n", top, position[top], len, next[top]);*/


/*   # find the first open path */
      current = top;
      father = top;
      while ((current != -1) && (position[current] >= len)) {
	father = current;
	current = next[current];
      }
/*   # remove this path */
      if (current == top) {
	top = next[top];
      } else {
	next[father] = next[current];
      }

      if (current == -1) {
/*       # no open path, finished, take the first path */
	next[top] = -1;
      } else {
	otv = &line[position[current]];
	h = (unsigned int)(otv[0] & 0xffff);

/*       # if the first character doesn't have word phrase in the dict.*/
	if (List->hash[h] == 0) {
	  List->hash[h] = 1 /*2*/;
	}

	i = List->hash[h];
	if (i + position[current] > len) {
	  i = len - position[current];
	}
	/*i = i + 1*/ /*2*/;
	otv = NULL;
	for (; i > 0; i-- /*2*/) {
	  /*i = i - 1*/ /*2*/;
	  DPS_FREE(otv);
	  otv = DpsUniNDup(&line[position[current]], (size_t)i);
	  chinaword = DpsChineseListFind(List, otv);

	  if (i == 1 /*2*/ && chinaword == NULL) {
	    DPS_FREE(otv);
	    otv = DpsUniNDup(&line[position[current]], 1/*2*/);
	    chiw.word = otv;
	    chiw.freq = 1;
	    DpsChineseListAdd(List, chinaword = &chiw);
/*	    DpsChineseListSort(List);*/
	    /*i = 1*//*2*//*;*/
	  }

	  if ((chinaword != NULL) && chinaword->freq) {
/*       # pronode()   */
/*	  value[nextid] = value[current] * chinaword->freq / List->total;*/
	    p = (double)chinaword->freq / List->total;
	    value[nextid] = value[current] / (-1.0 * log(p) / log(10.0));
	    position[nextid] = position[current] + i;
	    h = DpsUniLen(result[current]) + DpsUniLen(otv) + 2;
	    result[nextid] = (dpsunicode_t*)DpsXmalloc((size_t)h * sizeof(dpsunicode_t));
	    if (result[nextid] == NULL) {
	      DPS_FREE(position); DPS_FREE(next); DPS_FREE(value); DPS_FREE(result);
	      return NULL;
	    }
	    DpsUniStrCpy(result[nextid], result[current]);
	    DpsUniStrCat(result[nextid], space);
	    DpsUniStrCat(result[nextid], otv);
/*
    # check to see whether there is duplicated path
    # if there is a duplicate path, remove the small value path
*/
	    needinsert = 1;
	    iindex = top;
	    father = top;
	    while (iindex != -1) {
	      if (position[iindex] == position[nextid]) {
		if (0.85 * value[iindex] >= value[nextid]) {
		  needinsert = 0;
		} else {
		  if (top == iindex) {
		    next[nextid] = next[iindex];
		    top = nextid;
		    needinsert = 0;
    /*          } else {
	          next[nextid] = next[father];*/ /*  next[father] = next[nextid];*/
		  }
		}
		iindex = -1;
	      } else {
		father = iindex;
		iindex = next[iindex];
	      }
	    }
/*    # insert the new path into the list */
/*	    fprintf(stderr, "current:%d  position:%d  i:%d  value[current]:%.12lf  nextid:%d  value[nextid]:%.12lf\n", 
		    current, position[current], i, value[current], nextid, value[nextid]);*/
	    if (needinsert == 1) {
	      while ((iindex != -1) && (value[iindex] > value[nextid])) {
		father = iindex;
		iindex = next[iindex];
	      }
	      if (top == iindex) {
		next[nextid] = top;
		top = nextid;
	      } else {
		next[father] = nextid;
		next[nextid] = iindex;
	      }
	    }
	    nextid++;
	    if (nextid >= maxid) {
	      maxid +=128;
	      position = (int*)DpsRealloc(position, maxid * sizeof(int));
	      next = (int*)DpsRealloc(next, maxid * sizeof(int));
	      value = (double*)DpsRealloc(value, maxid * sizeof(double));
	      result = (dpsunicode_t **)DpsRealloc(result, maxid * sizeof(dpsunicode_t *));
	      if (position == NULL || next == NULL || value == NULL || result == NULL) {
		DPS_FREE(position); DPS_FREE(next); DPS_FREE(value);
		if (result != NULL) {
		  for (i = 0; i < nextid; i++) {
		    if (i != top) DPS_FREE(result[i]);
		  }
		  DPS_FREE(result);
		}
		return NULL;
	      }
	    }
	  }

	} /*while ((i >= 1) && ( chinaword == NULL));*/


	DPS_FREE(otv);
      }
    }

    DPS_FREE(position); DPS_FREE(next);
    for (i = 0; i < nextid; i++) {
      if (i != top) DPS_FREE(result[i]);
    }
    otv = result[top];
    DPS_FREE(value); DPS_FREE(result);
    return otv;

  } else {
    return (dpsunicode_t*)DpsUniDup(line);
  }
}
예제 #26
0
int DpsSearchdGetWordResponse(DPS_AGENT *query,DPS_RESULT *Res,DPS_DB *cl) {
	DPS_URL_CRD_DB *wrd = NULL;
	DPS_URLDATA *udt = NULL;
#ifdef WITH_REL_TRACK
	DPS_URLTRACK *trk = NULL;
#endif
	DPS_SEARCHD_PACKET_HEADER hdr;
	ssize_t	nrecv;
	char	*msg;
	int	done=0, rc = DPS_OK;
	char *wbuf, *p;
	DPS_WIDEWORDLIST_EX *wwl;
	DPS_WIDEWORD *ww_ex;
	DPS_WIDEWORD ww;
	size_t i;

	TRACE_IN(query, "DpsSearchdGetWordResponse");
	
	Res->total_found=0;
	
	while(!done){
	  nrecv = DpsRecvall(cl->searchd, &hdr, sizeof(hdr), 360);
	  if(nrecv!=sizeof(hdr)){
	    sprintf(query->Conf->errstr,"Received incomplete header from searchd (%d bytes,errno:%d)",(int)nrecv, errno);
	    TRACE_OUT(query);
	    return DPS_ERROR;;
	  }
#ifdef DEBUG_SDP
	  DpsLog(query, DPS_LOG_ERROR, "Received header cmd=%d len=%d\n",hdr.cmd,hdr.len);
#endif
		switch(hdr.cmd){
			case DPS_SEARCHD_CMD_ERROR:
				msg=(char*)DpsMalloc(hdr.len+1);
				if (msg == NULL) {
				  done = 1;
				  break;
				}
				nrecv = DpsRecvall(cl->searchd, msg, hdr.len, 360);
				if (nrecv >= 0) {
				    msg[nrecv]='\0';
				    sprintf(query->Conf->errstr,"Searchd error: '%s',received:%d", msg, (int)nrecv);
				}
				rc = DPS_ERROR;
				DPS_FREE(msg);
				done=1;
				break;
			case DPS_SEARCHD_CMD_MESSAGE:
				msg=(char*)DpsMalloc(hdr.len+1);
				if (msg == NULL) {
				  done = 1;
				  break;
				}
				nrecv = DpsRecvall(cl->searchd, msg, hdr.len, 360);
				msg[(nrecv >= 0) ? nrecv : 0] = '\0';
				if (strncmp(msg, "Total_found", 11) == 0) {
				  Res->total_found = (size_t)DPS_ATOI(msg + 12);
				  Res->grand_total = (size_t)DPS_ATOI(strchr(msg + 12, (int)' ') + 1);
				}
#ifdef DEBUG_SDP
				DpsLog(query, DPS_LOG_ERROR, "Message from searchd: '%s'\n",msg);
#endif
				DPS_FREE(msg);
				break;
			case DPS_SEARCHD_CMD_WORDS:
				DPS_FREE(wrd);
				wrd=(DPS_URL_CRD_DB*)DpsMalloc(hdr.len + 1);
				if (wrd == NULL) {
				  done = 1;
				  break;
				}
				nrecv = DpsRecvall(cl->searchd, wrd, hdr.len, 360);
				/*Res->total_found=hdr.len/sizeof(*wrd);*/
				Res->num_rows = (nrecv >= 0) ? (size_t)nrecv / sizeof(*wrd) : 0;
#ifdef DEBUG_SDP
				DpsLog(query, DPS_LOG_ERROR, "Received words size=%d nwrd=%d\n",hdr.len, Res->num_rows /*Res->total_found*/);
#endif
				done=1;
				break;
		        case DPS_SEARCHD_CMD_SUGGEST:
			        DPS_FREE(Res->Suggest);
				Res->Suggest = (char*)DpsMalloc(hdr.len + 1);
				if (Res->Suggest == NULL) {
				  done = 1; break;
				}
				nrecv = DpsRecvall(cl->searchd, Res->Suggest, hdr.len, 360);
				Res->Suggest[(nrecv >=0) ? nrecv : 0] = '\0';
#ifdef DEBUG_SDP
				DpsLog(query, DPS_LOG_ERROR, "Received Suggest size=%d\n", hdr.len);
#endif
				break;

		        case DPS_SEARCHD_CMD_PERSITE:
			        Res->PerSite = (size_t*)DpsMalloc(hdr.len + 1);
				if (Res->PerSite == NULL) {
				  done = 1;
				  break;
				}
				nrecv = DpsRecvall(cl->searchd, Res->PerSite, hdr.len, 360);
#ifdef DEBUG_SDP
				DpsLog(query, DPS_LOG_ERROR, "Received PerSite size=%d nwrd=%d\n", nrecv, Res->num_rows/*Res->total_found*/);
#endif
				break;
		        case DPS_SEARCHD_CMD_DATA:
			        udt = (DPS_URLDATA*)DpsMalloc(hdr.len + 1);
				if (udt == NULL) {
				  done = 1;
				  break;
				}
				nrecv = DpsRecvall(cl->searchd, udt, hdr.len, 360);
#ifdef DEBUG_SDP
				DpsLog(query, DPS_LOG_ERROR, "Received URLDATA size=%d nwrd=%d\n", nrecv, Res->num_rows);
#endif
				break;

#ifdef WITH_REL_TRACK
		        case DPS_SEARCHD_CMD_TRACKDATA:
			        trk = (DPS_URLTRACK*)DpsMalloc(hdr.len + 1);
				if (trk == NULL) {
				  done = 1;
				  break;
				}
				nrecv = DpsRecvall(cl->searchd, trk, hdr.len, 360);
#ifdef DEBUG_SDP
				DpsLog(query, DPS_LOG_ERROR, "Received TRACKDATA size=%d nwrd=%d\n", nrecv, Res->num_rows);
#endif
				break;
#endif

		        case DPS_SEARCHD_CMD_WITHOFFSET:
/*				Res->offset = 1;*/
				break;
		        case DPS_SEARCHD_CMD_QLC:
			        if ((p = (char *)DpsXmalloc(hdr.len + 1)) != NULL) {
				  if (DpsRecvall(cl->searchd, p, hdr.len, 360))  {
				    DpsVarListReplaceStr(&query->Vars, "q", p);
				  }
				}
				DPS_FREE(p);
				break;
		        case DPS_SEARCHD_CMD_WWL:
				Res->PerSite = NULL;
			        if ((wbuf = p = (char *)DpsXmalloc(hdr.len + 1)) != NULL) 
				  if (DpsRecvall(cl->searchd, wbuf, hdr.len, 360))  {
				    wwl = (DPS_WIDEWORDLIST_EX *)p;
				    p += sizeof(DPS_WIDEWORDLIST_EX);
#ifdef DEBUG_SDP
				    DpsLog(query, DPS_LOG_ERROR, "wbuf :%x, wwl: %x, p: %x hdr.len:%d\n", wbuf, wwl, p, hdr.len);
				    DpsLog(query, DPS_LOG_ERROR, "Received WWL nwords=%d nuniq=%d\n", wwl->nwords, wwl->nuniq);
#endif
/*				    DpsWideWordListFree(&Res->WWList);*/
				    for(i = 0; i < wwl->nwords; i++) {
/*				      ww_ex = (DPS_WIDEWORD_EX *)((void*)&p[0]);*/
				      dps_memcpy((char*)&ww, p, sizeof(DPS_WIDEWORD_EX));
				      p += sizeof(DPS_WIDEWORD_EX);
/*
				      ww.order = ww_ex->order;
				      ww.order_inquery = ww_ex->order_inquery;
				      ww.count = ww_ex->count;
				      ww.len = ww_ex->len;
				      ww.ulen = ww_ex->ulen;
				      ww.origin = ww_ex->origin;
				      ww.crcword = ww_ex->crcword;
*/				      
				      ww.word = p;
#ifdef DEBUG_SDP
				      DpsLog(query, DPS_LOG_ERROR, "Word {%d}: %s\n", ww.len+1, ww.word);
#endif
				      p += ww.len + 1;
				      p += sizeof(dpsunicode_t) - ((SDPALIGN)p % sizeof(dpsunicode_t));
				      ww.uword = (dpsunicode_t*)p;
				      p += sizeof(dpsunicode_t) * (ww.ulen + 1);
				      DpsWideWordListAdd(&Res->WWList, &ww, DPS_WWL_STRICT);
				    }
				    Res->WWList.nuniq = wwl->nuniq;
				    DPS_FREE(wbuf);
				  }
				break;
			default:
				sprintf(query->Conf->errstr,"Unknown searchd response: cmd=%d len=%d",hdr.cmd,hdr.len);
				rc = DPS_ERROR;
				done=1;
				break;
		}
	}
	Res->CoordList.Coords = wrd;
	Res->CoordList.Data = udt;
#ifdef WITH_REL_TRACK
	Res->CoordList.Track = trk;
#endif
	TRACE_OUT(query);
	return rc;
}
예제 #27
0
static int MakeLinearIndex(DPS_AGENT *Indexer, const char *field, const char *lim_name, int type, DPS_DB *db) {
    DPS_ENV *Conf = Indexer->Conf;
    DPS_UINT4URLIDLIST  L;
    size_t    k,prev;
    urlid_t   *data = NULL;
    DPS_UINT4_POS_LEN *ind=NULL;
    size_t    mind=1000,nind=0;
    char fname[PATH_MAX];
    int  dat_fd=0, ind_fd=0, rc;
    const char	*vardir = (db->vardir) ? db->vardir : DpsVarListFindStr(&Conf->Vars, "VarDir", DPS_VAR_DIR);

    bzero(&L, sizeof(DPS_UINT4URLIDLIST));

    rc = DpsLimit4(Indexer, &L, field, type, db);

    if(rc != DPS_OK) {
        DpsLog(Indexer, DPS_LOG_ERROR, "Error: %s [%s:%d]", DpsEnvErrMsg(Conf), __FILE__, __LINE__);
        goto err1;
    }

    if(!L.Item)return(1);

    if (L.nitems > 1) DpsSort(L.Item, L.nitems, sizeof(DPS_UINT4URLID), (qsort_cmp)cmp_ind4);

    data = (urlid_t*)DpsMalloc((L.nitems + 1) * sizeof(*data));
    if(!data) {
        fprintf(stderr,"Error1: %s\n",strerror(errno));
        goto err1;
    }
    ind=(DPS_UINT4_POS_LEN*)DpsMalloc(mind*sizeof(DPS_UINT4_POS_LEN));
    if(!ind) {
        fprintf(stderr,"Error2: %s\n",strerror(errno));
        goto err1;
    }
    prev=0;
    for(k=0; k<L.nitems; k++) {
        data[k]=L.Item[k].url_id;
        if((k==L.nitems-1) || (L.Item[k].val!=L.Item[prev].val)) {
            if(nind==mind) {
                mind+=1000;
                ind=(DPS_UINT4_POS_LEN*)DpsRealloc(ind,mind*sizeof(DPS_UINT4_POS_LEN));
                if(!ind) {
                    fprintf(stderr,"Error3: %s\n",strerror(errno));
                    goto err1;
                }
            }
            /* Fill index */
            ind[nind].val=L.Item[prev].val;
            ind[nind].pos = prev * sizeof(*data);
            if (k == L.nitems - 1) ind[nind].len = (k - prev + 1) * sizeof(*data);
            else ind[nind].len = (k - prev) * sizeof(*data);
            DpsLog(Indexer, DPS_LOG_DEBUG, "%d - pos:%x len:%d\n", ind[nind].val, (int)ind[nind].pos, ind[nind].len);
            nind++;

            prev=k;
        }
    }
    if (L.mapped) {
#ifdef HAVE_SYS_MMAN_H
        if (munmap(L.Item, (L.nitems + 1) * sizeof(DPS_UINT4URLID))) {
            fprintf(stderr, "Can't shmdt '%s': %s\n", L.shm_name, strerror(errno));
        }
#elif defined(HAVE_SYS_SHM_H)
        if (shmdt(L.Item)) {
            fprintf(stderr, "Can't shmdt '%s': %s\n", L.shm_name, strerror(errno));
        }
#endif
        unlink(L.shm_name);
    } else {
        DPS_FREE(L.Item);
    }

    dps_snprintf(fname,sizeof(fname),"%s%c%s%c%s.dat", vardir,DPSSLASH, DPS_TREEDIR, DPSSLASH, lim_name);
    if((dat_fd = DpsOpen3(fname, O_CREAT | O_WRONLY | O_TRUNC | DPS_BINARY, DPS_IWRITE)) < 0) {
        fprintf(stderr,"Can't open '%s': %s\n",fname,strerror(errno));
        goto err1;
    }
    DpsWriteLock(dat_fd);
    if((L.nitems * sizeof(*data)) != (size_t)write(dat_fd, data, L.nitems * sizeof(*data))) {
        fprintf(stderr,"Can't write '%s': %s\n",fname,strerror(errno));
        goto err1;
    }
    DpsUnLock(dat_fd);
    DpsClose(dat_fd);
    DPS_FREE(data);

    dps_snprintf(fname,sizeof(fname),"%s%c%s%c%s.ind", vardir,DPSSLASH, DPS_TREEDIR, DPSSLASH, lim_name);
    if((ind_fd = DpsOpen3(fname, O_CREAT | O_WRONLY | O_TRUNC | DPS_BINARY, DPS_IWRITE)) < 0) {
        fprintf(stderr,"Can't open '%s': %s\n",fname,strerror(errno));
        goto err1;
    }
    DpsWriteLock(ind_fd);
    if((nind*sizeof(DPS_UINT4_POS_LEN)) != (size_t)write(ind_fd,ind,nind*sizeof(DPS_UINT4_POS_LEN))) {
        fprintf(stderr,"Can't write '%s': %s\n",fname,strerror(errno));
        goto err1;
    }
    DpsUnLock(ind_fd);
    DpsClose(ind_fd);
    DPS_FREE(ind);

    return(0);

err1:
    if (L.mapped) {
#ifdef HAVE_SYS_MMAN_H
        if (munmap(L.Item, (L.nitems + 1) * sizeof(DPS_UINT4URLID))) {
            fprintf(stderr, "Can't shmdt '%s': %s\n", L.shm_name, strerror(errno));
        }
#elif defined(HAVE_SYS_SHM_H)
        if (shmdt(L.Item)) {
            fprintf(stderr, "Can't shmdt '%s': %s\n", L.shm_name, strerror(errno));
        }
#endif
        unlink(L.shm_name);
    } else {
        DPS_FREE(L.Item);
    }
    DPS_FREE(data);
    DPS_FREE(ind);
    if(dat_fd) DpsClose(dat_fd);
    if(ind_fd) DpsClose(ind_fd);
    return(1);
}
예제 #28
0
int __DPSCALL DpsFindWordsSearchd(DPS_AGENT *query, DPS_RESULT *Res, DPS_DB *searchd) {
	size_t		maxlen = 1024;
	char		*request, *edf = NULL, *e_empty = NULL;
	const char *df = DpsVarListFindStr(&query->Vars, "DateFormat", NULL);
	const char *empty = DpsVarListFindStr(&query->Vars, "empty", NULL);
	const char *qs = DpsVarListFindStr(&query->Vars, "QUERY_STRING", "");
	const char *tmplt = DpsVarListFindStr(&query->Vars, "tmplt", "");
	int		res=DPS_OK;

	TRACE_IN(query, "DpsFindWordsSearchd");

	if (df) {
	  edf = (char*)DpsMalloc(dps_strlen(df) * 10 + 1);
	  if (edf == NULL) {
		sprintf(query->Conf->errstr,"Can't allocate memory");
		TRACE_OUT(query);
		return DPS_ERROR;
	  }
	  DpsEscapeURL(edf, df);
	  maxlen += dps_strlen(edf);
	}
	if (empty) {
	  e_empty = (char*)DpsMalloc(dps_strlen(empty) * 10 + 1);
	  if (e_empty == NULL) {
		sprintf(query->Conf->errstr, "Can't allocate memory");
		TRACE_OUT(query);
		return DPS_ERROR;
	  }
	  DpsEscapeURL(e_empty, empty);
	  maxlen += dps_strlen(e_empty);
	}

	maxlen += dps_strlen(qs) + dps_strlen(tmplt) + 64;

	if (NULL==(request=(char*)DpsMalloc(maxlen))) {
		sprintf(query->Conf->errstr,"Can't allocate memory");
		DPS_FREE(edf);
		TRACE_OUT(query);
		return DPS_ERROR;
	}
	
     dps_snprintf(request, maxlen, "%s&BrowserCharset=%s&IP=%s&g-lc=%s&ExcerptSize=%s&ExcerptPadding=%s&DoExcerpt=%s&tmplt=%s%s%s%s%s%s%s&sp=%s&sy=%s&s=%s",
		  qs,
		  DpsVarListFindStr(&query->Vars, "BrowserCharset", "iso-8859-1"),
		  DpsVarListFindStr(&query->Vars, "IP", "localhost"),
		  DpsVarListFindStr(&query->Vars, "g-lc", "en"),
		  DpsVarListFindStr(&query->Vars, "ExcerptSize", "256"),
		  DpsVarListFindStr(&query->Vars, "ExcerptPadding", "40"),
		  (query->Flags.do_excerpt) ? "yes" : "no",
		  tmplt,
		  (edf) ? "&DateFormat=" : "", (edf) ? edf : "",
		  (e_empty) ? "&empty=" : "", (e_empty) ? e_empty : "",
		  (searchd->label) ? "&label=" : "", (searchd->label) ? searchd->label : "",
		  DpsVarListFindStr(&query->Vars, "sp", "1"),
		  DpsVarListFindStr(&query->Vars, "sy", "1"),
		  DpsVarListFindStr(&query->Vars, "s", "RP")
		  );
	DPS_FREE(edf);
	DPS_FREE(e_empty);

	request[maxlen-1]='\0';
	res = DpsSearchdSendWordRequest(query, searchd, request);
	DPS_FREE(request);
	if (DPS_OK != res) {
	  TRACE_OUT(query);
	  return res;
	}

/*	res = DpsSearchdGetWordResponse(query, Res, searchd);   called later from DpsFind */
	
	TRACE_OUT(query);
	return res;
}
예제 #29
0
static int MakeNestedIndex(DPS_AGENT *Indexer, DPS_UINT8URLIDLIST *L, const char *lim_name, DPS_DB *db) {
     DPS_ENV   *Conf = Indexer->Conf;
     size_t    k, prev;
     urlid_t   *data=NULL;
     DPS_UINT8_POS_LEN *ind=NULL;
     size_t    mind=1000, nind=0, ndata;
     char fname[PATH_MAX];
     int  dat_fd=0, ind_fd=0;
     int  rc=DPS_OK;
     const char	*vardir = (db->vardir) ? db->vardir : DpsVarListFindStr(&Conf->Vars, "VarDir", DPS_VAR_DIR);
     
     if(!L->Item)return(1);
     
     if (L->nitems > 1) DpsSort(L->Item, L->nitems, sizeof(DPS_UINT8URLID), (qsort_cmp)cmp_ind8);
     
     data = (urlid_t*)DpsMalloc((L->nitems + 1) * sizeof(urlid_t));
     if(!data){
       DpsLog(Indexer, DPS_LOG_ERROR, "Can't alloc %d bytes [%s:%d]", (L->nitems + 1) * sizeof(urlid_t), __FILE__, __LINE__);
       goto err1;
     }
     ind=(DPS_UINT8_POS_LEN*)DpsMalloc(mind*sizeof(DPS_UINT8_POS_LEN));
     if(!ind){
       DpsLog(Indexer, DPS_LOG_ERROR, "Can't alloc %d bytes [%s:%d]", mind * sizeof(DPS_UINT8_POS_LEN), __FILE__, __LINE__);
       goto err1;
     }
     prev=0;
     for(k=0; k < L->nitems; k++) {
          data[k] = L->Item[k].url_id;
          if((k == L->nitems-1) || (L->Item[k].hi != L->Item[prev].hi) || (L->Item[k].lo != L->Item[prev].lo)) {
               if(nind==mind){
                    mind+=1000;
                    ind=(DPS_UINT8_POS_LEN*)DpsRealloc(ind,mind*sizeof(DPS_UINT8_POS_LEN));
                    if(!ind) {
		      DpsLog(Indexer, DPS_LOG_ERROR, "Can't alloc %d bytes [%s:%d]", mind * sizeof(DPS_UINT8_POS_LEN), __FILE__, __LINE__);
		      goto err1;
                    }
               }
               /* Fill index */
               ind[nind].hi = L->Item[prev].hi;
               ind[nind].lo = L->Item[prev].lo;
               ind[nind].pos = prev * sizeof(*data);
               if (k == L->nitems - 1) ind[nind].len = (k - prev + 1) * sizeof(*data);
               else ind[nind].len = (k - prev) * sizeof(*data);
               DpsLog(Indexer, DPS_LOG_DEBUG, "%08X%08X - %d %d\n", ind[nind].hi, ind[nind].lo, (int)ind[nind].pos, ind[nind].len);
               nind++;
               
               prev=k;
          }
     }
     ndata = L->nitems;
     ClearIndex8(L);
     
     dps_snprintf(fname,sizeof(fname)-1,"%s%c%s%c%s.dat", vardir,DPSSLASH, DPS_TREEDIR,DPSSLASH, lim_name);
     if((dat_fd = DpsOpen3(fname, O_CREAT | O_WRONLY | O_TRUNC | DPS_BINARY, DPS_IWRITE)) < 0) {
       DpsLog(Indexer, DPS_LOG_ERROR, "Can't open '%s': %s [%s:%d]", fname, strerror(errno), __FILE__, __LINE__);
       goto err1;
     }
     DpsWriteLock(dat_fd);
     if((ndata * sizeof(*data)) != (size_t)write(dat_fd, data, ndata * sizeof(*data))) {
       DpsLog(Indexer, DPS_LOG_ERROR, "Can't write '%s': %s [%s:%d]", fname, strerror(errno), __FILE__, __LINE__);
       goto err1;
     }
     DpsUnLock(dat_fd);
     DpsClose(dat_fd);
     DPS_FREE(data);

     dps_snprintf(fname,sizeof(fname)-1,"%s%c%s%c%s.ind", vardir, DPSSLASH,DPS_TREEDIR, DPSSLASH, lim_name);
     if((ind_fd = DpsOpen3(fname, O_CREAT | O_WRONLY | O_TRUNC | DPS_BINARY, DPS_IWRITE)) < 0) {
       DpsLog(Indexer, DPS_LOG_ERROR, "Can't open '%s': %s [%s:%d]", fname, strerror(errno), __FILE__, __LINE__);
       goto err1;
     }
     DpsWriteLock(ind_fd);
     if((nind*sizeof(DPS_UINT8_POS_LEN)) != (size_t)write(ind_fd,ind,nind*sizeof(DPS_UINT8_POS_LEN))){
       DpsLog(Indexer, DPS_LOG_ERROR, "Can't write '%s': %s [%s:%d]", fname, strerror(errno), __FILE__, __LINE__);
          goto err1;
     }
     DpsUnLock(ind_fd);
     DpsClose(ind_fd);
     DPS_FREE(ind);
     
     return(0);
     
err1:
     ClearIndex8(L);
     DPS_FREE(data);
     DPS_FREE(ind);
     if(dat_fd) DpsClose(dat_fd);
     if(ind_fd) DpsClose(ind_fd);
     return(1);
}
예제 #30
0
int __DPSCALL DpsSearchdCatAction(DPS_AGENT *A, DPS_CATEGORY *C, int cmd, void *db) {
	DPS_DB		*searchd = db;
	DPS_SEARCHD_PACKET_HEADER hdr;
	char *buf;
	ssize_t nsent, nrecv;
	int done = 0;
	int rc=DPS_OK;
	char *msg = NULL;
	char *dinfo = NULL;

	TRACE_IN(A, "DpsSearchdCatAction");

	hdr.cmd = DPS_SEARCHD_CMD_CATINFO;
	hdr.len = sizeof(int) + dps_strlen(C->addr) + 1;
	
	if ((buf = (char*)DpsMalloc(hdr.len + 1)) == NULL) {
	  DpsLog(A, DPS_LOG_ERROR, "Out of memory");
	  TRACE_OUT(A);
	  return DPS_ERROR;
	}

	*((int*)buf) = cmd;
	dps_strcpy(buf + sizeof(int), C->addr);

	nsent = DpsSearchdSendPacket(searchd->searchd, &hdr, buf);

	DPS_FREE(buf);

	while(!done) {
		char * tok, * lt;
		nrecv = DpsRecvall(searchd->searchd, &hdr, sizeof(hdr), 360);
		
		if(nrecv != sizeof(hdr)){
			DpsLog(A, DPS_LOG_ERROR, "Received incomplete header from searchd (%d bytes)", (int)nrecv);
			TRACE_OUT(A);
			return(DPS_ERROR);
		}else{
#ifdef DEBUG_SDP
		  DpsLog(A, DPS_LOG_ERROR, "Received header cmd=%d len=%d\n", hdr.cmd, hdr.len);
#endif
		}
		switch(hdr.cmd){
			case DPS_SEARCHD_CMD_ERROR:
				msg = (char*)DpsMalloc(hdr.len + 1);
				if (msg == NULL) {
				  done=1;
				  break;
				}
				nrecv = DpsRecvall(searchd->searchd, msg, hdr.len, 360);
				msg[(nrecv >= 0) ? nrecv : 0 ] = '\0';
				sprintf(A->Conf->errstr, "Searchd error: '%s'", msg);
				rc=DPS_ERROR;
				DPS_FREE(msg);
				done=1;
				break;
			case DPS_SEARCHD_CMD_MESSAGE:
				msg=(char*)DpsMalloc(hdr.len+1);
				if (msg == NULL) {
				  done=1;
				  break;
				}
				nrecv = DpsRecvall(searchd->searchd, msg, hdr.len, 360);
				msg[(nrecv >= 0) ? nrecv : 0] = '\0';
#ifdef DEBUG_SDP
				DpsLog(A, DPS_LOG_ERROR, "Message from searchd: '%s'\n",msg);
#endif
				DPS_FREE(msg);
				break;
			case DPS_SEARCHD_CMD_CATINFO:
			        dinfo=(char*)DpsMalloc(hdr.len+1);
				if (dinfo == NULL) {
				  done=1;
				  break;
				}
				nrecv = DpsRecvall(searchd->searchd, dinfo, hdr.len, 360);
				dinfo[(nrecv >= 0) ? nrecv : 0] = '\0';
#ifdef DEBUG_SDP
				DpsLog(A, DPS_LOG_ERROR, "Received CATINFO size=%d buf=%s\n",hdr.len,dinfo);
#endif				

				C->ncategories = 0;
				tok = dps_strtok_r(dinfo, "\r\n", &lt, NULL);
				
				while(tok){
					DpsCatFromTextBuf(C, tok);
					
					tok = dps_strtok_r(NULL, "\r\n", &lt, NULL);
				}
				DPS_FREE(dinfo);
				done=1;
				break;
			default:
				sprintf(A->Conf->errstr, "Unknown searchd response: cmd=%d len=%d", hdr.cmd, hdr.len);
				rc=DPS_ERROR;
				done = 1;
				break;
		}
	}
	TRACE_OUT(A);
	return rc;
}