static int proceedSTOP(DPS_AGENT *query, DPS_STACK_ITEM *res, DPS_STACK_ITEM *x, DPS_STACK_ITEM *stop) { res->pbegin = res->pcur = (DPS_URL_CRD_DB*)DpsMalloc((x->count + stop->count + 1) * sizeof(DPS_URL_CRD_DB)); if (res->pbegin == NULL) { DpsLog(query, DPS_LOG_ERROR, "Can't alloc %d bytes for %d results", (x->count + stop->count + 1) * sizeof(DPS_URL_CRD_DB), (x->count + stop->count + 1)); return DPS_ERROR; } x->pcur = x->pbegin; x->plast = x->pbegin + x->count; stop->pcur = stop->pbegin; stop->plast = stop->pbegin + stop->count; if (stop->pcur < stop->plast) { while (x->pcur < x->plast) { while (stop->pcur < stop->plast && stop->pcur->url_id < x->pcur->url_id) stop->pcur++; while (stop->pcur < stop->plast && DpsCmpUrlid(stop->pcur, x->pcur) <= 0) { *res->pcur = *stop->pcur; res->pcur++; stop->pcur++; } if (stop->pcur >= stop->plast) break; while (x->pcur < x->plast && DpsCmpUrlid(x->pcur, stop->pcur) <= 0) { *res->pcur = *x->pcur; res->pcur++; x->pcur++; } } } while (x->pcur < x->plast) { *res->pcur = *x->pcur; res->pcur++; x->pcur++; } return DPS_OK; }
/* * allocateMoreSlots is called when there are only enough slot structures * left to support the allocation of a single malloc buffer. */ static void allocateMoreSlots(void) { size_t newSize = allocationListSize + bytesPerPage; void * newAllocation; void * oldAllocation = allocationList; Page_AllowAccess(allocationList, allocationListSize); noAllocationListProtection = 1; internalUse = 1; newAllocation = DpsMalloc(newSize); dps_memcpy(newAllocation, allocationList, allocationListSize); /* was: dps_memmove */ memset(&(((char *)newAllocation)[allocationListSize]), 0, bytesPerPage); allocationList = (Slot *)newAllocation; allocationListSize = newSize; slotCount += slotsPerPage; unUsedSlots += slotsPerPage; /* DpsSort(allocationList, slotCount, sizeof(Slot), (qsort_cmp)cmp_Slot);*/ DpsFree(oldAllocation); /* * Keep access to the allocation list open at this point, because * I am returning to memalign(), which needs that access. */ noAllocationListProtection = 0; internalUse = 0; }
dpsunicode_t *DpsUniGermanReplace(const dpsunicode_t *str) { size_t l = DpsUniLen(str); dpsunicode_t *german = DpsMalloc((2 * l + 1) * sizeof(dpsunicode_t)); if (german !=NULL) { dpsunicode_t *s = str, *d = german; while(*s) { switch(*s) { case 0x00DF: /* eszett, or scharfes s, small */ *d++ = 's'; *d++ = 's'; break; case 0x1E9E: /* eszett, or scharfes s, big */ *d++ = 'S'; *d++ = 'S'; break; case 0x00D6: *d++ = 'O'; *d++ = 'E'; break; case 0x00F6: *d++ = 'o'; *d++ = 'e'; break; case 0x00DC: *d++ = 'U'; *d++ = 'E'; break; case 0x00FC: *d++ = 'u'; *d++ = 'e'; break; case 0x00C4: *d++ = 'A'; *d++ = 'E'; break; case 0x00E4: *d++ = 'a'; *d++ = 'e'; break; default: *d++ = *s; } s++; } *d = 0; } return german; }
static int proceedOR(DPS_AGENT *query, DPS_STACK_ITEM *res, DPS_STACK_ITEM *x1, DPS_STACK_ITEM *x2) { res->pbegin = res->pcur = (DPS_URL_CRD_DB*)DpsMalloc((x1->count + x2->count + 1) * sizeof(DPS_URL_CRD_DB)); if (res->pbegin == NULL) { DpsLog(query, DPS_LOG_ERROR, "Can't alloc %d bytes for %d results", (x1->count + x2->count + 1) * sizeof(DPS_URL_CRD_DB), (x1->count + x2->count + 1)); return DPS_ERROR; } x1->pcur = x1->pbegin; x1->plast = x1->pbegin + x1->count; x2->pcur = x2->pbegin; x2->plast = x2->pbegin + x2->count; while (x1->pcur < x1->plast && x2->pcur < x2->plast) { while((x1->pcur < x1->plast) && (DpsCmpUrlid(x1->pcur, x2->pcur) <= 0)) { *res->pcur = *x1->pcur; res->pcur++; x1->pcur++; } { register DPS_STACK_ITEM *t = x1; x1 = x2; x2 = t; } } while (x1->pcur < x1->plast) { *res->pcur = *x1->pcur; res->pcur++; x1->pcur++; } while (x2->pcur < x2->plast) { *res->pcur = *x2->pcur; res->pcur++; x2->pcur++; } return DPS_OK; }
__C_LINK int __DPSCALL DpsBaseRead(DPS_BASE_PARAM *P, void *buf, size_t len) { int res = DPS_OK; #ifdef HAVE_ZLIB z_stream zstream; Byte *CDoc = NULL; #endif if ((res = DpsBaseSeek(P, DPS_READ_LOCK)) != DPS_OK) return res; if (P->Item.rec_id == P->rec_id) { if (lseek(P->Sfd, (off_t)P->Item.offset, SEEK_SET) == (off_t)-1) { DpsLog(P->A, DPS_LOG_ERROR, "[%s/%s.%d] %ld lseek error, rec_id: %x", P->subdir, P->basename, P->FileNo, P->Item.offset, P->rec_id); return DPS_ERROR; } if ((P->Item.orig_size ? P->Item.orig_size : P->Item.size) > len) { DpsLog(P->A, DPS_LOG_ERROR, "[%s/%s] size %d->%d error, rec_id: %x", P->subdir, P->basename, (P->Item.orig_size ? P->Item.orig_size : P->Item.size), len, P->rec_id); return DPS_ERROR; } #ifdef HAVE_ZLIB bzero(&zstream, sizeof(zstream)); if ((P->zlib_method == Z_DEFLATED) && (P->Item.orig_size != 0)) { zstream.avail_in = (uInt)P->Item.size; zstream.avail_out = (uInt)len; zstream.next_out = (Byte *) buf; CDoc = zstream.next_in = (Byte *) DpsMalloc(P->Item.size + 1); if (CDoc == NULL) { return DPS_ERROR; } zstream.zalloc = Z_NULL; zstream.zfree = Z_NULL; zstream.opaque = Z_NULL; if (read(P->Sfd, CDoc, P->Item.size) != (ssize_t)P->Item.size) { DpsLog(P->A, DPS_LOG_ERROR, "[%s/%s] %d read error, rec_id: %x -- %d", P->subdir, P->basename, P->Item.size, P->rec_id, __LINE__); DPS_FREE(CDoc); return DPS_ERROR; } inflateInit2(&zstream, P->zlib_windowBits); inflate(&zstream, Z_FINISH); inflateEnd(&zstream); DPS_FREE(CDoc); } else #endif if (read(P->Sfd, buf, P->Item.size) != (ssize_t)P->Item.size) { DpsLog(P->A, DPS_LOG_ERROR, "[%s/%s] %d read error, rec_id: %x -- %d", P->subdir, P->basename, P->Item.size, P->rec_id, __LINE__); return DPS_ERROR; } } else { DpsLog(P->A, DPS_LOG_DEBUG, "%s:[%s/%s] Not found rec_id: %x", P->vardir, P->subdir, P->basename, P->rec_id); return DPS_ERROR; } #ifdef DEBUG_SEARCH DpsLog(P->A, DPS_LOG_DEBUG, "[%s/%s] Retrieved rec_id: %x Size: %d", P->subdir, P->basename, P->rec_id, P->Item.size); #endif return DPS_OK; }
dpsunicode_t *DpsUniNDup(const dpsunicode_t *s, size_t len) { dpsunicode_t *res; size_t size = DpsUniLen(s); if (size > len) size = len; if((res = (dpsunicode_t*)DpsMalloc((size + 1) * sizeof(*s))) == NULL) return(NULL); dps_memmove(res, s, size * sizeof(*s)); res[size] = 0; return res; }
dpsunicode_t *DpsUniDup(const dpsunicode_t *s) { dpsunicode_t *res; size_t size; size = (DpsUniLen(s)+1)*sizeof(*s); if((res=(dpsunicode_t*)DpsMalloc(size)) == NULL) return(NULL); dps_memcpy(res, s, size); /* was: dps_memmove */ return res; }
DPS_URL * __DPSCALL DpsURLInit(DPS_URL *url) { if (!url) { url = (DPS_URL*)DpsMalloc(sizeof(DPS_URL)); if (url == NULL) return NULL; bzero((void*)url, sizeof(DPS_URL)); url->freeme = 1; } else { int fr = url->freeme; bzero((void*)url, sizeof(DPS_URL)); url->freeme = fr; } return url; }
DPS_BOOLSTACK *DpsBoolStackInit(DPS_BOOLSTACK *s) { if(s == NULL) { s = (DPS_BOOLSTACK*)DpsMalloc(sizeof(DPS_BOOLSTACK)); if (s == NULL) return NULL; bzero((void*)s, sizeof(*s)); s->freeme = 1; }else{ bzero((void*)s, sizeof(*s)); } s->ncstack = 0; s->nastack = 0; s->mcstack = s->mastack = DPS_MAXSTACK; s->cstack = (int*)DpsMalloc(DPS_MAXSTACK * sizeof(int)); if (s->cstack == NULL) { if (s->freeme) DPS_FREE(s); return NULL; } s->astack = (DPS_STACK_ITEM*)DpsMalloc(DPS_MAXSTACK * sizeof(DPS_STACK_ITEM)); if (s->astack == NULL) { DPS_FREE(s->cstack); if (s->freeme) DPS_FREE(s); return NULL; } return s; }
static void initialize(void) { size_t size = MEMORY_CREATION_SIZE; size_t slack; Slot * slot; FE_Print(version); lock(); /* * Get the run-time configuration of the virtual memory page size. */ bytesPerPage = Page_Size(); /* * Figure out how many Slot structures to allocate at one time. */ slotCount = slotsPerPage = bytesPerPage / sizeof(Slot); allocationListSize = bytesPerPage; if ( allocationListSize > size ) size = allocationListSize; if ( (slack = size % bytesPerPage) != 0 ) size += bytesPerPage - slack; /* * Allocate memory, and break it up into two malloc buffers. The * first buffer will be used for Slot structures, the second will * be marked free. */ slot = allocationList = (Slot *)DpsMalloc(size); memset((char *)allocationList, 0, allocationListSize); /* * Account for the two slot structures that we've used. */ unUsedSlots = slotCount; release(); #ifdef HAVE_PTHREAD if (!semEnabled) { semEnabled = 1; if (sem_init(&FE_sem, 0, 1) < 0) { semEnabled = 0; } } #endif }
dpsunicode_t *DpsUniRDup(const dpsunicode_t *s) { dpsunicode_t *res; size_t size, len; size = ((len = DpsUniLen(s)) + 1) * sizeof(*s); if((res=(dpsunicode_t*)DpsMalloc(size)) == NULL) return(NULL); { register size_t z; size = len - 1; for (z = 0; z < len; z++) res[z] = s[size - z]; res[len] = 0; } return res; }
static int DoStore(DPS_AGENT *Agent, urlid_t rec_id, Byte *Doc, size_t DocSize, char *Client) { z_stream zstream; DPS_BASE_PARAM P; int rc = DPS_OK; Byte *CDoc = NULL; size_t dbnum = ((size_t)rec_id) % ((Agent->flags & DPS_FLAG_UNOCON) ? Agent->Conf->dbl.nitems : Agent->dbl.nitems); DPS_DB *db = (Agent->flags & DPS_FLAG_UNOCON) ? &Agent->Conf->dbl.db[dbnum] : &Agent->dbl.db[dbnum]; zstream.zalloc = Z_NULL; zstream.zfree = Z_NULL; zstream.opaque = Z_NULL; zstream.next_in = Doc; if (deflateInit2(&zstream, 9, Z_DEFLATED, 15, 9, Z_DEFAULT_STRATEGY) == Z_OK) { zstream.avail_in = DocSize; zstream.avail_out = 2 * DocSize; CDoc = zstream.next_out = (Byte *) DpsMalloc(2 * DocSize + 1); if (zstream.next_out == NULL) { return DPS_ERROR; } deflate(&zstream, Z_FINISH); deflateEnd(&zstream); /* store operations */ bzero(&P, sizeof(P)); P.subdir = "store"; P.basename = "doc"; P.indname = "doc"; P.rec_id = rec_id; P.mode = DPS_WRITE_LOCK; P.NFiles = (db->StoredFiles) ? db->StoredFiles : DpsVarListFindInt(&Agent->Vars, "StoredFiles", 0x100); P.vardir = (db->vardir) ? db->vardir : DpsVarListFindStr(&Agent->Vars, "VarDir", DPS_VAR_DIR); P.A = Agent; if (DpsBaseWrite(&P, CDoc, zstream.total_out) != DPS_OK) { DpsLog(Agent, DPS_LOG_ERROR, "store/doc write error: %s", strerror(errno)); rc = DPS_ERROR; } DpsBaseClose(&P); if (rc == DPS_OK) DpsLog(Agent, DPS_LOG_EXTRA, "[%s] Stored rec_id: %x Size: %d Ratio: %5.2f%%", Client, rec_id, DocSize, 100.0 * zstream.total_out / DocSize); if (Agent->Flags.OptimizeAtUpdate) { DpsBaseOptimize(&P, ((int)rec_id) >> DPS_BASE_BITS); }
DPS_ENV *DpsEnvInit(DPS_ENV *Conf){ #ifdef MECAB const char *mecab_argv[8] = {"mecab", "-F", "%m ", "-B", " ", "-E", " ", NULL}; #endif if(!Conf){ Conf=(DPS_ENV *)DpsMalloc(sizeof(DPS_ENV)); if (Conf == NULL) return NULL; bzero((void*)Conf, sizeof(*Conf)); Conf->freeme=1; }else{ bzero((void*)Conf, sizeof(*Conf)); } Conf->Flags.OptimizeAtUpdate = 1; Conf->Flags.do_excerpt = 1; Conf->Flags.PopRankNeoIterations = 3; Conf->Flags.GuesserBytes = 512; Conf->Flags.robots_period = 604800; /* one week */ Conf->Flags.URLInfoSQL = 1; Conf->Flags.SRVInfoSQL = 1; Conf->Flags.CheckInsertSQL = 1; Conf->Flags.mark_for_index = 1; Conf->Flags.MaxSiteLevel = 2; Conf->Flags.SEASentences = 32; Conf->Flags.SEASentenceMinLength = 64; Conf->Flags.PagesInGroup = 1; Conf->Flags.SubDocCnt = 5; Conf->Flags.MaxCrawlDelay = 300; Conf->Flags.rel_nofollow = 1; Conf->Flags.bind_addr.sin_family = AF_INET; Conf->Flags.use_meta = 1; Conf->WordParam.min_word_len = 1; Conf->WordParam.max_word_len = 32; Conf->WordParam.correct_factor = 1; Conf->WordParam.incorrect_factor = 1; Conf->url_number = 0x7FFFFFFF; Conf->lcs=DpsGetCharSet("latin1"); Conf->bcs=DpsGetCharSet("latin1"); Conf->CharsToEscape = DpsStrdup("\"&<>"); #ifdef MECAB /* Conf->mecab = mecab_new2 ("mecab -F \"%m \" -B \" \" -E \" \"");*/ Conf->mecab = mecab_new(7, (char**)mecab_argv); #endif return(Conf); }
ssize_t DpsSearchdSendPacket(int fd,const DPS_SEARCHD_PACKET_HEADER *hdr,const void *data){ ssize_t nsent = 0; if (data == NULL) { nsent = DpsSend(fd, hdr, sizeof(*hdr), 0); } else { char *ldata = (char*)DpsMalloc(sizeof(*hdr) + hdr->len); if (ldata != NULL) { dps_memcpy(ldata, hdr, sizeof(*hdr)); dps_memcpy(ldata + sizeof(*hdr), data, hdr->len); nsent = DpsSend(fd, ldata, sizeof(*hdr) + hdr->len, 0); } DPS_FREE(ldata); } return nsent; }
/* * allocateMoreSlots is called when there are only enough slot structures * left to support the allocation of a single malloc buffer. */ static void allocateMoreSlots(void) { size_t newSize = allocationListSize + bytesPerPage; void * newAllocation; void * oldAllocation = allocationList; newAllocation = DpsMalloc(newSize); dps_memmove(newAllocation, allocationList, allocationListSize); memset(&(((char *)newAllocation)[allocationListSize]), 0, bytesPerPage); allocationList = (Slot *)newAllocation; allocationListSize = newSize; slotCount += slotsPerPage; unUsedSlots += slotsPerPage; /* DpsSort(allocationList, slotCount, sizeof(Slot), (qsort_cmp)cmp_Slot);*/ DpsFree(oldAllocation); }
static int DpsUniRegComp(DPS_UNIREG_EXP *reg, const dpsunicode_t *pattern) { const dpsunicode_t *tok, *lt; reg->ntokens=0; reg->Token=NULL; tok=DpsUniRegTok(pattern,<); while(tok){ size_t len; reg->Token=(DPS_UNIREG_TOK*)DpsRealloc(reg->Token,sizeof(*reg->Token)*(reg->ntokens+1)); if (reg->Token == NULL) { reg->ntokens = 0; return DPS_ERROR; } len=lt-tok; reg->Token[reg->ntokens].str = (dpsunicode_t*)DpsMalloc((len+1)*sizeof(dpsunicode_t)); dps_memmove(reg->Token[reg->ntokens].str, tok, len * sizeof(dpsunicode_t)); reg->Token[reg->ntokens].str[len]=0; reg->ntokens++; tok=DpsUniRegTok(NULL,<); } return DPS_OK; }
int main(int argc,char **argv, char **envp) { int ch, sleeps = 1, optimize = 0, obi = 0; unsigned int from = 0, to = 0xFFF, p_to = 0; DPS_ENV * Env; const char * config_name = DPS_CONF_DIR "/cached.conf"; DpsInit(argc, argv, envp); /* Initialize library */ DpsInitMutexes(); Env=DpsEnvInit(NULL); if (Env == NULL) exit(1); DpsSetLockProc(Env, DpsLockProc); /*#ifndef HAVE_SETPROCTITLE*/ ARGV = argv; ARGC = argc; /*#endif*/ while ((ch = getopt(argc, argv, "blt:f:op:w:v:h?")) != -1){ switch (ch) { case 'f': sscanf(optarg, "%x", &from); break; case 't': sscanf(optarg, "%x", &p_to); break; case 'w': DpsVarListReplaceStr(&Env->Vars, "VarDir", optarg); break; case 'v': DpsSetLogLevel(NULL, atoi(optarg)); break; case 'b': obi++; break; case 'o': optimize++; break; case 'p': sleeps = atoi(optarg); break; case 'h': case '?': default: usage(); DpsEnvFree(Env); DpsDeInit(); DpsDestroyMutexes(); return 1; break; } } argc -= optind; argv += optind; if(argc > 1) { usage(); DpsEnvFree(Env); DpsDeInit(); DpsDestroyMutexes(); return 1; } else if (argc == 1) { config_name = argv[0]; } { DPS_LOGDEL *del_buf=NULL; size_t del_count = 0, log, bytes, n = 0; int dd, log_fd; struct stat sb; char dname[PATH_MAX] = ""; DPS_BASE_PARAM P; DPS_LOGWORD *log_buf = NULL; DPS_AGENT *Indexer = DpsAgentInit(NULL, Env, 0); log2stderr = 1; if (Indexer == NULL) { fprintf(stderr, "Can't alloc Agent at %s:%d\n", __FILE__, __LINE__); exit(DPS_ERROR); } if(DPS_OK != DpsEnvLoad(Indexer, config_name, (dps_uint8)0)){ fprintf(stderr, "%s\n", DpsEnvErrMsg(Env)); DpsEnvFree(Env); DpsDeInit(); DpsDestroyMutexes(); return DPS_ERROR; } DpsOpenLog("splitter", Env, log2stderr); Indexer->flags = Env->flags = DPS_FLAG_UNOCON; DpsVarListAddLst(&Indexer->Vars, &Env->Vars, NULL, "*"); bzero(&P, sizeof(P)); P.subdir = DPS_TREEDIR; P.basename = "wrd"; P.indname = "wrd"; P.mode = DPS_WRITE_LOCK; P.NFiles = DpsVarListFindInt(&Indexer->Conf->Vars, "WrdFiles", 0x300); P.vardir = DpsStrdup(DpsVarListFindStr(&Indexer->Conf->Vars, "VarDir", DPS_VAR_DIR)); P.A = Indexer; if (p_to != 0) to = p_to; else to = P.NFiles - 1; #ifdef HAVE_ZLIB P.zlib_method = Z_DEFLATED; P.zlib_level = 9; P.zlib_windowBits = DPS_BASE_WRD_WINDOWBITS; P.zlib_memLevel = 9; P.zlib_strategy = DPS_BASE_WRD_STRATEGY; #endif /* Open del log file */ dps_snprintf(dname,sizeof(dname),"%s%c%s%cdel-split.log", P.vardir, DPSSLASH, DPS_SPLDIR, DPSSLASH); if((dd = DpsOpen2(dname, O_RDONLY | DPS_BINARY)) < 0) { dps_strerror(NULL, 0, "Can't open del log '%s'", dname); exit(DPS_ERROR); } DpsLog(Indexer, DPS_LOG_DEBUG, "VarDir: %s, WrdFiles: %d [%x]", P.vardir, P.NFiles, P.NFiles); /* Allocate del buffer */ fstat(dd, &sb); if (sb.st_size != 0) { del_buf=(DPS_LOGDEL*)DpsMalloc((size_t)sb.st_size + 1); if (del_buf == NULL) { fprintf(stderr, "Can't alloc %d bytes at %s:%d\n", (int)sb.st_size, __FILE__, __LINE__); exit(0); } del_count=read(dd,del_buf,(size_t)sb.st_size)/sizeof(DPS_LOGDEL); } DpsClose(dd); /* Remove duplicates URLs in DEL log */ /* Keep only oldest records for each URL */ if (del_count > 0) { DpsLog(Indexer, DPS_LOG_DEBUG, "Sorting del_buf: %d items", del_count); if (del_count > 1) DpsSort(del_buf, (size_t)del_count, sizeof(DPS_LOGDEL), DpsCmpurldellog); DpsLog(Indexer, DPS_LOG_DEBUG, "Removing DelLogDups"); del_count = DpsRemoveDelLogDups(del_buf, del_count); } DpsLog(Indexer, DPS_LOG_DEBUG, "Processing Bufs from %d [%x] to %d [%x]", from, from, to, to); for(log = from; log <= to; log++) { /* Open log file */ dps_snprintf(dname, sizeof(dname), "%s%c%s%c%03X-split.log", P.vardir, DPSSLASH, DPS_SPLDIR, DPSSLASH, log); if((log_fd = DpsOpen2(dname, O_RDWR|DPS_BINARY)) < 0){ if (errno == ENOENT) { dps_strerror(Indexer, DPS_LOG_DEBUG, "Can't open '%s'", dname); n = 0; /* continue;*/ } else { dps_strerror(Indexer, DPS_LOG_ERROR, "Can't open '%s'", dname); continue; } } else { DpsWriteLock(log_fd); DpsLog(Indexer, DPS_LOG_DEBUG, "Processing Log: %x", log); fstat(log_fd, &sb); log_buf = (sb.st_size > 0) ? (DPS_LOGWORD*)DpsMalloc((size_t)sb.st_size + 1) : NULL; if (log_buf != NULL) { unlink(dname); bytes = read(log_fd,log_buf,(size_t)sb.st_size); (void)ftruncate(log_fd, (off_t)0); DpsUnLock(log_fd); DpsClose(log_fd); n = bytes / sizeof(DPS_LOGWORD); DpsLog(Indexer, DPS_LOG_DEBUG, "Sorting log_buf: %d items", n); if (n > 1) DpsSort(log_buf, n, sizeof(DPS_LOGWORD), (qsort_cmp)DpsCmplog); DpsLog(Indexer, DPS_LOG_DEBUG, "Removing OldWords"); n = DpsRemoveOldWords(log_buf, n, del_buf, del_count); if (n > 1) DpsSort(log_buf, n, sizeof(DPS_LOGWORD), (qsort_cmp)DpsCmplog_wrd); } else { n = 0; DpsUnLock(log_fd); DpsClose(log_fd); } } DpsLog(Indexer, DPS_LOG_DEBUG, "Processing Buf, optimize: %d", optimize); if (obi) DpsBaseOptimize(&P, log); DpsProcessBuf(Indexer, &P, log, log_buf, n, del_buf, del_count); if (optimize) DpsBaseOptimize(&P, log); DpsBaseClose(&P); DPS_FREE(log_buf); DpsLog(Indexer, DPS_LOG_DEBUG, "pas done: %d from %d to %d", log, from, to); DPSSLEEP(sleeps); } DPS_FREE(del_buf); DpsAgentFree(Indexer); DPS_FREE(P.vardir); } fprintf(stderr, "Splitting done.\n"); DpsEnvFree(Env); DpsDeInit(); DpsDestroyMutexes(); #ifdef EFENCE fprintf(stderr, "Memory leaks checking\n"); DpsEfenceCheckLeaks(); #endif #ifdef FILENCE fprintf(stderr, "FD leaks checking\n"); DpsFilenceCheckLeaks(NULL); #endif return 0; }
int _DpsURLParse(DPS_URL *url, const char *str, const char *filename, int line) { #else int DpsURLParse(DPS_URL *url, const char *str) { #endif char *schema,*anchor,*file,*query; char *s; /* size_t len = dps_strlen(str);*/ #ifdef WITH_PARANOIA void * paran = DpsViolationEnter(paran); #endif #ifdef DEBUG_URL fprintf(stderr, " -- %s:%d Parser url: %s\n", filename, line, str); #endif DPS_FREE(url->schema); DPS_FREE(url->specific); DPS_FREE(url->hostinfo); DPS_FREE(url->hostname); DPS_FREE(url->anchor); DPS_FREE(url->auth); url->port=0; url->default_port=0; DPS_FREE(url->path); DPS_FREE(url->directory); DPS_FREE(url->filename); DPS_FREE(url->query_string); /* if(len >= DPS_URLSIZE)return(DPS_URL_LONG); FIXME: Chage this cheking for configured parameter, not DPS_URLSIZE */ s = (char*)DpsStrdup(str); if (s == NULL) { #ifdef WITH_PARANOIA DpsViolationExit(-1, paran); #endif return DPS_ERROR; } url->len = dps_strlen(str); /* Find possible schema end than */ /* Check that it is really schema */ /* It must consist of alphas only */ /* We will take in account digits */ /* also for oracle8:// for example */ /* We must check it because */ /* It might be anchor also */ /* For example: */ /* "mod/index.html#a:1" */ if((schema=strchr(s,':'))){ const char * ch; for(ch=s;ch<schema;ch++){ if(!isalnum(*ch)){ /* Bad character */ /* so it is not schema */ schema=0;break; } } } if(schema){ /* Have scheme - absolute path */ *schema=0; url->schema = (char*)DpsStrdup(s); url->specific = (char*)DpsStrdup(schema + 1); *schema=':'; if(!strcasecmp(url->schema,"http"))url->default_port=80; else if(!strcasecmp(url->schema,"https"))url->default_port=443; else if(!strcasecmp(url->schema,"nntp"))url->default_port=119; else if(!strcasecmp(url->schema,"news"))url->default_port=119; else if(!strcasecmp(url->schema,"ftp"))url->default_port=21; if(!strncmp(url->specific,"//",2)){ char *ss,*hostname; /* Have hostinfo */ if((ss=strchr(url->specific+2,'/'))){ /* Have hostname with path */ *ss=0; url->hostinfo = (char*)DpsStrdup(url->specific + 2); *ss='/'; url->path = (char*)DpsStrdup(ss); }else{ /* Hostname without path */ if ((ss = strchr(url->specific + 2, '?'))) { /* Have hostname with parameters */ *ss = 0; url->hostinfo = (char*)DpsStrdup(url->specific + 2); *ss='?'; url->path = (char*)DpsStrdup("/"); }else { url->hostinfo = (char*)DpsStrdup(url->specific + 2); url->path = (char*)DpsStrdup("/"); } } if((hostname=strchr(url->hostinfo,'@'))){ /* Username and password is given */ /* Store auth string user:password */ *hostname=0; url->auth = (char*)DpsStrdup(url->hostinfo); *hostname='@'; hostname++; }else{ hostname = url->hostinfo; } /* FIXME: for(h=hostname;*h;h++){ if( *h>='A' && *h<='Z') *h=(*h)-'A'+'a'; } */ if((ss=strchr(hostname,':'))){ *ss=0; url->hostname = (char*)DpsStrdup(hostname); *ss=':'; url->port=atoi(ss+1); }else{ url->hostname = (char*)DpsStrdup(hostname); url->port=0; } }else{ /* Have not host but have schema */ /* This is possible for: */ /* file: mailto: htdb: news: */ /* As far as we do not need mailto: just ignore it */ if(!strcasecmp(url->schema,"mailto") || !strcasecmp(url->schema,"javascript") || !strcasecmp(url->schema,"feed") ) { DPS_FREE(s); #ifdef WITH_PARANOIA DpsViolationExit(-1, paran); #endif return(DPS_URL_BAD); } else if(!strcasecmp(url->schema,"file")) url->path = (char*)DpsStrdup(url->specific); else if(!strcasecmp(url->schema,"exec")) url->path = (char*)DpsStrdup(url->specific); else if(!strcasecmp(url->schema,"cgi")) url->path = (char*)DpsStrdup(url->specific); else if(!strcasecmp(url->schema,"htdb")) url->path = (char*)DpsStrdup(url->specific); else if(!strcasecmp(url->schema,"news")){ /* Now we will use localhost as NNTP */ /* server as it is not indicated in URL */ url->hostname = (char*)DpsStrdup("localhost"); url->path = (char*)DpsMalloc(dps_strlen(url->specific) + 2); if (url->path == NULL) { DPS_FREE(s); #ifdef WITH_PARANOIA DpsViolationExit(-1, paran); #endif return DPS_ERROR; } sprintf(url->path,"/%s",url->specific); url->default_port=119; }else{ /* Unknown strange schema */ DPS_FREE(s); #ifdef WITH_PARANOIA DpsViolationExit(-1, paran); #endif return(DPS_URL_BAD); } } }else{ url->path = (char*)DpsStrdup(s); } /* Cat an anchor if exist */ if((anchor=strstr(url->path,"#")))*anchor=0; /* If path is not full just copy it to filename */ /* i.e. neither /usr/local/ nor c:/windows/temp/ */ if((url->path != NULL) && (url->path[0]!='/') && (url->path[0]!='?') && (url->path[1]!=':')) { /* Relative path */ if(!strncmp(url->path,"./",2)) url->filename = (char*)DpsStrdup(url->path + 2); else url->filename = (char*)DpsStrdup(url->path); url->path[0] = 0; } /* truncate path to query_string */ /* and store query_string */ if((query=strrchr(url->path,'?'))){ url->query_string = (char*)DpsStrdup(query); *(query) = 0; } DpsURLNormalizePath(url->path); /* Now find right '/' sign and copy the rest to filename */ if((file=strrchr(url->path,'/'))&&(strcmp(file,"/"))){ url->filename = (char*)DpsStrdup(file + 1); *(file+1)=0; } /* Now find right '/' sign and copy the rest to directory */ if ((file = strrchr(url->path,'/'))) { char *p_save = file; for(file--; (file > url->path) && (*file != '/'); file--); file++; if (*file) { *p_save = '\0'; url->directory = (char*)DpsStrdup(file); *p_save = '/'; } } DPS_FREE(s); if (url->hostname != NULL) { DpsRTrim(url->hostname, "."); url->domain_level = 1; for (s = url->hostname; *s; s++) { *s = dps_tolower(*s); if (*s == '.') url->domain_level++; if (strchr(",'\";", (int)*s)) { #ifdef WITH_PARANOIA DpsViolationExit(-1, paran); #endif return DPS_URL_BAD; } } } if (url->hostinfo != NULL) { DpsRTrim(url->hostinfo, "."); s = strchr(url->hostinfo, '@'); for (s = (s == NULL) ? url->hostinfo : s + 1; *s; s++) *s = dps_tolower(*s); } if (url->schema != NULL) for (s = url->schema; *s; s++) *s = dps_tolower(*s); /* fprintf(stderr, "url: .path: %s port:%d\n", url->path, url->port);*/ #ifdef WITH_PARANOIA DpsViolationExit(-1, paran); #endif return DPS_OK; }
int DpsChineseListLoad(DPS_AGENT *Agent, DPS_CHINALIST *List, const char *charset, const char *fname) { struct stat sb; char *str, *data = NULL, *cur_n = NULL; DPS_CHINAWORD chinaword; char word[PATH_MAX]; dpsunicode_t uword[256]; DPS_CHARSET *sys_int, *fcs; DPS_CONV to_uni; int fd; char savebyte; sys_int = DpsGetCharSet("sys-int"); if (!(fcs = DpsGetCharSet(charset))) { if (Agent->Conf->is_log_open) DpsLog(Agent, DPS_LOG_ERROR, "Charset '%s' not found or not supported", charset); else fprintf(stderr, "Charset '%s' not found or not supported", charset); return DPS_ERROR; } DpsConvInit(&to_uni, fcs, sys_int, Agent->Conf->CharsToEscape, DPS_RECODE_HTML); if (*fname != '/') { dps_snprintf(word, sizeof(word), "%s/%s", DpsVarListFindStr(&Agent->Conf->Vars, "EtcDir", DPS_CONF_DIR), fname); fname = word; } if (stat(fname, &sb)) { if (Agent->Conf->is_log_open) DpsLog(Agent, DPS_LOG_ERROR, "Unable to stat FreqDic file '%s': %s", fname, strerror(errno)); else fprintf(stderr, "Unable to stat FrecDic file '%s': %s", fname, strerror(errno)); return DPS_ERROR; } if ((fd = open(fname, O_RDONLY)) <= 0) { if (Agent->Conf->is_log_open) DpsLog(Agent, DPS_LOG_ERROR, "Unable to open FreqDic file '%s': %s", fname, strerror(errno)); else fprintf(stderr, "Unable to open FreqDic file '%s': %s", fname, strerror(errno)); return DPS_ERROR; } if ((data = (char*)DpsMalloc(sb.st_size + 1)) == NULL) { if (Agent->Conf->is_log_open) DpsLog(Agent, DPS_LOG_ERROR, "Unable to alloc %d bytes", sb.st_size); else fprintf(stderr, "Unable to alloc %ld bytes", (long)sb.st_size); close(fd); return DPS_ERROR; } if (read(fd, data, sb.st_size) != (ssize_t)sb.st_size) { if (Agent->Conf->is_log_open) DpsLog(Agent, DPS_LOG_ERROR, "Unable to read FreqDic file '%s': %s", fname, strerror(errno)); else fprintf(stderr, "Unable to read FreqDic file '%s': %s", fname, strerror(errno)); DPS_FREE(data); close(fd); return DPS_ERROR; } data[sb.st_size] = '\0'; str = data; cur_n = strchr(str, NL_INT); if (cur_n != NULL) { cur_n++; savebyte = *cur_n; *cur_n = '\0'; } close(fd); bzero((void*)&chinaword, sizeof(chinaword)); chinaword.word = uword; while(str != NULL) { if(!str[0]) goto loop_continue; if(str[0]=='#') goto loop_continue; sscanf(str, "%d %63s ", &chinaword.freq, word ); DpsConv(&to_uni, (char*)uword, sizeof(uword), word, sizeof(word)); DpsChineseListAdd(List, &chinaword); loop_continue: str = cur_n; if (str != NULL) { *str = savebyte; cur_n = strchr(str, NL_INT); if (cur_n != NULL) { cur_n++; savebyte = *cur_n; *cur_n = '\0'; } } } DPS_FREE(data); DpsChineseListSort(List); { register size_t i, j = 0; for (i = 1; i < List->nwords; i++) { if (cmpchinese(&List->ChiWord[j], &List->ChiWord[i]) == 0) { List->ChiWord[j].freq += List->ChiWord[i].freq; } else { j++; } } for (i = j + 1; i < List->nwords; i++) { DPS_FREE(List->ChiWord[i].word); } List->nwords = j + 1; } return DPS_OK; }
void RelLink(DPS_AGENT *Indexer, DPS_URL *curURL, DPS_URL *newURL, char **str, int ReverseAliasFlag) { const char *schema = newURL->schema ? newURL->schema : curURL->schema; const char *hostname = newURL->hostname ? newURL->hostname : curURL->hostname; const char *auth = newURL->auth ? newURL->auth : curURL->auth; const char *path = (newURL->path && newURL->path[0]) ? newURL->path : curURL->path; const char *fname = ((newURL->filename && newURL->filename[0]) || (newURL->path && newURL->path[0])) ? newURL->filename : curURL->filename; const char *query_string = newURL->query_string; char *pathfile = (char*)DpsMalloc(dps_strlen(DPS_NULL2EMPTY(path)) + dps_strlen(DPS_NULL2EMPTY(fname)) + dps_strlen(DPS_NULL2EMPTY(query_string)) + 5); int cascade; DPS_MATCH *Alias; char *alias = NULL; size_t aliassize, nparts = 10; DPS_MATCH_PART Parts[10]; if (newURL->hostinfo == NULL) newURL->charset_id = curURL->charset_id; if (pathfile == NULL) return; /* sprintf(pathfile, "/%s%s%s", DPS_NULL2EMPTY(path), DPS_NULL2EMPTY(fname), DPS_NULL2EMPTY(query_string));*/ pathfile[0] = '/'; dps_strcpy(pathfile + 1, DPS_NULL2EMPTY(path)); dps_strcat(pathfile, DPS_NULL2EMPTY(fname)); dps_strcat(pathfile, DPS_NULL2EMPTY(query_string)); DpsURLNormalizePath(pathfile); if (!strcasecmp(DPS_NULL2EMPTY(schema), "mailto") || !strcasecmp(DPS_NULL2EMPTY(schema), "javascript") || !strcasecmp(DPS_NULL2EMPTY(schema), "feed") ) { *str = (char*)DpsMalloc(dps_strlen(DPS_NULL2EMPTY(schema)) + dps_strlen(DPS_NULL2EMPTY(newURL->specific)) + 4); if (*str == NULL) return; /* sprintf(*str, "%s:%s", DPS_NULL2EMPTY(schema), DPS_NULL2EMPTY(newURL->specific));*/ dps_strcpy(*str, DPS_NULL2EMPTY(schema)); dps_strcat(*str, ":"); dps_strcat(*str, DPS_NULL2EMPTY(newURL->specific)); } else if(/*!strcasecmp(DPS_NULL2EMPTY(schema), "file") ||*/ !strcasecmp(DPS_NULL2EMPTY(schema), "htdb")) { *str = (char*)DpsMalloc(dps_strlen(DPS_NULL2EMPTY(schema)) + dps_strlen(pathfile) + 4); if (*str == NULL) return; /* sprintf(*str, "%s:%s", DPS_NULL2EMPTY(schema), pathfile);*/ dps_strcpy(*str, DPS_NULL2EMPTY(schema)); dps_strcat(*str, ":"); dps_strcat(*str, pathfile); }else{ *str = (char*)DpsMalloc(dps_strlen(DPS_NULL2EMPTY(schema)) + dps_strlen(pathfile) + dps_strlen(DPS_NULL2EMPTY(hostname)) + dps_strlen(DPS_NULL2EMPTY(auth)) + 8); if (*str == NULL) return; /* sprintf(*str, "%s://%s%s", DPS_NULL2EMPTY(schema), DPS_NULL2EMPTY(hostinfo), pathfile);*/ dps_strcpy(*str, DPS_NULL2EMPTY(schema)); dps_strcat(*str, "://"); if (auth) { dps_strcat(*str, auth); dps_strcat(*str,"@"); } dps_strcat(*str, DPS_NULL2EMPTY(hostname)); dps_strcat(*str, pathfile); } if(!strncmp(*str, "ftp://", 6) && (strstr(*str, ";type="))) *(strstr(*str, ";type")) = '\0'; DPS_FREE(pathfile); if (ReverseAliasFlag) { const char *alias_prog = DpsVarListFindStr(&Indexer->Vars, "ReverseAliasProg", NULL); if (alias_prog) { int result; aliassize = 256 + 2 * dps_strlen(*str); alias = (char*)DpsRealloc(alias, aliassize); if (alias == NULL) { DpsLog(Indexer, DPS_LOG_ERROR, "No memory (%d bytes). %s line %d", aliassize, __FILE__, __LINE__); goto ret; } alias[0] = '\0'; result = DpsAliasProg(Indexer, alias_prog, *str, alias, aliassize - 1); DpsLog(Indexer, DPS_LOG_EXTRA, "ReverseAliasProg result: '%s'", alias); if(result != DPS_OK) goto ret; DPS_FREE(*str); *str = (char*)DpsStrdup(alias); } for(cascade = 0; ((Alias=DpsMatchListFind(&Indexer->Conf->ReverseAliases,*str,nparts,Parts))) && (cascade < 1024); cascade++) { aliassize = dps_strlen(Alias->arg) + dps_strlen(Alias->pattern) + dps_strlen(*str) + 128; alias = (char*)DpsRealloc(alias, aliassize); if (alias == NULL) { DpsLog(Indexer, DPS_LOG_ERROR, "No memory (%d bytes). %s line %d", aliassize, __FILE__, __LINE__); goto ret; } DpsMatchApply(alias,aliassize,*str,Alias->arg,Alias,nparts,Parts); if(alias[0]){ DpsLog(Indexer,DPS_LOG_DEBUG,"ReverseAlias%d: pattern:%s, arg:%s -> '%s'", cascade, Alias->pattern, Alias->arg, alias); DPS_FREE(*str); *str = (char*)DpsStrdup(alias); } else break; if (Alias->last) break; } } ret: DPS_FREE(alias); }
dpsunicode_t *DpsSegmentByFreq(DPS_CHINALIST *List, dpsunicode_t *line) { dpsunicode_t *out, *mid, *last, *sentence, *segmented_sentence, part; size_t i, j, l, a; int /*reg = 1,*/ ctype, have_bukva_forte, fb_type; dpsunicode_t space[] = { 32, 0 }; l = 2 * (DpsUniLen(line) + 1); if (l < 2) return NULL; out = (dpsunicode_t*)DpsMalloc(l * sizeof(dpsunicode_t)); if (out == NULL) return NULL; *out = '\0'; mid = (dpsunicode_t*)DpsXmalloc(l * sizeof(dpsunicode_t)); if (mid == NULL) { DPS_FREE(out); return NULL; } *mid = '\0'; for (i = j = 0; i < DpsUniLen(line); i++) { /* if (line[i] >= 0x80) { if (reg == 0) { mid[j++] = *space; reg = 1; } } else { if (reg == 1) { mid[j++] = *space; reg = 0; } }*/ mid[j++] = line[i]; } /* mid[j] = 0;*/ for (sentence = DpsUniGetSepToken(/*line*/ mid, &last, &ctype, &have_bukva_forte, 0); sentence; sentence = DpsUniGetSepToken(NULL, &last, &ctype, &have_bukva_forte, 0)) { part = *last; *last = 0; fb_type = DpsUniCType(*sentence); if (fb_type > DPS_UNI_BUKVA || fb_type == 2 || fb_type == 1) { a = 2 * (DpsUniLen(sentence) + 1); j = DpsUniLen(out); if (j + a >= l) { l = j + a + 1; out = (dpsunicode_t*)DpsRealloc(out, l * sizeof(dpsunicode_t)); if (out == NULL) { DPS_FREE(mid); return NULL; } } if (*out) DpsUniStrCat(out, space); DpsUniStrCat(out, sentence); } else { if ((segmented_sentence = DpsSegmentProcess(List, sentence)) != NULL) { a = 2 * (DpsUniLen(segmented_sentence) + 1); j = DpsUniLen(out); if (j + a >= l) { l = j + a + 1; out = (dpsunicode_t*)DpsRealloc(out, l * sizeof(dpsunicode_t)); if (out == NULL) { DPS_FREE(mid); return NULL; } } if (*out) DpsUniStrCat(out, space); DpsUniStrCat(out, segmented_sentence); DPS_FREE(segmented_sentence); } else { DPS_FREE(mid); return NULL; } } *last = part; } DPS_FREE(mid); return out; }
int DpsCloneListSearchd(DPS_AGENT *Indexer, DPS_DOCUMENT *Doc, DPS_RESULT *Res, DPS_DB *db) { DPS_SEARCHD_PACKET_HEADER hdr; ssize_t nsent,nrecv; char *msg = NULL, *dinfo = NULL; char *tok, *lt; char buf[128]; int done = 0; int rc = DPS_OK; TRACE_IN(Indexer, "DpsCloneListSearchd"); dps_snprintf(buf, 128, "%s", DpsVarListFindStr(&Doc->Sections, "DP_ID", "0")); hdr.cmd = DPS_SEARCHD_CMD_CLONES; hdr.len = dps_strlen(buf); nsent = DpsSearchdSendPacket(db->searchd, &hdr, buf); while(!done){ nrecv = DpsRecvall(db->searchd, &hdr, sizeof(hdr), 360); if(nrecv != sizeof(hdr)){ DpsLog(Indexer, DPS_LOG_ERROR, "Received incomplete header from searchd (%d bytes)", (int)nrecv); TRACE_OUT(Indexer); return(DPS_ERROR); }else{ #ifdef DEBUG_SDP DpsLog(Indexer, DPS_LOG_DEBUG, "Received header cmd=%d len=%d\n", hdr.cmd, hdr.len); #endif } switch(hdr.cmd){ case DPS_SEARCHD_CMD_ERROR: msg = (char*)DpsMalloc(hdr.len + 1); if (msg == NULL) { done=1; break; } nrecv = DpsRecvall(db->searchd, msg, hdr.len, 360); msg[(nrecv >= 0) ? nrecv : 0] = '\0'; sprintf(Indexer->Conf->errstr, "Searchd error: '%s'", msg); rc = DPS_ERROR; DPS_FREE(msg); done = 1; break; case DPS_SEARCHD_CMD_DOCINFO: dinfo = (char*)DpsMalloc(hdr.len + 1); if (dinfo == NULL) { done=1; break; } nrecv = DpsRecvall(db->searchd, dinfo, hdr.len, 360); dinfo[(nrecv >= 0) ? nrecv : 0] = '\0'; #ifdef DEBUG_SDP DpsLog(Indexer, DPS_LOG_DEBUG, "Received DOCINFO size=%d buf=%s\n", hdr.len, dinfo); #endif if (strcasecmp(dinfo, "nocloneinfo") != 0) { tok = dps_strtok_r(dinfo, "\r\n", <, NULL); while(tok){ DPS_DOCUMENT *D; size_t nd = Res->num_rows++; Res->Doc = (DPS_DOCUMENT*)DpsRealloc(Res->Doc, (Res->num_rows + 1) * sizeof(DPS_DOCUMENT)); if (Res->Doc == NULL) { sprintf(Indexer->Conf->errstr, "Realloc error"); rc = DPS_ERROR; break; } D = &Res->Doc[nd]; DpsDocInit(D); DpsDocFromTextBuf(D, tok); tok = dps_strtok_r(NULL, "\r\n", <, NULL); } } DPS_FREE(dinfo); done = 1; break; default: sprintf(Indexer->Conf->errstr, "Unknown searchd response: cmd=%d len=%d", hdr.cmd, hdr.len); rc = DPS_ERROR; done = 1; break; } } TRACE_OUT(Indexer); return rc; }
__C_LINK int __DPSCALL DpsCacheMakeIndexes(DPS_AGENT *Indexer, DPS_DB *db) { DPS_UINT8URLIDLIST L8; DPS_UINT4URLIDLIST L4; DPS_VARLIST *v = &Indexer->Conf->Vars; size_t i, r; char *ind, *nm, *lfname; bzero(&L4, sizeof(DPS_UINT4URLIDLIST)); bzero(&L8, sizeof(DPS_UINT8URLIDLIST)); r = (size_t) 'l'; for (i = 0; i < v->Root[r].nvars; i++) { if (!strncasecmp("Limit-", v->Root[r].Var[i].name, 6)) { ind = v->Root[r].Var[i].val; lfname = v->Root[r].Var[i].name; nm = lfname + 6; if (!strcasecmp(ind, "category")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Category index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating category index"); if (DPS_OK == DpsLimit8(Indexer, &L8, "Category", DPS_IFIELD_TYPE_HEX8STR, db)) { MakeNestedIndex(Indexer, &L8, DPS_LIMFNAME_CAT, db); } } else if (!strcasecmp(ind, "tag")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Tag index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating tag index"); if (DPS_OK == DpsLimit4(Indexer, &L4, "Tag", DPS_IFIELD_TYPE_STRCRC32, db)) { MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_TAG, db); } } else if (!strcasecmp(ind, "link")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Link index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating link index"); if (DPS_OK == DpsLimit4(Indexer, &L4, "link", DPS_IFIELD_TYPE_INT, db)) { MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_LINK, db); } } else if (!strcasecmp(ind, "time")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Time index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating time index"); if (DPS_OK == DpsLimit4(Indexer, &L4, "last_mod_time", DPS_IFIELD_TYPE_HOUR, db)) { MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_TIME, db); } } else if (!strcasecmp(ind, "hostname")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Hostname index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating hostname index"); if (DPS_OK == DpsLimit4(Indexer, &L4, "url", DPS_IFIELD_TYPE_HOSTNAME, db)) { MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_HOST, db); } } else if (!strcasecmp(ind, "language")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Language index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating language index"); if (DPS_OK == DpsLimit4(Indexer, &L4, "Content-Language", DPS_IFIELD_TYPE_STR2CRC32, db)) { MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_LANG, db); } } else if (!strcasecmp(ind, "content")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Content-Type index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating Content-Type index"); if (DPS_OK == DpsLimit4(Indexer, &L4, "Content-Type", DPS_IFIELD_TYPE_STRCRC32, db)) { MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_CTYPE, db); } } else if (!strcasecmp(ind, "siteid")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Site_id index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating Site_id index"); if (DPS_OK == DpsLimit4(Indexer, &L4, "site_id", DPS_IFIELD_TYPE_INT, db)) { MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_SITE, db); } } else { char *buf, *req, *dbaddr; DPS_DB ldb, *pdb = &ldb; size_t buf_len = dps_strlen(nm) + 16; if ((buf = (char*) DpsMalloc(buf_len * sizeof(char))) == NULL) { DpsLog(Indexer, DPS_LOG_ERROR, "Can't alloc %d chars at %s:%d", buf_len, __FILE__, __LINE__); return DPS_ERROR; } dps_setproctitle("[%d] %s index creation", Indexer->handle, nm); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating %s index", nm); dps_snprintf(buf, buf_len, "Req-%s", nm); req = DpsVarListFindStr(&Indexer->Conf->Vars, buf, NULL); if (req != NULL) { dps_snprintf(buf, buf_len, "dbaddr-%s", nm); dbaddr = DpsVarListFindStr(&Indexer->Conf->Vars, buf, NULL); if (dbaddr != NULL) { DpsDBSetAddr(pdb, dbaddr, DPS_OPEN_MODE_READ); } else { pdb = db; } if (!strcasecmp(ind, "nex8str")) { if (DPS_OK == DpsSQLLimit8(Indexer, &L8, req, DPS_IFIELD_TYPE_HEX8STR, pdb)) { MakeNestedIndex(Indexer, &L8, lfname, pdb); } } else { int field_type = DPS_IFIELD_TYPE_INT; if (!strcasecmp(ind, "strcrc32")) field_type = DPS_IFIELD_TYPE_STRCRC32; else if (!strcasecmp(ind, "hour")) field_type = DPS_IFIELD_TYPE_HOUR; else if (!strcasecmp(ind, "hostname")) field_type = DPS_IFIELD_TYPE_HOSTNAME; else if (!strcasecmp(ind, "char2")) field_type = DPS_IFIELD_TYPE_STR2CRC32; else if (!strcasecmp(ind, "int")) field_type = DPS_IFIELD_TYPE_INT; if (DPS_OK == DpsSQLLimit4(Indexer, &L4, req, field_type, db)) { MakeLinearIndex(Indexer, &L4, lfname, db); } } } } /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Indexes done.", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Done"); } } return DPS_OK; }
int __DPSCALL DpsSearchdURLAction(DPS_AGENT *A, DPS_DOCUMENT *D, int cmd, void *db) { DPS_DB *searchd = db; DPS_SEARCHD_PACKET_HEADER hdr; char *buf; ssize_t nsent, nrecv; int done = 0; char *msg = NULL; char *dinfo = NULL; int rc=DPS_OK; TRACE_IN(A, "DpsSearchdURLAction"); if (cmd != DPS_URL_ACTION_DOCCOUNT) { DpsLog(A, DPS_LOG_ERROR, "searchd: unsupported URL action"); TRACE_OUT(A); return DPS_ERROR; } hdr.cmd = DPS_SEARCHD_CMD_URLACTION; hdr.len = sizeof(int); if ((buf = (char*)DpsMalloc(hdr.len + 1)) == NULL) { DpsLog(A, DPS_LOG_ERROR, "Out of memory"); TRACE_OUT(A); return DPS_ERROR; } *((int*)buf) = cmd; nsent = DpsSearchdSendPacket(searchd->searchd, &hdr, buf); DPS_FREE(buf); while(!done) { nrecv = DpsRecvall(searchd->searchd, &hdr, sizeof(hdr), 360); if(nrecv != sizeof(hdr)){ DpsLog(A, DPS_LOG_ERROR, "Received incomplete header from searchd (%d bytes)", (int)nrecv); TRACE_OUT(A); return(DPS_ERROR); }else{ #ifdef DEBUG_SDP DpsLog(A, DPS_LOG_ERROR, "Received header cmd=%d len=%d\n", hdr.cmd, hdr.len); #endif } switch(hdr.cmd){ case DPS_SEARCHD_CMD_ERROR: msg = (char*)DpsMalloc(hdr.len + 1); if (msg == NULL) { done=1; break; } nrecv = DpsRecvall(searchd->searchd, msg, hdr.len, 360); msg[(nrecv >= 0) ? nrecv : 0] = '\0'; sprintf(A->Conf->errstr, "Searchd error: '%s'", msg); rc=DPS_OK; DPS_FREE(msg); done=1; break; case DPS_SEARCHD_CMD_MESSAGE: msg=(char*)DpsMalloc(hdr.len+1); if (msg == NULL) { done=1; break; } nrecv = DpsRecvall(searchd->searchd, msg, hdr.len, 360); msg[(nrecv >= 0) ? nrecv : 0] = '\0'; #ifdef DEBUG_SDP DpsLog(A, DPS_LOG_ERROR, "Message from searchd: '%s'\n",msg); #endif DPS_FREE(msg); break; case DPS_SEARCHD_CMD_DOCCOUNT: dinfo=(char*)DpsMalloc(hdr.len+1); if (dinfo == NULL) { done=1; break; } nrecv = DpsRecvall(searchd->searchd, dinfo, hdr.len, 360); dinfo[(nrecv >= 0) ? nrecv : 0] = '\0'; A->doccount += *((int *)dinfo); #ifdef DEBUG_SDP DpsLog(A, DPS_LOG_DEBUG, "Received DOCCOUNT size=%d doccount=%d(+%s)\n", hdr.len, A->doccount, dinfo); #endif DPS_FREE(dinfo); done=1; break; default: sprintf(A->Conf->errstr, "Unknown searchd response: cmd=%d len=%d", hdr.cmd, hdr.len); rc=DPS_ERROR; done = 1; break; } } TRACE_OUT(A); return rc; }
static dpsunicode_t *DpsSegmentProcess(DPS_CHINALIST *List, dpsunicode_t *line) { int top, nextid, *position, *next, len, maxid, i, current, father, needinsert, iindex; unsigned int h; double *value, p; dpsunicode_t **result; dpsunicode_t *otv, space[] = {32, 0}; DPS_CHINAWORD *chinaword, chiw; if (/*(line[0] >= 0x80) &&*/ (List->hash != NULL)) { len = DpsUniLen(line); maxid = 2 * len + 1; position = (int*)DpsMalloc(maxid * sizeof(int)); if (position == NULL) return NULL; next = (int*)DpsMalloc(maxid * sizeof(int)); if (next == NULL) { DPS_FREE(position); return NULL; } value = (double*)DpsMalloc(maxid * sizeof(double)); if (value == NULL) { DPS_FREE(position); DPS_FREE(next); return NULL; } result = (dpsunicode_t **)DpsMalloc(maxid * sizeof(dpsunicode_t *)); if (result == NULL) { DPS_FREE(position); DPS_FREE(next); DPS_FREE(value); return NULL; } top = 0; /* value[0] = 1;*/ value[0] = 1.0 * List->total * len; position[0] = 0; next[0] = -1; result[0] = (dpsunicode_t*)DpsUniDup(&space[1]); nextid = 1; /* fprintf(stderr, "SegmentProcess start: len -- %d\n", len);*/ while ((top != -1) && (!((position[top] >= len) && (next[top] == -1)))) { /* fprintf(stderr, "top: %d position: %d (len: %d) next:%d\n", top, position[top], len, next[top]);*/ /* # find the first open path */ current = top; father = top; while ((current != -1) && (position[current] >= len)) { father = current; current = next[current]; } /* # remove this path */ if (current == top) { top = next[top]; } else { next[father] = next[current]; } if (current == -1) { /* # no open path, finished, take the first path */ next[top] = -1; } else { otv = &line[position[current]]; h = (unsigned int)(otv[0] & 0xffff); /* # if the first character doesn't have word phrase in the dict.*/ if (List->hash[h] == 0) { List->hash[h] = 1 /*2*/; } i = List->hash[h]; if (i + position[current] > len) { i = len - position[current]; } /*i = i + 1*/ /*2*/; otv = NULL; for (; i > 0; i-- /*2*/) { /*i = i - 1*/ /*2*/; DPS_FREE(otv); otv = DpsUniNDup(&line[position[current]], (size_t)i); chinaword = DpsChineseListFind(List, otv); if (i == 1 /*2*/ && chinaword == NULL) { DPS_FREE(otv); otv = DpsUniNDup(&line[position[current]], 1/*2*/); chiw.word = otv; chiw.freq = 1; DpsChineseListAdd(List, chinaword = &chiw); /* DpsChineseListSort(List);*/ /*i = 1*//*2*//*;*/ } if ((chinaword != NULL) && chinaword->freq) { /* # pronode() */ /* value[nextid] = value[current] * chinaword->freq / List->total;*/ p = (double)chinaword->freq / List->total; value[nextid] = value[current] / (-1.0 * log(p) / log(10.0)); position[nextid] = position[current] + i; h = DpsUniLen(result[current]) + DpsUniLen(otv) + 2; result[nextid] = (dpsunicode_t*)DpsXmalloc((size_t)h * sizeof(dpsunicode_t)); if (result[nextid] == NULL) { DPS_FREE(position); DPS_FREE(next); DPS_FREE(value); DPS_FREE(result); return NULL; } DpsUniStrCpy(result[nextid], result[current]); DpsUniStrCat(result[nextid], space); DpsUniStrCat(result[nextid], otv); /* # check to see whether there is duplicated path # if there is a duplicate path, remove the small value path */ needinsert = 1; iindex = top; father = top; while (iindex != -1) { if (position[iindex] == position[nextid]) { if (0.85 * value[iindex] >= value[nextid]) { needinsert = 0; } else { if (top == iindex) { next[nextid] = next[iindex]; top = nextid; needinsert = 0; /* } else { next[nextid] = next[father];*/ /* next[father] = next[nextid];*/ } } iindex = -1; } else { father = iindex; iindex = next[iindex]; } } /* # insert the new path into the list */ /* fprintf(stderr, "current:%d position:%d i:%d value[current]:%.12lf nextid:%d value[nextid]:%.12lf\n", current, position[current], i, value[current], nextid, value[nextid]);*/ if (needinsert == 1) { while ((iindex != -1) && (value[iindex] > value[nextid])) { father = iindex; iindex = next[iindex]; } if (top == iindex) { next[nextid] = top; top = nextid; } else { next[father] = nextid; next[nextid] = iindex; } } nextid++; if (nextid >= maxid) { maxid +=128; position = (int*)DpsRealloc(position, maxid * sizeof(int)); next = (int*)DpsRealloc(next, maxid * sizeof(int)); value = (double*)DpsRealloc(value, maxid * sizeof(double)); result = (dpsunicode_t **)DpsRealloc(result, maxid * sizeof(dpsunicode_t *)); if (position == NULL || next == NULL || value == NULL || result == NULL) { DPS_FREE(position); DPS_FREE(next); DPS_FREE(value); if (result != NULL) { for (i = 0; i < nextid; i++) { if (i != top) DPS_FREE(result[i]); } DPS_FREE(result); } return NULL; } } } } /*while ((i >= 1) && ( chinaword == NULL));*/ DPS_FREE(otv); } } DPS_FREE(position); DPS_FREE(next); for (i = 0; i < nextid; i++) { if (i != top) DPS_FREE(result[i]); } otv = result[top]; DPS_FREE(value); DPS_FREE(result); return otv; } else { return (dpsunicode_t*)DpsUniDup(line); } }
int DpsSearchdGetWordResponse(DPS_AGENT *query,DPS_RESULT *Res,DPS_DB *cl) { DPS_URL_CRD_DB *wrd = NULL; DPS_URLDATA *udt = NULL; #ifdef WITH_REL_TRACK DPS_URLTRACK *trk = NULL; #endif DPS_SEARCHD_PACKET_HEADER hdr; ssize_t nrecv; char *msg; int done=0, rc = DPS_OK; char *wbuf, *p; DPS_WIDEWORDLIST_EX *wwl; DPS_WIDEWORD *ww_ex; DPS_WIDEWORD ww; size_t i; TRACE_IN(query, "DpsSearchdGetWordResponse"); Res->total_found=0; while(!done){ nrecv = DpsRecvall(cl->searchd, &hdr, sizeof(hdr), 360); if(nrecv!=sizeof(hdr)){ sprintf(query->Conf->errstr,"Received incomplete header from searchd (%d bytes,errno:%d)",(int)nrecv, errno); TRACE_OUT(query); return DPS_ERROR;; } #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Received header cmd=%d len=%d\n",hdr.cmd,hdr.len); #endif switch(hdr.cmd){ case DPS_SEARCHD_CMD_ERROR: msg=(char*)DpsMalloc(hdr.len+1); if (msg == NULL) { done = 1; break; } nrecv = DpsRecvall(cl->searchd, msg, hdr.len, 360); if (nrecv >= 0) { msg[nrecv]='\0'; sprintf(query->Conf->errstr,"Searchd error: '%s',received:%d", msg, (int)nrecv); } rc = DPS_ERROR; DPS_FREE(msg); done=1; break; case DPS_SEARCHD_CMD_MESSAGE: msg=(char*)DpsMalloc(hdr.len+1); if (msg == NULL) { done = 1; break; } nrecv = DpsRecvall(cl->searchd, msg, hdr.len, 360); msg[(nrecv >= 0) ? nrecv : 0] = '\0'; if (strncmp(msg, "Total_found", 11) == 0) { Res->total_found = (size_t)DPS_ATOI(msg + 12); Res->grand_total = (size_t)DPS_ATOI(strchr(msg + 12, (int)' ') + 1); } #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Message from searchd: '%s'\n",msg); #endif DPS_FREE(msg); break; case DPS_SEARCHD_CMD_WORDS: DPS_FREE(wrd); wrd=(DPS_URL_CRD_DB*)DpsMalloc(hdr.len + 1); if (wrd == NULL) { done = 1; break; } nrecv = DpsRecvall(cl->searchd, wrd, hdr.len, 360); /*Res->total_found=hdr.len/sizeof(*wrd);*/ Res->num_rows = (nrecv >= 0) ? (size_t)nrecv / sizeof(*wrd) : 0; #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Received words size=%d nwrd=%d\n",hdr.len, Res->num_rows /*Res->total_found*/); #endif done=1; break; case DPS_SEARCHD_CMD_SUGGEST: DPS_FREE(Res->Suggest); Res->Suggest = (char*)DpsMalloc(hdr.len + 1); if (Res->Suggest == NULL) { done = 1; break; } nrecv = DpsRecvall(cl->searchd, Res->Suggest, hdr.len, 360); Res->Suggest[(nrecv >=0) ? nrecv : 0] = '\0'; #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Received Suggest size=%d\n", hdr.len); #endif break; case DPS_SEARCHD_CMD_PERSITE: Res->PerSite = (size_t*)DpsMalloc(hdr.len + 1); if (Res->PerSite == NULL) { done = 1; break; } nrecv = DpsRecvall(cl->searchd, Res->PerSite, hdr.len, 360); #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Received PerSite size=%d nwrd=%d\n", nrecv, Res->num_rows/*Res->total_found*/); #endif break; case DPS_SEARCHD_CMD_DATA: udt = (DPS_URLDATA*)DpsMalloc(hdr.len + 1); if (udt == NULL) { done = 1; break; } nrecv = DpsRecvall(cl->searchd, udt, hdr.len, 360); #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Received URLDATA size=%d nwrd=%d\n", nrecv, Res->num_rows); #endif break; #ifdef WITH_REL_TRACK case DPS_SEARCHD_CMD_TRACKDATA: trk = (DPS_URLTRACK*)DpsMalloc(hdr.len + 1); if (trk == NULL) { done = 1; break; } nrecv = DpsRecvall(cl->searchd, trk, hdr.len, 360); #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Received TRACKDATA size=%d nwrd=%d\n", nrecv, Res->num_rows); #endif break; #endif case DPS_SEARCHD_CMD_WITHOFFSET: /* Res->offset = 1;*/ break; case DPS_SEARCHD_CMD_QLC: if ((p = (char *)DpsXmalloc(hdr.len + 1)) != NULL) { if (DpsRecvall(cl->searchd, p, hdr.len, 360)) { DpsVarListReplaceStr(&query->Vars, "q", p); } } DPS_FREE(p); break; case DPS_SEARCHD_CMD_WWL: Res->PerSite = NULL; if ((wbuf = p = (char *)DpsXmalloc(hdr.len + 1)) != NULL) if (DpsRecvall(cl->searchd, wbuf, hdr.len, 360)) { wwl = (DPS_WIDEWORDLIST_EX *)p; p += sizeof(DPS_WIDEWORDLIST_EX); #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "wbuf :%x, wwl: %x, p: %x hdr.len:%d\n", wbuf, wwl, p, hdr.len); DpsLog(query, DPS_LOG_ERROR, "Received WWL nwords=%d nuniq=%d\n", wwl->nwords, wwl->nuniq); #endif /* DpsWideWordListFree(&Res->WWList);*/ for(i = 0; i < wwl->nwords; i++) { /* ww_ex = (DPS_WIDEWORD_EX *)((void*)&p[0]);*/ dps_memcpy((char*)&ww, p, sizeof(DPS_WIDEWORD_EX)); p += sizeof(DPS_WIDEWORD_EX); /* ww.order = ww_ex->order; ww.order_inquery = ww_ex->order_inquery; ww.count = ww_ex->count; ww.len = ww_ex->len; ww.ulen = ww_ex->ulen; ww.origin = ww_ex->origin; ww.crcword = ww_ex->crcword; */ ww.word = p; #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Word {%d}: %s\n", ww.len+1, ww.word); #endif p += ww.len + 1; p += sizeof(dpsunicode_t) - ((SDPALIGN)p % sizeof(dpsunicode_t)); ww.uword = (dpsunicode_t*)p; p += sizeof(dpsunicode_t) * (ww.ulen + 1); DpsWideWordListAdd(&Res->WWList, &ww, DPS_WWL_STRICT); } Res->WWList.nuniq = wwl->nuniq; DPS_FREE(wbuf); } break; default: sprintf(query->Conf->errstr,"Unknown searchd response: cmd=%d len=%d",hdr.cmd,hdr.len); rc = DPS_ERROR; done=1; break; } } Res->CoordList.Coords = wrd; Res->CoordList.Data = udt; #ifdef WITH_REL_TRACK Res->CoordList.Track = trk; #endif TRACE_OUT(query); return rc; }
static int MakeLinearIndex(DPS_AGENT *Indexer, const char *field, const char *lim_name, int type, DPS_DB *db) { DPS_ENV *Conf = Indexer->Conf; DPS_UINT4URLIDLIST L; size_t k,prev; urlid_t *data = NULL; DPS_UINT4_POS_LEN *ind=NULL; size_t mind=1000,nind=0; char fname[PATH_MAX]; int dat_fd=0, ind_fd=0, rc; const char *vardir = (db->vardir) ? db->vardir : DpsVarListFindStr(&Conf->Vars, "VarDir", DPS_VAR_DIR); bzero(&L, sizeof(DPS_UINT4URLIDLIST)); rc = DpsLimit4(Indexer, &L, field, type, db); if(rc != DPS_OK) { DpsLog(Indexer, DPS_LOG_ERROR, "Error: %s [%s:%d]", DpsEnvErrMsg(Conf), __FILE__, __LINE__); goto err1; } if(!L.Item)return(1); if (L.nitems > 1) DpsSort(L.Item, L.nitems, sizeof(DPS_UINT4URLID), (qsort_cmp)cmp_ind4); data = (urlid_t*)DpsMalloc((L.nitems + 1) * sizeof(*data)); if(!data) { fprintf(stderr,"Error1: %s\n",strerror(errno)); goto err1; } ind=(DPS_UINT4_POS_LEN*)DpsMalloc(mind*sizeof(DPS_UINT4_POS_LEN)); if(!ind) { fprintf(stderr,"Error2: %s\n",strerror(errno)); goto err1; } prev=0; for(k=0; k<L.nitems; k++) { data[k]=L.Item[k].url_id; if((k==L.nitems-1) || (L.Item[k].val!=L.Item[prev].val)) { if(nind==mind) { mind+=1000; ind=(DPS_UINT4_POS_LEN*)DpsRealloc(ind,mind*sizeof(DPS_UINT4_POS_LEN)); if(!ind) { fprintf(stderr,"Error3: %s\n",strerror(errno)); goto err1; } } /* Fill index */ ind[nind].val=L.Item[prev].val; ind[nind].pos = prev * sizeof(*data); if (k == L.nitems - 1) ind[nind].len = (k - prev + 1) * sizeof(*data); else ind[nind].len = (k - prev) * sizeof(*data); DpsLog(Indexer, DPS_LOG_DEBUG, "%d - pos:%x len:%d\n", ind[nind].val, (int)ind[nind].pos, ind[nind].len); nind++; prev=k; } } if (L.mapped) { #ifdef HAVE_SYS_MMAN_H if (munmap(L.Item, (L.nitems + 1) * sizeof(DPS_UINT4URLID))) { fprintf(stderr, "Can't shmdt '%s': %s\n", L.shm_name, strerror(errno)); } #elif defined(HAVE_SYS_SHM_H) if (shmdt(L.Item)) { fprintf(stderr, "Can't shmdt '%s': %s\n", L.shm_name, strerror(errno)); } #endif unlink(L.shm_name); } else { DPS_FREE(L.Item); } dps_snprintf(fname,sizeof(fname),"%s%c%s%c%s.dat", vardir,DPSSLASH, DPS_TREEDIR, DPSSLASH, lim_name); if((dat_fd = DpsOpen3(fname, O_CREAT | O_WRONLY | O_TRUNC | DPS_BINARY, DPS_IWRITE)) < 0) { fprintf(stderr,"Can't open '%s': %s\n",fname,strerror(errno)); goto err1; } DpsWriteLock(dat_fd); if((L.nitems * sizeof(*data)) != (size_t)write(dat_fd, data, L.nitems * sizeof(*data))) { fprintf(stderr,"Can't write '%s': %s\n",fname,strerror(errno)); goto err1; } DpsUnLock(dat_fd); DpsClose(dat_fd); DPS_FREE(data); dps_snprintf(fname,sizeof(fname),"%s%c%s%c%s.ind", vardir,DPSSLASH, DPS_TREEDIR, DPSSLASH, lim_name); if((ind_fd = DpsOpen3(fname, O_CREAT | O_WRONLY | O_TRUNC | DPS_BINARY, DPS_IWRITE)) < 0) { fprintf(stderr,"Can't open '%s': %s\n",fname,strerror(errno)); goto err1; } DpsWriteLock(ind_fd); if((nind*sizeof(DPS_UINT4_POS_LEN)) != (size_t)write(ind_fd,ind,nind*sizeof(DPS_UINT4_POS_LEN))) { fprintf(stderr,"Can't write '%s': %s\n",fname,strerror(errno)); goto err1; } DpsUnLock(ind_fd); DpsClose(ind_fd); DPS_FREE(ind); return(0); err1: if (L.mapped) { #ifdef HAVE_SYS_MMAN_H if (munmap(L.Item, (L.nitems + 1) * sizeof(DPS_UINT4URLID))) { fprintf(stderr, "Can't shmdt '%s': %s\n", L.shm_name, strerror(errno)); } #elif defined(HAVE_SYS_SHM_H) if (shmdt(L.Item)) { fprintf(stderr, "Can't shmdt '%s': %s\n", L.shm_name, strerror(errno)); } #endif unlink(L.shm_name); } else { DPS_FREE(L.Item); } DPS_FREE(data); DPS_FREE(ind); if(dat_fd) DpsClose(dat_fd); if(ind_fd) DpsClose(ind_fd); return(1); }
int __DPSCALL DpsFindWordsSearchd(DPS_AGENT *query, DPS_RESULT *Res, DPS_DB *searchd) { size_t maxlen = 1024; char *request, *edf = NULL, *e_empty = NULL; const char *df = DpsVarListFindStr(&query->Vars, "DateFormat", NULL); const char *empty = DpsVarListFindStr(&query->Vars, "empty", NULL); const char *qs = DpsVarListFindStr(&query->Vars, "QUERY_STRING", ""); const char *tmplt = DpsVarListFindStr(&query->Vars, "tmplt", ""); int res=DPS_OK; TRACE_IN(query, "DpsFindWordsSearchd"); if (df) { edf = (char*)DpsMalloc(dps_strlen(df) * 10 + 1); if (edf == NULL) { sprintf(query->Conf->errstr,"Can't allocate memory"); TRACE_OUT(query); return DPS_ERROR; } DpsEscapeURL(edf, df); maxlen += dps_strlen(edf); } if (empty) { e_empty = (char*)DpsMalloc(dps_strlen(empty) * 10 + 1); if (e_empty == NULL) { sprintf(query->Conf->errstr, "Can't allocate memory"); TRACE_OUT(query); return DPS_ERROR; } DpsEscapeURL(e_empty, empty); maxlen += dps_strlen(e_empty); } maxlen += dps_strlen(qs) + dps_strlen(tmplt) + 64; if (NULL==(request=(char*)DpsMalloc(maxlen))) { sprintf(query->Conf->errstr,"Can't allocate memory"); DPS_FREE(edf); TRACE_OUT(query); return DPS_ERROR; } dps_snprintf(request, maxlen, "%s&BrowserCharset=%s&IP=%s&g-lc=%s&ExcerptSize=%s&ExcerptPadding=%s&DoExcerpt=%s&tmplt=%s%s%s%s%s%s%s&sp=%s&sy=%s&s=%s", qs, DpsVarListFindStr(&query->Vars, "BrowserCharset", "iso-8859-1"), DpsVarListFindStr(&query->Vars, "IP", "localhost"), DpsVarListFindStr(&query->Vars, "g-lc", "en"), DpsVarListFindStr(&query->Vars, "ExcerptSize", "256"), DpsVarListFindStr(&query->Vars, "ExcerptPadding", "40"), (query->Flags.do_excerpt) ? "yes" : "no", tmplt, (edf) ? "&DateFormat=" : "", (edf) ? edf : "", (e_empty) ? "&empty=" : "", (e_empty) ? e_empty : "", (searchd->label) ? "&label=" : "", (searchd->label) ? searchd->label : "", DpsVarListFindStr(&query->Vars, "sp", "1"), DpsVarListFindStr(&query->Vars, "sy", "1"), DpsVarListFindStr(&query->Vars, "s", "RP") ); DPS_FREE(edf); DPS_FREE(e_empty); request[maxlen-1]='\0'; res = DpsSearchdSendWordRequest(query, searchd, request); DPS_FREE(request); if (DPS_OK != res) { TRACE_OUT(query); return res; } /* res = DpsSearchdGetWordResponse(query, Res, searchd); called later from DpsFind */ TRACE_OUT(query); return res; }
static int MakeNestedIndex(DPS_AGENT *Indexer, DPS_UINT8URLIDLIST *L, const char *lim_name, DPS_DB *db) { DPS_ENV *Conf = Indexer->Conf; size_t k, prev; urlid_t *data=NULL; DPS_UINT8_POS_LEN *ind=NULL; size_t mind=1000, nind=0, ndata; char fname[PATH_MAX]; int dat_fd=0, ind_fd=0; int rc=DPS_OK; const char *vardir = (db->vardir) ? db->vardir : DpsVarListFindStr(&Conf->Vars, "VarDir", DPS_VAR_DIR); if(!L->Item)return(1); if (L->nitems > 1) DpsSort(L->Item, L->nitems, sizeof(DPS_UINT8URLID), (qsort_cmp)cmp_ind8); data = (urlid_t*)DpsMalloc((L->nitems + 1) * sizeof(urlid_t)); if(!data){ DpsLog(Indexer, DPS_LOG_ERROR, "Can't alloc %d bytes [%s:%d]", (L->nitems + 1) * sizeof(urlid_t), __FILE__, __LINE__); goto err1; } ind=(DPS_UINT8_POS_LEN*)DpsMalloc(mind*sizeof(DPS_UINT8_POS_LEN)); if(!ind){ DpsLog(Indexer, DPS_LOG_ERROR, "Can't alloc %d bytes [%s:%d]", mind * sizeof(DPS_UINT8_POS_LEN), __FILE__, __LINE__); goto err1; } prev=0; for(k=0; k < L->nitems; k++) { data[k] = L->Item[k].url_id; if((k == L->nitems-1) || (L->Item[k].hi != L->Item[prev].hi) || (L->Item[k].lo != L->Item[prev].lo)) { if(nind==mind){ mind+=1000; ind=(DPS_UINT8_POS_LEN*)DpsRealloc(ind,mind*sizeof(DPS_UINT8_POS_LEN)); if(!ind) { DpsLog(Indexer, DPS_LOG_ERROR, "Can't alloc %d bytes [%s:%d]", mind * sizeof(DPS_UINT8_POS_LEN), __FILE__, __LINE__); goto err1; } } /* Fill index */ ind[nind].hi = L->Item[prev].hi; ind[nind].lo = L->Item[prev].lo; ind[nind].pos = prev * sizeof(*data); if (k == L->nitems - 1) ind[nind].len = (k - prev + 1) * sizeof(*data); else ind[nind].len = (k - prev) * sizeof(*data); DpsLog(Indexer, DPS_LOG_DEBUG, "%08X%08X - %d %d\n", ind[nind].hi, ind[nind].lo, (int)ind[nind].pos, ind[nind].len); nind++; prev=k; } } ndata = L->nitems; ClearIndex8(L); dps_snprintf(fname,sizeof(fname)-1,"%s%c%s%c%s.dat", vardir,DPSSLASH, DPS_TREEDIR,DPSSLASH, lim_name); if((dat_fd = DpsOpen3(fname, O_CREAT | O_WRONLY | O_TRUNC | DPS_BINARY, DPS_IWRITE)) < 0) { DpsLog(Indexer, DPS_LOG_ERROR, "Can't open '%s': %s [%s:%d]", fname, strerror(errno), __FILE__, __LINE__); goto err1; } DpsWriteLock(dat_fd); if((ndata * sizeof(*data)) != (size_t)write(dat_fd, data, ndata * sizeof(*data))) { DpsLog(Indexer, DPS_LOG_ERROR, "Can't write '%s': %s [%s:%d]", fname, strerror(errno), __FILE__, __LINE__); goto err1; } DpsUnLock(dat_fd); DpsClose(dat_fd); DPS_FREE(data); dps_snprintf(fname,sizeof(fname)-1,"%s%c%s%c%s.ind", vardir, DPSSLASH,DPS_TREEDIR, DPSSLASH, lim_name); if((ind_fd = DpsOpen3(fname, O_CREAT | O_WRONLY | O_TRUNC | DPS_BINARY, DPS_IWRITE)) < 0) { DpsLog(Indexer, DPS_LOG_ERROR, "Can't open '%s': %s [%s:%d]", fname, strerror(errno), __FILE__, __LINE__); goto err1; } DpsWriteLock(ind_fd); if((nind*sizeof(DPS_UINT8_POS_LEN)) != (size_t)write(ind_fd,ind,nind*sizeof(DPS_UINT8_POS_LEN))){ DpsLog(Indexer, DPS_LOG_ERROR, "Can't write '%s': %s [%s:%d]", fname, strerror(errno), __FILE__, __LINE__); goto err1; } DpsUnLock(ind_fd); DpsClose(ind_fd); DPS_FREE(ind); return(0); err1: ClearIndex8(L); DPS_FREE(data); DPS_FREE(ind); if(dat_fd) DpsClose(dat_fd); if(ind_fd) DpsClose(ind_fd); return(1); }
int __DPSCALL DpsSearchdCatAction(DPS_AGENT *A, DPS_CATEGORY *C, int cmd, void *db) { DPS_DB *searchd = db; DPS_SEARCHD_PACKET_HEADER hdr; char *buf; ssize_t nsent, nrecv; int done = 0; int rc=DPS_OK; char *msg = NULL; char *dinfo = NULL; TRACE_IN(A, "DpsSearchdCatAction"); hdr.cmd = DPS_SEARCHD_CMD_CATINFO; hdr.len = sizeof(int) + dps_strlen(C->addr) + 1; if ((buf = (char*)DpsMalloc(hdr.len + 1)) == NULL) { DpsLog(A, DPS_LOG_ERROR, "Out of memory"); TRACE_OUT(A); return DPS_ERROR; } *((int*)buf) = cmd; dps_strcpy(buf + sizeof(int), C->addr); nsent = DpsSearchdSendPacket(searchd->searchd, &hdr, buf); DPS_FREE(buf); while(!done) { char * tok, * lt; nrecv = DpsRecvall(searchd->searchd, &hdr, sizeof(hdr), 360); if(nrecv != sizeof(hdr)){ DpsLog(A, DPS_LOG_ERROR, "Received incomplete header from searchd (%d bytes)", (int)nrecv); TRACE_OUT(A); return(DPS_ERROR); }else{ #ifdef DEBUG_SDP DpsLog(A, DPS_LOG_ERROR, "Received header cmd=%d len=%d\n", hdr.cmd, hdr.len); #endif } switch(hdr.cmd){ case DPS_SEARCHD_CMD_ERROR: msg = (char*)DpsMalloc(hdr.len + 1); if (msg == NULL) { done=1; break; } nrecv = DpsRecvall(searchd->searchd, msg, hdr.len, 360); msg[(nrecv >= 0) ? nrecv : 0 ] = '\0'; sprintf(A->Conf->errstr, "Searchd error: '%s'", msg); rc=DPS_ERROR; DPS_FREE(msg); done=1; break; case DPS_SEARCHD_CMD_MESSAGE: msg=(char*)DpsMalloc(hdr.len+1); if (msg == NULL) { done=1; break; } nrecv = DpsRecvall(searchd->searchd, msg, hdr.len, 360); msg[(nrecv >= 0) ? nrecv : 0] = '\0'; #ifdef DEBUG_SDP DpsLog(A, DPS_LOG_ERROR, "Message from searchd: '%s'\n",msg); #endif DPS_FREE(msg); break; case DPS_SEARCHD_CMD_CATINFO: dinfo=(char*)DpsMalloc(hdr.len+1); if (dinfo == NULL) { done=1; break; } nrecv = DpsRecvall(searchd->searchd, dinfo, hdr.len, 360); dinfo[(nrecv >= 0) ? nrecv : 0] = '\0'; #ifdef DEBUG_SDP DpsLog(A, DPS_LOG_ERROR, "Received CATINFO size=%d buf=%s\n",hdr.len,dinfo); #endif C->ncategories = 0; tok = dps_strtok_r(dinfo, "\r\n", <, NULL); while(tok){ DpsCatFromTextBuf(C, tok); tok = dps_strtok_r(NULL, "\r\n", <, NULL); } DPS_FREE(dinfo); done=1; break; default: sprintf(A->Conf->errstr, "Unknown searchd response: cmd=%d len=%d", hdr.cmd, hdr.len); rc=DPS_ERROR; done = 1; break; } } TRACE_OUT(A); return rc; }