static int open_socket(DPS_AGENT *A, char *unix_socket) { char unix_path[128]; struct sockaddr_un unix_addr; int sockfd, saddrlen; if (DpsRelVarName(A->Conf, unix_path, sizeof(unix_path), unix_socket) < 105) { } else { DpsLog(A, DPS_LOG_ERROR, "Unix socket name '%s' is too large", unix_path); return(DPS_NET_CANT_CONNECT); } if ((sockfd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { DpsLog(A, DPS_LOG_ERROR, "unix socket() error %d", errno); return(DPS_NET_CANT_CONNECT); } DpsSockOpt(A, sockfd); bzero((void*)&unix_addr, sizeof(unix_addr)); unix_addr.sun_family = AF_UNIX; dps_strncpy(unix_addr.sun_path, unix_path, sizeof(unix_addr.sun_path)); saddrlen = sizeof(unix_addr.sun_family) + dps_strlen(unix_addr.sun_path); if(connect(sockfd, (struct sockaddr *)&unix_addr, sizeof (unix_addr))) { dps_strerror(A, DPS_LOG_ERROR, "unix socket '%s' connect() error", unix_path); return(DPS_NET_CANT_CONNECT); } return sockfd; }
void DpsSockOpt(DPS_AGENT *A, int dps_socket) { const int lowat = 1; struct timeval so_tval; so_tval.tv_sec = 300; so_tval.tv_usec = 0; #if !defined(sgi) && !defined(__sgi) && !defined(__irix__) && !defined(sun) && !defined(__sun) /* && !defined(__FreeBSD__)*/ if (setsockopt(dps_socket, SOL_SOCKET, SO_SNDTIMEO, (char *)&so_tval, sizeof(so_tval)) != 0) { if (A) DpsLog(A, DPS_LOG_EXTRA, "%s [%d] setsockopt error: %d (%s)\n", __FILE__, __LINE__, errno, strerror(errno)); else fprintf(stderr, "%s [%d] setsockopt error: %d (%s)\n", __FILE__, __LINE__, errno, strerror(errno)); } #endif #if defined(SO_SNDLOWAT) && !defined(__linux__) if (setsockopt(dps_socket, SOL_SOCKET, SO_SNDLOWAT, (char *)&lowat, sizeof(lowat)) != 0) { if (A) DpsLog(A, DPS_LOG_EXTRA, "%s [%d] setsockopt error: %d (%s)\n", __FILE__, __LINE__, errno, strerror(errno)); else fprintf(stderr, "%s [%d] setsockopt error: %d (%s)\n", __FILE__, __LINE__, errno, strerror(errno)); } #endif #if defined(SO_RCVLOWAT) && (!defined(__linux__) || LINUX_VERSION_CODE >= 0x20400) if (setsockopt(dps_socket, SOL_SOCKET, SO_RCVLOWAT, (char *)&lowat, sizeof(lowat)) != 0) { if (A) DpsLog(A, DPS_LOG_EXTRA, "%s [%d] setsockopt error: %d (%s)\n", __FILE__, __LINE__, errno, strerror(errno)); else fprintf(stderr, "%s [%d] setsockopt error: %d (%s)\n", __FILE__, __LINE__, errno, strerror(errno)); } #endif }
__C_LINK int __DPSCALL DpsCacheMakeIndexes(DPS_AGENT *Indexer, DPS_DB *db) { DPS_VARLIST *v = &Indexer->Conf->Vars; size_t i, r; char *ind; r = (size_t) 'l'; for (i = 0; i < v->Root[r].nvars; i++) { if (!strncasecmp("Limit-", v->Root[r].Var[i].name, 6)) { ind = v->Root[r].Var[i].val; if (!strcasecmp(ind, "category")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Category index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating category index"); MakeNestedIndex(Indexer, "Category", DPS_LIMFNAME_CAT, DPS_IFIELD_TYPE_HEX8STR, db); } else if (!strcasecmp(ind, "tag")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Tag index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating tag index"); MakeLinearIndex(Indexer, "Tag", DPS_LIMFNAME_TAG, DPS_IFIELD_TYPE_STRCRC32, db); } else if (!strcasecmp(ind, "link")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Link index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating link index"); MakeLinearIndex(Indexer, "link", DPS_LIMFNAME_LINK, DPS_IFIELD_TYPE_INT, db); } else if (!strcasecmp(ind, "time")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Time index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating time index"); MakeLinearIndex(Indexer, "last_mod_time", DPS_LIMFNAME_TIME, DPS_IFIELD_TYPE_HOUR, db); } else if (!strcasecmp(ind, "hostname")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Hostname index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating hostname index"); MakeLinearIndex(Indexer, "url", DPS_LIMFNAME_HOST, DPS_IFIELD_TYPE_HOSTNAME, db); } else if (!strcasecmp(ind, "language")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Language index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating language index"); MakeLinearIndex(Indexer, "Content-Language", DPS_LIMFNAME_LANG, DPS_IFIELD_TYPE_STR2CRC32, db); } else if (!strcasecmp(ind, "content")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Content-Type index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating Content-Type index"); MakeLinearIndex(Indexer, "Content-Type", DPS_LIMFNAME_CTYPE, DPS_IFIELD_TYPE_STRCRC32, db); } else if (!strcasecmp(ind, "siteid")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Site_id index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating Site_id index"); MakeLinearIndex(Indexer, "site_id", DPS_LIMFNAME_SITE, DPS_IFIELD_TYPE_INT, db); } /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Indexes done.", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Done"); } } return DPS_OK; }
static int DoStore(DPS_AGENT *Agent, urlid_t rec_id, Byte *Doc, size_t DocSize, char *Client) { z_stream zstream; DPS_BASE_PARAM P; int rc = DPS_OK; Byte *CDoc = NULL; size_t dbnum = ((size_t)rec_id) % ((Agent->flags & DPS_FLAG_UNOCON) ? Agent->Conf->dbl.nitems : Agent->dbl.nitems); DPS_DB *db = (Agent->flags & DPS_FLAG_UNOCON) ? &Agent->Conf->dbl.db[dbnum] : &Agent->dbl.db[dbnum]; zstream.zalloc = Z_NULL; zstream.zfree = Z_NULL; zstream.opaque = Z_NULL; zstream.next_in = Doc; if (deflateInit2(&zstream, 9, Z_DEFLATED, 15, 9, Z_DEFAULT_STRATEGY) == Z_OK) { zstream.avail_in = DocSize; zstream.avail_out = 2 * DocSize; CDoc = zstream.next_out = (Byte *) DpsMalloc(2 * DocSize + 1); if (zstream.next_out == NULL) { return DPS_ERROR; } deflate(&zstream, Z_FINISH); deflateEnd(&zstream); /* store operations */ bzero(&P, sizeof(P)); P.subdir = "store"; P.basename = "doc"; P.indname = "doc"; P.rec_id = rec_id; P.mode = DPS_WRITE_LOCK; P.NFiles = (db->StoredFiles) ? db->StoredFiles : DpsVarListFindInt(&Agent->Vars, "StoredFiles", 0x100); P.vardir = (db->vardir) ? db->vardir : DpsVarListFindStr(&Agent->Vars, "VarDir", DPS_VAR_DIR); P.A = Agent; if (DpsBaseWrite(&P, CDoc, zstream.total_out) != DPS_OK) { DpsLog(Agent, DPS_LOG_ERROR, "store/doc write error: %s", strerror(errno)); rc = DPS_ERROR; } DpsBaseClose(&P); if (rc == DPS_OK) DpsLog(Agent, DPS_LOG_EXTRA, "[%s] Stored rec_id: %x Size: %d Ratio: %5.2f%%", Client, rec_id, DocSize, 100.0 * zstream.total_out / DocSize); if (Agent->Flags.OptimizeAtUpdate) { DpsBaseOptimize(&P, ((int)rec_id) >> DPS_BASE_BITS); }
void DpsCookiesClean(DPS_AGENT *A) { char buf[256]; DPS_DB *db; size_t i, dbfrom = 0, dbto; int res; if (A->Flags.robots_period == 0) return; dps_snprintf(buf, sizeof(buf), "DELETE FROM cookies WHERE expires < %d", A->now); if (A->flags & DPS_FLAG_UNOCON) DPS_GETLOCK(A, DPS_LOCK_CONF); dbto = DPS_DBL_TO(A); if (A->flags & DPS_FLAG_UNOCON) DPS_RELEASELOCK(A, DPS_LOCK_CONF); for (i = dbfrom; i < dbto; i++) { db = DPS_DBL_DB(A, i); if (A->flags & DPS_FLAG_UNOCON) DPS_GETLOCK(A, DPS_LOCK_DB); #ifdef HAVE_SQL res = DpsSQLAsyncQuery(db, NULL, buf); #endif if (res != DPS_OK) { DpsLog(A, DPS_LOG_ERROR, db->errstr); } if (A->flags & DPS_FLAG_UNOCON) DPS_RELEASELOCK(A, DPS_LOCK_DB); if (res != DPS_OK) break; } }
static int proceedSTOP(DPS_AGENT *query, DPS_STACK_ITEM *res, DPS_STACK_ITEM *x, DPS_STACK_ITEM *stop) { res->pbegin = res->pcur = (DPS_URL_CRD_DB*)DpsMalloc((x->count + stop->count + 1) * sizeof(DPS_URL_CRD_DB)); if (res->pbegin == NULL) { DpsLog(query, DPS_LOG_ERROR, "Can't alloc %d bytes for %d results", (x->count + stop->count + 1) * sizeof(DPS_URL_CRD_DB), (x->count + stop->count + 1)); return DPS_ERROR; } x->pcur = x->pbegin; x->plast = x->pbegin + x->count; stop->pcur = stop->pbegin; stop->plast = stop->pbegin + stop->count; if (stop->pcur < stop->plast) { while (x->pcur < x->plast) { while (stop->pcur < stop->plast && stop->pcur->url_id < x->pcur->url_id) stop->pcur++; while (stop->pcur < stop->plast && DpsCmpUrlid(stop->pcur, x->pcur) <= 0) { *res->pcur = *stop->pcur; res->pcur++; stop->pcur++; } if (stop->pcur >= stop->plast) break; while (x->pcur < x->plast && DpsCmpUrlid(x->pcur, stop->pcur) <= 0) { *res->pcur = *x->pcur; res->pcur++; x->pcur++; } } } while (x->pcur < x->plast) { *res->pcur = *x->pcur; res->pcur++; x->pcur++; } return DPS_OK; }
static int proceedOR(DPS_AGENT *query, DPS_STACK_ITEM *res, DPS_STACK_ITEM *x1, DPS_STACK_ITEM *x2) { res->pbegin = res->pcur = (DPS_URL_CRD_DB*)DpsMalloc((x1->count + x2->count + 1) * sizeof(DPS_URL_CRD_DB)); if (res->pbegin == NULL) { DpsLog(query, DPS_LOG_ERROR, "Can't alloc %d bytes for %d results", (x1->count + x2->count + 1) * sizeof(DPS_URL_CRD_DB), (x1->count + x2->count + 1)); return DPS_ERROR; } x1->pcur = x1->pbegin; x1->plast = x1->pbegin + x1->count; x2->pcur = x2->pbegin; x2->plast = x2->pbegin + x2->count; while (x1->pcur < x1->plast && x2->pcur < x2->plast) { while((x1->pcur < x1->plast) && (DpsCmpUrlid(x1->pcur, x2->pcur) <= 0)) { *res->pcur = *x1->pcur; res->pcur++; x1->pcur++; } { register DPS_STACK_ITEM *t = x1; x1 = x2; x2 = t; } } while (x1->pcur < x1->plast) { *res->pcur = *x1->pcur; res->pcur++; x1->pcur++; } while (x2->pcur < x2->plast) { *res->pcur = *x2->pcur; res->pcur++; x2->pcur++; } return DPS_OK; }
__C_LINK int __DPSCALL DpsBaseRead(DPS_BASE_PARAM *P, void *buf, size_t len) { int res = DPS_OK; #ifdef HAVE_ZLIB z_stream zstream; Byte *CDoc = NULL; #endif if ((res = DpsBaseSeek(P, DPS_READ_LOCK)) != DPS_OK) return res; if (P->Item.rec_id == P->rec_id) { if (lseek(P->Sfd, (off_t)P->Item.offset, SEEK_SET) == (off_t)-1) { DpsLog(P->A, DPS_LOG_ERROR, "[%s/%s.%d] %ld lseek error, rec_id: %x", P->subdir, P->basename, P->FileNo, P->Item.offset, P->rec_id); return DPS_ERROR; } if ((P->Item.orig_size ? P->Item.orig_size : P->Item.size) > len) { DpsLog(P->A, DPS_LOG_ERROR, "[%s/%s] size %d->%d error, rec_id: %x", P->subdir, P->basename, (P->Item.orig_size ? P->Item.orig_size : P->Item.size), len, P->rec_id); return DPS_ERROR; } #ifdef HAVE_ZLIB bzero(&zstream, sizeof(zstream)); if ((P->zlib_method == Z_DEFLATED) && (P->Item.orig_size != 0)) { zstream.avail_in = (uInt)P->Item.size; zstream.avail_out = (uInt)len; zstream.next_out = (Byte *) buf; CDoc = zstream.next_in = (Byte *) DpsMalloc(P->Item.size + 1); if (CDoc == NULL) { return DPS_ERROR; } zstream.zalloc = Z_NULL; zstream.zfree = Z_NULL; zstream.opaque = Z_NULL; if (read(P->Sfd, CDoc, P->Item.size) != (ssize_t)P->Item.size) { DpsLog(P->A, DPS_LOG_ERROR, "[%s/%s] %d read error, rec_id: %x -- %d", P->subdir, P->basename, P->Item.size, P->rec_id, __LINE__); DPS_FREE(CDoc); return DPS_ERROR; } inflateInit2(&zstream, P->zlib_windowBits); inflate(&zstream, Z_FINISH); inflateEnd(&zstream); DPS_FREE(CDoc); } else #endif if (read(P->Sfd, buf, P->Item.size) != (ssize_t)P->Item.size) { DpsLog(P->A, DPS_LOG_ERROR, "[%s/%s] %d read error, rec_id: %x -- %d", P->subdir, P->basename, P->Item.size, P->rec_id, __LINE__); return DPS_ERROR; } } else { DpsLog(P->A, DPS_LOG_DEBUG, "%s:[%s/%s] Not found rec_id: %x", P->vardir, P->subdir, P->basename, P->rec_id); return DPS_ERROR; } #ifdef DEBUG_SEARCH DpsLog(P->A, DPS_LOG_DEBUG, "[%s/%s] Retrieved rec_id: %x Size: %d", P->subdir, P->basename, P->rec_id, P->Item.size); #endif return DPS_OK; }
__C_LINK int __DPSCALL DpsBaseDelete(DPS_BASE_PARAM *P) { int res = DPS_OK; if ((res = DpsBaseSeek(P, DPS_WRITE_LOCK)) != DPS_OK) return res; if (P->Item.rec_id == P->rec_id) { P->Item.rec_id = 0; if (lseek(P->Ifd, (off_t)P->CurrentItemPos, SEEK_SET) == (off_t)-1) { DpsLog(P->A, DPS_LOG_ERROR, "Can't seek file %s {%s:%d}", P->Ifilename, __FILE__, __LINE__); return DPS_ERROR; } if (write(P->Ifd, &P->Item, sizeof(DPS_BASEITEM)) != sizeof(DPS_BASEITEM)) { DpsLog(P->A, DPS_LOG_ERROR, "Can't write hash chain for file %s (%s:%d)", P->Ifilename, __FILE__, __LINE__); return DPS_ERROR; } #ifdef DEBUG_SEARCH DpsLog(P->A, DPS_LOG_DEBUG, "[%s/%s] Deleted rec_id: %x", P->subdir, P->basename, P->rec_id); #endif } else { DpsLog(P->A, DPS_LOG_DEBUG, "[%s/%s] rec_id: %x not found for delete", P->subdir, P->basename, P->rec_id); } return DPS_OK; }
int __DPSCALL DpsSearchdCatAction(DPS_AGENT *A, DPS_CATEGORY *C, int cmd, void *db) { DPS_DB *searchd = db; DPS_SEARCHD_PACKET_HEADER hdr; char *buf; ssize_t nsent, nrecv; int done = 0; int rc=DPS_OK; char *msg = NULL; char *dinfo = NULL; TRACE_IN(A, "DpsSearchdCatAction"); hdr.cmd = DPS_SEARCHD_CMD_CATINFO; hdr.len = sizeof(int) + dps_strlen(C->addr) + 1; if ((buf = (char*)DpsMalloc(hdr.len + 1)) == NULL) { DpsLog(A, DPS_LOG_ERROR, "Out of memory"); TRACE_OUT(A); return DPS_ERROR; } *((int*)buf) = cmd; dps_strcpy(buf + sizeof(int), C->addr); nsent = DpsSearchdSendPacket(searchd->searchd, &hdr, buf); DPS_FREE(buf); while(!done) { char * tok, * lt; nrecv = DpsRecvall(searchd->searchd, &hdr, sizeof(hdr), 360); if(nrecv != sizeof(hdr)){ DpsLog(A, DPS_LOG_ERROR, "Received incomplete header from searchd (%d bytes)", (int)nrecv); TRACE_OUT(A); return(DPS_ERROR); }else{ #ifdef DEBUG_SDP DpsLog(A, DPS_LOG_ERROR, "Received header cmd=%d len=%d\n", hdr.cmd, hdr.len); #endif } switch(hdr.cmd){ case DPS_SEARCHD_CMD_ERROR: msg = (char*)DpsMalloc(hdr.len + 1); if (msg == NULL) { done=1; break; } nrecv = DpsRecvall(searchd->searchd, msg, hdr.len, 360); msg[(nrecv >= 0) ? nrecv : 0 ] = '\0'; sprintf(A->Conf->errstr, "Searchd error: '%s'", msg); rc=DPS_ERROR; DPS_FREE(msg); done=1; break; case DPS_SEARCHD_CMD_MESSAGE: msg=(char*)DpsMalloc(hdr.len+1); if (msg == NULL) { done=1; break; } nrecv = DpsRecvall(searchd->searchd, msg, hdr.len, 360); msg[(nrecv >= 0) ? nrecv : 0] = '\0'; #ifdef DEBUG_SDP DpsLog(A, DPS_LOG_ERROR, "Message from searchd: '%s'\n",msg); #endif DPS_FREE(msg); break; case DPS_SEARCHD_CMD_CATINFO: dinfo=(char*)DpsMalloc(hdr.len+1); if (dinfo == NULL) { done=1; break; } nrecv = DpsRecvall(searchd->searchd, dinfo, hdr.len, 360); dinfo[(nrecv >= 0) ? nrecv : 0] = '\0'; #ifdef DEBUG_SDP DpsLog(A, DPS_LOG_ERROR, "Received CATINFO size=%d buf=%s\n",hdr.len,dinfo); #endif C->ncategories = 0; tok = dps_strtok_r(dinfo, "\r\n", <, NULL); while(tok){ DpsCatFromTextBuf(C, tok); tok = dps_strtok_r(NULL, "\r\n", <, NULL); } DPS_FREE(dinfo); done=1; break; default: sprintf(A->Conf->errstr, "Unknown searchd response: cmd=%d len=%d", hdr.cmd, hdr.len); rc=DPS_ERROR; done = 1; break; } } TRACE_OUT(A); return rc; }
int DpsSearchdGetWordResponse(DPS_AGENT *query,DPS_RESULT *Res,DPS_DB *cl) { DPS_URL_CRD_DB *wrd = NULL; DPS_URLDATA *udt = NULL; #ifdef WITH_REL_TRACK DPS_URLTRACK *trk = NULL; #endif DPS_SEARCHD_PACKET_HEADER hdr; ssize_t nrecv; char *msg; int done=0, rc = DPS_OK; char *wbuf, *p; DPS_WIDEWORDLIST_EX *wwl; DPS_WIDEWORD *ww_ex; DPS_WIDEWORD ww; size_t i; TRACE_IN(query, "DpsSearchdGetWordResponse"); Res->total_found=0; while(!done){ nrecv = DpsRecvall(cl->searchd, &hdr, sizeof(hdr), 360); if(nrecv!=sizeof(hdr)){ sprintf(query->Conf->errstr,"Received incomplete header from searchd (%d bytes,errno:%d)",(int)nrecv, errno); TRACE_OUT(query); return DPS_ERROR;; } #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Received header cmd=%d len=%d\n",hdr.cmd,hdr.len); #endif switch(hdr.cmd){ case DPS_SEARCHD_CMD_ERROR: msg=(char*)DpsMalloc(hdr.len+1); if (msg == NULL) { done = 1; break; } nrecv = DpsRecvall(cl->searchd, msg, hdr.len, 360); if (nrecv >= 0) { msg[nrecv]='\0'; sprintf(query->Conf->errstr,"Searchd error: '%s',received:%d", msg, (int)nrecv); } rc = DPS_ERROR; DPS_FREE(msg); done=1; break; case DPS_SEARCHD_CMD_MESSAGE: msg=(char*)DpsMalloc(hdr.len+1); if (msg == NULL) { done = 1; break; } nrecv = DpsRecvall(cl->searchd, msg, hdr.len, 360); msg[(nrecv >= 0) ? nrecv : 0] = '\0'; if (strncmp(msg, "Total_found", 11) == 0) { Res->total_found = (size_t)DPS_ATOI(msg + 12); Res->grand_total = (size_t)DPS_ATOI(strchr(msg + 12, (int)' ') + 1); } #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Message from searchd: '%s'\n",msg); #endif DPS_FREE(msg); break; case DPS_SEARCHD_CMD_WORDS: DPS_FREE(wrd); wrd=(DPS_URL_CRD_DB*)DpsMalloc(hdr.len + 1); if (wrd == NULL) { done = 1; break; } nrecv = DpsRecvall(cl->searchd, wrd, hdr.len, 360); /*Res->total_found=hdr.len/sizeof(*wrd);*/ Res->num_rows = (nrecv >= 0) ? (size_t)nrecv / sizeof(*wrd) : 0; #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Received words size=%d nwrd=%d\n",hdr.len, Res->num_rows /*Res->total_found*/); #endif done=1; break; case DPS_SEARCHD_CMD_SUGGEST: DPS_FREE(Res->Suggest); Res->Suggest = (char*)DpsMalloc(hdr.len + 1); if (Res->Suggest == NULL) { done = 1; break; } nrecv = DpsRecvall(cl->searchd, Res->Suggest, hdr.len, 360); Res->Suggest[(nrecv >=0) ? nrecv : 0] = '\0'; #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Received Suggest size=%d\n", hdr.len); #endif break; case DPS_SEARCHD_CMD_PERSITE: Res->PerSite = (size_t*)DpsMalloc(hdr.len + 1); if (Res->PerSite == NULL) { done = 1; break; } nrecv = DpsRecvall(cl->searchd, Res->PerSite, hdr.len, 360); #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Received PerSite size=%d nwrd=%d\n", nrecv, Res->num_rows/*Res->total_found*/); #endif break; case DPS_SEARCHD_CMD_DATA: udt = (DPS_URLDATA*)DpsMalloc(hdr.len + 1); if (udt == NULL) { done = 1; break; } nrecv = DpsRecvall(cl->searchd, udt, hdr.len, 360); #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Received URLDATA size=%d nwrd=%d\n", nrecv, Res->num_rows); #endif break; #ifdef WITH_REL_TRACK case DPS_SEARCHD_CMD_TRACKDATA: trk = (DPS_URLTRACK*)DpsMalloc(hdr.len + 1); if (trk == NULL) { done = 1; break; } nrecv = DpsRecvall(cl->searchd, trk, hdr.len, 360); #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Received TRACKDATA size=%d nwrd=%d\n", nrecv, Res->num_rows); #endif break; #endif case DPS_SEARCHD_CMD_WITHOFFSET: /* Res->offset = 1;*/ break; case DPS_SEARCHD_CMD_QLC: if ((p = (char *)DpsXmalloc(hdr.len + 1)) != NULL) { if (DpsRecvall(cl->searchd, p, hdr.len, 360)) { DpsVarListReplaceStr(&query->Vars, "q", p); } } DPS_FREE(p); break; case DPS_SEARCHD_CMD_WWL: Res->PerSite = NULL; if ((wbuf = p = (char *)DpsXmalloc(hdr.len + 1)) != NULL) if (DpsRecvall(cl->searchd, wbuf, hdr.len, 360)) { wwl = (DPS_WIDEWORDLIST_EX *)p; p += sizeof(DPS_WIDEWORDLIST_EX); #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "wbuf :%x, wwl: %x, p: %x hdr.len:%d\n", wbuf, wwl, p, hdr.len); DpsLog(query, DPS_LOG_ERROR, "Received WWL nwords=%d nuniq=%d\n", wwl->nwords, wwl->nuniq); #endif /* DpsWideWordListFree(&Res->WWList);*/ for(i = 0; i < wwl->nwords; i++) { /* ww_ex = (DPS_WIDEWORD_EX *)((void*)&p[0]);*/ dps_memcpy((char*)&ww, p, sizeof(DPS_WIDEWORD_EX)); p += sizeof(DPS_WIDEWORD_EX); /* ww.order = ww_ex->order; ww.order_inquery = ww_ex->order_inquery; ww.count = ww_ex->count; ww.len = ww_ex->len; ww.ulen = ww_ex->ulen; ww.origin = ww_ex->origin; ww.crcword = ww_ex->crcword; */ ww.word = p; #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Word {%d}: %s\n", ww.len+1, ww.word); #endif p += ww.len + 1; p += sizeof(dpsunicode_t) - ((SDPALIGN)p % sizeof(dpsunicode_t)); ww.uword = (dpsunicode_t*)p; p += sizeof(dpsunicode_t) * (ww.ulen + 1); DpsWideWordListAdd(&Res->WWList, &ww, DPS_WWL_STRICT); } Res->WWList.nuniq = wwl->nuniq; DPS_FREE(wbuf); } break; default: sprintf(query->Conf->errstr,"Unknown searchd response: cmd=%d len=%d",hdr.cmd,hdr.len); rc = DPS_ERROR; done=1; break; } } Res->CoordList.Coords = wrd; Res->CoordList.Data = udt; #ifdef WITH_REL_TRACK Res->CoordList.Track = trk; #endif TRACE_OUT(query); return rc; }
int __DPSCALL DpsResAddDocInfoSearchd(DPS_AGENT * query,DPS_DB *cl,DPS_RESULT * Res,size_t clnum){ DPS_SEARCHD_PACKET_HEADER hdr; char * msg=NULL; size_t i; /* num=0,curnum=0;*/ int done = 0; ssize_t nsent,nrecv; char * dinfo=NULL; int rc=DPS_OK; char *textbuf; size_t dlen = 0; TRACE_IN(query, "DpsResAddDocInfoSearchd"); if(!Res->num_rows) { TRACE_OUT(query); return(DPS_OK); } for(i=0;i<Res->num_rows;i++){ size_t ulen; size_t olen; size_t nsec, r; DPS_DOCUMENT *D=&Res->Doc[i]; r = (size_t) 's'; for(nsec = 0; nsec < D->Sections.Root[r].nvars; nsec++) if (strcasecmp(D->Sections.Root[r].Var[nsec].name, "Score") == 0) D->Sections.Root[r].Var[nsec].section = 1; #ifdef WITH_MULTIDBADDR if (D->dbnum != cl->dbnum) continue; #endif textbuf = DpsDocToTextBuf(D, 1, 0); if (textbuf == NULL) {TRACE_OUT(query); return DPS_ERROR;} ulen = dps_strlen(textbuf)+2; olen = dlen; dlen = dlen + ulen; dinfo = (char*)DpsRealloc(dinfo, dlen + 1); if (dinfo == NULL) { DpsFree(textbuf); TRACE_OUT(query); return DPS_ERROR; } dinfo[olen] = '\0'; sprintf(dinfo + olen, "%s\r\n", textbuf); DpsFree(textbuf); } if (dinfo == NULL) { TRACE_OUT(query); return DPS_OK; } hdr.cmd=DPS_SEARCHD_CMD_DOCINFO; hdr.len = dps_strlen(dinfo); nsent = DpsSearchdSendPacket(cl->searchd, &hdr, dinfo); #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Sent DOCINFO size=%d buf=%s\n", hdr.len, dinfo); #endif while(!done){ char * tok, * lt; nrecv = DpsRecvall(cl->searchd, &hdr, sizeof(hdr), 360); if(nrecv!=sizeof(hdr)){ DpsLog(query, DPS_LOG_ERROR, "Received incomplete header from searchd (%d bytes, errno:%d)", (int)nrecv, errno); TRACE_OUT(query); return(DPS_ERROR); }else{ #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Received header cmd=%d len=%d\n",hdr.cmd,hdr.len); #endif } switch(hdr.cmd){ case DPS_SEARCHD_CMD_ERROR: msg=(char*)DpsMalloc(hdr.len+1); if (msg == NULL) { done = 1; break; } nrecv = DpsRecvall(cl->searchd, msg, hdr.len, 360); msg[(nrecv >= 0) ? nrecv : 0]='\0'; sprintf(query->Conf->errstr,"Searchd error: '%s'",msg); rc=DPS_ERROR; DPS_FREE(msg); done=1; break; case DPS_SEARCHD_CMD_MESSAGE: msg=(char*)DpsMalloc(hdr.len+1); if (msg == NULL) { done = 1; break; } nrecv = DpsRecvall(cl->searchd, msg, hdr.len, 360); msg[(nrecv >= 0) ? nrecv : 0]='\0'; #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Message from searchd: '%s'\n",msg); #endif DPS_FREE(msg); break; case DPS_SEARCHD_CMD_DOCINFO: dinfo = (char*)DpsRealloc(dinfo, hdr.len + 1); if (dinfo == NULL) { done=1; break; } nrecv = DpsRecvall(cl->searchd, dinfo, hdr.len, 360); dinfo[(nrecv > 0) ? nrecv : 0] = '\0'; #ifdef DEBUG_SDP DpsLog(query, DPS_LOG_ERROR, "Received DOCINFO size=%d buf=%s\n",hdr.len,dinfo); #endif tok = dps_strtok_r(dinfo, "\r\n", <, NULL); while(tok){ urlid_t Doc_url_id, Res_Doc_url_id; DPS_DOCUMENT Doc; DpsDocInit(&Doc); DpsDocFromTextBuf(&Doc,tok); Doc_url_id = (urlid_t)DpsVarListFindInt(&Doc.Sections, "DP_ID", 0); for(i=0;i<Res->num_rows;i++){ #ifdef WITH_MULTIDBADDR if (Res->Doc[i].dbnum != cl->dbnum) continue; #endif Res_Doc_url_id = (urlid_t)DpsVarListFindInt(&Res->Doc[i].Sections, "DP_ID", 0); if (Res_Doc_url_id == Doc_url_id) { DpsDocFromTextBuf(&Res->Doc[i], tok); break; } } tok = dps_strtok_r(NULL, "\r\n", <, NULL); DpsDocFree(&Doc); } DPS_FREE(dinfo); done=1; break; default: sprintf(query->Conf->errstr,"Unknown searchd response: cmd=%d len=%d",hdr.cmd,hdr.len); rc=DPS_ERROR; done=1; break; } } TRACE_OUT(query); return rc; }
int main(int argc, char ** argv, char **envp) { const char *env, *bcharset, *lcharset, *conf_dir; char template_name[PATH_MAX+6]=""; char *template_filename = NULL; char *query_string = NULL; char self[1024]=""; char *url = NULL; const char *ResultContentType; int res,httpd=0; size_t catcolumns = 0; int page_size,page_number; DPS_ENV *Env; DPS_AGENT *Agent; DPS_VARLIST query_vars; /* Output Content-type if under HTTPD */ /* Some servers do not pass QUERY_STRING */ /* if the query was empty, so check */ /* REQUEST_METHOD too to be safe */ httpd=(getenv("QUERY_STRING")||getenv("REQUEST_METHOD")); if (!(conf_dir=getenv("DPS_ETC_DIR"))) conf_dir=DPS_CONF_DIR; DpsInit(argc, argv, envp); Env=DpsEnvInit(NULL); if (Env == NULL) { if(httpd){ printf("Content-Type: text/plain\r\n\r\n"); } printf("Can't alloc Env\n"); exit(0); } DpsVarListInit(&query_vars); Agent = DpsAgentInit(NULL, Env, 0); if (Agent == NULL) { if(httpd){ printf("Content-Type: text/plain\r\n\r\n"); } printf("Can't alloc Agent\n"); exit(0); } DpsVarListAddEnviron(&Env->Vars,"ENV"); /* Detect self and template name */ if((env = getenv("DPSEARCH_TEMPLATE"))) dps_strncpy(template_name, env, sizeof(template_name) - 1); else if((env = getenv("PATH_INFO")) && env[0]) dps_strncpy(template_name, env + 1, sizeof(template_name) - 1); if((env=getenv("DPSEARCH_SELF"))) dps_strncpy(self,env,sizeof(self)-1); if((env=getenv("QUERY_STRING"))){ query_string = (char*)DpsRealloc(query_string, dps_strlen(env) + 2); if (query_string == NULL) { if(httpd){ printf("Content-Type: text/plain\r\n\r\n"); } printf("Can't alloc query_string\n"); exit(0); } dps_strncpy(query_string, env, dps_strlen(env) + 1); /* Hack for Russian Apache from apache.lexa.ru */ /* QUERY_STRING is already converted to server */ /* character set. We must print original query */ /* string instead however. Under usual apache */ /* we'll use QUERY_STRING. Note that query_vars */ /* list will contain not unescaped values, so */ /* we don't have to escape them when displaying */ env = getenv("CHARSET_SAVED_QUERY_STRING"); DpsParseQStringUnescaped(&query_vars,env?env:query_string); /* Unescape and save variables from QUERY_STRING */ /* Env->Vars will have unescaped values however */ DpsParseQueryString(Agent,&Env->Vars,query_string); template_filename = (char*)DpsStrdup(DpsVarListFindStr(&Env->Vars, "tmplt", "")); if((env=getenv("REDIRECT_STATUS"))){ /* Check Apache internal redirect */ /* via "AddHandler" and "Action" */ if(!self[0]){ dps_strncpy(self,(env=getenv("REDIRECT_URL"))?env:"filler.cgi",sizeof(self)-1); } if(!template_name[0]){ dps_strncpy(template_name,(env=getenv("PATH_TRANSLATED"))?env:"",sizeof(template_name)-1); } if (*template_filename == '\0') { DPS_FREE(template_filename); template_filename = (char*)DpsStrdup("filler.htm"); } }else{ /* CGI executed without Apache internal redirect */ /* Detect $Self variable with OS independant SLASHES */ if(!self[0]){ dps_strncpy(self,(env=getenv("SCRIPT_NAME"))?env:"filler.cgi",sizeof(self)-1); } if(!template_name[0]){ char *s,*e; /*This is with OS specific SLASHES */ env=((env=getenv("SCRIPT_FILENAME"))?env:"filler.cgi"); if(strcmp(conf_dir,".")){ /* Take from the config directory */ dps_snprintf(template_name, sizeof(template_name)-1, "%s/%s", conf_dir,(s=strrchr(env,DPSSLASH))?(s+1):(self)); }else{ /* Take from the current directory */ dps_strncpy(template_name,env,sizeof(template_name)-1); } /* Find right slash if it presents */ s=((s=strrchr(template_name,DPSSLASH))?s:template_name); if (*template_filename == '\0') { /* Find .cgi substring */ if ((e = strstr(s, ".cgi")) != NULL) { /* Replace ".cgi" with ".htm" */ e[1]='h';e[2]='t';e[3]='m'; } else { dps_strcat(s, ".htm"); } e = strrchr(s, '/'); DPS_FREE(template_filename); template_filename = (char*)DpsStrdup(e + 1); } else { dps_strncpy(s + 1, template_filename, sizeof(template_name) - (s - template_name) - 2); } } } }else{ /* Executed from command line */ /* or under server which does not */ /* pass an empty QUERY_STRING var */ if(argv[1]) { query_string = (char*)DpsRealloc(query_string, dps_strlen(argv[1]) + 10); if (query_string == NULL) { if(httpd){ printf("Content-Type: text/plain\r\n\r\n"); } printf("Can't realloc query_string\n"); exit(0); } sprintf(query_string, "q=%s", argv[1]); } else { query_string = (char*)DpsRealloc(query_string, 1024); if (query_string == NULL) { if(httpd){ printf("Content-Type: text/plain\r\n\r\n"); } printf("Can't realloc query_string\n"); exit(0); } sprintf(query_string, "q="); } /* Hack for Russian Apache from apache.lexa.ru */ /* QUERY_STRING is already converted to server */ /* character set. We must print original query */ /* string instead however. Under usual apache */ /* we'll use QUERY_STRING. Note that query_vars */ /* list will contain not unescaped values, so */ /* we don't have to escape them when displaying */ env = getenv("CHARSET_SAVED_QUERY_STRING"); DpsParseQStringUnescaped(&query_vars,env?env:query_string); /* Unescape and save variables from QUERY_STRING */ /* Env->Vars will have unescaped values however */ DpsParseQueryString(Agent,&Env->Vars,query_string); DPS_FREE(template_filename); template_filename = (char*)DpsStrdup(DpsVarListFindStr(&Env->Vars, "tmplt", "")); if (*template_filename == '\0') { DPS_FREE(template_filename); template_filename = (char*)DpsStrdup("filler.htm"); } /*// Get template name from command line variable &tmplt */ if(!template_name[0]) dps_snprintf(template_name,sizeof(template_name),"%s/%s", conf_dir, template_filename); } DpsVarListReplaceStr(&Agent->Conf->Vars, "tmplt", template_filename); DPS_FREE(template_filename); Agent->tmpl.Env_Vars = &Env->Vars; DpsURLNormalizePath(template_name); if (strncmp(template_name, conf_dir, dps_strlen(conf_dir)) || (res = DpsTemplateLoad(Agent, Env, &Agent->tmpl, template_name))) { if (strcmp(template_name, "filler.htm")) { /* trying load default template */ fprintf(stderr, "Can't load template: '%s' %s\n", template_name, Env->errstr); DPS_FREE(template_filename); template_filename = (char*)DpsStrdup("filler.htm"); dps_snprintf(template_name, sizeof(template_name), "%s/%s", conf_dir, template_filename); if ((res = DpsTemplateLoad(Agent, Env, &Agent->tmpl, template_name))) { if(httpd)printf("Content-Type: text/plain\r\n\r\n"); printf("%s\n",Env->errstr); DpsVarListFree(&query_vars); DpsEnvFree(Env); DPS_FREE(query_string); DpsAgentFree(Agent); return(0); } } else { if(httpd)printf("Content-Type: text/plain\r\n\r\n"); printf("%s\n",Env->errstr); DpsVarListFree(&query_vars); DpsEnvFree(Env); DPS_FREE(query_string); DpsAgentFree(Agent); return(0); } } /* set locale if specified */ if ((url = DpsVarListFindStr(&Env->Vars, "Locale", NULL)) != NULL) { setlocale(LC_ALL, url); /*#ifdef HAVE_ASPELL*/ { char *p; if ((p = strchr(url, '.')) != NULL) { *p = '\0'; DpsVarListReplaceStr(&Env->Vars, "g-lc", url); *p = '.'; } } /*#endif*/ url = NULL; } /* Call again to load search Limits if need */ DpsParseQueryString(Agent, &Env->Vars, query_string); Agent->Flags = Env->Flags; Agent->flags |= DPS_FLAG_UNOCON; Env->flags |= DPS_FLAG_UNOCON; DpsSetLogLevel(NULL, DpsVarListFindInt(&Env->Vars, "LogLevel", 0)); DpsOpenLog("filler.cgi", Env, !strcasecmp(DpsVarListFindStr(&Env->Vars, "Log2stderr", (!httpd) ? "yes" : "no"), "yes")); DpsLog(Agent,DPS_LOG_ERROR,"filler.cgi started with '%s'",template_name); DpsLog(Agent, DPS_LOG_DEBUG, "VarDir: '%s'", DpsVarListFindStr(&Agent->Conf->Vars, "VarDir", DPS_VAR_DIR)); DpsLog(Agent, DPS_LOG_DEBUG, "Affixes: %d, Spells: %d, Synonyms: %d, Acronyms: %d, Stopwords: %d", Env->Affixes.naffixes,Env->Spells.nspell, Env->Synonyms.nsynonyms, Env->Acronyms.nacronyms, Env->StopWords.nstopwords); DpsLog(Agent, DPS_LOG_DEBUG, "Chinese dictionary with %d entries", Env->Chi.nwords); DpsLog(Agent, DPS_LOG_DEBUG, "Korean dictionary with %d entries", Env->Korean.nwords); DpsLog(Agent, DPS_LOG_DEBUG, "Thai dictionary with %d entries", Env->Thai.nwords); DpsVarListAddLst(&Agent->Vars, &Env->Vars, NULL, "*"); Agent->tmpl.Env_Vars = &Agent->Vars; /* DpsVarListAddEnviron(&Agent->Vars, "ENV");*/ /****************************************************************************************************************************************/ /* This is for query tracking */ DpsVarListAddStr(&Agent->Vars, "QUERY_STRING", query_string); DpsVarListAddStr(&Agent->Vars, "self", self); env = getenv("HTTP_X_FORWARDER_FOR"); if (env) { DpsVarListAddStr(&Agent->Vars, "IP", env); } else { env = getenv("REMOTE_ADDR"); DpsVarListAddStr(&Agent->Vars, "IP", env ? env : "localhost"); } bcharset = DpsVarListFindStr(&Agent->Vars, "BrowserCharset", "iso-8859-1"); Env->bcs=DpsGetCharSet(bcharset); lcharset = DpsVarListFindStr(&Agent->Vars, "LocalCharset", "iso-8859-1"); Env->lcs=DpsGetCharSet(lcharset); ResultContentType = DpsVarListFindStr(&Agent->Vars, "ResultContentType", "text/html"); if(httpd){ if(!Env->bcs){ printf("Content-Type: text/plain\r\n\r\n"); printf("Unknown BrowserCharset '%s' in template '%s'\n",bcharset,template_name); exit(0); }else if(!Env->lcs){ printf("Content-Type: text/plain\r\n\r\n"); printf("Unknown LocalCharset '%s' in template '%s'\n",lcharset,template_name); exit(0); }else{ printf("Content-type: %s; charset=%s\r\n\r\n", ResultContentType, bcharset); } }else{ if(!Env->bcs){ printf("Unknown BrowserCharset '%s' in template '%s'\n",bcharset,template_name); exit(0); } if(!Env->lcs){ printf("Unknown LocalCharset '%s' in template '%s'\n",lcharset,template_name); exit(0); } } /* These parameters taken from "variable section of template"*/ res = DpsVarListFindInt(&Agent->Vars, "ps", DPS_DEFAULT_PS); page_size = dps_min(res, MAX_PS); page_number = DpsVarListFindInt(&Agent->Vars, "p", 0); if (page_number == 0) { page_number = DpsVarListFindInt(&Agent->Vars, "np", 0); DpsVarListReplaceInt(&Agent->Vars, "p", page_number + 1); } else page_number--; res = DpsVarListFindInt(&Agent->Vars, "np", 0) * page_size; DpsVarListAddInt(&Agent->Vars, "pn", res); catcolumns = (size_t)atoi(DpsVarListFindStr(&Agent->Vars, "CatColumns", "")); DpsTemplatePrint(Agent, (DPS_OUTPUTFUNCTION)&fprintf, stdout, NULL, 0, &Agent->tmpl, "top"); DpsTemplatePrint(Agent, (DPS_OUTPUTFUNCTION)&fprintf, stdout, NULL, 0, &Agent->tmpl, "restop"); DpsTemplatePrint(Agent, (DPS_OUTPUTFUNCTION)&fprintf, stdout, NULL, 0, &Agent->tmpl, "res"); DpsTemplatePrint(Agent, (DPS_OUTPUTFUNCTION)&fprintf, stdout, NULL, 0, &Agent->tmpl, "resbot"); DpsTemplatePrint(Agent, (DPS_OUTPUTFUNCTION)&fprintf, stdout, NULL, 0, &Agent->tmpl, "bottom"); DpsVarListFree(&query_vars); DpsAgentFree(Agent); DpsEnvFree(Env); DPS_FREE(query_string); DPS_FREE(url); if (httpd) fflush(NULL); else fclose(stdout); #ifdef EFENCE fprintf(stderr, "Memory leaks checking\n"); DpsEfenceCheckLeaks(); #endif #ifdef FILENCE fprintf(stderr, "FD leaks checking\n"); DpsFilenceCheckLeaks(NULL); #endif return DPS_OK; }
static int MakeLinearIndex(DPS_AGENT *Indexer, const char *field, const char *lim_name, int type, DPS_DB *db) { DPS_ENV *Conf = Indexer->Conf; DPS_UINT4URLIDLIST L; size_t k,prev; urlid_t *data = NULL; DPS_UINT4_POS_LEN *ind=NULL; size_t mind=1000,nind=0; char fname[PATH_MAX]; int dat_fd=0, ind_fd=0, rc; const char *vardir = (db->vardir) ? db->vardir : DpsVarListFindStr(&Conf->Vars, "VarDir", DPS_VAR_DIR); bzero(&L, sizeof(DPS_UINT4URLIDLIST)); rc = DpsLimit4(Indexer, &L, field, type, db); if(rc != DPS_OK) { DpsLog(Indexer, DPS_LOG_ERROR, "Error: %s [%s:%d]", DpsEnvErrMsg(Conf), __FILE__, __LINE__); goto err1; } if(!L.Item)return(1); if (L.nitems > 1) DpsSort(L.Item, L.nitems, sizeof(DPS_UINT4URLID), (qsort_cmp)cmp_ind4); data = (urlid_t*)DpsMalloc((L.nitems + 1) * sizeof(*data)); if(!data) { fprintf(stderr,"Error1: %s\n",strerror(errno)); goto err1; } ind=(DPS_UINT4_POS_LEN*)DpsMalloc(mind*sizeof(DPS_UINT4_POS_LEN)); if(!ind) { fprintf(stderr,"Error2: %s\n",strerror(errno)); goto err1; } prev=0; for(k=0; k<L.nitems; k++) { data[k]=L.Item[k].url_id; if((k==L.nitems-1) || (L.Item[k].val!=L.Item[prev].val)) { if(nind==mind) { mind+=1000; ind=(DPS_UINT4_POS_LEN*)DpsRealloc(ind,mind*sizeof(DPS_UINT4_POS_LEN)); if(!ind) { fprintf(stderr,"Error3: %s\n",strerror(errno)); goto err1; } } /* Fill index */ ind[nind].val=L.Item[prev].val; ind[nind].pos = prev * sizeof(*data); if (k == L.nitems - 1) ind[nind].len = (k - prev + 1) * sizeof(*data); else ind[nind].len = (k - prev) * sizeof(*data); DpsLog(Indexer, DPS_LOG_DEBUG, "%d - pos:%x len:%d\n", ind[nind].val, (int)ind[nind].pos, ind[nind].len); nind++; prev=k; } } if (L.mapped) { #ifdef HAVE_SYS_MMAN_H if (munmap(L.Item, (L.nitems + 1) * sizeof(DPS_UINT4URLID))) { fprintf(stderr, "Can't shmdt '%s': %s\n", L.shm_name, strerror(errno)); } #elif defined(HAVE_SYS_SHM_H) if (shmdt(L.Item)) { fprintf(stderr, "Can't shmdt '%s': %s\n", L.shm_name, strerror(errno)); } #endif unlink(L.shm_name); } else { DPS_FREE(L.Item); } dps_snprintf(fname,sizeof(fname),"%s%c%s%c%s.dat", vardir,DPSSLASH, DPS_TREEDIR, DPSSLASH, lim_name); if((dat_fd = DpsOpen3(fname, O_CREAT | O_WRONLY | O_TRUNC | DPS_BINARY, DPS_IWRITE)) < 0) { fprintf(stderr,"Can't open '%s': %s\n",fname,strerror(errno)); goto err1; } DpsWriteLock(dat_fd); if((L.nitems * sizeof(*data)) != (size_t)write(dat_fd, data, L.nitems * sizeof(*data))) { fprintf(stderr,"Can't write '%s': %s\n",fname,strerror(errno)); goto err1; } DpsUnLock(dat_fd); DpsClose(dat_fd); DPS_FREE(data); dps_snprintf(fname,sizeof(fname),"%s%c%s%c%s.ind", vardir,DPSSLASH, DPS_TREEDIR, DPSSLASH, lim_name); if((ind_fd = DpsOpen3(fname, O_CREAT | O_WRONLY | O_TRUNC | DPS_BINARY, DPS_IWRITE)) < 0) { fprintf(stderr,"Can't open '%s': %s\n",fname,strerror(errno)); goto err1; } DpsWriteLock(ind_fd); if((nind*sizeof(DPS_UINT4_POS_LEN)) != (size_t)write(ind_fd,ind,nind*sizeof(DPS_UINT4_POS_LEN))) { fprintf(stderr,"Can't write '%s': %s\n",fname,strerror(errno)); goto err1; } DpsUnLock(ind_fd); DpsClose(ind_fd); DPS_FREE(ind); return(0); err1: if (L.mapped) { #ifdef HAVE_SYS_MMAN_H if (munmap(L.Item, (L.nitems + 1) * sizeof(DPS_UINT4URLID))) { fprintf(stderr, "Can't shmdt '%s': %s\n", L.shm_name, strerror(errno)); } #elif defined(HAVE_SYS_SHM_H) if (shmdt(L.Item)) { fprintf(stderr, "Can't shmdt '%s': %s\n", L.shm_name, strerror(errno)); } #endif unlink(L.shm_name); } else { DPS_FREE(L.Item); } DPS_FREE(data); DPS_FREE(ind); if(dat_fd) DpsClose(dat_fd); if(ind_fd) DpsClose(ind_fd); return(1); }
int main(int argc,char **argv, char **envp) { int ch, sleeps = 1, optimize = 0, obi = 0; unsigned int from = 0, to = 0xFFF, p_to = 0; DPS_ENV * Env; const char * config_name = DPS_CONF_DIR "/cached.conf"; DpsInit(argc, argv, envp); /* Initialize library */ DpsInitMutexes(); Env=DpsEnvInit(NULL); if (Env == NULL) exit(1); DpsSetLockProc(Env, DpsLockProc); /*#ifndef HAVE_SETPROCTITLE*/ ARGV = argv; ARGC = argc; /*#endif*/ while ((ch = getopt(argc, argv, "blt:f:op:w:v:h?")) != -1){ switch (ch) { case 'f': sscanf(optarg, "%x", &from); break; case 't': sscanf(optarg, "%x", &p_to); break; case 'w': DpsVarListReplaceStr(&Env->Vars, "VarDir", optarg); break; case 'v': DpsSetLogLevel(NULL, atoi(optarg)); break; case 'b': obi++; break; case 'o': optimize++; break; case 'p': sleeps = atoi(optarg); break; case 'h': case '?': default: usage(); DpsEnvFree(Env); DpsDeInit(); DpsDestroyMutexes(); return 1; break; } } argc -= optind; argv += optind; if(argc > 1) { usage(); DpsEnvFree(Env); DpsDeInit(); DpsDestroyMutexes(); return 1; } else if (argc == 1) { config_name = argv[0]; } { DPS_LOGDEL *del_buf=NULL; size_t del_count = 0, log, bytes, n = 0; int dd, log_fd; struct stat sb; char dname[PATH_MAX] = ""; DPS_BASE_PARAM P; DPS_LOGWORD *log_buf = NULL; DPS_AGENT *Indexer = DpsAgentInit(NULL, Env, 0); log2stderr = 1; if (Indexer == NULL) { fprintf(stderr, "Can't alloc Agent at %s:%d\n", __FILE__, __LINE__); exit(DPS_ERROR); } if(DPS_OK != DpsEnvLoad(Indexer, config_name, (dps_uint8)0)){ fprintf(stderr, "%s\n", DpsEnvErrMsg(Env)); DpsEnvFree(Env); DpsDeInit(); DpsDestroyMutexes(); return DPS_ERROR; } DpsOpenLog("splitter", Env, log2stderr); Indexer->flags = Env->flags = DPS_FLAG_UNOCON; DpsVarListAddLst(&Indexer->Vars, &Env->Vars, NULL, "*"); bzero(&P, sizeof(P)); P.subdir = DPS_TREEDIR; P.basename = "wrd"; P.indname = "wrd"; P.mode = DPS_WRITE_LOCK; P.NFiles = DpsVarListFindInt(&Indexer->Conf->Vars, "WrdFiles", 0x300); P.vardir = DpsStrdup(DpsVarListFindStr(&Indexer->Conf->Vars, "VarDir", DPS_VAR_DIR)); P.A = Indexer; if (p_to != 0) to = p_to; else to = P.NFiles - 1; #ifdef HAVE_ZLIB P.zlib_method = Z_DEFLATED; P.zlib_level = 9; P.zlib_windowBits = DPS_BASE_WRD_WINDOWBITS; P.zlib_memLevel = 9; P.zlib_strategy = DPS_BASE_WRD_STRATEGY; #endif /* Open del log file */ dps_snprintf(dname,sizeof(dname),"%s%c%s%cdel-split.log", P.vardir, DPSSLASH, DPS_SPLDIR, DPSSLASH); if((dd = DpsOpen2(dname, O_RDONLY | DPS_BINARY)) < 0) { dps_strerror(NULL, 0, "Can't open del log '%s'", dname); exit(DPS_ERROR); } DpsLog(Indexer, DPS_LOG_DEBUG, "VarDir: %s, WrdFiles: %d [%x]", P.vardir, P.NFiles, P.NFiles); /* Allocate del buffer */ fstat(dd, &sb); if (sb.st_size != 0) { del_buf=(DPS_LOGDEL*)DpsMalloc((size_t)sb.st_size + 1); if (del_buf == NULL) { fprintf(stderr, "Can't alloc %d bytes at %s:%d\n", (int)sb.st_size, __FILE__, __LINE__); exit(0); } del_count=read(dd,del_buf,(size_t)sb.st_size)/sizeof(DPS_LOGDEL); } DpsClose(dd); /* Remove duplicates URLs in DEL log */ /* Keep only oldest records for each URL */ if (del_count > 0) { DpsLog(Indexer, DPS_LOG_DEBUG, "Sorting del_buf: %d items", del_count); if (del_count > 1) DpsSort(del_buf, (size_t)del_count, sizeof(DPS_LOGDEL), DpsCmpurldellog); DpsLog(Indexer, DPS_LOG_DEBUG, "Removing DelLogDups"); del_count = DpsRemoveDelLogDups(del_buf, del_count); } DpsLog(Indexer, DPS_LOG_DEBUG, "Processing Bufs from %d [%x] to %d [%x]", from, from, to, to); for(log = from; log <= to; log++) { /* Open log file */ dps_snprintf(dname, sizeof(dname), "%s%c%s%c%03X-split.log", P.vardir, DPSSLASH, DPS_SPLDIR, DPSSLASH, log); if((log_fd = DpsOpen2(dname, O_RDWR|DPS_BINARY)) < 0){ if (errno == ENOENT) { dps_strerror(Indexer, DPS_LOG_DEBUG, "Can't open '%s'", dname); n = 0; /* continue;*/ } else { dps_strerror(Indexer, DPS_LOG_ERROR, "Can't open '%s'", dname); continue; } } else { DpsWriteLock(log_fd); DpsLog(Indexer, DPS_LOG_DEBUG, "Processing Log: %x", log); fstat(log_fd, &sb); log_buf = (sb.st_size > 0) ? (DPS_LOGWORD*)DpsMalloc((size_t)sb.st_size + 1) : NULL; if (log_buf != NULL) { unlink(dname); bytes = read(log_fd,log_buf,(size_t)sb.st_size); (void)ftruncate(log_fd, (off_t)0); DpsUnLock(log_fd); DpsClose(log_fd); n = bytes / sizeof(DPS_LOGWORD); DpsLog(Indexer, DPS_LOG_DEBUG, "Sorting log_buf: %d items", n); if (n > 1) DpsSort(log_buf, n, sizeof(DPS_LOGWORD), (qsort_cmp)DpsCmplog); DpsLog(Indexer, DPS_LOG_DEBUG, "Removing OldWords"); n = DpsRemoveOldWords(log_buf, n, del_buf, del_count); if (n > 1) DpsSort(log_buf, n, sizeof(DPS_LOGWORD), (qsort_cmp)DpsCmplog_wrd); } else { n = 0; DpsUnLock(log_fd); DpsClose(log_fd); } } DpsLog(Indexer, DPS_LOG_DEBUG, "Processing Buf, optimize: %d", optimize); if (obi) DpsBaseOptimize(&P, log); DpsProcessBuf(Indexer, &P, log, log_buf, n, del_buf, del_count); if (optimize) DpsBaseOptimize(&P, log); DpsBaseClose(&P); DPS_FREE(log_buf); DpsLog(Indexer, DPS_LOG_DEBUG, "pas done: %d from %d to %d", log, from, to); DPSSLEEP(sleeps); } DPS_FREE(del_buf); DpsAgentFree(Indexer); DPS_FREE(P.vardir); } fprintf(stderr, "Splitting done.\n"); DpsEnvFree(Env); DpsDeInit(); DpsDestroyMutexes(); #ifdef EFENCE fprintf(stderr, "Memory leaks checking\n"); DpsEfenceCheckLeaks(); #endif #ifdef FILENCE fprintf(stderr, "FD leaks checking\n"); DpsFilenceCheckLeaks(NULL); #endif return 0; }
/* Compute one operation and store result */ static int perform(DPS_AGENT *query, DPS_RESULT *Res, DPS_BOOLSTACK *s, int com) { DPS_STACK_ITEM res, *x1, *x2; int rc = DPS_OK, found, flag1; bzero(&res, sizeof(res)); switch(com){ case DPS_STACK_PHRASE_LEFT: x1 = POPARG(s); if (x1 == NULL) { bzero(&res, sizeof(res)); } else { res = *x1; /* FIXME: add checking ? */ if (res.order_from != res.order_to) { DPS_URL_CRD_DB *w; dps_uint4 *pos_real, *order_ideal, *order_real, *gap_ahead/*, *gap_back*/; urlid_t curlid; size_t nwords = res.order_to - res.order_from + 1, nonstop_words; size_t p_cmp, p_ins; res.plast = res.pbegin + res.count; w = res.pcur = res.pchecked = res.pbegin; if ((pos_real = (dps_uint4*)DpsMalloc(5 * nwords * sizeof(dps_uint4) + 1)) == NULL) { DpsLog(query, DPS_LOG_ERROR, "Can't alloc %d bytes %s:%d",(5 * nwords * sizeof(dps_uint4) + 1), __FILE__, __LINE__); return DPS_ERROR; } order_real = pos_real + nwords; gap_ahead = order_real + nwords; order_ideal = gap_ahead + nwords; /* gap_back = order_ideal + nwords;*/ nonstop_words = 0; { register size_t tt; for (tt = res.order_from; tt <= res.order_to; tt++) { #ifdef DEBUG_BOOL #endif if ((Res->items[tt].order_origin & DPS_WORD_ORIGIN_STOP) == 0) { order_ideal[nonstop_words] = tt; gap_ahead[nonstop_words] = (tt == res.order_to) ? 0 : 1; { register size_t zz; for (zz = tt + 1; zz <= res.order_to; zz++) { if (Res->items[zz].order_origin & DPS_WORD_ORIGIN_STOP) gap_ahead[nonstop_words]++; else break; } } nonstop_words++; } } } #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, "nonstopwords: %d nwords:%d", nonstop_words, nwords); { register size_t tt; for (tt = 0; tt < nonstop_words; tt++) { DpsLog(query, DPS_LOG_EXTRA, "%d:order_ideal:%d gap_ahead:%d", tt, order_ideal[tt], gap_ahead[tt]); } } #endif if (nonstop_words != 0) { while (res.pcur < res.plast) { register size_t tt; curlid = res.pcur->url_id; found = 0; p_ins = 0; p_cmp = nwords - nonstop_words; res.pchecked = res.pcur; /******* ? *****/ for (tt = 0; tt < nwords - nonstop_words; tt++) pos_real[tt] = 0; for (tt = 0; (tt < nonstop_words) && (res.pcur < res.plast) && (res.pcur->url_id == curlid) ; tt++) { order_real[p_ins] = Res->WWList.Word[DPS_WRDNUM(res.pcur->coord)].order; pos_real[p_ins] = DPS_WRDPOS(res.pcur->coord); while((res.pcur < res.plast) && (pos_real[p_ins] == DPS_WRDPOS(res.pcur->coord))) res.pcur++; p_ins++; p_cmp++; p_ins %= nwords; p_cmp %= nwords; } if (tt == nonstop_words) { /* [[[[[[ */ found = 1; for (tt = 0; tt < nonstop_words; tt++) { if (order_real[(p_cmp + tt) % nwords] != order_ideal[tt]) { found = 0; break; } if (gap_ahead[tt] && (tt + gap_ahead[tt] < nwords) && (pos_real[(p_cmp + tt) % nwords] + gap_ahead[tt] != pos_real[(p_cmp + tt + 1) % nwords])) { found = 0; break; } } if (found) { while((res.pchecked < res.pcur) /*&& (res.pchecked->url_id == curlid)*/) { *w = *res.pchecked; w++; res.pchecked++; } res.pcur = res.pchecked; } else { res.pchecked = res.pcur; } /* ]]]]]] */ } while (/*(found == 0) &&*/ (res.pcur < res.plast) && (res.pcur->url_id == curlid)) { order_real[p_ins] = Res->WWList.Word[DPS_WRDNUM(res.pcur->coord)].order; pos_real[p_ins] = DPS_WRDPOS(res.pcur->coord); while((res.pcur < res.plast) && (pos_real[p_ins] == DPS_WRDPOS(res.pcur->coord))) res.pcur++; p_ins++; p_cmp++; p_ins %= nwords; p_cmp %= nwords; /* [[[[[[ */ found = 1; for (tt = 0; tt < nonstop_words; tt++) { if (order_real[(p_cmp + tt) % nwords] != order_ideal[tt]) { found = 0; break; } if (gap_ahead[tt] && (tt + gap_ahead[tt] < nwords) && (pos_real[(p_cmp + tt) % nwords] + gap_ahead[tt] != pos_real[(p_cmp + tt + 1) % nwords])) { found = 0; break; } } if (found) { while((res.pchecked < res.pcur) /*&& (res.pchecked->url_id == curlid)*/) { *w = *res.pchecked; w++; res.pchecked++; } res.pcur = res.pchecked; } else { res.pchecked = res.pcur; } /* ]]]]]] */ } } } res.count = w - res.pbegin; DPS_FREE(pos_real); } } #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, "Perform <{%d}:%d:%d> ->{%d}", x1 ? x1->count : 0, x1 ? x1->order_from : 0, x1 ? x1->order_to : 0, res.count); #endif rc = PUSHARG(s, &res); break; case DPS_STACK_OR: x1 = POPARG(s); x2 = POPARG(s); if (x2 == NULL || x1 == NULL) { if (x1 != NULL) { res = *x1; x1 = NULL; } if (x2 != NULL) { res = *x2; x2 = NULL; } } else { #ifdef DEBUG_BOOL /* printBoolRes(query, x1);*/ DpsLog(query, DPS_LOG_EXTRA, "^^^"); /* printBoolRes(query, x2);*/ #endif res.order_from = (x1->order_from <= x2->order_from) ? x1->order_from : x2->order_from; res.order_to = (x1->order_to >= x2->order_to) ? x1->order_to : x2->order_to; if (DPS_OK != proceedOR(query, &res, x1, x2)) return DPS_ERROR; DpsStackItemFree(x1); DpsStackItemFree(x2); res.count = res.pcur - res.pbegin; { register size_t tt; int x1origin=0, x2origin=0; for (tt = x1->order_from; tt <= x1->order_to; tt++) { #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, "\t\t\t\tx1order_origin[%d].%x ", tt, Res->items[tt].origin /*order_origin*/); #endif if (Res->items[tt].origin /*order_origin*/ & DPS_WORD_ORIGIN_STOP) { x1origin = Res->items[tt].origin /*order_origin*/; break; } } for (tt = x2->order_from; tt <= x2->order_to; tt++) { #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, "\t\t\t\tx2order_origin[%d].%x ", tt, Res->items[tt].origin /*order_origin*/); #endif if (Res->items[tt].origin /*order_origin*/ & DPS_WORD_ORIGIN_STOP) { x2origin = Res->items[tt].origin /*order_origin*/; break; } } x1origin = x1->origin; x2origin = x2->origin; #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, "\t\t\t\tx1origin.%x x2origin.%x", x1origin, x2origin); #endif if ((x1origin & (DPS_WORD_ORIGIN_STOP | DPS_WORD_ORIGIN_QUERY)) == (DPS_WORD_ORIGIN_STOP|DPS_WORD_ORIGIN_QUERY)) { res.origin = x1origin; } else if ((x2origin & (DPS_WORD_ORIGIN_STOP | DPS_WORD_ORIGIN_QUERY)) == (DPS_WORD_ORIGIN_STOP|DPS_WORD_ORIGIN_QUERY)) { res.origin = x2origin; } else if (((x1origin & DPS_WORD_ORIGIN_STOP) && (x2origin & DPS_WORD_ORIGIN_STOP)) || ( (res.count == 0) && ((x1origin & DPS_WORD_ORIGIN_STOP) || (x2origin & DPS_WORD_ORIGIN_STOP)))) res.origin = DPS_WORD_ORIGIN_STOP; if ((x1origin & DPS_WORD_ORIGIN_ACRONYM) && (x1origin & DPS_WORD_ORIGIN_ACRONYM)) res.origin |= DPS_WORD_ORIGIN_ACRONYM; } } #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, "Perform {%d}.%x | {%d}.%x -> {%d}.%x", (x1) ? x1->count:-1, (x1)?x1->origin:-1, (x2)?x2->count : -1, (x2) ? x2->origin : -1, res.count, res.origin); /* printBoolRes(query, &res);*/ DpsLog(query, DPS_LOG_EXTRA, "==="); #endif rc = PUSHARG(s, &res); break; case DPS_STACK_NEAR: x1 = POPARG(s); x2 = POPARG(s); if (x2 == NULL || x1 == NULL) { if (x1 != NULL) { res = *x1; x1 = NULL; } if (x2 != NULL) { res = *x2; x2 = NULL; } } else { res.order_from = (x1->order_from <= x2->order_from) ? x1->order_from : x2->order_from; res.order_to = (x1->order_to >= x2->order_to) ? x1->order_to : x2->order_to; if ((x1->origin & DPS_WORD_ORIGIN_STOP) && (x2->origin & DPS_WORD_ORIGIN_STOP) ) { if (DPS_OK != proceedOR(query, &res, x1, x2)) return DPS_ERROR; res.origin = DPS_WORD_ORIGIN_STOP; } else if (x2->origin & DPS_WORD_ORIGIN_STOP) { if (DPS_OK != proceedSTOP(query, &res, x1, x2)) return DPS_ERROR; } else if (x1->origin & DPS_WORD_ORIGIN_STOP ) { if (DPS_OK != proceedSTOP(query, &res, x2, x1)) return DPS_ERROR; } else if (!((x1->cmd & DPS_STACK_WORD_NOT) && (x2->cmd & DPS_STACK_WORD_NOT))) { #ifdef DEBUG_BOOL /* printBoolRes(query, x1);*/ DpsLog(query, DPS_LOG_EXTRA, "^^^"); DpsLog(query, DPS_LOG_DEBUG, "x1.NOT: %d x2.NOT: %d", x1->cmd & DPS_STACK_WORD_NOT, x2->cmd & DPS_STACK_WORD_NOT); /* printBoolRes(query, x2);*/ #endif res.pbegin = res.pcur = (DPS_URL_CRD_DB*)DpsMalloc((x1->count + x2->count + 1) * sizeof(DPS_URL_CRD_DB)); if (res.pbegin == NULL) return DPS_ERROR; x1->pcur = x1->pbegin; x1->plast = x1->pbegin + x1->count; x2->pcur = x2->pbegin; x2->plast = x2->pbegin + x2->count; if (x1->cmd & DPS_STACK_WORD_NOT) { register DPS_STACK_ITEM *t = x1; x1 = x2; x2 = t; } while (x1->pcur < x1->plast && x2->pcur < x2->plast) { while ((x2->pcur < x2->plast) && (x2->pcur->url_id < x1->pcur->url_id)) x2->pcur++; if (x2->pcur >= x2->plast) break; if (x2->pcur->url_id == x1->pcur->url_id) { dps_uint4 pos1 = DPS_WRDPOS(x1->pcur->coord); dps_uint4 pos2 = DPS_WRDPOS(x2->pcur->coord); register urlid_t curlid = x1->pcur->url_id; if (pos1 > pos2) { found = ((pos2 + 16) >= pos1); } else { found = ((pos1 + 16) >= pos2); } x1->pchecked = x1->pcur; x2->pchecked = x2->pcur; while ((!found) && (x1->pchecked < x1->plast) && (x2->pchecked < x2->plast) /*&& (x1->pchecked->url_id == x2->pchecked->url_id)*/ ) { if (x1->pchecked->coord <= x2->pchecked->coord) { x1->pchecked++; pos1 = DPS_WRDPOS(x1->pchecked->coord); if (x1->pchecked->url_id != curlid) break; } else { x2->pchecked++; pos2 = DPS_WRDPOS(x2->pchecked->coord); if (x2->pchecked->url_id != curlid) break; } if (pos1 > pos2) { found = ((pos2 + 16) >= pos1); } else { found = ((pos1 + 16) >= pos2); } } if (x2->cmd & DPS_STACK_WORD_NOT || x1->cmd & DPS_STACK_WORD_NOT) found = !found; if (found) { while ((x1->pcur < x1->plast) && (x2->pcur < x2->plast) /*&& (x1->pcur->url_id == x2->pcur->url_id)*/) { if (x1->pcur->coord <= x2->pcur->coord) { *res.pcur = *x1->pcur; res.pcur++; x1->pcur++; if (x1->pcur->url_id != curlid) break; } else { *res.pcur = *x2->pcur; res.pcur++; x2->pcur++; if (x2->pcur->url_id != curlid) break; } } while ((x1->pcur < x1->plast) && (x1->pcur->url_id == curlid)) { *res.pcur = *x1->pcur; res.pcur++; x1->pcur++; } while ((x2->pcur < x2->plast) && (x2->pcur->url_id == curlid)) { *res.pcur = *x2->pcur; res.pcur++; x2->pcur++; } } else { x1->pcur = x1->pchecked; x2->pcur = x2->pchecked; while ((x1->pcur < x1->plast) && (x1->pcur->url_id == curlid)) x1->pcur++; while ((x2->pcur < x2->plast) && (x2->pcur->url_id == curlid)) x2->pcur++; } } else { register DPS_STACK_ITEM *t = x1; x1 = x2; x2 = t; } } } } DpsStackItemFree(x1); DpsStackItemFree(x2); res.count = res.pcur - res.pbegin; #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA,"Perform {%d}.%x NEAR {%d}.%x - > %d.%d", (x1)?x1->count:-1, (x1)?x1->origin:-1, (x2) ? x2->count : -1, (x2) ? x2->origin: - 1, res.count, res.origin); /* printBoolRes(query, &res);*/ DpsLog(query, DPS_LOG_EXTRA, "==="); #endif rc = PUSHARG(s, &res); break; case DPS_STACK_ANYWORD: x1 = POPARG(s); x2 = POPARG(s); flag1 = 0; if (x2 == NULL || x1 == NULL) { if (x1 != NULL) { res = *x1; x1 = NULL; } if (x2 != NULL) { res = *x2; x2 = NULL; } } else { res.order_from = (x1->order_from <= x2->order_from) ? x1->order_from : x2->order_from; res.order_to = (x1->order_to >= x2->order_to) ? x1->order_to : x2->order_to; if ((x1->origin & DPS_WORD_ORIGIN_STOP) && (x2->origin & DPS_WORD_ORIGIN_STOP) ) { if (DPS_OK != proceedOR(query, &res, x1, x2)) return DPS_ERROR; res.origin = DPS_WORD_ORIGIN_STOP; } else if (x2->origin & DPS_WORD_ORIGIN_STOP) { if (DPS_OK != proceedSTOP(query, &res, x1, x2)) return DPS_ERROR; } else if (x1->origin & DPS_WORD_ORIGIN_STOP ) { if (DPS_OK != proceedSTOP(query, &res, x2, x1)) return DPS_ERROR; } else if (!((x1->cmd & DPS_STACK_WORD_NOT) && (x2->cmd & DPS_STACK_WORD_NOT))) { res.pbegin = res.pcur = (DPS_URL_CRD_DB*)DpsMalloc((x1->count + x2->count + 1) * sizeof(DPS_URL_CRD_DB)); if (res.pbegin == NULL) return DPS_ERROR; x1->pcur = x1->pbegin; x1->plast = x1->pbegin + x1->count; x2->pcur = x2->pbegin; x2->plast = x2->pbegin + x2->count; if (x1->cmd & DPS_STACK_WORD_NOT) { register DPS_STACK_ITEM *t = x1; x1 = x2; x2 = t; flag1 = !flag1; } while (x1->pcur < x1->plast && x2->pcur < x2->plast) { while ((x2->pcur < x2->plast) && (x2->pcur->url_id < x1->pcur->url_id)) x2->pcur++; if (x2->pcur >= x2->plast) break; if (x2->pcur->url_id == x1->pcur->url_id) { dps_int4 pos1 = (dps_int4)DPS_WRDPOS(x1->pcur->coord); dps_int4 pos2 = (dps_int4)DPS_WRDPOS(x2->pcur->coord); register urlid_t curlid = x1->pcur->url_id; found = ((flag1) ? ((pos1 + 2) == pos2) : ((pos2 + 2) == pos1)); x1->pchecked = x1->pcur; x2->pchecked = x2->pcur; while ((!found) && (x1->pchecked < x1->plast) && (x2->pchecked < x2->plast) && (x1->pchecked->url_id == x2->pchecked->url_id)) { if (x1->pchecked->coord <= x2->pchecked->coord) { x1->pchecked++; pos1 = (dps_int4)DPS_WRDPOS(x1->pchecked->coord); } else { x2->pchecked++; pos2 = (dps_int4)DPS_WRDPOS(x1->pchecked->coord); } found = ((flag1) ? ((pos1 + 2) == pos2) : ((pos2 + 2) == pos1)); } if (x2->cmd & DPS_STACK_WORD_NOT || x1->cmd & DPS_STACK_WORD_NOT) found = !found; if (found) { while ((x1->pcur < x1->plast) && (x2->pcur < x2->plast) && (x1->pcur->url_id == x2->pcur->url_id)) { if (x1->pcur->coord <= x2->pcur->coord) { *res.pcur = *x1->pcur; res.pcur++; x1->pcur++; } else { *res.pcur = *x2->pcur; res.pcur++; x2->pcur++; } } while ((x1->pcur < x1->plast) && (x1->pcur->url_id == curlid)) { *res.pcur = *x1->pcur; res.pcur++; x1->pcur++; } while ((x2->pcur < x2->plast) && (x2->pcur->url_id == curlid)) { *res.pcur = *x2->pcur; res.pcur++; x2->pcur++; } } else { x1->pcur = x1->pchecked; x2->pcur = x2->pchecked; while ((x1->pcur < x1->plast) && (x1->pcur->url_id == curlid)) x1->pcur++; while ((x2->pcur < x2->plast) && (x2->pcur->url_id == curlid)) x2->pcur++; } } else { register DPS_STACK_ITEM *t = x1; x1 = x2; x2 = t; flag1 = !flag1; } } } } DpsStackItemFree(x1); DpsStackItemFree(x2); res.count = res.pcur - res.pbegin; #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, "Perform {%d} ANYWORD {%d} - > %d", (x1) ? x1->count : -1, (x2) ? x2->count : -1, res.count); #endif rc = PUSHARG(s, &res); break; case DPS_STACK_AND: x1 = POPARG(s); x2 = POPARG(s); if (x2 == NULL || x1 == NULL) { if (x1 != NULL) { res = *x1; x1 = NULL; } if (x2 != NULL) { res = *x2; x2 = NULL; } } else { res.order_from = (x1->order_from <= x2->order_from) ? x1->order_from : x2->order_from; res.order_to = (x1->order_to >= x2->order_to) ? x1->order_to : x2->order_to; if ((x1->origin & DPS_WORD_ORIGIN_STOP) && (x2->origin & DPS_WORD_ORIGIN_STOP) ) { if (DPS_OK != proceedOR(query, &res, x1, x2)) return DPS_ERROR; res.origin = DPS_WORD_ORIGIN_STOP; } else if (x2->origin & DPS_WORD_ORIGIN_STOP) { if (DPS_OK != proceedSTOP(query, &res, x1, x2)) return DPS_ERROR; } else if (x1->origin & DPS_WORD_ORIGIN_STOP ) { if (DPS_OK != proceedSTOP(query, &res, x2, x1)) return DPS_ERROR; } else if (!((x1->cmd & DPS_STACK_WORD_NOT) && (x2->cmd & DPS_STACK_WORD_NOT))) { res.pbegin = res.pcur = (DPS_URL_CRD_DB*)DpsMalloc((x1->count + x2->count + 1) * sizeof(DPS_URL_CRD_DB)); if (res.pbegin == NULL) return DPS_ERROR; x1->pcur = x1->pbegin; x1->plast = x1->pbegin + x1->count; x2->pcur = x2->pbegin; x2->plast = x2->pbegin + x2->count; if (x1->cmd & DPS_STACK_WORD_NOT) { register DPS_STACK_ITEM *t = x1; x1 = x2; x2 = t; } if (x2->cmd & DPS_STACK_WORD_NOT) { while (x1->pcur < x1->plast && x2->pcur < x2->plast) { while ((x1->pcur < x1->plast) && (x1->pcur->url_id < x2->pcur->url_id)) { *res.pcur = *x1->pcur; res.pcur++; x1->pcur++; } while ((x2->pcur < x2->plast) && (x2->pcur->url_id < x1->pcur->url_id)) x2->pcur++; if (x2->pcur->url_id == x1->pcur->url_id) { register urlid_t curlid = x1->pcur->url_id; while ((x1->pcur < x1->plast) && (x1->pcur->url_id == curlid)) x1->pcur++; while ((x2->pcur < x2->plast) && (x2->pcur->url_id == curlid)) x2->pcur++; } } while (x1->pcur < x1->plast) { *res.pcur = *x1->pcur; res.pcur++; x1->pcur++; } } else { #if 0 { DPS_URL_CRD_DB *w; for (w = x1->pcur; w < x1->plast; w++) { fprintf(stderr, "x1.url_id:%d .coord:%d\n", w->url_id, w->coord); } for (w = x2->pcur; w < x2->plast; w++) { fprintf(stderr, "x2.url_id:%d .coord:%d\n", w->url_id, w->coord); } } #endif while (x1->pcur < x1->plast && x2->pcur < x2->plast) { while ((x2->pcur < x2->plast) && (x2->pcur->url_id < x1->pcur->url_id)) x2->pcur++; if (x2->pcur >= x2->plast) break; if (x2->pcur->url_id == x1->pcur->url_id) { register urlid_t curlid = x1->pcur->url_id; while ((x1->pcur < x1->plast) && (x2->pcur < x2->plast) && (x1->pcur->url_id == x2->pcur->url_id)) { if (x1->pcur->coord <= x2->pcur->coord) { *res.pcur = *x1->pcur; res.pcur++; x1->pcur++; } else { *res.pcur = *x2->pcur; res.pcur++; x2->pcur++; } } while ((x1->pcur < x1->plast) && (x1->pcur->url_id == curlid)) { *res.pcur = *x1->pcur; res.pcur++; x1->pcur++; } while ((x2->pcur < x2->plast) && (x2->pcur->url_id == curlid)) { *res.pcur = *x2->pcur; res.pcur++; x2->pcur++; } } else { register DPS_STACK_ITEM *t = x1; x1 = x2; x2 = t; } } } } } DpsStackItemFree(x1); DpsStackItemFree(x2); res.count = res.pcur - res.pbegin; #ifdef DEBUG_BOOL #if 0 { DPS_URL_CRD_DB *w = res.pbegin; size_t q; for (q = 0; q < res.count; q++) { fprintf(stderr, "res.url_id:%d .coord:%d\n", w[q].url_id, w[q].coord); } } #endif DpsLog(query, DPS_LOG_EXTRA, "Perform {%d}.%x & {%d}.%x - > {%d}.%x", (x1) ? x1->count : -1, (x1) ? x1->origin : -1, (x2) ? x2->count : -1 , (x2) ? x2->origin : -1, res.count, res.origin); /* printBoolRes(query, &res);*/ DpsLog(query, DPS_LOG_EXTRA, "==="); #endif rc = PUSHARG(s, &res); break; case DPS_STACK_NOT: x1 = POPARG(s); /* res = x1 ? 0 : 1; */ if (x1 != NULL) { x1->cmd ^= DPS_STACK_WORD_NOT; rc = PUSHARG(s, x1); } #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, "Perform ~ {%d}", (x1) ? x1->count : -1); #endif break; } return rc; }
static void printBoolRes(DPS_AGENT *query, DPS_STACK_ITEM *res) { size_t i;; for (i = 0; i < res->count; i++) { DpsLog(query, DPS_LOG_EXTRA, "url_id: %03x coord: %08x", res->pbegin[i].url_id, res->pbegin[i].coord); } }
int DpsSEAMake(DPS_AGENT *Indexer, DPS_DOCUMENT *Doc, DPS_DSTR *excerpt, const char *content_lang, size_t *indexed_size, size_t *indexed_limit, size_t max_word_len, size_t min_word_len, int crossec, int seasec #ifdef HAVE_ASPELL , int have_speller, AspellSpeller *speller #endif ) { DPS_SENTENCELIST List; DPS_MAPSTAT MapStat; DPS_TEXTITEM Item; DPS_VAR *Sec; dpsunicode_t *sentence, *lt, savec; double *links, *lang_cs, w; double delta, pdiv, cur_div; size_t l, sent_len, order; size_t min_len = 10000000, min_pos = 0; int it; register size_t i, j; #ifdef DEBUG char lcstr[4096]; #endif TRACE_IN(Indexer, "DpsSEAMake"); if((Sec = DpsVarListFind(&Doc->Sections, "sea"))) { /* set SEA section to NULL */ DPS_FREE(Sec->val); DPS_FREE(Sec->txt_val); Sec->curlen = 0; } bzero(&List, sizeof(List)); order = 0; sentence = DpsUniStrTok_SEA((dpsunicode_t*)excerpt->data, <); while(sentence) { if (lt != NULL) { savec = *lt; *lt = 0; } #ifdef DEBUG DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)sentence, sizeof(dpsunicode_t) * (DpsUniLen(sentence) + 1)); fprintf(stderr, "Sentence.%d: %s\n", List.nitems, lcstr); #endif if ((sent_len = DpsUniLen(sentence)) >= Indexer->Flags.SEASentenceMinLength) { j = 1; for (i = 0; i < List.nitems; i++) { if (DpsUniStrCmp(sentence, List.Sent[i].sentence) == 0) { j = 0; break; } } if (j) { if ( List.nitems < Indexer->Flags.SEASentences ) { if (List.nitems == List.mitems) { List.mitems += 16; List.Sent = (DPS_SENTENCE*)DpsRealloc(List.Sent, List.mitems * sizeof(DPS_SENTENCE)); if (List.Sent == NULL) { TRACE_OUT(Indexer); return DPS_ERROR;} } List.Sent[List.nitems].sentence = DpsUniDup(sentence); List.Sent[List.nitems].len = sent_len; List.Sent[List.nitems].order = order++; sentence = DpsUniDup(sentence); DpsUniStrToLower(sentence); bzero(&List.Sent[List.nitems].LangMap, sizeof(DPS_LANGMAP)); DpsBuildLangMap(&List.Sent[List.nitems].LangMap, (char*)sentence, sent_len * sizeof(dpsunicode_t), 0, 0); if (sent_len < min_len) { min_len = sent_len; min_pos = List.nitems; } List.nitems++; DPS_FREE(sentence); } else if (sent_len > min_len) { DPS_FREE(List.Sent[min_pos].sentence); List.Sent[min_pos].sentence = DpsUniDup(sentence); List.Sent[min_pos].len = sent_len; List.Sent[min_pos].order = order++; sentence = DpsUniDup(sentence); DpsUniStrToLower(sentence); bzero(&List.Sent[min_pos].LangMap, sizeof(DPS_LANGMAP)); DpsBuildLangMap(&List.Sent[min_pos].LangMap, (char*)sentence, sent_len * sizeof(dpsunicode_t), 0, 0); DPS_FREE(sentence); min_len = List.Sent[0].len; min_pos = 0; for(i = 1; i < List.nitems; i++) if (List.Sent[i].len < min_len) { min_len = List.Sent[i].len; min_pos = i; } } } } #ifdef DEBUG fprintf(stderr, "Sent. len.:%d, Min.allowed: %d\n", sent_len, Indexer->Flags.SEASentenceMinLength); #endif if (lt != NULL) *lt = savec; sentence = DpsUniStrTok_SEA(NULL, <); } DpsLog(Indexer, DPS_LOG_DEBUG, "SEA sentences: %d", List.nitems); if (List.nitems < 4) { for (i = 0; i < List.nitems; i++) DPS_FREE(List.Sent[i].sentence); DPS_FREE(List.Sent); TRACE_OUT(Indexer); return DPS_OK; } links = (double*)DpsMalloc(sizeof(double) * List.nitems * List.nitems); lang_cs = (double*)DpsMalloc(sizeof(double) * List.nitems); /* k ot links[i * List.nitems + j] */ if (links != NULL && lang_cs != NULL) { for (i = 0; i < List.nitems; i++) { DpsPrepareLangMap(&List.Sent[i].LangMap); } for (i = 0; i < List.nitems; i++) { List.Sent[i].Oi = List.Sent[i].di = 0.5; if (Doc->lang_cs_map == NULL) { links[i * List.nitems + i] = 0.0; } else { MapStat.map = &List.Sent[i].LangMap; DpsCheckLangMap6(Doc->lang_cs_map, &List.Sent[i].LangMap, &MapStat, DPS_LM_TOPCNT * DPS_LM_TOPCNT, 2 * DPS_LM_TOPCNT); links[i * List.nitems + i] = (double)MapStat.hits / (2.0 * DPS_LM_TOPCNT) / (List.nitems + 1); } #ifdef DEBUG DpsLog(Indexer, DPS_LOG_INFO, "Link %u->%u: %f [hits:%d miss:%d]", i, i, links[i * List.nitems + i], MapStat.hits, MapStat.miss); #endif for (j = 0; j < List.nitems; j++) { if (j == i) continue; MapStat.map = &List.Sent[j].LangMap; DpsCheckLangMap6(&List.Sent[j].LangMap, &List.Sent[i].LangMap, &MapStat, DPS_LM_TOPCNT * DPS_LM_TOPCNT, 2 * DPS_LM_TOPCNT); links[i * List.nitems + j] = (double)MapStat.hits / (2.0 * DPS_LM_TOPCNT) / (List.nitems + 1); #ifdef DEBUG DpsLog(Indexer, DPS_LOG_INFO, "Link %u->%u: %f [hits:%d miss:%d]", i, j, links[i * List.nitems + j], MapStat.hits, MapStat.miss); #endif } } for (l = 0; l < List.nitems; l++) { w = 0.0; for (i = 0; i < List.nitems; i++) { w += links[l * List.nitems + i] * List.Sent[i].Oi; } w = f(w); if (w < LOW_BORDER_EPS2) w = LOW_BORDER_EPS2; else if (w > HI_BORDER_EPS2) w = HI_BORDER_EPS2; List.Sent[l].di = w; } DpsSort(List.Sent, List.nitems, sizeof(DPS_SENTENCE), (qsort_cmp)SentCmp); #ifdef DEBUG DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[0].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[0].sentence) + 1)); fprintf(stderr, "Sent.0: %f %f -- %s\n", List.Sent[0].di, List.Sent[0].Oi, lcstr); DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[1].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[1].sentence) + 1)); fprintf(stderr, "Sent.1: %f %f -- %s\n", List.Sent[1].di, List.Sent[1].Oi, lcstr); DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[2].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[2].sentence) + 1)); fprintf(stderr, "Sent.2: %f %f -- %s\n", List.Sent[2].di, List.Sent[2].Oi, lcstr); DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[3].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[3].sentence) + 1)); fprintf(stderr, "Sent.3: %f %f -- %s\n", List.Sent[3].di, List.Sent[3].Oi, lcstr); DpsConv(&Indexer->uni_lc, lcstr, sizeof(lcstr), (char*)List.Sent[4].sentence, sizeof(dpsunicode_t) * (DpsUniLen(List.Sent[4].sentence) + 1)); fprintf(stderr, "Sent.4: %f %f -- %s\n", List.Sent[4].di, List.Sent[4].Oi, lcstr); #endif DpsSort(List.Sent, TOP_SENTENCES, sizeof(DPS_SENTENCE), (qsort_cmp)SentOrderCmp); bzero(&Item, sizeof(Item)); Item.section = seasec; Item.href = NULL; Item.section_name = "sea"; for (i = 0; i < TOP_SENTENCES; i++) { dpsunicode_t *UStr = DpsUniDup(List.Sent[i].sentence); DpsPrepareItem(Indexer, Doc, &Item, List.Sent[i].sentence, UStr, content_lang, indexed_size, indexed_limit, max_word_len, min_word_len, crossec #ifdef HAVE_ASPELL , have_speller, speller, NULL #endif ); DPS_FREE(UStr); } } DPS_FREE(lang_cs); DPS_FREE(links); for (i = 0; i < List.nitems; i++) DPS_FREE(List.Sent[i].sentence); DPS_FREE(List.Sent); TRACE_OUT(Indexer); return DPS_OK; }
extern __C_LINK int __DPSCALL DpsBaseOptimize(DPS_BASE_PARAM *P, int sbase) { struct stat sb; urlid_t base, base_from, base_to; long unsigned ActualSize, OriginalSize, i, nitems; off_t pos, posold, NewItemPos, SSize; dps_uint8 diff, gain; double dr = 0.0, cr = 0.0; ssize_t nread; size_t rsize; ssize_t wr; int OptimizeRatio, res, error_cnt; char buffer[BUFSIZ]; DPS_BASEITEM *hTable; DPS_SORTBASEITEM *si = NULL; OptimizeRatio = DpsVarListFindInt(&P->A->Vars, "OptimizeRatio", 15); P->mode = DPS_WRITE_LOCK; if (sbase == -1) { base_from = 0; base_to = (urlid_t)P->NFiles; } else { base_from = sbase; base_to = sbase + 1; } for (base = base_from; base < base_to; base++) { error_cnt = 0; gain = (dps_uint8)0; P->rec_id = ((base & DPS_BASE_MASK) << DPS_BASE_BITS); if (DpsBaseOpen(P, DPS_WRITE_LOCK) != DPS_OK) { DpsLog(P->A, DPS_LOG_ERROR, "Can't open base %s/%s {%s:%d}", P->subdir, P->basename, __FILE__, __LINE__); DpsBaseClose(P); return DPS_ERROR; } if (lseek(P->Ifd, (off_t)0, SEEK_SET) == (off_t)-1) { DpsLog(P->A, DPS_LOG_ERROR, "Can't seek %s {%s:%d}", P->Ifilename, __FILE__, __LINE__); DpsBaseClose(P); return DPS_ERROR; } if (fstat(P->Sfd, &sb) == 0) { SSize = sb.st_size; } else { if ((SSize = (off_t)lseek(P->Sfd, (off_t)0, SEEK_END)) == (off_t)-1) { DpsLog(P->A, DPS_LOG_ERROR, "Can't seek %s {%s:%d}", P->Sfilename, __FILE__, __LINE__); DpsBaseClose(P); return DPS_ERROR; } } nitems = 0; ActualSize = 0; OriginalSize = 0; while(read(P->Ifd, &P->Item, sizeof(DPS_BASEITEM)) == sizeof(DPS_BASEITEM)) { nitems++; if ((P->Item.rec_id != 0) && ((dps_uint8)P->Item.offset < (dps_uint8)SSize) && (P->Item.size > 0)) { ActualSize += (long unsigned)P->Item.size; OriginalSize += (long unsigned)(P->Item.orig_size ? P->Item.orig_size : P->Item.size); } } if (ftruncate(P->Ifd, (off_t)(nitems * sizeof(DPS_BASEITEM))) != 0) { dps_strerror(P->A, DPS_LOG_EXTRA, "ftruncate error (pos:%ld) [%s:%d]", (off_t)(nitems * sizeof(DPS_BASEITEM)), __FILE__, __LINE__); } dr = (nitems) ? fabs(100.0 * ((long unsigned)SSize - ActualSize) / ((double)SSize + 1.0)) : 0.0; cr = (nitems) ? fabs(100.0 * ActualSize / (OriginalSize + 1)) : 0.0; DpsLog(P->A, DPS_LOG_EXTRA, "Optimize: %s/%s base 0x%X, %ld recs defrag: %.2f%% Ratio: %.2f%% Data: %ld File: %ld", P->subdir, P->basename, P->FileNo, nitems, dr, cr, ActualSize, (long)SSize); if ((dr >= (double)OptimizeRatio) || (ActualSize == 0 && SSize != 0)) { si = (DPS_SORTBASEITEM*)DpsMalloc((nitems + 1) * sizeof(DPS_SORTBASEITEM)); if (si == NULL) { DpsLog(P->A, DPS_LOG_ERROR, "Can't alloc si (%d bytes) at {%s:%d}", (nitems + 1) * sizeof(DPS_SORTBASEITEM), __FILE__, __LINE__); DpsBaseClose(P); return DPS_ERROR; } if (lseek(P->Ifd, (off_t)0, SEEK_SET) == (off_t)-1) { DpsLog(P->A, DPS_LOG_ERROR, "Can't seek %s {%s:%d}", P->Ifilename, __FILE__, __LINE__); DpsBaseClose(P); DPS_FREE(si); return DPS_ERROR; } for (i = 0; (i < nitems) && (read(P->Ifd, &si[i].Item, sizeof(DPS_BASEITEM)) == sizeof(DPS_BASEITEM)); ) { if(si[i].Item.rec_id != 0 && ((dps_uint8)si[i].Item.offset < (dps_uint8)SSize) && (si[i].Item.size > 0) && (si[i].Item.size < ActualSize) ) { i++; } } if (i < nitems) nitems = i; if (nitems > 1) DpsSort((void*)si, (size_t)nitems, sizeof(DPS_SORTBASEITEM), cmpsi); gain = (dps_uint8)0; pos = (off_t)0; posold = (off_t)0; if (nitems > 0) { if ((long unsigned)si[0].Item.offset < (long unsigned)SSize) { posold = (off_t)si[0].Item.offset; } else { si[0].Item.offset = (off_t)0; si[0].Item.size = 0; } } if (nitems > 1) { if (si[0].Item.size > (rsize = (size_t)(si[1].Item.offset - si[0].Item.offset))) { DpsLog(P->A, DPS_LOG_ERROR, "si[0] size adjusted by offset: %ld -> %ld", (long)si[0].Item.size, (long)rsize); si[0].Item.size = rsize; error_cnt++; } } if ((diff = (dps_uint8)posold) > 0) { for( lseek(P->Sfd, posold, SEEK_SET), rsize = 0; (rsize < si[0].Item.size) && ((nread = read(P->Sfd, buffer, (rsize + BUFSIZ < si[0].Item.size) ? BUFSIZ : (si[0].Item.size - rsize) )) > 0); lseek(P->Sfd, posold, SEEK_SET) ) { lseek(P->Sfd, pos, SEEK_SET); (void)write(P->Sfd, buffer, (size_t)nread); rsize += (size_t)nread; posold += (off_t)nread; pos += (off_t)nread; } si[0].Item.offset = 0; if (rsize != si[0].Item.size) { DpsLog(P->A, DPS_LOG_ERROR, "si[0] size adjusted by size: %ld -> %ld", (long)si[0].Item.size, (long)rsize); si[0].Item.size = rsize; error_cnt++; } gain += diff; } if (nitems > 0) for (i = 0; i < nitems - 1; i++) { if ((long unsigned)si[i + 1].Item.offset > (long unsigned)SSize) { DpsLog(P->A, DPS_LOG_ERROR, "si[%ld] too long offset: %ld > %ld, removing", i , (long)si[i + 1].Item.offset, (long)SSize); si[i + 1].Item.size = 0; si[i + 1].Item.offset = si[i].Item.offset + si[i].Item.size; error_cnt++; } else { pos = (off_t)(si[i].Item.offset + si[i].Item.size); posold = (off_t)si[i + 1].Item.offset; if (i < nitems - 2) { if (si[i + 1].Item.size > (rsize = (size_t)(si[i + 2].Item.offset - si[i + 1].Item.offset))) { DpsLog(P->A, DPS_LOG_ERROR, "si[%ld] size adjusted by offset: %ld -> %ld", i + 1, (long)si[i + 1].Item.size, (long)rsize ); si[i + 1].Item.size = rsize; error_cnt++; } } if ((diff = (dps_uint8)posold - (dps_uint8)pos) > 0) { for( lseek(P->Sfd, posold, SEEK_SET), rsize = 0; (rsize < si[i + 1].Item.size) && ((nread = read(P->Sfd, buffer, (rsize + BUFSIZ < si[i + 1].Item.size) ? BUFSIZ : (si[i + 1].Item.size - rsize) )) > 0); lseek(P->Sfd, posold, SEEK_SET) ) { lseek(P->Sfd, pos, SEEK_SET); (void)write(P->Sfd, buffer, (size_t)nread); rsize += (size_t)nread; posold += (off_t)nread; pos += (off_t)nread; } if (rsize != si[i + 1].Item.size) { DpsLog(P->A, DPS_LOG_ERROR, "si[%ld] size adjusted by size: %ld -> %ld", i + 1, (long)si[i + 1].Item.size, (long)rsize); si[i + 1].Item.size = rsize; error_cnt++; } si[i + 1].Item.offset = si[i].Item.offset + si[i].Item.size; gain += diff; } } } posold = SSize; pos = (nitems) ? (off_t)(si[nitems - 1].Item.offset + si[nitems - 1].Item.size) : (off_t)0; if (ftruncate(P->Sfd, (off_t)(pos)) != 0) { dps_strerror(P->A, DPS_LOG_ERROR, "ftruncate error (pos:%ld) [%s:%d]", pos, __FILE__, __LINE__); } SSize = pos; if (posold > pos) { gain += ((dps_uint8)posold - (dps_uint8)pos); } /*if (gain != 0 || OptimizeRatio == 0 || error_cnt > 0)*/ { posold = lseek(P->Ifd, (off_t)0, SEEK_END); (void)ftruncate(P->Ifd, (off_t)0); lseek(P->Ifd, (off_t)0, SEEK_SET); if ((hTable = (DPS_BASEITEM *)DpsXmalloc(sizeof(DPS_BASEITEM) * DPS_HASH_PRIME)) == NULL) { DpsLog(P->A, DPS_LOG_ERROR, "Memory alloc error hTable: %d bytes", sizeof(DPS_BASEITEM) * DPS_HASH_PRIME); DpsBaseClose(P); DPS_FREE(si); return DPS_ERROR; } if ( (wr = write(P->Ifd, hTable, sizeof(DPS_BASEITEM) * DPS_HASH_PRIME)) != sizeof(DPS_BASEITEM) * DPS_HASH_PRIME) { dps_strerror(P->A, DPS_LOG_ERROR, "[%s:%d] Can't set new index for file %s\nwritten %d bytes of %d", __FILE__, __LINE__, P->Ifilename, wr, sizeof(DPS_BASEITEM) * DPS_HASH_PRIME); DPS_FREE(hTable); DpsBaseClose(P); DPS_FREE(si); return DPS_ERROR; } DPS_FREE(hTable); for (i = 0; i < nitems; i++) { if (si[i].Item.rec_id == 0 || si[i].Item.size == 0) continue; if ((long)si[i].Item.offset > (long)SSize) { DpsLog(P->A, DPS_LOG_ERROR, "si[%ld] too long offset: %ld > %ld, removing", i , (long)si[i].Item.offset, (long)SSize); error_cnt++; continue; } P->rec_id = si[i].Item.rec_id; if ((res = DpsBaseSeek(P, DPS_WRITE_LOCK)) != DPS_OK) { DpsBaseClose(P); DPS_FREE(si); return res; } if (P->Item.rec_id != P->rec_id) { if (P->mishash && P->Item.rec_id != 0) { if ((P->Item.next = (dps_uint8)(NewItemPos = lseek(P->Ifd, (off_t)0, SEEK_END))) == (dps_uint8)-1) { DpsBaseClose(P); DPS_FREE(si); return DPS_ERROR; } if (lseek(P->Ifd, (off_t)P->CurrentItemPos, SEEK_SET) == (off_t)-1) { DpsBaseClose(P); DPS_FREE(si); return DPS_ERROR; } if (write(P->Ifd, &P->Item, sizeof(DPS_BASEITEM)) != sizeof(DPS_BASEITEM)) { DpsBaseClose(P); DPS_FREE(si); return DPS_ERROR; } P->CurrentItemPos = (dps_uint8)NewItemPos; } } P->Item = si[i].Item; P->Item.next = (off_t)0; if (lseek(P->Ifd, (off_t)P->CurrentItemPos, SEEK_SET) == (off_t)-1) { DpsLog(P->A, DPS_LOG_ERROR, "Can't seek %s {%s:%d}", P->Ifilename, __FILE__, __LINE__); DpsBaseClose(P); DPS_FREE(si); return DPS_ERROR; } if (write(P->Ifd, &P->Item, sizeof(DPS_BASEITEM)) != sizeof(DPS_BASEITEM)) { DpsLog(P->A, DPS_LOG_ERROR, "Can't write index for file %s {%s:%d}", P->Ifilename, __FILE__, __LINE__); DpsBaseClose(P); DPS_FREE(si); return DPS_ERROR; } } pos = lseek(P->Ifd, (off_t)0, SEEK_END); gain += ((dps_uint8)posold - (dps_uint8)pos); DpsLog(P->A, DPS_LOG_DEBUG, "Optimize: %s/%s base 0x%X cleaned, %ld bytes freed", P->subdir, P->basename, base, gain); } DPS_FREE(si); } if (error_cnt) base--; DpsBaseClose(P); } return DPS_OK; }
int __DPSCALL DpsSearchdURLAction(DPS_AGENT *A, DPS_DOCUMENT *D, int cmd, void *db) { DPS_DB *searchd = db; DPS_SEARCHD_PACKET_HEADER hdr; char *buf; ssize_t nsent, nrecv; int done = 0; char *msg = NULL; char *dinfo = NULL; int rc=DPS_OK; TRACE_IN(A, "DpsSearchdURLAction"); if (cmd != DPS_URL_ACTION_DOCCOUNT) { DpsLog(A, DPS_LOG_ERROR, "searchd: unsupported URL action"); TRACE_OUT(A); return DPS_ERROR; } hdr.cmd = DPS_SEARCHD_CMD_URLACTION; hdr.len = sizeof(int); if ((buf = (char*)DpsMalloc(hdr.len + 1)) == NULL) { DpsLog(A, DPS_LOG_ERROR, "Out of memory"); TRACE_OUT(A); return DPS_ERROR; } *((int*)buf) = cmd; nsent = DpsSearchdSendPacket(searchd->searchd, &hdr, buf); DPS_FREE(buf); while(!done) { nrecv = DpsRecvall(searchd->searchd, &hdr, sizeof(hdr), 360); if(nrecv != sizeof(hdr)){ DpsLog(A, DPS_LOG_ERROR, "Received incomplete header from searchd (%d bytes)", (int)nrecv); TRACE_OUT(A); return(DPS_ERROR); }else{ #ifdef DEBUG_SDP DpsLog(A, DPS_LOG_ERROR, "Received header cmd=%d len=%d\n", hdr.cmd, hdr.len); #endif } switch(hdr.cmd){ case DPS_SEARCHD_CMD_ERROR: msg = (char*)DpsMalloc(hdr.len + 1); if (msg == NULL) { done=1; break; } nrecv = DpsRecvall(searchd->searchd, msg, hdr.len, 360); msg[(nrecv >= 0) ? nrecv : 0] = '\0'; sprintf(A->Conf->errstr, "Searchd error: '%s'", msg); rc=DPS_OK; DPS_FREE(msg); done=1; break; case DPS_SEARCHD_CMD_MESSAGE: msg=(char*)DpsMalloc(hdr.len+1); if (msg == NULL) { done=1; break; } nrecv = DpsRecvall(searchd->searchd, msg, hdr.len, 360); msg[(nrecv >= 0) ? nrecv : 0] = '\0'; #ifdef DEBUG_SDP DpsLog(A, DPS_LOG_ERROR, "Message from searchd: '%s'\n",msg); #endif DPS_FREE(msg); break; case DPS_SEARCHD_CMD_DOCCOUNT: dinfo=(char*)DpsMalloc(hdr.len+1); if (dinfo == NULL) { done=1; break; } nrecv = DpsRecvall(searchd->searchd, dinfo, hdr.len, 360); dinfo[(nrecv >= 0) ? nrecv : 0] = '\0'; A->doccount += *((int *)dinfo); #ifdef DEBUG_SDP DpsLog(A, DPS_LOG_DEBUG, "Received DOCCOUNT size=%d doccount=%d(+%s)\n", hdr.len, A->doccount, dinfo); #endif DPS_FREE(dinfo); done=1; break; default: sprintf(A->Conf->errstr, "Unknown searchd response: cmd=%d len=%d", hdr.cmd, hdr.len); rc=DPS_ERROR; done = 1; break; } } TRACE_OUT(A); return rc; }
int DpsCloneListSearchd(DPS_AGENT *Indexer, DPS_DOCUMENT *Doc, DPS_RESULT *Res, DPS_DB *db) { DPS_SEARCHD_PACKET_HEADER hdr; ssize_t nsent,nrecv; char *msg = NULL, *dinfo = NULL; char *tok, *lt; char buf[128]; int done = 0; int rc = DPS_OK; TRACE_IN(Indexer, "DpsCloneListSearchd"); dps_snprintf(buf, 128, "%s", DpsVarListFindStr(&Doc->Sections, "DP_ID", "0")); hdr.cmd = DPS_SEARCHD_CMD_CLONES; hdr.len = dps_strlen(buf); nsent = DpsSearchdSendPacket(db->searchd, &hdr, buf); while(!done){ nrecv = DpsRecvall(db->searchd, &hdr, sizeof(hdr), 360); if(nrecv != sizeof(hdr)){ DpsLog(Indexer, DPS_LOG_ERROR, "Received incomplete header from searchd (%d bytes)", (int)nrecv); TRACE_OUT(Indexer); return(DPS_ERROR); }else{ #ifdef DEBUG_SDP DpsLog(Indexer, DPS_LOG_DEBUG, "Received header cmd=%d len=%d\n", hdr.cmd, hdr.len); #endif } switch(hdr.cmd){ case DPS_SEARCHD_CMD_ERROR: msg = (char*)DpsMalloc(hdr.len + 1); if (msg == NULL) { done=1; break; } nrecv = DpsRecvall(db->searchd, msg, hdr.len, 360); msg[(nrecv >= 0) ? nrecv : 0] = '\0'; sprintf(Indexer->Conf->errstr, "Searchd error: '%s'", msg); rc = DPS_ERROR; DPS_FREE(msg); done = 1; break; case DPS_SEARCHD_CMD_DOCINFO: dinfo = (char*)DpsMalloc(hdr.len + 1); if (dinfo == NULL) { done=1; break; } nrecv = DpsRecvall(db->searchd, dinfo, hdr.len, 360); dinfo[(nrecv >= 0) ? nrecv : 0] = '\0'; #ifdef DEBUG_SDP DpsLog(Indexer, DPS_LOG_DEBUG, "Received DOCINFO size=%d buf=%s\n", hdr.len, dinfo); #endif if (strcasecmp(dinfo, "nocloneinfo") != 0) { tok = dps_strtok_r(dinfo, "\r\n", <, NULL); while(tok){ DPS_DOCUMENT *D; size_t nd = Res->num_rows++; Res->Doc = (DPS_DOCUMENT*)DpsRealloc(Res->Doc, (Res->num_rows + 1) * sizeof(DPS_DOCUMENT)); if (Res->Doc == NULL) { sprintf(Indexer->Conf->errstr, "Realloc error"); rc = DPS_ERROR; break; } D = &Res->Doc[nd]; DpsDocInit(D); DpsDocFromTextBuf(D, tok); tok = dps_strtok_r(NULL, "\r\n", <, NULL); } } DPS_FREE(dinfo); done = 1; break; default: sprintf(Indexer->Conf->errstr, "Unknown searchd response: cmd=%d len=%d", hdr.cmd, hdr.len); rc = DPS_ERROR; done = 1; break; } } TRACE_OUT(Indexer); return rc; }
int DpsAddStackItem(DPS_AGENT *query, DPS_RESULT *Res, DPS_PREPARE_STATE *state, char *word, dpsunicode_t *uword) { int origin; size_t i; size_t wlen = (uword == NULL) ? 0 : DpsUniLen(uword); dpshash32_t crcword = (word == NULL) ? 0 : DpsStrHash32(word); #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, "0[%d].%x %c -- %s [%x] .secno:%d\n", state->order, state->origin, item_type(state->cmd), (word == NULL) ? "<NULL>" : word, crcword, state->secno[state->p_secno]); #endif if((uword != NULL) && ( DpsStopListFind(&query->Conf->StopWords, uword, state->qlang) || (query->WordParam.min_word_len > wlen) || (query->WordParam.max_word_len < wlen)) ) { origin = state->origin | DPS_WORD_ORIGIN_STOP; } else { origin = state->origin; } if (state->cmd == DPS_STACK_WORD && !(origin & DPS_WORD_ORIGIN_QUERY)) { for (i = 0; i < Res->nitems; i++) { if ((Res->items[i].order == state->order) && (Res->items[i].crcword == crcword)) return DPS_OK; } } if (Res->nitems >= Res->mitems - 2) { Res->mitems += DPS_MAXSTACK; Res->items = (DPS_STACK_ITEM*)DpsRealloc(Res->items, Res->mitems * sizeof(DPS_STACK_ITEM)); if (Res->items == NULL) { DpsLog(query, DPS_LOG_ERROR, "Can't alloc %d bytes for %d mitems", Res->mitems * sizeof(DPS_STACK_ITEM), Res->mitems); return DPS_ERROR; } } if (Res->nitems > 0) { if (state->cmd == DPS_STACK_OR || state->cmd == DPS_STACK_AND || state->cmd == DPS_STACK_NEAR || state->cmd == DPS_STACK_ANYWORD) { if (Res->items[Res->nitems-1].cmd == DPS_STACK_AND || Res->items[Res->nitems-1].cmd == DPS_STACK_OR || Res->items[Res->nitems-1].cmd == DPS_STACK_NEAR || Res->items[Res->nitems-1].cmd == DPS_STACK_ANYWORD) { return DPS_OK; } } if ((Res->nitems > 0) && (state->cmd == DPS_STACK_WORD) && ( (Res->items[Res->nitems-1].cmd == DPS_STACK_WORD) || (Res->items[Res->nitems-1].cmd == DPS_STACK_RIGHT) || (Res->items[Res->nitems-1].cmd == DPS_STACK_PHRASE_RIGHT) )) { Res->items[Res->nitems].cmd = DPS_STACK_OR; Res->items[Res->nitems].order = 0; Res->items[Res->nitems].origin = 0; Res->items[Res->nitems].count = 0; Res->items[Res->nitems].len = 0; Res->items[Res->nitems].crcword = 0; Res->items[Res->nitems].word = NULL; Res->items[Res->nitems].ulen = 0; Res->items[Res->nitems].uword = NULL; Res->items[Res->nitems].pbegin = NULL; Res->items[Res->nitems].order_origin = 0; Res->items[Res->nitems].secno = state->secno[state->p_secno]; Res->nitems++; Res->ncmds++; #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, "1[%d].%x %c -- %s", 0, 0, item_type(DPS_STACK_OR), "<NULL>"); #endif } if ((Res->nitems > 0) && (state->cmd == DPS_STACK_LEFT) && ( (Res->items[Res->nitems-1].cmd == DPS_STACK_RIGHT) || (Res->items[Res->nitems-1].cmd == DPS_STACK_PHRASE_RIGHT) )) { Res->items[Res->nitems].cmd = state->add_cmd; Res->items[Res->nitems].order = 0; Res->items[Res->nitems].origin = 0; Res->items[Res->nitems].count = 0; Res->items[Res->nitems].len = 0; Res->items[Res->nitems].crcword = 0; Res->items[Res->nitems].word = NULL; Res->items[Res->nitems].ulen = 0; Res->items[Res->nitems].uword = NULL; Res->items[Res->nitems].pbegin = NULL; Res->items[Res->nitems].order_origin = 0; Res->items[Res->nitems].secno = state->secno[state->p_secno]; Res->nitems++; Res->ncmds++; #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, "1[%d].%x %c -- %s", 0, 0, item_type(state->add_cmd), "<NULL>"); #endif } } Res->items[Res->nitems].cmd = state->cmd; Res->items[Res->nitems].order = state->order; Res->items[Res->nitems].order_inquery = state->order_inquery; Res->items[Res->nitems].origin = origin; Res->items[Res->nitems].count = 0; Res->items[Res->nitems].len = (word == NULL) ? 0 : dps_strlen(word); Res->items[Res->nitems].crcword = crcword; Res->items[Res->nitems].word = (word == NULL) ? NULL : DpsStrdup(word); Res->items[Res->nitems].ulen = wlen; Res->items[Res->nitems].uword = (uword == NULL) ? NULL : DpsUniDup(uword); Res->items[Res->nitems].pbegin = NULL; Res->items[Res->nitems].order_origin = 0; Res->items[Res->nitems].wordnum = Res->nitems; Res->items[Res->nitems].secno = state->secno[state->p_secno]; Res->nitems++; if (state->cmd != DPS_STACK_WORD) { Res->ncmds++; } else { Res->items[state->order].order_origin |= origin; if (state->order > Res->max_order) Res->max_order = state->order; if (state->order_inquery > Res->max_order_inquery) Res->max_order_inquery = state->order; } /* if ((state->cmd == DPS_STACK_WORD) && state->order > Res->max_order) Res->max_order = state->order;*/ #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, "1[%d,%d].%x %c -- %s", state->order, state->order_inquery, state->origin, item_type(state->cmd), (word == NULL) ? "<NULL>" : word); #endif return DPS_OK; }
void RelLink(DPS_AGENT *Indexer, DPS_URL *curURL, DPS_URL *newURL, char **str, int ReverseAliasFlag) { const char *schema = newURL->schema ? newURL->schema : curURL->schema; const char *hostname = newURL->hostname ? newURL->hostname : curURL->hostname; const char *auth = newURL->auth ? newURL->auth : curURL->auth; const char *path = (newURL->path && newURL->path[0]) ? newURL->path : curURL->path; const char *fname = ((newURL->filename && newURL->filename[0]) || (newURL->path && newURL->path[0])) ? newURL->filename : curURL->filename; const char *query_string = newURL->query_string; char *pathfile = (char*)DpsMalloc(dps_strlen(DPS_NULL2EMPTY(path)) + dps_strlen(DPS_NULL2EMPTY(fname)) + dps_strlen(DPS_NULL2EMPTY(query_string)) + 5); int cascade; DPS_MATCH *Alias; char *alias = NULL; size_t aliassize, nparts = 10; DPS_MATCH_PART Parts[10]; if (newURL->hostinfo == NULL) newURL->charset_id = curURL->charset_id; if (pathfile == NULL) return; /* sprintf(pathfile, "/%s%s%s", DPS_NULL2EMPTY(path), DPS_NULL2EMPTY(fname), DPS_NULL2EMPTY(query_string));*/ pathfile[0] = '/'; dps_strcpy(pathfile + 1, DPS_NULL2EMPTY(path)); dps_strcat(pathfile, DPS_NULL2EMPTY(fname)); dps_strcat(pathfile, DPS_NULL2EMPTY(query_string)); DpsURLNormalizePath(pathfile); if (!strcasecmp(DPS_NULL2EMPTY(schema), "mailto") || !strcasecmp(DPS_NULL2EMPTY(schema), "javascript") || !strcasecmp(DPS_NULL2EMPTY(schema), "feed") ) { *str = (char*)DpsMalloc(dps_strlen(DPS_NULL2EMPTY(schema)) + dps_strlen(DPS_NULL2EMPTY(newURL->specific)) + 4); if (*str == NULL) return; /* sprintf(*str, "%s:%s", DPS_NULL2EMPTY(schema), DPS_NULL2EMPTY(newURL->specific));*/ dps_strcpy(*str, DPS_NULL2EMPTY(schema)); dps_strcat(*str, ":"); dps_strcat(*str, DPS_NULL2EMPTY(newURL->specific)); } else if(/*!strcasecmp(DPS_NULL2EMPTY(schema), "file") ||*/ !strcasecmp(DPS_NULL2EMPTY(schema), "htdb")) { *str = (char*)DpsMalloc(dps_strlen(DPS_NULL2EMPTY(schema)) + dps_strlen(pathfile) + 4); if (*str == NULL) return; /* sprintf(*str, "%s:%s", DPS_NULL2EMPTY(schema), pathfile);*/ dps_strcpy(*str, DPS_NULL2EMPTY(schema)); dps_strcat(*str, ":"); dps_strcat(*str, pathfile); }else{ *str = (char*)DpsMalloc(dps_strlen(DPS_NULL2EMPTY(schema)) + dps_strlen(pathfile) + dps_strlen(DPS_NULL2EMPTY(hostname)) + dps_strlen(DPS_NULL2EMPTY(auth)) + 8); if (*str == NULL) return; /* sprintf(*str, "%s://%s%s", DPS_NULL2EMPTY(schema), DPS_NULL2EMPTY(hostinfo), pathfile);*/ dps_strcpy(*str, DPS_NULL2EMPTY(schema)); dps_strcat(*str, "://"); if (auth) { dps_strcat(*str, auth); dps_strcat(*str,"@"); } dps_strcat(*str, DPS_NULL2EMPTY(hostname)); dps_strcat(*str, pathfile); } if(!strncmp(*str, "ftp://", 6) && (strstr(*str, ";type="))) *(strstr(*str, ";type")) = '\0'; DPS_FREE(pathfile); if (ReverseAliasFlag) { const char *alias_prog = DpsVarListFindStr(&Indexer->Vars, "ReverseAliasProg", NULL); if (alias_prog) { int result; aliassize = 256 + 2 * dps_strlen(*str); alias = (char*)DpsRealloc(alias, aliassize); if (alias == NULL) { DpsLog(Indexer, DPS_LOG_ERROR, "No memory (%d bytes). %s line %d", aliassize, __FILE__, __LINE__); goto ret; } alias[0] = '\0'; result = DpsAliasProg(Indexer, alias_prog, *str, alias, aliassize - 1); DpsLog(Indexer, DPS_LOG_EXTRA, "ReverseAliasProg result: '%s'", alias); if(result != DPS_OK) goto ret; DPS_FREE(*str); *str = (char*)DpsStrdup(alias); } for(cascade = 0; ((Alias=DpsMatchListFind(&Indexer->Conf->ReverseAliases,*str,nparts,Parts))) && (cascade < 1024); cascade++) { aliassize = dps_strlen(Alias->arg) + dps_strlen(Alias->pattern) + dps_strlen(*str) + 128; alias = (char*)DpsRealloc(alias, aliassize); if (alias == NULL) { DpsLog(Indexer, DPS_LOG_ERROR, "No memory (%d bytes). %s line %d", aliassize, __FILE__, __LINE__); goto ret; } DpsMatchApply(alias,aliassize,*str,Alias->arg,Alias,nparts,Parts); if(alias[0]){ DpsLog(Indexer,DPS_LOG_DEBUG,"ReverseAlias%d: pattern:%s, arg:%s -> '%s'", cascade, Alias->pattern, Alias->arg, alias); DPS_FREE(*str); *str = (char*)DpsStrdup(alias); } else break; if (Alias->last) break; } } ret: DPS_FREE(alias); }
__C_LINK int __DPSCALL DpsCacheMakeIndexes(DPS_AGENT *Indexer, DPS_DB *db) { DPS_UINT8URLIDLIST L8; DPS_UINT4URLIDLIST L4; DPS_VARLIST *v = &Indexer->Conf->Vars; size_t i, r; char *ind, *nm, *lfname; bzero(&L4, sizeof(DPS_UINT4URLIDLIST)); bzero(&L8, sizeof(DPS_UINT8URLIDLIST)); r = (size_t) 'l'; for (i = 0; i < v->Root[r].nvars; i++) { if (!strncasecmp("Limit-", v->Root[r].Var[i].name, 6)) { ind = v->Root[r].Var[i].val; lfname = v->Root[r].Var[i].name; nm = lfname + 6; if (!strcasecmp(ind, "category")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Category index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating category index"); if (DPS_OK == DpsLimit8(Indexer, &L8, "Category", DPS_IFIELD_TYPE_HEX8STR, db)) { MakeNestedIndex(Indexer, &L8, DPS_LIMFNAME_CAT, db); } } else if (!strcasecmp(ind, "tag")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Tag index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating tag index"); if (DPS_OK == DpsLimit4(Indexer, &L4, "Tag", DPS_IFIELD_TYPE_STRCRC32, db)) { MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_TAG, db); } } else if (!strcasecmp(ind, "link")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Link index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating link index"); if (DPS_OK == DpsLimit4(Indexer, &L4, "link", DPS_IFIELD_TYPE_INT, db)) { MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_LINK, db); } } else if (!strcasecmp(ind, "time")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Time index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating time index"); if (DPS_OK == DpsLimit4(Indexer, &L4, "last_mod_time", DPS_IFIELD_TYPE_HOUR, db)) { MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_TIME, db); } } else if (!strcasecmp(ind, "hostname")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Hostname index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating hostname index"); if (DPS_OK == DpsLimit4(Indexer, &L4, "url", DPS_IFIELD_TYPE_HOSTNAME, db)) { MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_HOST, db); } } else if (!strcasecmp(ind, "language")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Language index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating language index"); if (DPS_OK == DpsLimit4(Indexer, &L4, "Content-Language", DPS_IFIELD_TYPE_STR2CRC32, db)) { MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_LANG, db); } } else if (!strcasecmp(ind, "content")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Content-Type index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating Content-Type index"); if (DPS_OK == DpsLimit4(Indexer, &L4, "Content-Type", DPS_IFIELD_TYPE_STRCRC32, db)) { MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_CTYPE, db); } } else if (!strcasecmp(ind, "siteid")) { /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Site_id index creation", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating Site_id index"); if (DPS_OK == DpsLimit4(Indexer, &L4, "site_id", DPS_IFIELD_TYPE_INT, db)) { MakeLinearIndex(Indexer, &L4, DPS_LIMFNAME_SITE, db); } } else { char *buf, *req, *dbaddr; DPS_DB ldb, *pdb = &ldb; size_t buf_len = dps_strlen(nm) + 16; if ((buf = (char*) DpsMalloc(buf_len * sizeof(char))) == NULL) { DpsLog(Indexer, DPS_LOG_ERROR, "Can't alloc %d chars at %s:%d", buf_len, __FILE__, __LINE__); return DPS_ERROR; } dps_setproctitle("[%d] %s index creation", Indexer->handle, nm); DpsLog(Indexer, DPS_LOG_EXTRA, "Creating %s index", nm); dps_snprintf(buf, buf_len, "Req-%s", nm); req = DpsVarListFindStr(&Indexer->Conf->Vars, buf, NULL); if (req != NULL) { dps_snprintf(buf, buf_len, "dbaddr-%s", nm); dbaddr = DpsVarListFindStr(&Indexer->Conf->Vars, buf, NULL); if (dbaddr != NULL) { DpsDBSetAddr(pdb, dbaddr, DPS_OPEN_MODE_READ); } else { pdb = db; } if (!strcasecmp(ind, "nex8str")) { if (DPS_OK == DpsSQLLimit8(Indexer, &L8, req, DPS_IFIELD_TYPE_HEX8STR, pdb)) { MakeNestedIndex(Indexer, &L8, lfname, pdb); } } else { int field_type = DPS_IFIELD_TYPE_INT; if (!strcasecmp(ind, "strcrc32")) field_type = DPS_IFIELD_TYPE_STRCRC32; else if (!strcasecmp(ind, "hour")) field_type = DPS_IFIELD_TYPE_HOUR; else if (!strcasecmp(ind, "hostname")) field_type = DPS_IFIELD_TYPE_HOSTNAME; else if (!strcasecmp(ind, "char2")) field_type = DPS_IFIELD_TYPE_STR2CRC32; else if (!strcasecmp(ind, "int")) field_type = DPS_IFIELD_TYPE_INT; if (DPS_OK == DpsSQLLimit4(Indexer, &L4, req, field_type, db)) { MakeLinearIndex(Indexer, &L4, lfname, db); } } } } /* To see the URL being indexed in "ps" output on xBSD */ dps_setproctitle("[%d] Indexes done.", Indexer->handle); DpsLog(Indexer, DPS_LOG_EXTRA, "Done"); } } return DPS_OK; }
int DpsChineseListLoad(DPS_AGENT *Agent, DPS_CHINALIST *List, const char *charset, const char *fname) { struct stat sb; char *str, *data = NULL, *cur_n = NULL; DPS_CHINAWORD chinaword; char word[PATH_MAX]; dpsunicode_t uword[256]; DPS_CHARSET *sys_int, *fcs; DPS_CONV to_uni; int fd; char savebyte; sys_int = DpsGetCharSet("sys-int"); if (!(fcs = DpsGetCharSet(charset))) { if (Agent->Conf->is_log_open) DpsLog(Agent, DPS_LOG_ERROR, "Charset '%s' not found or not supported", charset); else fprintf(stderr, "Charset '%s' not found or not supported", charset); return DPS_ERROR; } DpsConvInit(&to_uni, fcs, sys_int, Agent->Conf->CharsToEscape, DPS_RECODE_HTML); if (*fname != '/') { dps_snprintf(word, sizeof(word), "%s/%s", DpsVarListFindStr(&Agent->Conf->Vars, "EtcDir", DPS_CONF_DIR), fname); fname = word; } if (stat(fname, &sb)) { if (Agent->Conf->is_log_open) DpsLog(Agent, DPS_LOG_ERROR, "Unable to stat FreqDic file '%s': %s", fname, strerror(errno)); else fprintf(stderr, "Unable to stat FrecDic file '%s': %s", fname, strerror(errno)); return DPS_ERROR; } if ((fd = open(fname, O_RDONLY)) <= 0) { if (Agent->Conf->is_log_open) DpsLog(Agent, DPS_LOG_ERROR, "Unable to open FreqDic file '%s': %s", fname, strerror(errno)); else fprintf(stderr, "Unable to open FreqDic file '%s': %s", fname, strerror(errno)); return DPS_ERROR; } if ((data = (char*)DpsMalloc(sb.st_size + 1)) == NULL) { if (Agent->Conf->is_log_open) DpsLog(Agent, DPS_LOG_ERROR, "Unable to alloc %d bytes", sb.st_size); else fprintf(stderr, "Unable to alloc %ld bytes", (long)sb.st_size); close(fd); return DPS_ERROR; } if (read(fd, data, sb.st_size) != (ssize_t)sb.st_size) { if (Agent->Conf->is_log_open) DpsLog(Agent, DPS_LOG_ERROR, "Unable to read FreqDic file '%s': %s", fname, strerror(errno)); else fprintf(stderr, "Unable to read FreqDic file '%s': %s", fname, strerror(errno)); DPS_FREE(data); close(fd); return DPS_ERROR; } data[sb.st_size] = '\0'; str = data; cur_n = strchr(str, NL_INT); if (cur_n != NULL) { cur_n++; savebyte = *cur_n; *cur_n = '\0'; } close(fd); bzero((void*)&chinaword, sizeof(chinaword)); chinaword.word = uword; while(str != NULL) { if(!str[0]) goto loop_continue; if(str[0]=='#') goto loop_continue; sscanf(str, "%d %63s ", &chinaword.freq, word ); DpsConv(&to_uni, (char*)uword, sizeof(uword), word, sizeof(word)); DpsChineseListAdd(List, &chinaword); loop_continue: str = cur_n; if (str != NULL) { *str = savebyte; cur_n = strchr(str, NL_INT); if (cur_n != NULL) { cur_n++; savebyte = *cur_n; *cur_n = '\0'; } } } DPS_FREE(data); DpsChineseListSort(List); { register size_t i, j = 0; for (i = 1; i < List->nwords; i++) { if (cmpchinese(&List->ChiWord[j], &List->ChiWord[i]) == 0) { List->ChiWord[j].freq += List->ChiWord[i].freq; } else { j++; } } for (i = j + 1; i < List->nwords; i++) { DPS_FREE(List->ChiWord[i].word); } List->nwords = j + 1; } return DPS_OK; }
__C_LINK int __DPSCALL DpsBaseCheckup(DPS_BASE_PARAM *P, int (*checkrec) (DPS_AGENT *A, const urlid_t rec_id)) { int found; urlid_t i; size_t z; urlid_t *todel = (int*)DpsMalloc(128 * sizeof(urlid_t)); size_t ndel = 0, mdel = 128, totaldel = 0; if (todel == NULL) return DPS_ERROR; for (i = 0; i < (urlid_t)P->NFiles; i++) { if (have_sigterm || have_sigint || have_sigalrm) { DpsLog(P->A, DPS_LOG_EXTRA, "%s signal received. Exiting chackup", (have_sigterm) ? "SIGTERM" : (have_sigint) ? "SIGINT" : "SIGALRM"); DpsBaseClose(P); DPS_FREE(todel); return DPS_OK; } P->rec_id = i << DPS_BASE_BITS; if (DpsBaseOpen(P, DPS_READ_LOCK) != DPS_OK) { DpsBaseClose(P); continue; } if (lseek(P->Ifd, (off_t)0, SEEK_SET) == (off_t)-1) { DpsLog(P->A, DPS_LOG_ERROR, "Can't seeek for file %s", P->Ifilename); DpsBaseClose(P); DPS_FREE(todel); return DPS_ERROR; } while (read(P->Ifd, &P->Item, sizeof(DPS_BASEITEM)) == sizeof(DPS_BASEITEM)) { if (P->Item.rec_id != 0) { found = checkrec(P->A, P->Item.rec_id); if (found == 0) { if (ndel >= mdel) { mdel += 128; todel = (urlid_t*)DpsRealloc(todel, mdel * sizeof(urlid_t)); if (todel == NULL) { DpsBaseClose(P); DpsLog(P->A, DPS_LOG_ERROR, "Can't realloc %d bytes %s:%d", mdel * sizeof(urlid_t), __FILE__, __LINE__); DPS_FREE(todel); return DPS_ERROR; } } todel[ndel++] = P->Item.rec_id; } } } DpsBaseClose(P); for (z = 0; z < ndel; z++) { DpsLog(P->A, DPS_LOG_DEBUG, "Base %s/%s store %03X: deleting url_id: %X", P->subdir, P->basename, i, todel[z]); P->rec_id = todel[z]; DpsBaseDelete(P); } DpsBaseClose(P); DpsLog(P->A, DPS_LOG_INFO, "Base %s/%s store %03X, %d lost records deleted", P->subdir, P->basename, i, ndel); totaldel += ndel; ndel = 0; } DPS_FREE(todel); DpsLog(P->A, DPS_LOG_EXTRA, "Total lost record(s) deleted: %d\n", totaldel); return DPS_OK; }
/* Main function to calculate items sequence */ int DpsCalcBoolItems(DPS_AGENT *query, DPS_RESULT *Res) { DPS_BOOLSTACK *s = DpsBoolStackInit(NULL); DPS_STACK_ITEM *items = Res->items, *res; DPS_WIDEWORD Word; size_t nitems = Res->nitems; size_t i, j; int first_time = (Res->WWList.nwords == 0); if (s == NULL) return DPS_STACK_ERR; if (nitems == 0) { Res->CoordList.Coords = Res->items[0].pbegin; Res->items[0].pbegin = NULL; Res->items[0].count = 0; /* Res->CoordList.ncoords = Res->items[0]->count;*/ DpsBoolStackFree(s); return DPS_OK; } for (i = 0; i < nitems; i++) { if (items[i].cmd != DPS_STACK_WORD) continue; if ((items[i].pbegin == NULL) && ((items[i].origin & DPS_WORD_ORIGIN_STOP) == 0)) { for(j = 0; j < i; j++) if (items[j].crcword == items[i].crcword) break; if (j < i) { items[i].count = items[j].count; items[i].pbegin = (DPS_URL_CRD_DB*)DpsMalloc((items[j].count + 1) * sizeof(DPS_URL_CRD_DB)); if (items[i].pbegin == NULL) { DpsLog(query, DPS_LOG_ERROR, "Can't alloc %d bytes %s:%d", (items[j].count + 1) * sizeof(DPS_URL_CRD_DB), __FILE__, __LINE__); DpsBoolStackFree(s); return DPS_STACK_ERR; } { register size_t z; for (z = 0; z < items[i].count; z++) { items[i].pbegin[z] = items[j].pbegin[z]; items[i].pbegin[z].coord &= 0xFFFFFF00; items[i].pbegin[z].coord += (items[i].wordnum & 0xFF); } } } } if (first_time) { Word.order = items[i].order; Word.count = items[i].count; Word.crcword = items[i].crcword; Word.word = items[i].word; Word.uword = items[i].uword; Word.origin = items[i].origin; DpsWideWordListAdd(&Res->WWList, &Word, DPS_WWL_LOOSE); } } #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, "--------"); for(i=0;i<nitems;i++){ DpsLog(query, DPS_LOG_EXTRA, "[%d].%d %c : %d -- %s", i, items[i].wordnum, item_type(items[i].cmd), items[i].count, (items[i].word == NULL) ? "<NULL>" : items[i].word); } DpsLog(query, DPS_LOG_EXTRA, "--------"); #endif for(i=0;i<nitems;i++){ int c; #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, ".[%d].%d %c : %d -- %s, (order_origin:%d)", i, items[i].wordnum, item_type(items[i].cmd), items[i].count, (items[i].word == NULL) ? "<NULL>" : items[i].word, items[i].order_origin); #endif switch(c=items[i].cmd){ case DPS_STACK_RIGHT: /* Perform till LEFT bracket */ while((TOPCMD(s) != DPS_STACK_LEFT) && (TOPCMD(s) != DPS_STACK_BOT)) if (DPS_OK != perform(query, Res, s, POPCMD(s))) { DpsBoolStackFree(s); return DPS_STACK_ERR; } /* Pop LEFT bracket itself */ if(TOPCMD(s) == DPS_STACK_LEFT) POPCMD(s); break; case DPS_STACK_OR: case DPS_STACK_AND: case DPS_STACK_NEAR: case DPS_STACK_ANYWORD: if (s->nastack > 1) while(c <= TOPCMD(s)) { if (DPS_OK != perform(query, Res, s, POPCMD(s))) { DpsBoolStackFree(s); return DPS_STACK_ERR; } } /* IMPORTANT! No break here! That's OK*/ /* Так надо ! */ case DPS_STACK_LEFT: case DPS_STACK_PHRASE_LEFT: case DPS_STACK_NOT: if (PUSHCMD(s,c) != DPS_OK) { DpsBoolStackFree(s); return DPS_STACK_ERR; } break; case DPS_STACK_PHRASE_RIGHT: /* perform till RIGHT phrase quote */ while((TOPCMD(s) != DPS_STACK_PHRASE_LEFT) && (TOPCMD(s) != DPS_STACK_BOT)) if (DPS_OK != perform(query, Res, s, POPCMD(s))) { DpsBoolStackFree(s); return DPS_STACK_ERR; } if (TOPCMD(s) == DPS_STACK_PHRASE_LEFT) perform(query, Res, s, POPCMD(s)); #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, "[%d] %c", i, item_type(items[i].cmd)); #endif break; case DPS_STACK_WORD: items[i].order_from = items[i].order_to = items[i].order; default: if (DPS_OK != PUSHARG(s, &items[i])) { DpsBoolStackFree(s); return DPS_STACK_ERR; } items[i].pbegin = items[i].plast = NULL; /*items[i].word = NULL; items[i].uword = NULL;*/ /* items[i].count = 0;*/ /* DpsStackItemFree(&items[i]);*/ /* if (DPS_OK != PUSHARG(s, (count[items[i].order]) ? 1UL : 0UL)) { DpsBoolStackFree(s); return DPS_STACK_ERR; }*/ break; } } while(TOPCMD(s) != DPS_STACK_BOT) { if (DPS_OK != perform(query, Res, s, POPCMD(s))) { DpsBoolStackFree(s); return DPS_STACK_ERR; } } res = POPARG(s); if (res != NULL) { Res->CoordList.Coords = res->pbegin; Res->CoordList.ncoords = res->count; res->pbegin = res->plast = NULL; /* res->count = 0;*/ DpsStackItemFree(res); } #ifdef DEBUG_BOOL DpsLog(query, DPS_LOG_EXTRA, "result: %x", res); #endif DpsBoolStackFree(s); return DPS_OK; }
int DpsCarryLimitWrite(DPS_AGENT *Indexer, FILE *f, urlid_t *data, size_t num) { unsigned char *bits; urlid_t curr = 0; urlid_t *a; size_t i, n, z; int j; size_t avail, elems; unsigned size, max_bits; unsigned char *table, *base; unsigned global_max, global_sum, global_n; /* allocating mem for bits[i] - minimal bits needed to code a[i] */ if (! (bits = (unsigned char*)malloc(ELEMS_PER_BLOCK * sizeof(unsigned char)))) { DpsLog(Indexer, DPS_LOG_ERROR, "Out of memory [%s:%d]", __FILE__, __LINE__); return DPS_ERROR; } if (! (a = (urlid_t*) malloc(ELEMS_PER_BLOCK * sizeof(urlid_t)))) { DpsLog(Indexer, DPS_LOG_ERROR, "Out of memory [%s:%d]", __FILE__, __LINE__); DPS_FREE(bits); return DPS_ERROR; } CARRY_ENCODE_START(f); size = TRANS_TABLE_STARTER; global_max = global_sum = global_n = 0; for (z = 0; z < num; ) { for (n = 0; (n < ELEMS_PER_BLOCK) && (z < num); n++,z++) { a[n] = data[z] - curr; curr = data[z]; } max_bits = CalcMinBits(a, bits, n, &global_max, &global_sum, &global_n); CARRY_BLOCK_ENCODE_START(n, max_bits); for (i=0; i<n; ) { avail = GET_AVAILABLE_BITS; table = GET_TRANS_TABLE(avail); base= table+(size<<2); /* row in trans table */ /* 1. Modeling: Find j= the first-fit column in base */ for (j=0; j<4; j++) { size = base[j]; if (size >avail) /* must use next word for data */ { avail=32; j=-1; continue; } if ( elems=elems_coded(avail,size,bits,i,n-1) ) break; } /* 2. Coding: Code elements using row "base" & column "j" */ WORD_ENCODE(j+1,2); /* encoding column */ for ( ; elems ; elems--, i++) /* encoding d-gaps */ WORD_ENCODE(a[i],size); } } CARRY_ENCODE_END; DPS_FREE(a); DPS_FREE(bits); return DPS_OK; }
static int MakeNestedIndex(DPS_AGENT *Indexer, DPS_UINT8URLIDLIST *L, const char *lim_name, DPS_DB *db) { DPS_ENV *Conf = Indexer->Conf; size_t k, prev; urlid_t *data=NULL; DPS_UINT8_POS_LEN *ind=NULL; size_t mind=1000, nind=0, ndata; char fname[PATH_MAX]; int dat_fd=0, ind_fd=0; int rc=DPS_OK; const char *vardir = (db->vardir) ? db->vardir : DpsVarListFindStr(&Conf->Vars, "VarDir", DPS_VAR_DIR); if(!L->Item)return(1); if (L->nitems > 1) DpsSort(L->Item, L->nitems, sizeof(DPS_UINT8URLID), (qsort_cmp)cmp_ind8); data = (urlid_t*)DpsMalloc((L->nitems + 1) * sizeof(urlid_t)); if(!data){ DpsLog(Indexer, DPS_LOG_ERROR, "Can't alloc %d bytes [%s:%d]", (L->nitems + 1) * sizeof(urlid_t), __FILE__, __LINE__); goto err1; } ind=(DPS_UINT8_POS_LEN*)DpsMalloc(mind*sizeof(DPS_UINT8_POS_LEN)); if(!ind){ DpsLog(Indexer, DPS_LOG_ERROR, "Can't alloc %d bytes [%s:%d]", mind * sizeof(DPS_UINT8_POS_LEN), __FILE__, __LINE__); goto err1; } prev=0; for(k=0; k < L->nitems; k++) { data[k] = L->Item[k].url_id; if((k == L->nitems-1) || (L->Item[k].hi != L->Item[prev].hi) || (L->Item[k].lo != L->Item[prev].lo)) { if(nind==mind){ mind+=1000; ind=(DPS_UINT8_POS_LEN*)DpsRealloc(ind,mind*sizeof(DPS_UINT8_POS_LEN)); if(!ind) { DpsLog(Indexer, DPS_LOG_ERROR, "Can't alloc %d bytes [%s:%d]", mind * sizeof(DPS_UINT8_POS_LEN), __FILE__, __LINE__); goto err1; } } /* Fill index */ ind[nind].hi = L->Item[prev].hi; ind[nind].lo = L->Item[prev].lo; ind[nind].pos = prev * sizeof(*data); if (k == L->nitems - 1) ind[nind].len = (k - prev + 1) * sizeof(*data); else ind[nind].len = (k - prev) * sizeof(*data); DpsLog(Indexer, DPS_LOG_DEBUG, "%08X%08X - %d %d\n", ind[nind].hi, ind[nind].lo, (int)ind[nind].pos, ind[nind].len); nind++; prev=k; } } ndata = L->nitems; ClearIndex8(L); dps_snprintf(fname,sizeof(fname)-1,"%s%c%s%c%s.dat", vardir,DPSSLASH, DPS_TREEDIR,DPSSLASH, lim_name); if((dat_fd = DpsOpen3(fname, O_CREAT | O_WRONLY | O_TRUNC | DPS_BINARY, DPS_IWRITE)) < 0) { DpsLog(Indexer, DPS_LOG_ERROR, "Can't open '%s': %s [%s:%d]", fname, strerror(errno), __FILE__, __LINE__); goto err1; } DpsWriteLock(dat_fd); if((ndata * sizeof(*data)) != (size_t)write(dat_fd, data, ndata * sizeof(*data))) { DpsLog(Indexer, DPS_LOG_ERROR, "Can't write '%s': %s [%s:%d]", fname, strerror(errno), __FILE__, __LINE__); goto err1; } DpsUnLock(dat_fd); DpsClose(dat_fd); DPS_FREE(data); dps_snprintf(fname,sizeof(fname)-1,"%s%c%s%c%s.ind", vardir, DPSSLASH,DPS_TREEDIR, DPSSLASH, lim_name); if((ind_fd = DpsOpen3(fname, O_CREAT | O_WRONLY | O_TRUNC | DPS_BINARY, DPS_IWRITE)) < 0) { DpsLog(Indexer, DPS_LOG_ERROR, "Can't open '%s': %s [%s:%d]", fname, strerror(errno), __FILE__, __LINE__); goto err1; } DpsWriteLock(ind_fd); if((nind*sizeof(DPS_UINT8_POS_LEN)) != (size_t)write(ind_fd,ind,nind*sizeof(DPS_UINT8_POS_LEN))){ DpsLog(Indexer, DPS_LOG_ERROR, "Can't write '%s': %s [%s:%d]", fname, strerror(errno), __FILE__, __LINE__); goto err1; } DpsUnLock(ind_fd); DpsClose(ind_fd); DPS_FREE(ind); return(0); err1: ClearIndex8(L); DPS_FREE(data); DPS_FREE(ind); if(dat_fd) DpsClose(dat_fd); if(ind_fd) DpsClose(ind_fd); return(1); }
void DpsParseHTTPResponse(DPS_AGENT *Indexer, DPS_DOCUMENT *Doc) { char *token, *lt, *headers, savec; int status, oldstatus; DPS_DSTR header; time_t now, last_mod_time; Doc->Buf.content=NULL; oldstatus = DpsVarListFindInt(&Doc->Sections, "Status", 0); DpsVarListReplaceInt(&Doc->Sections, "ResponseSize", (int)Doc->Buf.size); DpsVarListDel(&Doc->Sections, "Content-Length"); /* DpsVarListDel(&Doc->Sections, "Last-Modified");*/ /* if it's not deleted Lat-Modified equals to the first appearance in db */ if (Doc->Buf.buf == NULL) return; /* Cut HTTP response header first */ for(token=Doc->Buf.buf;*token;token++){ if(!strncmp(token,"\r\n\r\n",4)){ if (token <= Doc->Buf.buf + Doc->Buf.size - 4) { *token='\0'; Doc->Buf.content = token + 4; } break; } else if(!strncmp(token,"\n\n",2)){ if (token <= Doc->Buf.buf + Doc->Buf.size - 2) { *token='\0'; Doc->Buf.content = token + 2; } break; } } /* Bad response, return */ if(!Doc->Buf.content) { if (token <= Doc->Buf.buf + Doc->Buf.size - 4) { if (token[2] == CR_CHAR) Doc->Buf.content = token + 4; else Doc->Buf.content = token + 2; } } /* Copy headers not to break them */ headers = (char*)DpsStrdup(Doc->Buf.buf); /* Now lets parse response header lines */ token = dps_strtok_r(headers, "\r\n", <, &savec); if(!token) { DpsFree(headers); return; } if(!strncmp(token,"HTTP/",5)){ status = atoi(token + 8); DpsVarListReplaceStr(&Doc->Sections,"ResponseLine",token); DpsVarListReplaceInt(&Doc->Sections, "Status", (oldstatus > status) ? oldstatus : status ); }else{ DpsFree(headers); return; } token = dps_strtok_r(NULL, "\r\n", <, &savec); DpsDSTRInit(&header, 128); while(token){ if(strchr(token,':')) { if (header.data_size) { DpsParseHTTPHeader(Indexer, Doc, &header); DpsDSTRFree(&header); DpsDSTRInit(&header, 128); } } DpsDSTRAppendStr(&header, token); token = dps_strtok_r(NULL, "\r\n", <, &savec); } if (header.data_size) { DpsParseHTTPHeader(Indexer, Doc, &header); } DpsDSTRFree(&header); DPS_FREE(headers); { now = Indexer->now; last_mod_time = DpsHttpDate2Time_t(DpsVarListFindStr(&Doc->Sections, "Last-Modified", "")); if (last_mod_time > now + 3600 * 4) { /* we have a document with Last-Modified time in the future */ DpsLog(Indexer, DPS_LOG_EXTRA, "Last-Modified date is deep in future (%d>%d), dropping it.", last_mod_time, now); DpsVarListDel(&Doc->Sections, "Last-Modified"); } } /* Bad response, return */ if(!Doc->Buf.content) { return; } DpsVarListReplaceInt(&Doc->Sections,"Content-Length", Doc->Buf.buf-Doc->Buf.content+(int)Doc->Buf.size + DpsVarListFindInt(&Doc->Sections,"Content-Length", 0)); }