int DpsCookiesAddStr(DPS_AGENT *Indexer, DPS_URL *CurURL, const char *cookie_str, int insert_flag) { char *part, *lpart; char *name = NULL; char *value = NULL; char *domain = NULL, *orig_domain = NULL; char *path = NULL; dps_uint4 expire = 0, need_free_domain = 1, need_free_path = 1; char secure = 'n', savec; for (part = dps_strtok_r(cookie_str, ";" , &lpart, &savec) ; part; part = dps_strtok_r(NULL, ";", &lpart, &savec)) { char *arg; part = DpsTrim(part, " "); if ((arg = strchr(part, '='))) { *arg++ = '\0'; if (!name) { name = part; DpsFree(value); value = DpsStrdup(arg); } else if (!strcasecmp(part, "path")) { DpsFree(path); path = DpsStrdup(arg); } else if (!strcasecmp(part, "domain")) { DpsFree(orig_domain); orig_domain = domain = DpsStrdup(arg); } else if (!strcasecmp(part, "secure")) { secure = 'y'; } else if (!strcasecmp(part, "expires")) { expire = (dps_uint4)DpsHttpDate2Time_t(arg); } } } if (name && value) { if (domain && domain[0] == '.') { domain++; } else { if (domain) DpsFree(orig_domain); domain = (CurURL && CurURL->hostname) ? CurURL->hostname : "localhost"; need_free_domain = 0; } if (!path) { path = (CurURL && CurURL->path) ? CurURL->path : "/"; need_free_path = 0; } DpsCookiesAdd(Indexer, domain, path, name, value, secure, expire, (CurURL==NULL)?1:0, 1); } DpsFree(value); if (need_free_path) DpsFree(path); if (need_free_domain) DpsFree(orig_domain); return DPS_OK; }
static void DpsParseHTTPHeader(DPS_AGENT *Indexer, DPS_DOCUMENT *Doc, DPS_DSTR *header) { char *val, *header_name; char secname[128]; DPS_VAR *Sec; DPS_TEXTITEM Item; if ((val = strchr(header_name = header->data, ':'))) { /* fprintf(stderr, "HEADER: %s\n", header_name); */ *val++='\0'; val = DpsTrim(val," \t:"); if (!strcasecmp(header_name, "Content-Type") || !strcasecmp(header_name, "Content-Encoding")) { char *v; for(v=val ; *v ; v++) *v = dps_tolower(*v); } else if (Doc->Spider.use_cookies && !strcasecmp(header_name, "Set-Cookie")) { char *part, *lpart; char *name = NULL; char *value = NULL; const char *domain = NULL; const char *path = NULL; dps_uint4 expire = 0; char secure = 'n'; for (part = dps_strtok_r(val, ";" , &lpart) ; part; part = dps_strtok_r(NULL, ";", &lpart)) { char *arg; part = DpsTrim(part, " "); if ((arg = strchr(part, '='))) { *arg++ = '\0'; if (!name) { name = part; value = arg; } else if (!strcasecmp(part, "path")) { path = arg; } else if (!strcasecmp(part, "domain")) { domain = arg; } else if (!strcasecmp(part, "secure")) { secure = 'y'; } else if (!strcasecmp(part, "expires")) { expire = (dps_uint4)DpsHttpDate2Time_t(arg); } } } if (name && value) { if (domain && domain[0] == '.') { domain++; } else { domain = Doc->CurURL.hostname ? Doc->CurURL.hostname : "localhost"; } if (!path) { path = Doc->CurURL.path ? Doc->CurURL.path : "/"; } DpsCookiesAdd(Indexer, domain, path, name, value, secure, expire, 1); } /* token = dps_strtok_r(NULL,"\r\n",<); continue;*/ return; } } DpsVarListReplaceStr(&Doc->Sections, header_name, val ? val : "<NULL>"); dps_snprintf(secname,sizeof(secname),"header.%s", header_name); secname[sizeof(secname)-1]='\0'; if((Sec = DpsVarListFind(&Doc->Sections, secname)) && val ) { Item.href = NULL; Item.str = val; Item.section = Sec->section; Item.section_name = secname; Item.len = 0; DpsTextListAdd(&Doc->TextList, &Item); } }
static void DpsParseHTTPHeader(DPS_AGENT *Indexer, DPS_DOCUMENT *Doc, DPS_DSTR *header) { char *val, *header_name; char secname[128], savec; DPS_VAR *Sec; DPS_TEXTITEM Item; if ((val = strchr(header_name = header->data, ':'))) { /* fprintf(stderr, "HEADER: %s\n", header_name); */ *val++='\0'; val = DpsTrim(val," \t:"); if (!strcasecmp(header_name, "Content-Type") || !strcasecmp(header_name, "Content-Encoding")) { register char *v; for(v=val ; *v ; v++) *v = (char)dps_tolower((int)*v); } else if (Doc->Spider.use_robots && !strcasecmp(header_name, "X-Robots-Tag")) { char * lt; char * rtok; rtok = dps_strtok_r(val, " ,\r\n\t", <, &savec); while(rtok){ if(!strcasecmp(rtok, "ALL")){ /* Left Server parameters unchanged */ }else if(!strcasecmp(rtok, "NONE")){ Doc->Spider.follow = DPS_FOLLOW_NO; Doc->Spider.index = 0; if (DpsNeedLog(DPS_LOG_DEBUG)) { DpsVarListReplaceInt(&Doc->Sections, "Index", 0); DpsVarListReplaceInt(&Doc->Sections, "Follow", DPS_FOLLOW_NO); } }else if(!strcasecmp(rtok, "NOINDEX")) { Doc->Spider.index = 0; /* Doc->method = DPS_METHOD_DISALLOW;*/ if (DpsNeedLog(DPS_LOG_DEBUG)) DpsVarListReplaceInt(&Doc->Sections, "Index", 0); }else if(!strcasecmp(rtok, "NOFOLLOW")) { Doc->Spider.follow = DPS_FOLLOW_NO; if (DpsNeedLog(DPS_LOG_DEBUG)) DpsVarListReplaceInt(&Doc->Sections, "Follow", DPS_FOLLOW_NO); }else if(!strcasecmp(rtok, "NOARCHIVE")) { DpsVarListReplaceStr(&Doc->Sections, "Z", ""); }else if(!strcasecmp(rtok, "INDEX")) { /* left server value unchanged */ if (DpsNeedLog(DPS_LOG_DEBUG)) DpsVarListReplaceInt(&Doc->Sections, "Index", Doc->Spider.index); }else if(!strcasecmp(rtok, "FOLLOW")) { /* left server value unchanged */ if (DpsNeedLog(DPS_LOG_DEBUG)) DpsVarListReplaceInt(&Doc->Sections, "Follow", Doc->Spider.follow); } rtok = dps_strtok_r(NULL, " \r\n\t", <, &savec); } } else if (Doc->Spider.use_cookies && !strcasecmp(header_name, "Set-Cookie")) { DpsCookiesAddStr(Indexer, &Doc->CurURL, val, 1); return; } } DpsVarListReplaceStr(&Doc->Sections, header_name, val ? val : "<NULL>"); dps_snprintf(secname,sizeof(secname),"header.%s", header_name); secname[sizeof(secname)-1]='\0'; if((Sec = DpsVarListFind(&Doc->Sections, secname)) && val ) { bzero((void*)&Item, sizeof(Item)); Item.href = NULL; Item.str = val; Item.section = Sec->section; Item.section_name = secname; Item.strict = Sec->strict; Item.len = 0; (void)DpsTextListAdd(&Doc->TextList, &Item); } }
void DpsCookiesFind(DPS_AGENT *Indexer, DPS_SERVER *Server, DPS_DOCUMENT *Doc, const char *hostinfo) { #ifdef HAVE_SQL DPS_DSTR cookie; DPS_COOKIES *Cookies = &Indexer->Cookies; DPS_COOKIE *Coo; size_t i, blen = dps_strlen(hostinfo), slen; int have_no_cookies = DpsVarListFindInt(&Doc->Sections, "have_no_cookies", 1); #ifdef WITH_PARANOIA void *paran = DpsViolationEnter(paran); #endif TRACE_IN(Indexer, "DpsCookiesFind"); DpsDSTRInit(&cookie, 1024); for(i = 0; i < Cookies->ncookies; i++) { Coo = &Cookies->Cookie[i]; slen = dps_strlen(Coo->domain); if (slen > blen) continue; if (Coo->secure == 'y' && strcasecmp(Doc->CurURL.schema, "https")) continue; if (strncasecmp(Coo->path, Doc->CurURL.path, dps_strlen(Coo->path))) continue; if (strcasecmp(Coo->domain, hostinfo + (blen - slen))) continue; if (Coo->from_config != 1) have_no_cookies = 0; if (Coo->name[0] == '\0' && Coo->value[0] == '\0') continue; if (cookie.data_size) DpsDSTRAppend(&cookie, "; ", 2); DpsDSTRAppendStr(&cookie, Coo->name); DpsDSTRAppend(&cookie, "=", 1); DpsDSTRAppendStr(&cookie, Coo->value); } if (have_no_cookies) { char buf[2*PATH_MAX]; dpshash32_t url_id; DPS_DB *db; DPS_SQLRES Res; size_t rows; int rc; if (Server != NULL) { char *PingData = DpsVarListFindStr(&Server->Vars, "AuthPing", NULL); if (PingData != NULL) { char *AuthPing = DpsStrdup(DpsTrim(PingData, " \t\r\n")); int method = DPS_METHOD_GET; dps_base64_decode(AuthPing, PingData, dps_strlen(PingData)); if (!strncasecmp(AuthPing, "GET", 3)) { method = DPS_METHOD_GET; PingData = DpsTrim(AuthPing + 3, " \t\r\n"); } else if (!strncasecmp(AuthPing, "POST", 4)) { method = DPS_METHOD_POST; PingData = DpsTrim(AuthPing + 4, " \t\r\n"); } else { DpsLog(Indexer, DPS_LOG_ERROR, "AuthPing should be GET or POST: %s", AuthPing); PingData = NULL; } if (PingData != NULL) { size_t size = dps_strlen(PingData); { char PingURL[size + 2]; char PingBody[size]; DPS_DOCUMENT *rDoc; int result; rDoc = DpsDocInit(NULL); DpsSpiderParamInit(&rDoc->Spider); DpsVarList2Doc(rDoc, Server); rDoc->Buf.max_size = (size_t)DpsVarListFindInt(&Indexer->Vars, "MaxDocSize", DPS_MAXDOCSIZE); rDoc->Buf.allocated_size = DPS_NET_BUF_SIZE; if ((rDoc->Buf.buf = (char*)DpsMalloc(rDoc->Buf.allocated_size + 1)) == NULL) { DpsDocFree(rDoc); TRACE_OUT(Indexer); return; } rDoc->Buf.buf[0]='\0'; rDoc->subdoc = Indexer->Flags.SubDocLevel + 1; #if 1 dps_snprintf(buf, sizeof(buf), "%s://%s/", DPS_NULL2EMPTY(Doc->CurURL.schema), DPS_NULL2EMPTY(Doc->CurURL.hostinfo)); DpsVarListReplaceStr(&rDoc->Sections, "URL", buf); DpsURLParse(&rDoc->CurURL, buf); DpsLog(Indexer, DPS_LOG_INFO, "HOME: %s", buf); rDoc->method = DPS_METHOD_HEAD; /* DpsVarListFree(&rDoc->RequestHeaders);*/ if (Doc != NULL) { DpsVarListReplaceLst(&rDoc->RequestHeaders, &Doc->RequestHeaders, NULL, "*"); } DpsVarListReplaceStr(&rDoc->Sections, "have_no_cookies", "0"); DpsDocAddDocExtraHeaders(Indexer, Server, rDoc); DpsDocAddConfExtraHeaders(Indexer->Conf, rDoc); DpsVarListReplaceLst(&rDoc->Sections, &Server->Vars, NULL, "*"); DpsDocAddServExtraHeaders(Server, rDoc); DpsVarListLog(Indexer, &rDoc->RequestHeaders, DPS_LOG_DEBUG, "HOME.Request"); if (Doc == NULL || Indexer->Flags.cmd == DPS_IND_FILTER) { DpsDocLookupConn(Indexer, rDoc); } else { DPS_FREE(rDoc->connp.connp); rDoc->connp = Doc->connp; } result = DpsGetURL(Indexer, rDoc, NULL); /* Just get headers from the home as we need only Cookies from it */ DpsDocProcessResponseHeaders(Indexer, rDoc); DpsVarListLog(Indexer, &rDoc->Sections, DPS_LOG_DEBUG, "HOME.Response"); #endif sscanf(PingData, "%s %s", PingURL, PingBody); if (rDoc->method == DPS_METHOD_GET) { dps_strcat(PingURL, "?"); dps_strcat(PingURL, PingBody); } else { DpsVarListReplaceStr(&rDoc->Sections, "body", PingBody); } DpsVarListReplaceStr(&rDoc->Sections, "URL", PingURL); DpsURLParse(&rDoc->CurURL, PingURL); DpsLog(Indexer, DPS_LOG_INFO, "AUTH.PING: %s", PingURL); rDoc->method = method; DpsVarListFree(&rDoc->RequestHeaders); DpsVarListReplaceStr(&rDoc->Sections, "have_no_cookies", "0"); DpsDocAddDocExtraHeaders(Indexer, Server, rDoc); DpsDocAddConfExtraHeaders(Indexer->Conf, rDoc); DpsVarListReplaceLst(&rDoc->Sections, &Server->Vars, NULL, "*"); DpsDocAddServExtraHeaders(Server, rDoc); if (method == DPS_METHOD_POST) { dps_snprintf(buf, sizeof(buf), "application/x-www-form-urlencoded; charset=%s", DpsVarListFindStr(&Indexer->Conf->Vars, "LocalCharset", "iso-8859-1")); DpsVarListReplaceStr(&rDoc->RequestHeaders, "Content-Type", buf); dps_snprintf(buf, sizeof(buf), "%d", dps_strlen(PingBody)); DpsVarListReplaceStr(&rDoc->RequestHeaders, "Content-Length", buf); } DpsVarListLog(Indexer, &rDoc->RequestHeaders, DPS_LOG_DEBUG, "AUTHPING.Request"); #if 0 if (Doc == NULL || Indexer->Flags.cmd == DPS_IND_FILTER) { DpsDocLookupConn(Indexer, rDoc); } else { DPS_FREE(rDoc->connp.connp); rDoc->connp = Doc->connp; } #endif result = DpsGetURL(Indexer, rDoc, NULL); /* Just get it as we need only Cookies from the headers */ DpsDocProcessResponseHeaders(Indexer, rDoc); DpsVarListDel(&rDoc->Sections, "body"); DpsVarListLog(Indexer, &rDoc->Sections, DPS_LOG_DEBUG, "AUTHPING.Response"); if (Doc != NULL) bzero(&rDoc->connp, sizeof(rDoc->connp)); DpsDocFree(rDoc); } } DpsFree(AuthPing); } } while(hostinfo != NULL) { url_id = DpsStrHash32(hostinfo); DpsSQLResInit(&Res); dps_snprintf(buf, sizeof(buf), "SELECT name,value,path,secure FROM cookies WHERE domain='%s'", hostinfo); if (Indexer->flags & DPS_FLAG_UNOCON) { DPS_GETLOCK(Indexer, DPS_LOCK_DB); db = Indexer->Conf->dbl.db[url_id % Indexer->Conf->dbl.nitems]; } else { db = Indexer->dbl.db[url_id % Indexer->dbl.nitems]; } if(DPS_OK == (rc = DpsSQLQuery(db, &Res, buf))) { rows = DpsSQLNumRows(&Res); for(i = 0; i < rows; i++) { DpsCookiesAdd(Indexer, hostinfo, DpsSQLValue(&Res, i, 2), DpsSQLValue(&Res, i, 0), DpsSQLValue(&Res, i, 1), *DpsSQLValue(&Res, i, 3), 0, 0, 0); if (*DpsSQLValue(&Res, i, 3) == 'y' && strcasecmp(Doc->CurURL.schema, "https")) continue; if (strncasecmp(DpsSQLValue(&Res, i, 2), Doc->CurURL.path, dps_strlen(DpsSQLValue(&Res, i, 2)))) continue; if (cookie.data_size) DpsDSTRAppend(&cookie, "; ", 2); DpsDSTRAppendStr(&cookie, DpsSQLValue(&Res, i, 0)); DpsDSTRAppend(&cookie, "=", 1); DpsDSTRAppendStr(&cookie, DpsSQLValue(&Res, i, 1)); } if (rows == 0) { DpsCookiesAdd(Indexer, hostinfo, "/", "", "", 'n', 0, 0, 0); } } DpsSQLFree(&Res); if (Indexer->flags & DPS_FLAG_UNOCON) { DPS_RELEASELOCK(Indexer, DPS_LOCK_DB); } hostinfo = strchr(hostinfo, '.'); if (hostinfo != NULL) hostinfo++; } } if (cookie.data_size) { DpsVarListReplaceStr(&Doc->RequestHeaders, "Cookie", cookie.data); } DpsDSTRFree(&cookie); #endif TRACE_OUT(Indexer); return; }