void DpsParseHTTPResponse(DPS_AGENT *Indexer, DPS_DOCUMENT *Doc) { char *token, *lt, *headers, savec; int status, oldstatus; DPS_DSTR header; time_t now, last_mod_time; Doc->Buf.content=NULL; oldstatus = DpsVarListFindInt(&Doc->Sections, "Status", 0); DpsVarListReplaceInt(&Doc->Sections, "ResponseSize", (int)Doc->Buf.size); DpsVarListDel(&Doc->Sections, "Content-Length"); /* DpsVarListDel(&Doc->Sections, "Last-Modified");*/ /* if it's not deleted Lat-Modified equals to the first appearance in db */ if (Doc->Buf.buf == NULL) return; /* Cut HTTP response header first */ for(token=Doc->Buf.buf;*token;token++){ if(!strncmp(token,"\r\n\r\n",4)){ if (token <= Doc->Buf.buf + Doc->Buf.size - 4) { *token='\0'; Doc->Buf.content = token + 4; } break; } else if(!strncmp(token,"\n\n",2)){ if (token <= Doc->Buf.buf + Doc->Buf.size - 2) { *token='\0'; Doc->Buf.content = token + 2; } break; } } /* Bad response, return */ if(!Doc->Buf.content) { if (token <= Doc->Buf.buf + Doc->Buf.size - 4) { if (token[2] == CR_CHAR) Doc->Buf.content = token + 4; else Doc->Buf.content = token + 2; } } /* Copy headers not to break them */ headers = (char*)DpsStrdup(Doc->Buf.buf); /* Now lets parse response header lines */ token = dps_strtok_r(headers, "\r\n", <, &savec); if(!token) { DpsFree(headers); return; } if(!strncmp(token,"HTTP/",5)){ status = atoi(token + 8); DpsVarListReplaceStr(&Doc->Sections,"ResponseLine",token); DpsVarListReplaceInt(&Doc->Sections, "Status", (oldstatus > status) ? oldstatus : status ); }else{ DpsFree(headers); return; } token = dps_strtok_r(NULL, "\r\n", <, &savec); DpsDSTRInit(&header, 128); while(token){ if(strchr(token,':')) { if (header.data_size) { DpsParseHTTPHeader(Indexer, Doc, &header); DpsDSTRFree(&header); DpsDSTRInit(&header, 128); } } DpsDSTRAppendStr(&header, token); token = dps_strtok_r(NULL, "\r\n", <, &savec); } if (header.data_size) { DpsParseHTTPHeader(Indexer, Doc, &header); } DpsDSTRFree(&header); DPS_FREE(headers); { now = Indexer->now; last_mod_time = DpsHttpDate2Time_t(DpsVarListFindStr(&Doc->Sections, "Last-Modified", "")); if (last_mod_time > now + 3600 * 4) { /* we have a document with Last-Modified time in the future */ DpsLog(Indexer, DPS_LOG_EXTRA, "Last-Modified date is deep in future (%d>%d), dropping it.", last_mod_time, now); DpsVarListDel(&Doc->Sections, "Last-Modified"); } } /* Bad response, return */ if(!Doc->Buf.content) { return; } DpsVarListReplaceInt(&Doc->Sections,"Content-Length", Doc->Buf.buf-Doc->Buf.content+(int)Doc->Buf.size + DpsVarListFindInt(&Doc->Sections,"Content-Length", 0)); }
void DpsParseHTTPResponse(DPS_AGENT *Indexer, DPS_DOCUMENT *Doc) { char *token, *lt, *headers; int oldstatus; DPS_DSTR header; Doc->Buf.content=NULL; oldstatus = DpsVarListFindInt(&Doc->Sections, "Status", 0); DpsVarListReplaceInt(&Doc->Sections, "ResponseSize", (int)Doc->Buf.size); DpsVarListDel(&Doc->Sections, "Content-Length"); DpsVarListDel(&Doc->Sections, "Last-Modified"); if (Doc->Buf.buf == NULL) return; /* Cut HTTP response header first */ for(token=Doc->Buf.buf;*token;token++){ if(!strncmp(token,"\r\n\r\n",4)){ *token='\0'; Doc->Buf.content = token + 4; break; }else if(!strncmp(token,"\n\n",2)){ *token='\0'; Doc->Buf.content = token + 2; break; } } /* Bad response, return */ if(!Doc->Buf.content) { if (token < Doc->Buf.buf + Doc->Buf.size - 4) { if (token[2] == '\r') Doc->Buf.content = token + 4; else Doc->Buf.content = token + 2; } else { return; } } /* Copy headers not to break them */ headers = (char*)DpsStrdup(Doc->Buf.buf); /* Now lets parse response header lines */ token = dps_strtok_r(headers,"\r\n",<); if(!token)return; if(!strncmp(token,"HTTP/",5)){ int status = atoi(token + 8); DpsVarListReplaceStr(&Doc->Sections,"ResponseLine",token); DpsVarListReplaceInt(&Doc->Sections, "Status", (oldstatus > status) ? oldstatus : status ); }else{ DpsFree(headers); return; } token = dps_strtok_r(NULL,"\r\n",<); DpsDSTRInit(&header, 128); while(token){ if(strchr(token,':')) { if (header.data_size) { DpsParseHTTPHeader(Indexer, Doc, &header); DpsDSTRFree(&header); DpsDSTRInit(&header, 128); } } DpsDSTRAppendStr(&header, token); token = dps_strtok_r(NULL,"\r\n",<); } if (header.data_size) { DpsParseHTTPHeader(Indexer, Doc, &header); } DpsDSTRFree(&header); DPS_FREE(headers); DpsVarListInsInt(&Doc->Sections,"Content-Length",Doc->Buf.buf-Doc->Buf.content+(int)Doc->Buf.size); }
void DpsCookiesFind(DPS_AGENT *Indexer, DPS_SERVER *Server, DPS_DOCUMENT *Doc, const char *hostinfo) { #ifdef HAVE_SQL DPS_DSTR cookie; DPS_COOKIES *Cookies = &Indexer->Cookies; DPS_COOKIE *Coo; size_t i, blen = dps_strlen(hostinfo), slen; int have_no_cookies = DpsVarListFindInt(&Doc->Sections, "have_no_cookies", 1); #ifdef WITH_PARANOIA void *paran = DpsViolationEnter(paran); #endif TRACE_IN(Indexer, "DpsCookiesFind"); DpsDSTRInit(&cookie, 1024); for(i = 0; i < Cookies->ncookies; i++) { Coo = &Cookies->Cookie[i]; slen = dps_strlen(Coo->domain); if (slen > blen) continue; if (Coo->secure == 'y' && strcasecmp(Doc->CurURL.schema, "https")) continue; if (strncasecmp(Coo->path, Doc->CurURL.path, dps_strlen(Coo->path))) continue; if (strcasecmp(Coo->domain, hostinfo + (blen - slen))) continue; if (Coo->from_config != 1) have_no_cookies = 0; if (Coo->name[0] == '\0' && Coo->value[0] == '\0') continue; if (cookie.data_size) DpsDSTRAppend(&cookie, "; ", 2); DpsDSTRAppendStr(&cookie, Coo->name); DpsDSTRAppend(&cookie, "=", 1); DpsDSTRAppendStr(&cookie, Coo->value); } if (have_no_cookies) { char buf[2*PATH_MAX]; dpshash32_t url_id; DPS_DB *db; DPS_SQLRES Res; size_t rows; int rc; if (Server != NULL) { char *PingData = DpsVarListFindStr(&Server->Vars, "AuthPing", NULL); if (PingData != NULL) { char *AuthPing = DpsStrdup(DpsTrim(PingData, " \t\r\n")); int method = DPS_METHOD_GET; dps_base64_decode(AuthPing, PingData, dps_strlen(PingData)); if (!strncasecmp(AuthPing, "GET", 3)) { method = DPS_METHOD_GET; PingData = DpsTrim(AuthPing + 3, " \t\r\n"); } else if (!strncasecmp(AuthPing, "POST", 4)) { method = DPS_METHOD_POST; PingData = DpsTrim(AuthPing + 4, " \t\r\n"); } else { DpsLog(Indexer, DPS_LOG_ERROR, "AuthPing should be GET or POST: %s", AuthPing); PingData = NULL; } if (PingData != NULL) { size_t size = dps_strlen(PingData); { char PingURL[size + 2]; char PingBody[size]; DPS_DOCUMENT *rDoc; int result; rDoc = DpsDocInit(NULL); DpsSpiderParamInit(&rDoc->Spider); DpsVarList2Doc(rDoc, Server); rDoc->Buf.max_size = (size_t)DpsVarListFindInt(&Indexer->Vars, "MaxDocSize", DPS_MAXDOCSIZE); rDoc->Buf.allocated_size = DPS_NET_BUF_SIZE; if ((rDoc->Buf.buf = (char*)DpsMalloc(rDoc->Buf.allocated_size + 1)) == NULL) { DpsDocFree(rDoc); TRACE_OUT(Indexer); return; } rDoc->Buf.buf[0]='\0'; rDoc->subdoc = Indexer->Flags.SubDocLevel + 1; #if 1 dps_snprintf(buf, sizeof(buf), "%s://%s/", DPS_NULL2EMPTY(Doc->CurURL.schema), DPS_NULL2EMPTY(Doc->CurURL.hostinfo)); DpsVarListReplaceStr(&rDoc->Sections, "URL", buf); DpsURLParse(&rDoc->CurURL, buf); DpsLog(Indexer, DPS_LOG_INFO, "HOME: %s", buf); rDoc->method = DPS_METHOD_HEAD; /* DpsVarListFree(&rDoc->RequestHeaders);*/ if (Doc != NULL) { DpsVarListReplaceLst(&rDoc->RequestHeaders, &Doc->RequestHeaders, NULL, "*"); } DpsVarListReplaceStr(&rDoc->Sections, "have_no_cookies", "0"); DpsDocAddDocExtraHeaders(Indexer, Server, rDoc); DpsDocAddConfExtraHeaders(Indexer->Conf, rDoc); DpsVarListReplaceLst(&rDoc->Sections, &Server->Vars, NULL, "*"); DpsDocAddServExtraHeaders(Server, rDoc); DpsVarListLog(Indexer, &rDoc->RequestHeaders, DPS_LOG_DEBUG, "HOME.Request"); if (Doc == NULL || Indexer->Flags.cmd == DPS_IND_FILTER) { DpsDocLookupConn(Indexer, rDoc); } else { DPS_FREE(rDoc->connp.connp); rDoc->connp = Doc->connp; } result = DpsGetURL(Indexer, rDoc, NULL); /* Just get headers from the home as we need only Cookies from it */ DpsDocProcessResponseHeaders(Indexer, rDoc); DpsVarListLog(Indexer, &rDoc->Sections, DPS_LOG_DEBUG, "HOME.Response"); #endif sscanf(PingData, "%s %s", PingURL, PingBody); if (rDoc->method == DPS_METHOD_GET) { dps_strcat(PingURL, "?"); dps_strcat(PingURL, PingBody); } else { DpsVarListReplaceStr(&rDoc->Sections, "body", PingBody); } DpsVarListReplaceStr(&rDoc->Sections, "URL", PingURL); DpsURLParse(&rDoc->CurURL, PingURL); DpsLog(Indexer, DPS_LOG_INFO, "AUTH.PING: %s", PingURL); rDoc->method = method; DpsVarListFree(&rDoc->RequestHeaders); DpsVarListReplaceStr(&rDoc->Sections, "have_no_cookies", "0"); DpsDocAddDocExtraHeaders(Indexer, Server, rDoc); DpsDocAddConfExtraHeaders(Indexer->Conf, rDoc); DpsVarListReplaceLst(&rDoc->Sections, &Server->Vars, NULL, "*"); DpsDocAddServExtraHeaders(Server, rDoc); if (method == DPS_METHOD_POST) { dps_snprintf(buf, sizeof(buf), "application/x-www-form-urlencoded; charset=%s", DpsVarListFindStr(&Indexer->Conf->Vars, "LocalCharset", "iso-8859-1")); DpsVarListReplaceStr(&rDoc->RequestHeaders, "Content-Type", buf); dps_snprintf(buf, sizeof(buf), "%d", dps_strlen(PingBody)); DpsVarListReplaceStr(&rDoc->RequestHeaders, "Content-Length", buf); } DpsVarListLog(Indexer, &rDoc->RequestHeaders, DPS_LOG_DEBUG, "AUTHPING.Request"); #if 0 if (Doc == NULL || Indexer->Flags.cmd == DPS_IND_FILTER) { DpsDocLookupConn(Indexer, rDoc); } else { DPS_FREE(rDoc->connp.connp); rDoc->connp = Doc->connp; } #endif result = DpsGetURL(Indexer, rDoc, NULL); /* Just get it as we need only Cookies from the headers */ DpsDocProcessResponseHeaders(Indexer, rDoc); DpsVarListDel(&rDoc->Sections, "body"); DpsVarListLog(Indexer, &rDoc->Sections, DPS_LOG_DEBUG, "AUTHPING.Response"); if (Doc != NULL) bzero(&rDoc->connp, sizeof(rDoc->connp)); DpsDocFree(rDoc); } } DpsFree(AuthPing); } } while(hostinfo != NULL) { url_id = DpsStrHash32(hostinfo); DpsSQLResInit(&Res); dps_snprintf(buf, sizeof(buf), "SELECT name,value,path,secure FROM cookies WHERE domain='%s'", hostinfo); if (Indexer->flags & DPS_FLAG_UNOCON) { DPS_GETLOCK(Indexer, DPS_LOCK_DB); db = Indexer->Conf->dbl.db[url_id % Indexer->Conf->dbl.nitems]; } else { db = Indexer->dbl.db[url_id % Indexer->dbl.nitems]; } if(DPS_OK == (rc = DpsSQLQuery(db, &Res, buf))) { rows = DpsSQLNumRows(&Res); for(i = 0; i < rows; i++) { DpsCookiesAdd(Indexer, hostinfo, DpsSQLValue(&Res, i, 2), DpsSQLValue(&Res, i, 0), DpsSQLValue(&Res, i, 1), *DpsSQLValue(&Res, i, 3), 0, 0, 0); if (*DpsSQLValue(&Res, i, 3) == 'y' && strcasecmp(Doc->CurURL.schema, "https")) continue; if (strncasecmp(DpsSQLValue(&Res, i, 2), Doc->CurURL.path, dps_strlen(DpsSQLValue(&Res, i, 2)))) continue; if (cookie.data_size) DpsDSTRAppend(&cookie, "; ", 2); DpsDSTRAppendStr(&cookie, DpsSQLValue(&Res, i, 0)); DpsDSTRAppend(&cookie, "=", 1); DpsDSTRAppendStr(&cookie, DpsSQLValue(&Res, i, 1)); } if (rows == 0) { DpsCookiesAdd(Indexer, hostinfo, "/", "", "", 'n', 0, 0, 0); } } DpsSQLFree(&Res); if (Indexer->flags & DPS_FLAG_UNOCON) { DPS_RELEASELOCK(Indexer, DPS_LOCK_DB); } hostinfo = strchr(hostinfo, '.'); if (hostinfo != NULL) hostinfo++; } } if (cookie.data_size) { DpsVarListReplaceStr(&Doc->RequestHeaders, "Cookie", cookie.data); } DpsDSTRFree(&cookie); #endif TRACE_OUT(Indexer); return; }