Esempio n. 1
0
int DpsCookiesAddStr(DPS_AGENT *Indexer, DPS_URL *CurURL, const char *cookie_str, int insert_flag) {
  char *part, *lpart;
  char *name = NULL;
  char *value = NULL;
  char *domain = NULL, *orig_domain = NULL;
  char *path = NULL;
  dps_uint4 expire = 0, need_free_domain = 1, need_free_path = 1;
  char secure = 'n', savec;

  for (part = dps_strtok_r(cookie_str, ";" , &lpart, &savec) ; part;
       part = dps_strtok_r(NULL, ";", &lpart, &savec)) {
    char *arg;

    part = DpsTrim(part, " ");
    if ((arg = strchr(part, '='))) {
      *arg++ = '\0';
      if (!name) {
	name = part;
	DpsFree(value);
	value = DpsStrdup(arg);
      } else 
	if (!strcasecmp(part, "path")) {
	  DpsFree(path);
	  path = DpsStrdup(arg);
	} else
	  if (!strcasecmp(part, "domain")) {
	    DpsFree(orig_domain);
	    orig_domain = domain = DpsStrdup(arg);
	  } else
	    if (!strcasecmp(part, "secure")) {
	      secure = 'y';
	    } else
	      if (!strcasecmp(part, "expires")) {
		expire = (dps_uint4)DpsHttpDate2Time_t(arg);
	      }
    }
  }
  if (name && value) {
    if (domain && domain[0] == '.') {
      domain++;
    } else {
      if (domain) DpsFree(orig_domain);
      domain = (CurURL && CurURL->hostname) ? CurURL->hostname : "localhost";
      need_free_domain = 0;
    }
    if (!path) {
      path = (CurURL && CurURL->path) ? CurURL->path : "/";
      need_free_path = 0;
    }

    DpsCookiesAdd(Indexer, domain, path, name, value, secure, expire, (CurURL==NULL)?1:0, 1);

  }

  DpsFree(value);
  if (need_free_path) DpsFree(path);
  if (need_free_domain) DpsFree(orig_domain);

  return DPS_OK;
}
Esempio n. 2
0
static void DpsParseHTTPHeader(DPS_AGENT *Indexer, DPS_DOCUMENT *Doc, DPS_DSTR *header) {
  char *val, *header_name;
  char	secname[128];
  DPS_VAR	*Sec;
  DPS_TEXTITEM Item;

  if ((val = strchr(header_name = header->data, ':'))) {
/*
  fprintf(stderr, "HEADER: %s\n", header_name);
*/
    *val++='\0';
    val = DpsTrim(val," \t:");
			
    if (!strcasecmp(header_name, "Content-Type") || !strcasecmp(header_name, "Content-Encoding")) {
      char *v;
      for(v=val ; *v ; v++) 
	*v = dps_tolower(*v);
    } else if (Doc->Spider.use_cookies && !strcasecmp(header_name, "Set-Cookie")) {
      char *part, *lpart;
      char *name = NULL;
      char *value = NULL;
      const char *domain = NULL;
      const char *path = NULL;
      dps_uint4 expire = 0;
      char secure = 'n';
      for (part = dps_strtok_r(val, ";" , &lpart) ; part;
	   part = dps_strtok_r(NULL, ";", &lpart)) {
	char *arg;
	part = DpsTrim(part, " ");
	if ((arg = strchr(part, '='))) {
	  *arg++ = '\0';
	  if (!name) {
	    name = part;
	    value = arg;
	  } else 
	    if (!strcasecmp(part, "path")) {
	      path = arg;
	    } else
	      if (!strcasecmp(part, "domain")) {
		domain = arg;
	      } else
		if (!strcasecmp(part, "secure")) {
		  secure = 'y';
		} else
		  if (!strcasecmp(part, "expires")) {
		    expire = (dps_uint4)DpsHttpDate2Time_t(arg);
		  }
	}
      }
      if (name && value) {
	if (domain && domain[0] == '.') {
	  domain++;
	} else {
	  domain = Doc->CurURL.hostname ? Doc->CurURL.hostname : "localhost";
	}
	if (!path) {
	  path = Doc->CurURL.path ? Doc->CurURL.path : "/";
	}
	DpsCookiesAdd(Indexer, domain, path, name, value, secure, expire, 1);
      }
/*			  token = dps_strtok_r(NULL,"\r\n",&lt);
			  continue;*/
      return;
    }
  }

  DpsVarListReplaceStr(&Doc->Sections, header_name, val ? val : "<NULL>");

  dps_snprintf(secname,sizeof(secname),"header.%s", header_name);
  secname[sizeof(secname)-1]='\0';
  if((Sec = DpsVarListFind(&Doc->Sections, secname)) && val ) {
    Item.href = NULL;
    Item.str = val;
    Item.section = Sec->section;
    Item.section_name = secname;
    Item.len = 0;
    DpsTextListAdd(&Doc->TextList, &Item);
  }
}
Esempio n. 3
0
void DpsParseHTTPResponse(DPS_AGENT *Indexer, DPS_DOCUMENT *Doc) {			
    char *token, *lt, *headers, savec;
    int status, oldstatus;
    DPS_DSTR header;
    time_t now, last_mod_time;
	
	Doc->Buf.content=NULL;
	oldstatus = DpsVarListFindInt(&Doc->Sections, "Status", 0);
	DpsVarListReplaceInt(&Doc->Sections, "ResponseSize", (int)Doc->Buf.size);
	DpsVarListDel(&Doc->Sections, "Content-Length");
/*	DpsVarListDel(&Doc->Sections, "Last-Modified");*/ /* if it's not deleted Lat-Modified equals to the first appearance in db */

	if (Doc->Buf.buf == NULL) return;

	/* Cut HTTP response header first        */
	for(token=Doc->Buf.buf;*token;token++){
	  if(!strncmp(token,"\r\n\r\n",4)){
	    if (token <= Doc->Buf.buf + Doc->Buf.size - 4) {
			*token='\0';
			Doc->Buf.content = token + 4;
	    }
	    break;
	  } else if(!strncmp(token,"\n\n",2)){
	    if (token <= Doc->Buf.buf + Doc->Buf.size - 2) {
			*token='\0';
			Doc->Buf.content = token + 2;
	    }
	    break;
	  }
	}
	
	/* Bad response, return */
	if(!Doc->Buf.content) {
	  if (token <= Doc->Buf.buf + Doc->Buf.size - 4) {
	    if (token[2] == CR_CHAR) Doc->Buf.content = token + 4;
	    else Doc->Buf.content = token + 2;
	  }
	}
	
	/* Copy headers not to break them */
	headers = (char*)DpsStrdup(Doc->Buf.buf);
	
	/* Now lets parse response header lines */
	token = dps_strtok_r(headers, "\r\n", &lt, &savec);
	
	if(!token) {
	  DpsFree(headers);
	  return;
	}
	
	if(!strncmp(token,"HTTP/",5)){
		status = atoi(token + 8);
		DpsVarListReplaceStr(&Doc->Sections,"ResponseLine",token);
		DpsVarListReplaceInt(&Doc->Sections, "Status", (oldstatus > status) ? oldstatus : status );
	}else{
	        DpsFree(headers);
		return;
	}
	token = dps_strtok_r(NULL, "\r\n", &lt, &savec);
	DpsDSTRInit(&header, 128);
	
	while(token){
	
		if(strchr(token,':')) {

		  if (header.data_size) {
		    DpsParseHTTPHeader(Indexer, Doc, &header);
		    DpsDSTRFree(&header);
		    DpsDSTRInit(&header, 128);
		  }

		}
		DpsDSTRAppendStr(&header, token);

		token = dps_strtok_r(NULL, "\r\n", &lt, &savec);
	}
	if (header.data_size) {
	  DpsParseHTTPHeader(Indexer, Doc, &header);
	}
	DpsDSTRFree(&header);
	DPS_FREE(headers);
	
	{
	    now = Indexer->now;
	    last_mod_time = DpsHttpDate2Time_t(DpsVarListFindStr(&Doc->Sections, "Last-Modified", ""));
	    if (last_mod_time > now + 3600 * 4) { /* we have a document with Last-Modified time in the future */
		DpsLog(Indexer, DPS_LOG_EXTRA, "Last-Modified date is deep in future (%d>%d), dropping it.", last_mod_time, now);
		DpsVarListDel(&Doc->Sections, "Last-Modified");
	    }
	}

	/* Bad response, return */
	if(!Doc->Buf.content) {
	    return;
	}
	DpsVarListReplaceInt(&Doc->Sections,"Content-Length", Doc->Buf.buf-Doc->Buf.content+(int)Doc->Buf.size + DpsVarListFindInt(&Doc->Sections,"Content-Length", 0));
}