Ejemplo n.º 1
0
int DpsWildCaseCmp(const char *str, const char *wexp) {
    register size_t x, y;

    for (x = 0, y = 0; str[x] && wexp[y]; ++y, ++x) {
      switch(wexp[y]) {
      case '*':
	    while (wexp[++y] == '*');
	    if (!wexp[y])return 0;
	    while (str[x]) {
		register int ret;
		if ((ret = DpsWildCaseCmp(&str[x++], &wexp[y])) != 1) return ret;
	    }
	    return -1;
      case '?':
	break;
      default:
	if (dps_tolower(str[x]) != dps_tolower(wexp[y])) return 1;
	break;
      }
    }
    if (str[x] != '\0') return 1;
    while(wexp[y] == '*' || wexp[y] == '?') y++;
    if (wexp[y] == '\0') return 0;
    return -1;
/*    return (str[x] != '\0') ? 1 : ((wexp[y] == '*' || wexp[y] == '?' || wexp[y] == '\0') ? 0 : -1);*/
}
Ejemplo n.º 2
0
int DpsWildCaseCmp(const char *str, const char *wexp) {
    register size_t x, y;

    for (x = 0, y = 0; wexp[y]; ++y, ++x) {
	if ((!str[x]) && (wexp[y] != '*'))return -1;
	if (wexp[y] == '*') {
	    while (wexp[++y] == '*');
	    if (!wexp[y])return 0;
	    while (str[x]) {
		register int ret;
		if ((ret = DpsWildCaseCmp(&str[x++], &wexp[y])) != 1)return ret;
	    }
	    return -1;
	}else
	if ((wexp[y] != '?') && (dps_tolower(str[x]) != dps_tolower(wexp[y]))) return 1;
    }
    return (str[x] != '\0');
}
Ejemplo n.º 3
0
static void DpsParseHTTPHeader(DPS_AGENT *Indexer, DPS_DOCUMENT *Doc, DPS_DSTR *header) {
  char *val, *header_name;
  char	secname[128];
  DPS_VAR	*Sec;
  DPS_TEXTITEM Item;

  if ((val = strchr(header_name = header->data, ':'))) {
/*
  fprintf(stderr, "HEADER: %s\n", header_name);
*/
    *val++='\0';
    val = DpsTrim(val," \t:");
			
    if (!strcasecmp(header_name, "Content-Type") || !strcasecmp(header_name, "Content-Encoding")) {
      char *v;
      for(v=val ; *v ; v++) 
	*v = dps_tolower(*v);
    } else if (Doc->Spider.use_cookies && !strcasecmp(header_name, "Set-Cookie")) {
      char *part, *lpart;
      char *name = NULL;
      char *value = NULL;
      const char *domain = NULL;
      const char *path = NULL;
      dps_uint4 expire = 0;
      char secure = 'n';
      for (part = dps_strtok_r(val, ";" , &lpart) ; part;
	   part = dps_strtok_r(NULL, ";", &lpart)) {
	char *arg;
	part = DpsTrim(part, " ");
	if ((arg = strchr(part, '='))) {
	  *arg++ = '\0';
	  if (!name) {
	    name = part;
	    value = arg;
	  } else 
	    if (!strcasecmp(part, "path")) {
	      path = arg;
	    } else
	      if (!strcasecmp(part, "domain")) {
		domain = arg;
	      } else
		if (!strcasecmp(part, "secure")) {
		  secure = 'y';
		} else
		  if (!strcasecmp(part, "expires")) {
		    expire = (dps_uint4)DpsHttpDate2Time_t(arg);
		  }
	}
      }
      if (name && value) {
	if (domain && domain[0] == '.') {
	  domain++;
	} else {
	  domain = Doc->CurURL.hostname ? Doc->CurURL.hostname : "localhost";
	}
	if (!path) {
	  path = Doc->CurURL.path ? Doc->CurURL.path : "/";
	}
	DpsCookiesAdd(Indexer, domain, path, name, value, secure, expire, 1);
      }
/*			  token = dps_strtok_r(NULL,"\r\n",&lt);
			  continue;*/
      return;
    }
  }

  DpsVarListReplaceStr(&Doc->Sections, header_name, val ? val : "<NULL>");

  dps_snprintf(secname,sizeof(secname),"header.%s", header_name);
  secname[sizeof(secname)-1]='\0';
  if((Sec = DpsVarListFind(&Doc->Sections, secname)) && val ) {
    Item.href = NULL;
    Item.str = val;
    Item.section = Sec->section;
    Item.section_name = secname;
    Item.len = 0;
    DpsTextListAdd(&Doc->TextList, &Item);
  }
}
Ejemplo n.º 4
0
int _DpsURLParse(DPS_URL *url, const char *str, const char *filename, int line) {
#else
int DpsURLParse(DPS_URL *url, const char *str) {
#endif
	char *schema,*anchor,*file,*query;
	char *s;
/*	size_t len = dps_strlen(str);*/
#ifdef WITH_PARANOIA
	void * paran = DpsViolationEnter(paran);
#endif

#ifdef DEBUG_URL
	fprintf(stderr, " -- %s:%d Parser url: %s\n", filename, line, str);
#endif
	
	DPS_FREE(url->schema);
	DPS_FREE(url->specific);
	DPS_FREE(url->hostinfo);
	DPS_FREE(url->hostname);
	DPS_FREE(url->anchor);
	DPS_FREE(url->auth);
	url->port=0;
	url->default_port=0;
	DPS_FREE(url->path);
	DPS_FREE(url->directory);
	DPS_FREE(url->filename);
	DPS_FREE(url->query_string);

/*	if(len >= DPS_URLSIZE)return(DPS_URL_LONG);  FIXME: Chage this cheking for configured parameter, not DPS_URLSIZE */
	s = (char*)DpsStrdup(str);
	if (s == NULL) {
#ifdef WITH_PARANOIA
	  DpsViolationExit(-1, paran);
#endif
	  return DPS_ERROR;
	}

	url->len = dps_strlen(str);
	
	/* Find possible schema end than   */	
	/* Check that it is really schema  */
	/* It must consist of alphas only  */
	/* We will take in account digits  */
	/* also for oracle8:// for example */
	/* We must check it because        */
	/* It might be anchor also         */
	/* For example:                    */
	/* "mod/index.html#a:1"            */

	if((schema=strchr(s,':'))){
		const char * ch;
		for(ch=s;ch<schema;ch++){
			if(!isalnum(*ch)){
				/* Bad character       */
				/* so it is not schema */
				schema=0;break;
			}
		}
	}

	if(schema){
		/* Have scheme - absolute path */
		*schema=0;
		url->schema = (char*)DpsStrdup(s);
		url->specific = (char*)DpsStrdup(schema + 1);
		*schema=':';
		if(!strcasecmp(url->schema,"http"))url->default_port=80;
		else
		if(!strcasecmp(url->schema,"https"))url->default_port=443;
		else
		if(!strcasecmp(url->schema,"nntp"))url->default_port=119;
		else
		if(!strcasecmp(url->schema,"news"))url->default_port=119;
		else
		if(!strcasecmp(url->schema,"ftp"))url->default_port=21;

		if(!strncmp(url->specific,"//",2)){
			char	*ss,*hostname;
			
			/* Have hostinfo */
			if((ss=strchr(url->specific+2,'/'))){
				/* Have hostname with path */
				*ss=0;
				url->hostinfo = (char*)DpsStrdup(url->specific + 2);
				*ss='/';
				url->path = (char*)DpsStrdup(ss);
			}else{
				/* Hostname without path */
			        if ((ss = strchr(url->specific + 2, '?'))) {
				  /* Have hostname with parameters */
				  *ss = 0;
				  url->hostinfo = (char*)DpsStrdup(url->specific + 2);
				  *ss='?';
				  url->path = (char*)DpsStrdup("/");
				}else {
				  url->hostinfo = (char*)DpsStrdup(url->specific + 2);
				  url->path = (char*)DpsStrdup("/");
				}
			}
			if((hostname=strchr(url->hostinfo,'@'))){
				/* Username and password is given  */
				/* Store auth string user:password */
				*hostname=0;
				url->auth = (char*)DpsStrdup(url->hostinfo);
				*hostname='@';
				hostname++;
			}else{
				hostname = url->hostinfo;
			}
			/*
			FIXME:
			for(h=hostname;*h;h++){
				if( *h>='A' && *h<='Z')
				*h=(*h)-'A'+'a';
			}
			*/
	
			if((ss=strchr(hostname,':'))){
				*ss=0;
				url->hostname = (char*)DpsStrdup(hostname);
				*ss=':';
				url->port=atoi(ss+1);
			}else{
				url->hostname = (char*)DpsStrdup(hostname);
				url->port=0;
			}
		}else{
			/* Have not host but have schema                   */
			/* This is possible for:                           */
			/* file:  mailto:  htdb: news:                     */
			/* As far as we do not need mailto: just ignore it */
			
		        if(!strcasecmp(url->schema,"mailto") 
			   || !strcasecmp(url->schema,"javascript")
			   || !strcasecmp(url->schema,"feed")
			   ) {
			        DPS_FREE(s);
#ifdef WITH_PARANOIA
				DpsViolationExit(-1, paran);
#endif
				return(DPS_URL_BAD);
			} else
			if(!strcasecmp(url->schema,"file"))
				url->path = (char*)DpsStrdup(url->specific);
			else
			if(!strcasecmp(url->schema,"exec"))
				url->path = (char*)DpsStrdup(url->specific);
			else
			if(!strcasecmp(url->schema,"cgi"))
				url->path = (char*)DpsStrdup(url->specific);
			else
			if(!strcasecmp(url->schema,"htdb"))
				url->path = (char*)DpsStrdup(url->specific);
			else
			if(!strcasecmp(url->schema,"news")){
				/* Now we will use localhost as NNTP    */
				/* server as it is not indicated in URL */
				url->hostname = (char*)DpsStrdup("localhost");
				url->path = (char*)DpsMalloc(dps_strlen(url->specific) + 2);
				if (url->path == NULL) {
				  DPS_FREE(s);
#ifdef WITH_PARANOIA
				  DpsViolationExit(-1, paran);
#endif
				  return DPS_ERROR;
				}
				sprintf(url->path,"/%s",url->specific);
				url->default_port=119;
			}else{
				/* Unknown strange schema */
			        DPS_FREE(s);
#ifdef WITH_PARANOIA
				DpsViolationExit(-1, paran);
#endif
				return(DPS_URL_BAD);
			}
		}
	}else{
		url->path = (char*)DpsStrdup(s);
	}

	/* Cat an anchor if exist */
	if((anchor=strstr(url->path,"#")))*anchor=0;


	/* If path is not full just copy it to filename    */
	/* i.e. neither  /usr/local/ nor  c:/windows/temp/ */

	if((url->path != NULL) && (url->path[0]!='/') && (url->path[0]!='?') && (url->path[1]!=':')) { 
		/* Relative path */
		if(!strncmp(url->path,"./",2))
			url->filename = (char*)DpsStrdup(url->path + 2);
		else
			url->filename = (char*)DpsStrdup(url->path);
		url->path[0] = 0;
	}

	/* truncate path to query_string */
	/* and store query_string        */

	if((query=strrchr(url->path,'?'))){
		url->query_string = (char*)DpsStrdup(query);
		*(query) = 0;
	}
	
	DpsURLNormalizePath(url->path);
	
	/* Now find right '/' sign and copy the rest to filename */

	if((file=strrchr(url->path,'/'))&&(strcmp(file,"/"))){
		url->filename = (char*)DpsStrdup(file + 1);
		*(file+1)=0;
	}

	/* Now find right '/' sign and copy the rest to directory */

	if ((file = strrchr(url->path,'/'))) {
	  char *p_save = file;
	  for(file--; (file > url->path) && (*file != '/'); file--);
	  file++;
	  if (*file) {
	    *p_save = '\0';
	    url->directory = (char*)DpsStrdup(file);
	    *p_save = '/';
	  }
	}

	DPS_FREE(s);
	if (url->hostname != NULL) {
	  DpsRTrim(url->hostname, ".");
	  url->domain_level = 1;
	  for (s = url->hostname; *s; s++) {
	    *s = dps_tolower(*s);
	    if (*s == '.') url->domain_level++;
	    if (strchr(",'\";", (int)*s)) {
#ifdef WITH_PARANOIA
	      DpsViolationExit(-1, paran);
#endif
	      return DPS_URL_BAD;
	    }
	  }
	}
	if (url->hostinfo != NULL) {
	  DpsRTrim(url->hostinfo, ".");
	  s = strchr(url->hostinfo, '@');
	  for (s = (s == NULL) ? url->hostinfo : s + 1; *s; s++) *s = dps_tolower(*s);
	}
	if (url->schema != NULL) for (s = url->schema; *s; s++) *s = dps_tolower(*s);

/*	fprintf(stderr, "url: .path: %s port:%d\n", url->path, url->port);*/

#ifdef WITH_PARANOIA
	DpsViolationExit(-1, paran);
#endif
	return DPS_OK;
}
Ejemplo n.º 5
0
static void DpsParseHTTPHeader(DPS_AGENT *Indexer, DPS_DOCUMENT *Doc, DPS_DSTR *header) {
  char *val, *header_name;
  char	secname[128], savec;
  DPS_VAR	*Sec;
  DPS_TEXTITEM Item;

  if ((val = strchr(header_name = header->data, ':'))) {
/*
  fprintf(stderr, "HEADER: %s\n", header_name);
*/
    *val++='\0';
    val = DpsTrim(val," \t:");
			
    if (!strcasecmp(header_name, "Content-Type") || !strcasecmp(header_name, "Content-Encoding")) {
      register char *v;
      for(v=val ; *v ; v++) 
	*v = (char)dps_tolower((int)*v);
    } else if (Doc->Spider.use_robots && !strcasecmp(header_name, "X-Robots-Tag")) {
        char * lt;
	char * rtok;
					
	rtok = dps_strtok_r(val, " ,\r\n\t", &lt, &savec);
	while(rtok){
	  if(!strcasecmp(rtok, "ALL")){
	    /* Left Server parameters unchanged */
	  }else if(!strcasecmp(rtok, "NONE")){
	    Doc->Spider.follow = DPS_FOLLOW_NO;
	    Doc->Spider.index = 0;
	    if (DpsNeedLog(DPS_LOG_DEBUG)) {
	      DpsVarListReplaceInt(&Doc->Sections, "Index", 0);
	      DpsVarListReplaceInt(&Doc->Sections, "Follow", DPS_FOLLOW_NO);
	    }
	  }else if(!strcasecmp(rtok, "NOINDEX")) {
	    Doc->Spider.index = 0;
/*          Doc->method = DPS_METHOD_DISALLOW;*/
	    if (DpsNeedLog(DPS_LOG_DEBUG)) DpsVarListReplaceInt(&Doc->Sections, "Index", 0);
	  }else if(!strcasecmp(rtok, "NOFOLLOW")) {
	    Doc->Spider.follow = DPS_FOLLOW_NO;
	    if (DpsNeedLog(DPS_LOG_DEBUG)) DpsVarListReplaceInt(&Doc->Sections, "Follow", DPS_FOLLOW_NO);
	  }else if(!strcasecmp(rtok, "NOARCHIVE")) {
	    DpsVarListReplaceStr(&Doc->Sections, "Z", "");
	  }else if(!strcasecmp(rtok, "INDEX")) {
            /* left server value unchanged */ 
	    if (DpsNeedLog(DPS_LOG_DEBUG)) DpsVarListReplaceInt(&Doc->Sections, "Index", Doc->Spider.index);
	  }else if(!strcasecmp(rtok, "FOLLOW")) {
            /* left server value unchanged */ 
	    if (DpsNeedLog(DPS_LOG_DEBUG)) DpsVarListReplaceInt(&Doc->Sections, "Follow", Doc->Spider.follow);
	  }
	  rtok = dps_strtok_r(NULL, " \r\n\t", &lt, &savec);
	}
      
    } else if (Doc->Spider.use_cookies && !strcasecmp(header_name, "Set-Cookie")) {


      DpsCookiesAddStr(Indexer, &Doc->CurURL, val, 1);

      return;
    }
  }

  DpsVarListReplaceStr(&Doc->Sections, header_name, val ? val : "<NULL>");

  dps_snprintf(secname,sizeof(secname),"header.%s", header_name);
  secname[sizeof(secname)-1]='\0';
  if((Sec = DpsVarListFind(&Doc->Sections, secname)) && val ) {
    bzero((void*)&Item, sizeof(Item));
    Item.href = NULL;
    Item.str = val;
    Item.section = Sec->section;
    Item.section_name = secname;
    Item.strict = Sec->strict;
    Item.len = 0;
    (void)DpsTextListAdd(&Doc->TextList, &Item);
  }
}