int DpsWildCaseCmp(const char *str, const char *wexp) { register size_t x, y; for (x = 0, y = 0; str[x] && wexp[y]; ++y, ++x) { switch(wexp[y]) { case '*': while (wexp[++y] == '*'); if (!wexp[y])return 0; while (str[x]) { register int ret; if ((ret = DpsWildCaseCmp(&str[x++], &wexp[y])) != 1) return ret; } return -1; case '?': break; default: if (dps_tolower(str[x]) != dps_tolower(wexp[y])) return 1; break; } } if (str[x] != '\0') return 1; while(wexp[y] == '*' || wexp[y] == '?') y++; if (wexp[y] == '\0') return 0; return -1; /* return (str[x] != '\0') ? 1 : ((wexp[y] == '*' || wexp[y] == '?' || wexp[y] == '\0') ? 0 : -1);*/ }
int DpsWildCaseCmp(const char *str, const char *wexp) { register size_t x, y; for (x = 0, y = 0; wexp[y]; ++y, ++x) { if ((!str[x]) && (wexp[y] != '*'))return -1; if (wexp[y] == '*') { while (wexp[++y] == '*'); if (!wexp[y])return 0; while (str[x]) { register int ret; if ((ret = DpsWildCaseCmp(&str[x++], &wexp[y])) != 1)return ret; } return -1; }else if ((wexp[y] != '?') && (dps_tolower(str[x]) != dps_tolower(wexp[y]))) return 1; } return (str[x] != '\0'); }
static void DpsParseHTTPHeader(DPS_AGENT *Indexer, DPS_DOCUMENT *Doc, DPS_DSTR *header) { char *val, *header_name; char secname[128]; DPS_VAR *Sec; DPS_TEXTITEM Item; if ((val = strchr(header_name = header->data, ':'))) { /* fprintf(stderr, "HEADER: %s\n", header_name); */ *val++='\0'; val = DpsTrim(val," \t:"); if (!strcasecmp(header_name, "Content-Type") || !strcasecmp(header_name, "Content-Encoding")) { char *v; for(v=val ; *v ; v++) *v = dps_tolower(*v); } else if (Doc->Spider.use_cookies && !strcasecmp(header_name, "Set-Cookie")) { char *part, *lpart; char *name = NULL; char *value = NULL; const char *domain = NULL; const char *path = NULL; dps_uint4 expire = 0; char secure = 'n'; for (part = dps_strtok_r(val, ";" , &lpart) ; part; part = dps_strtok_r(NULL, ";", &lpart)) { char *arg; part = DpsTrim(part, " "); if ((arg = strchr(part, '='))) { *arg++ = '\0'; if (!name) { name = part; value = arg; } else if (!strcasecmp(part, "path")) { path = arg; } else if (!strcasecmp(part, "domain")) { domain = arg; } else if (!strcasecmp(part, "secure")) { secure = 'y'; } else if (!strcasecmp(part, "expires")) { expire = (dps_uint4)DpsHttpDate2Time_t(arg); } } } if (name && value) { if (domain && domain[0] == '.') { domain++; } else { domain = Doc->CurURL.hostname ? Doc->CurURL.hostname : "localhost"; } if (!path) { path = Doc->CurURL.path ? Doc->CurURL.path : "/"; } DpsCookiesAdd(Indexer, domain, path, name, value, secure, expire, 1); } /* token = dps_strtok_r(NULL,"\r\n",<); continue;*/ return; } } DpsVarListReplaceStr(&Doc->Sections, header_name, val ? val : "<NULL>"); dps_snprintf(secname,sizeof(secname),"header.%s", header_name); secname[sizeof(secname)-1]='\0'; if((Sec = DpsVarListFind(&Doc->Sections, secname)) && val ) { Item.href = NULL; Item.str = val; Item.section = Sec->section; Item.section_name = secname; Item.len = 0; DpsTextListAdd(&Doc->TextList, &Item); } }
int _DpsURLParse(DPS_URL *url, const char *str, const char *filename, int line) { #else int DpsURLParse(DPS_URL *url, const char *str) { #endif char *schema,*anchor,*file,*query; char *s; /* size_t len = dps_strlen(str);*/ #ifdef WITH_PARANOIA void * paran = DpsViolationEnter(paran); #endif #ifdef DEBUG_URL fprintf(stderr, " -- %s:%d Parser url: %s\n", filename, line, str); #endif DPS_FREE(url->schema); DPS_FREE(url->specific); DPS_FREE(url->hostinfo); DPS_FREE(url->hostname); DPS_FREE(url->anchor); DPS_FREE(url->auth); url->port=0; url->default_port=0; DPS_FREE(url->path); DPS_FREE(url->directory); DPS_FREE(url->filename); DPS_FREE(url->query_string); /* if(len >= DPS_URLSIZE)return(DPS_URL_LONG); FIXME: Chage this cheking for configured parameter, not DPS_URLSIZE */ s = (char*)DpsStrdup(str); if (s == NULL) { #ifdef WITH_PARANOIA DpsViolationExit(-1, paran); #endif return DPS_ERROR; } url->len = dps_strlen(str); /* Find possible schema end than */ /* Check that it is really schema */ /* It must consist of alphas only */ /* We will take in account digits */ /* also for oracle8:// for example */ /* We must check it because */ /* It might be anchor also */ /* For example: */ /* "mod/index.html#a:1" */ if((schema=strchr(s,':'))){ const char * ch; for(ch=s;ch<schema;ch++){ if(!isalnum(*ch)){ /* Bad character */ /* so it is not schema */ schema=0;break; } } } if(schema){ /* Have scheme - absolute path */ *schema=0; url->schema = (char*)DpsStrdup(s); url->specific = (char*)DpsStrdup(schema + 1); *schema=':'; if(!strcasecmp(url->schema,"http"))url->default_port=80; else if(!strcasecmp(url->schema,"https"))url->default_port=443; else if(!strcasecmp(url->schema,"nntp"))url->default_port=119; else if(!strcasecmp(url->schema,"news"))url->default_port=119; else if(!strcasecmp(url->schema,"ftp"))url->default_port=21; if(!strncmp(url->specific,"//",2)){ char *ss,*hostname; /* Have hostinfo */ if((ss=strchr(url->specific+2,'/'))){ /* Have hostname with path */ *ss=0; url->hostinfo = (char*)DpsStrdup(url->specific + 2); *ss='/'; url->path = (char*)DpsStrdup(ss); }else{ /* Hostname without path */ if ((ss = strchr(url->specific + 2, '?'))) { /* Have hostname with parameters */ *ss = 0; url->hostinfo = (char*)DpsStrdup(url->specific + 2); *ss='?'; url->path = (char*)DpsStrdup("/"); }else { url->hostinfo = (char*)DpsStrdup(url->specific + 2); url->path = (char*)DpsStrdup("/"); } } if((hostname=strchr(url->hostinfo,'@'))){ /* Username and password is given */ /* Store auth string user:password */ *hostname=0; url->auth = (char*)DpsStrdup(url->hostinfo); *hostname='@'; hostname++; }else{ hostname = url->hostinfo; } /* FIXME: for(h=hostname;*h;h++){ if( *h>='A' && *h<='Z') *h=(*h)-'A'+'a'; } */ if((ss=strchr(hostname,':'))){ *ss=0; url->hostname = (char*)DpsStrdup(hostname); *ss=':'; url->port=atoi(ss+1); }else{ url->hostname = (char*)DpsStrdup(hostname); url->port=0; } }else{ /* Have not host but have schema */ /* This is possible for: */ /* file: mailto: htdb: news: */ /* As far as we do not need mailto: just ignore it */ if(!strcasecmp(url->schema,"mailto") || !strcasecmp(url->schema,"javascript") || !strcasecmp(url->schema,"feed") ) { DPS_FREE(s); #ifdef WITH_PARANOIA DpsViolationExit(-1, paran); #endif return(DPS_URL_BAD); } else if(!strcasecmp(url->schema,"file")) url->path = (char*)DpsStrdup(url->specific); else if(!strcasecmp(url->schema,"exec")) url->path = (char*)DpsStrdup(url->specific); else if(!strcasecmp(url->schema,"cgi")) url->path = (char*)DpsStrdup(url->specific); else if(!strcasecmp(url->schema,"htdb")) url->path = (char*)DpsStrdup(url->specific); else if(!strcasecmp(url->schema,"news")){ /* Now we will use localhost as NNTP */ /* server as it is not indicated in URL */ url->hostname = (char*)DpsStrdup("localhost"); url->path = (char*)DpsMalloc(dps_strlen(url->specific) + 2); if (url->path == NULL) { DPS_FREE(s); #ifdef WITH_PARANOIA DpsViolationExit(-1, paran); #endif return DPS_ERROR; } sprintf(url->path,"/%s",url->specific); url->default_port=119; }else{ /* Unknown strange schema */ DPS_FREE(s); #ifdef WITH_PARANOIA DpsViolationExit(-1, paran); #endif return(DPS_URL_BAD); } } }else{ url->path = (char*)DpsStrdup(s); } /* Cat an anchor if exist */ if((anchor=strstr(url->path,"#")))*anchor=0; /* If path is not full just copy it to filename */ /* i.e. neither /usr/local/ nor c:/windows/temp/ */ if((url->path != NULL) && (url->path[0]!='/') && (url->path[0]!='?') && (url->path[1]!=':')) { /* Relative path */ if(!strncmp(url->path,"./",2)) url->filename = (char*)DpsStrdup(url->path + 2); else url->filename = (char*)DpsStrdup(url->path); url->path[0] = 0; } /* truncate path to query_string */ /* and store query_string */ if((query=strrchr(url->path,'?'))){ url->query_string = (char*)DpsStrdup(query); *(query) = 0; } DpsURLNormalizePath(url->path); /* Now find right '/' sign and copy the rest to filename */ if((file=strrchr(url->path,'/'))&&(strcmp(file,"/"))){ url->filename = (char*)DpsStrdup(file + 1); *(file+1)=0; } /* Now find right '/' sign and copy the rest to directory */ if ((file = strrchr(url->path,'/'))) { char *p_save = file; for(file--; (file > url->path) && (*file != '/'); file--); file++; if (*file) { *p_save = '\0'; url->directory = (char*)DpsStrdup(file); *p_save = '/'; } } DPS_FREE(s); if (url->hostname != NULL) { DpsRTrim(url->hostname, "."); url->domain_level = 1; for (s = url->hostname; *s; s++) { *s = dps_tolower(*s); if (*s == '.') url->domain_level++; if (strchr(",'\";", (int)*s)) { #ifdef WITH_PARANOIA DpsViolationExit(-1, paran); #endif return DPS_URL_BAD; } } } if (url->hostinfo != NULL) { DpsRTrim(url->hostinfo, "."); s = strchr(url->hostinfo, '@'); for (s = (s == NULL) ? url->hostinfo : s + 1; *s; s++) *s = dps_tolower(*s); } if (url->schema != NULL) for (s = url->schema; *s; s++) *s = dps_tolower(*s); /* fprintf(stderr, "url: .path: %s port:%d\n", url->path, url->port);*/ #ifdef WITH_PARANOIA DpsViolationExit(-1, paran); #endif return DPS_OK; }
static void DpsParseHTTPHeader(DPS_AGENT *Indexer, DPS_DOCUMENT *Doc, DPS_DSTR *header) { char *val, *header_name; char secname[128], savec; DPS_VAR *Sec; DPS_TEXTITEM Item; if ((val = strchr(header_name = header->data, ':'))) { /* fprintf(stderr, "HEADER: %s\n", header_name); */ *val++='\0'; val = DpsTrim(val," \t:"); if (!strcasecmp(header_name, "Content-Type") || !strcasecmp(header_name, "Content-Encoding")) { register char *v; for(v=val ; *v ; v++) *v = (char)dps_tolower((int)*v); } else if (Doc->Spider.use_robots && !strcasecmp(header_name, "X-Robots-Tag")) { char * lt; char * rtok; rtok = dps_strtok_r(val, " ,\r\n\t", <, &savec); while(rtok){ if(!strcasecmp(rtok, "ALL")){ /* Left Server parameters unchanged */ }else if(!strcasecmp(rtok, "NONE")){ Doc->Spider.follow = DPS_FOLLOW_NO; Doc->Spider.index = 0; if (DpsNeedLog(DPS_LOG_DEBUG)) { DpsVarListReplaceInt(&Doc->Sections, "Index", 0); DpsVarListReplaceInt(&Doc->Sections, "Follow", DPS_FOLLOW_NO); } }else if(!strcasecmp(rtok, "NOINDEX")) { Doc->Spider.index = 0; /* Doc->method = DPS_METHOD_DISALLOW;*/ if (DpsNeedLog(DPS_LOG_DEBUG)) DpsVarListReplaceInt(&Doc->Sections, "Index", 0); }else if(!strcasecmp(rtok, "NOFOLLOW")) { Doc->Spider.follow = DPS_FOLLOW_NO; if (DpsNeedLog(DPS_LOG_DEBUG)) DpsVarListReplaceInt(&Doc->Sections, "Follow", DPS_FOLLOW_NO); }else if(!strcasecmp(rtok, "NOARCHIVE")) { DpsVarListReplaceStr(&Doc->Sections, "Z", ""); }else if(!strcasecmp(rtok, "INDEX")) { /* left server value unchanged */ if (DpsNeedLog(DPS_LOG_DEBUG)) DpsVarListReplaceInt(&Doc->Sections, "Index", Doc->Spider.index); }else if(!strcasecmp(rtok, "FOLLOW")) { /* left server value unchanged */ if (DpsNeedLog(DPS_LOG_DEBUG)) DpsVarListReplaceInt(&Doc->Sections, "Follow", Doc->Spider.follow); } rtok = dps_strtok_r(NULL, " \r\n\t", <, &savec); } } else if (Doc->Spider.use_cookies && !strcasecmp(header_name, "Set-Cookie")) { DpsCookiesAddStr(Indexer, &Doc->CurURL, val, 1); return; } } DpsVarListReplaceStr(&Doc->Sections, header_name, val ? val : "<NULL>"); dps_snprintf(secname,sizeof(secname),"header.%s", header_name); secname[sizeof(secname)-1]='\0'; if((Sec = DpsVarListFind(&Doc->Sections, secname)) && val ) { bzero((void*)&Item, sizeof(Item)); Item.href = NULL; Item.str = val; Item.section = Sec->section; Item.section_name = secname; Item.strict = Sec->strict; Item.len = 0; (void)DpsTextListAdd(&Doc->TextList, &Item); } }