// returns false on bad mime bool HttpMime::parse ( char *mime , long mimeLen , Url *url ) { // reset locUrl to 0 m_locUrl.reset(); // return if we have no valid complete mime if ( mimeLen == 0 ) return false; // status is on first line m_status = -1; // skip HTTP/x.x till we hit a space char *p = mime; char *pend = mime + mimeLen; while ( p < pend && !is_wspace_a(*p) ) p++; // then skip over spaces while ( p < pend && is_wspace_a(*p) ) p++; // return false on a problem if ( p == pend ) return false; // then read in the http status m_status = atol2 ( p , pend - p ); // if no Content-Type: mime field was provided, assume html m_contentType = CT_HTML; // assume default charset m_charset = NULL; m_charsetLen = 0; // set contentLen, lastModifiedDate, m_cookie p = mime; while ( p < pend ) { // compute the length of the string starting at p and ending // at a \n or \r long len = 0; while ( &p[len] < pend && p[len]!='\n' && p[len]!='\r' ) len++; // . if we could not find a \n or \r there was an error // . MIMEs must always end in \n or \r if ( &p[len] >= pend ) return false; // . stick a NULL at the end of the line // . overwrites \n or \r TEMPORARILY char c = p [ len ]; p [ len ] = '\0'; // parse out some meaningful data if ( strncasecmp ( p , "Content-Length:" ,15) == 0 ) { m_contentLengthPos = p + 15; m_contentLen = atol( m_contentLengthPos); } else if ( strncasecmp ( p , "Last-Modified:" ,14) == 0 ) { m_lastModifiedDate=atotime(p+14); // do not let them exceed current time for purposes // of sorting by date using datedb (see Msg16.cpp) time_t now = time(NULL); if (m_lastModifiedDate > now) m_lastModifiedDate = now; } else if ( strncasecmp ( p , "Content-Type:" ,13) == 0 ) m_contentType = getContentTypePrivate ( p + 13 ); else if ( strncasecmp ( p , "Set-Cookie: " ,11) == 0 ) { m_cookie = p + 11; m_cookieLen = gbstrlen ( p + 11 ); } else if ( strncasecmp ( p , "Location:" , 9) == 0 ) { // point to it char *tt = p + 9; // skip if space if ( *tt == ' ' ) tt++; if ( *tt == ' ' ) tt++; // at least set this for Msg13.cpp to use m_locationField = tt; m_locationFieldLen = gbstrlen(tt); // . we don't add the "www." because of slashdot.com // . we skip initial spaces in this Url::set() routine if(url) m_locUrl.set ( url, p + 9, len - 9, false/*addWWW?*/); } else if ( strncasecmp ( p , "Content-Encoding:", 17) == 0 ) { //only support gzip now, it doesn't seem like servers //implement the other types much m_contentEncodingPos = p+17; if(strstr(m_contentEncodingPos, "gzip")) { m_contentEncoding = ET_GZIP; } else if(strstr(m_contentEncodingPos, "deflate")) { //zlib's compression m_contentEncoding = ET_DEFLATE; } } //else if ( strncasecmp ( p, "Cookie:", 7) == 0 ) // log (LOG_INFO, "mime: Got Cookie = %s", (p+7)); // re-insert the character that we replaced with a '\0' p [ len ] = c; // go to next line p += len; // skip over the cruft at the end of this line while ( p < pend && ( *p=='\r' || *p=='\n' ) ) p++; } return true; }
// . when the Conf::m_proxyIps parm is updated we call this to rebuild // s_iptab, our table of SpiderProxy instances, which has the proxies and // their performance statistics. // . we try to maintain stats of ip/ports that did NOT change when rebuilding. bool buildProxyTable ( ) { // scan the NEW list of proxy ip/port pairs in g_conf char *p = g_conf.m_proxyIps.getBufStart(); HashTableX tmptab; tmptab.set(8,0,16,NULL,0,false,"tmptab"); // scan the user inputted space-separated list of ip:ports // (optional username:password@ip:port) for ( ; *p ; ) { // skip white space if ( is_wspace_a(*p) ) { p++; continue; } // skip http:// if ( strncasecmp(p,"http://",7) == 0 ) { p += 7; continue; } // scan in an ip:port char *s = p; char *portStr = NULL; int32_t dc = 0, pc = 0, gc = 0, bc = 0; const char *msg; char *usernamePwd = NULL; int32_t usernamePwdLen = 0; char *ipStart = p; // scan all characters until we hit \0 or another whitespace for ( ; *s && !is_wspace_a(*s); s++) { if ( *s == '@' ) { // must be username:pwd if ( pc != 1 ) { msg = "bad username:password"; goto hadError; } usernamePwd = p; usernamePwdLen = s - p; if ( usernamePwdLen >= MAXUSERNAMEPWD-2 ) { msg = "username:password too long"; goto hadError; } dc = 0; gc = 0; bc = 0; pc = 0; portStr = NULL; ipStart = s+1; continue; } if ( *s == '.' ) { dc++; continue; } if ( *s == ':' ) { portStr=s; pc++; continue; } if ( is_digit(*s) ) { gc++; continue; } bc++; continue; } // ensure it is a legit ip:port combo msg = NULL; if ( gc < 4 ) msg = "not enough digits for an ip"; if ( pc > 1 ) msg = "too many colons"; if ( dc != 3 ) msg = "need 3 dots for an ip address"; if ( bc ) msg = "got illegal char in ip:port listing"; if ( msg ) { hadError: char c = *s; *s = '\0'; log("buf: %s for %s",msg,p); *s = c; return false; } // convert it int32_t iplen = s - ipStart; if ( portStr ) iplen = portStr - ipStart; int32_t ip = atoip(ipStart,iplen); // another sanity check if ( ip == 0 || ip == -1 ) { log("spider: got bad proxy ip for %s",p); return false; } // and the port default is 80 int32_t port = 80; if ( portStr ) port = atol2(portStr+1,s-portStr-1); if ( port < 0 || port > 65535 ) { log("spider: got bad proxy port for %s",p); return false; } // . we got a legit ip:port // . see if already in our table uint64_t ipKey = (uint32_t)ip; ipKey <<= 16; ipKey |= (uint16_t)(port & 0xffff); // also store into tmptable to see what we need to remove tmptab.addKey(&ipKey); // see if in table int32_t islot = s_iptab.getSlot( &ipKey); // advance p p = s; // if in there, keep it as is if ( islot >= 0 ) continue; // otherwise add new entry SpiderProxy newThing; memset ( &newThing , 0 , sizeof(SpiderProxy)); newThing.m_ip = ip; newThing.m_port = port; newThing.m_lastDownloadTookMS = -1; newThing.m_lastSuccessfulTestMS = -1; gbmemcpy(newThing.m_usernamePwd,usernamePwd,usernamePwdLen); // ensure it is NULL terminated newThing.m_usernamePwd[usernamePwdLen] = '\0'; if ( ! s_iptab.addKey ( &ipKey, &newThing ) ) return false; } redo: int32_t removed = 0; // scan all SpiderProxies in tmptab for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) { // skip empty buckets in hashtable s_iptab if ( ! s_iptab.m_flags[i] ) continue; // get the key int64_t key = *(int64_t *)s_iptab.getKeyFromSlot(i); // must also exist in tmptab, otherwise it got removed by user if ( tmptab.isInTable ( &key ) ) continue; // skip if not in table if ( s_iptab.getSlot ( &key ) < 0 ) { log("sproxy: iptable hashing messed up"); continue; } // shoot, it got removed. not in the new list of ip:ports s_iptab.removeKey ( &key ); removed++; // hashtable is messed up now, start over //goto redo; } if ( removed ) goto redo; return true; }
// returns false on bad mime bool HttpMime::parse(char *mime, int32_t mimeLen, Url *url) { #ifdef _VALGRIND_ VALGRIND_CHECK_MEM_IS_DEFINED(mime,mimeLen); #endif // reset locUrl to 0 m_locUrl.reset(); // return if we have no valid complete mime if (mimeLen == 0) { return false; } // status is on first line m_status = -1; // skip HTTP/x.x till we hit a space char *p = mime; char *pend = mime + mimeLen; while (p < pend && !is_wspace_a(*p)) p++; // then skip over spaces while (p < pend && is_wspace_a(*p)) p++; // return false on a problem if (p == pend) return false; // then read in the http status m_status = atol2(p, pend - p); // if no Content-Type: mime field was provided, assume html m_contentType = CT_HTML; // assume default charset m_charset = NULL; m_charsetLen = 0; // skip over first line getNextLine(); while (getNextLine()) { const char *field = NULL; size_t fieldLen = 0; if (getField(&field, &fieldLen)) { if (parseContentEncoding(field, fieldLen)) { continue; } if (parseContentLength(field, fieldLen)) { continue; } if (parseContentType(field, fieldLen)) { continue; } if (parseLocation(field, fieldLen, url)) { continue; } if (parseSetCookie(field, fieldLen)) { continue; } // add parsing of other header here } } return true; }