// hash the space (or +) separated list of numbers in this string //template<class Key_t, class Val_t> //bool HashTableT<Key_t,Val_t>::hashFromString ( HashTableT *ht , char *x ) { bool hashFromString ( HashTableT<long long,char> *ht , char *x ) { if ( ! x ) return true; char *xend = x + gbstrlen(x); long n = 1; for ( char *s = x ; s < xend ; s++ ) // i am assuming this is ascii here! if (is_wspace_a(*s)||*s == '+') n++; // double # slots to nd*2 so that hashtable is somewhat sparse --> fast if ( ! ht->set ( n * 2 , NULL , 0 , false ) ) return false; // now populate with the docids for ( char *s = x ; s < xend ; ) { // skip the plusses while ( s < xend && (is_wspace_a(*s) || *s == '+') ) s++; // are we done? if ( s >= xend ) break; // get the docid, a long long (64 bits) long long d = atoll ( s ); // add it, should never fail! if ( ! ht->addKey ( d , 1 ) ) return false; // skip till + while ( s < xend && (*s != '+' && !is_wspace_a(*s)) ) s++; // are we done? if ( s >= xend ) break; } return true; }
double atof2 ( const char *s, int32_t len ) { // skip over spaces const char *end = s + len; while ( s < end && is_wspace_a ( *s ) ) { s++; len--; } // return 0 if all spaces if ( s == end ) return 0; char tmpBuf[128]; if ( len >= 128 ) len = 127; //strncpy ( dst , s , len ); const char *p = s; const char *srcEnd = s + len; char *dst = tmpBuf; // remove commas for ( ; p < srcEnd ; p++ ) { // skip commas if ( *p == ',' ) continue; // otherwise store it *dst++ = *p; } // null term *dst = '\0'; //buf[len] = '\0'; return atof ( tmpBuf ); }
bool HttpMime::getField(const char **field, size_t *fieldLen) { size_t currentLinePos = m_valueStartPos; const char *colonPos = (const char *)memchr(m_currentLine + currentLinePos, ':', m_currentLineLen); // no colon if (colonPos == NULL) { return false; } currentLinePos = colonPos - m_currentLine; m_valueStartPos = currentLinePos + 1; *field = m_currentLine; *fieldLen = currentLinePos; // strip ending whitespaces while (*fieldLen > 0 && is_wspace_a(m_currentLine[*fieldLen - 1])) { --(*fieldLen); } logTrace(g_conf.m_logTraceHttpMime, "field='%.*s'", static_cast<int>(*fieldLen), *field); return (*fieldLen > 0); }
bool loadUrls ( ) { static bool s_loaded = false; if ( s_loaded ) return true; s_loaded = true; // use injectme3 file s_ubuf1.load("./injectme3"); // scan for +++URL: xxxxx char *s = s_ubuf1.getBufStart(); for ( ; *s ; s++ ) { if ( strncmp(s,"+++URL: ",8) ) continue; // got one // \0 term it for s_contentPtrs below *s = '\0'; // find end of it s += 8; char *e = s; for ( ; *e && ! is_wspace_a(*e); e++ ); // null term it if ( *e ) *e = '\0'; // store ptr s_ubuf2.pushLong((long)s); // skip past that s = e; // point to content s_cbuf2.pushLong((long)(s+1)); } // make array of url ptrs s_urlPtrs = (char **)s_ubuf2.getBufStart(); s_contentPtrs= (char **)s_cbuf2.getBufStart(); return true; }
bool isInWhiteSpaceList ( char *p , char *buf ) { if ( ! p ) return false; char *match = strstr ( buf , p ); if ( ! match ) return false; int32_t len = gbstrlen(p); // ensure book-ended by whitespace if ( match && (match == buf || is_wspace_a(match[-1])) && (!match[len] || is_wspace_a(match[len])) ) return true; // no match return false; }
bool endsInCurly ( char *s , int32_t slen ) { char *e = s + slen - 1; // don't backup more than 30 chars char *m = e - 30; if ( m < s ) m = s; // \0? if ( e > m && *e == '\0' ) e--; // scan backwards, skipping whitespace for ( ; e > m && is_wspace_a(*e) ; e-- ); // should be a } now to be valid json if ( e >= m && *e == '}' ) return true; return false; }
int64_t atoll2 ( const char *s, int32_t len ) { // skip over spaces const char *end = s + len; while ( s < end && is_wspace_a ( *s ) ) s++; // return 0 if all spaces if ( s == end ) return 0; int32_t i = 0; int64_t val = 0LL; bool negative = false; if ( s[0] == '-' ) { negative = true; i++; } while ( i < len && is_digit(s[i]) ) val = val * 10LL + ( s[i++] - '0'); if ( negative ) return -val; return val; }
long HttpRequest::getLong ( char *field , long defaultLong ) { long len; char *value = getValue ( field, &len, NULL ); // return default if no match if ( ! value || len == 0 ) return defaultLong; // otherwise, it's a match char c = value[len]; value[len] = '\0'; long res = atol ( value ); value[len] = c; if ( res == 0 ) { // may be an error. if so return the default long i = 0; while ( i < len && is_wspace_a(value[i]) ) i++; if ( i < len && (value[i] == '-' || value[i] == '+') ) i++; if ( i >= len || !is_digit(value[i]) ) return defaultLong; } return res; }
bool HttpMime::getValue(const char **value, size_t *valueLen) { // strip starting whitespaces while (is_wspace_a(m_currentLine[m_valueStartPos]) && (m_valueStartPos < m_currentLineLen)) { ++m_valueStartPos; } *value = m_currentLine + m_valueStartPos; *valueLen = m_currentLineLen - m_valueStartPos; const char *semicolonPos = (const char *)memchr(*value, ';', *valueLen); if (semicolonPos) { // value should end at semicolon if present *valueLen = semicolonPos - *value; m_attributeStartPos = semicolonPos - m_currentLine + 1; } logTrace(g_conf.m_logTraceHttpMime, "value='%.*s'", static_cast<int>(*valueLen), *value); return (*valueLen > 0); }
double HttpRequest::getDouble ( char *field , double defaultDouble ) { long len; char *value = getValue ( field, &len, NULL ); // return default if no match if ( ! value || len == 0 ) return defaultDouble; // otherwise, it's a match char c = value[len]; value[len] = '\0'; double res = strtod ( value , NULL ); value[len] = c; if ( res == +0.0 ) { // may be an error. if so return the default long i = 0; while ( i < len && is_wspace_a(value[i]) ) i++; if ( i < len && (value[i] == '-' || value[i] == '+' || value[i] == '.') ) i++; if ( i >= len || !is_digit(value[i]) ) return defaultDouble; } return res; }
bool SearchInput::set ( TcpSocket *sock , HttpRequest *r , Query *q ) { // store list of collection #'s to search here. usually just one. m_collnumBuf.reset(); // zero out everything, set niceness to 0 clear ( 0 ) ; // save it now m_socket = sock; // get coll rec long collLen9; char *coll9 = r->getString ( "c" , &collLen9 ); m_firstCollnum = -1; CollectionRec *cr = NULL; // now convert list of space-separated coll names into list of collnums char *p = coll9; // if no collection list was specified look for "token=" and // use those to make collections. hack for diffbot. char *token = r->getString("token",NULL); // find all collections under this token for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { // must not have a "&c=" if ( p ) break; // must have a "&token=" if ( ! token ) break; // skip if empty CollectionRec *tmpcr = g_collectiondb.m_recs[i]; if ( ! tmpcr ) continue; // skip if does not match token if ( strcmp(token,tmpcr->m_diffbotToken.getBufStart()) ) continue; // . we got a match // . set initial junk if ( ! cr ) { cr = tmpcr; m_firstCollnum = tmpcr->m_collnum; } // save the collection # if ( ! m_collnumBuf.safeMemcpy ( &tmpcr->m_collnum, sizeof(collnum_t) ) ) return false; } // if we had a "&c=..." in the GET request process that if ( p ) { loop: char *end = p; for ( ; *end && ! is_wspace_a(*end) ; end++ ); // temp null char c = *end; *end = '\0'; CollectionRec *tmpcr = g_collectiondb.getRec ( p ); // set defaults from the FIRST one if ( tmpcr && ! cr ) { cr = tmpcr; m_firstCollnum = tmpcr->m_collnum; } if ( ! tmpcr ) { g_errno = ENOCOLLREC; log("query: missing collection %s",p); g_msg = " (error: no such collection)"; return false; } // add to our list if (!m_collnumBuf.safeMemcpy(&cr->m_collnum,sizeof(collnum_t))) return false; // restore the \0 character we wrote in there *end = c; // advance p = end; // skip to next collection name if there is one while ( *p && is_wspace_a(*p) ) p++; // now add it's collection # to m_collnumBuf if there if ( *p ) goto loop; } //if (! coll){coll = g_conf.m_defaultColl; collLen = gbstrlen(coll); } //if ( ! coll ) // coll = g_conf.getDefaultColl(r->getHost(), r->getHostLen()); //if ( ! coll || ! coll[0] ) // coll = "main"; //if ( ! coll ) { g_errno = ENOCOLLREC; return false; } //collLen = gbstrlen(coll); //CollectionRec *cr = g_collectiondb.getRec ( coll9 ); //if ( ! cr ) { // g_errno = ENOCOLLREC; // g_msg = " (error: no such collection)"; // return false; //} // set all to 0 just to avoid any inconsistencies //long size = (char *)&m_END_TEST - (char *)&m_START; //memset ( this , 0x00 , size ); //setToDefaults( cr , 0 ); // niceness m_cr = cr; if ( ! cr ) { log("si: collection does not exist"); g_errno = ENOCOLLREC; return false; } //m_coll2 = m_cr->m_coll; //m_collLen2 = gbstrlen(m_coll2); // from ::reset() m_languageWeightFactor = 0.33; // Set IP for language detection. // (among other things) if ( sock ) m_queryIP = sock->m_ip; else m_queryIP = 0; m_hr = r; // keep ptr to the query class to use m_q = q; // set this here since its size can be variable m_sq = r->getString("sq",&m_sqLen); // negative docids m_noDocIds = r->getString("nodocids",&m_noDocIdsLen); // negative sites m_noSiteIds = r->getString("nositeids",&m_noSiteIdsLen); // Msg5e calls Msg40 with this set to true in the searchInput // so it can analyze the entire pages of each search result so it // can find the article start/end tag sequence indicators m_getTitleRec = r->getLong("gettrs",0); m_getSitePops = r->getLong("getsitepops",0 ); // does this collection ban this IP? /* long encapIp = 0; m if (! cr->hasSearchPermission ( sock, encapIp ) ) { g_errno = ENOPERM; g_msg = " (error: permission denied)"; return false; } */ // set all search parms in SearchInput to defaults for ( long i = 0 ; i < g_parms.m_numSearchParms ; i++ ) { Parm *m = g_parms.m_searchParms[i]; // sanity if ( m->m_soff < 0 ) { char *xx=NULL;*xx=0; } char *x = (char *)this + m->m_soff; // what is the def val ptr char *def = NULL; if ( m->m_off >= 0 && m->m_obj == OBJ_COLL ) def = ((char *)cr) + m->m_off; else if ( m->m_off >= 0 && m->m_obj == OBJ_CONF ) def = ((char *)&g_conf) + m->m_off; // set it based on type if ( m->m_type == TYPE_LONG ) { long v = 0; if ( def ) v = *(long *)def; else if ( m->m_def ) v = atol(m->m_def); *(long *)x = v; } else if ( m->m_type == TYPE_BOOL ) { long v = 0; if ( def ) v = *(char *)def; else if ( m->m_def ) v = atol(m->m_def); // sanity test! if ( v != 0 && v != 1 ) log("query: got non-bool default " "for bool parm %s",m->m_title); if ( v ) *(char *)x = 1; else *(char *)x = 0; } else if ( m->m_type == TYPE_CHAR ) { if ( def ) *(char *)x = *(char *)def; else if ( m->m_def ) *(char *)x = atol(m->m_def); } else if ( m->m_type == TYPE_FLOAT ) { float v = 0; if ( def ) v = *(float *)def; else if ( m->m_def ) v = atof(m->m_def); *(float *)x = (float)v; } else if ( m->m_type == TYPE_DOUBLE ) { double v = 0; if ( def ) v = *(double *)def; else if ( m->m_def ) v = atof(m->m_def); *(double *)x = (double)v; } else if ( m->m_type == TYPE_LONG_LONG ) { long long v = 0; if ( def ) v = *(long long *)def; else if ( m->m_def ) v = atoll(m->m_def); *(long long *)x = (long long)v; } else if ( m->m_type == TYPE_STRING || m->m_type == TYPE_STRINGBOX ) { //if ( m->m_cgi && strcmp ( m->m_cgi, "erpc" ) == 0 ) // log("hey1"); //if ( m->m_cgi && strcmp ( m->m_scgi, "q" ) == 0 ) // log("hey1"); char *v = NULL; if ( def ) v = (char *)def; else if ( m->m_def ) v = m->m_def; *(char **)x = v; // set the length if ( ! v ) *(long *)(x-4) = 0; else *(long *)(x-4) = gbstrlen(v); } } // this is just used to determine in PageResults.cpp if we should // show admin knobs next to each result... // default to off for now. default back on. m_isAdmin = r->getLong("admin",1); //if ( m_isAdmin ) m_isAdmin = g_users.hasPermission ( r,PAGE_MASTER); // local ip? if ( ! r->isLocal() ) m_isAdmin = 0; // default set does not take into account g_conf, // so we will take care of that here ourselves... m_adFeedEnabled = g_conf.m_adFeedEnabled; //m_excludeLinkText = g_conf.m_excludeLinkText; //m_excludeMetaText = g_conf.m_excludeMetaText; // we need to get some cgi values in order to correct the defaults // based on if we're doing an xml feed, have a site: query, etc. //long xml = r->getLong ( "xml" , 0 ); // was "raw" long siteLen = 0; r->getString ("site",&siteLen); long sitesLen = 0; char *sites = r->getString ("sites",&sitesLen,NULL); // save it if there if ( sites && sitesLen > 0 && ( ! m_whiteListBuf.safeStrcpy(sites)|| ! m_whiteListBuf.nullTerm() ) ) return log("query: unable to strcpy whitelist"); char format = getFormatFromRequest ( r ); // now override automatic defaults for special cases if ( format != FORMAT_HTML ) { m_familyFilter = 0; // this is causing me a headache when on when i dont know it m_restrictIndexdbForQuery = false; // this is hackish if ( r->getLong("rt",0) ) m_restrictIndexdbForQuery=false; m_numTopicsToDisplay = 0; m_doQueryHighlighting = 0; m_spellCheck = 0; m_refs_numToGenerate = 0; m_refs_docsToScan = 0; // default scoring info to off m_getDocIdScoringInfo = false; } else if ( m_siteLen > 0 ) { m_restrictIndexdbForQuery = false; m_doSiteClustering = false; m_ipRestrictForTopics = false; } else if ( m_whiteListBuf.length() > 0 ) { m_ipRestrictForTopics = false; } m_doIpClustering = false; //m_sitesQueryLen = 0; // set the user ip, "uip" long uip = m_queryIP; char *uipStr = m_hr->getString ("uip" , NULL ); long tmpIp = 0; if ( uipStr ) tmpIp = atoip(uipStr); if ( tmpIp ) uip = tmpIp; // // // BEGIN MAIN PARM SETTING LOOP // // // loop through all possible cgi parms to set SearchInput for ( long i = 0 ; i < g_parms.m_numSearchParms ; i++ ) { Parm *m = g_parms.m_searchParms[i]; char *x = (char *)this + m->m_soff; // what is the parm's cgi name? char *cgi = m->m_scgi; if ( ! cgi ) cgi = m->m_cgi; // sanity check if ( ! m->m_sparm ) { log("query: Failed search input sanity check."); char *xx = NULL; *xx = 0; } // . break it down by type now // . get it from request and store it in SearchInput if ( m->m_type == TYPE_LONG ) { // default was set above long def = *(long *)x; // assume default long v = def; // but cgi parms override cookie v = r->getLong ( cgi , v ); // but if its a privledged parm and we're not an admin // then do not allow overrides, but m_priv of 3 means // to not display for clients, but to allow overrides if ( ! m_isAdmin && m->m_priv && m->m_priv!=3) v = def; // bounds checks if ( v < m->m_smin ) v = m->m_smin; if ( v > m->m_smax ) v = m->m_smax; if ( m->m_sminc >= 0 ) { long vmin = *(long *)((char *)cr+m->m_sminc); if ( v < vmin ) v = vmin; } if ( m->m_smaxc >= 0 ) { long vmax = *(long *)((char *)cr+m->m_smaxc); if ( v > vmax ) v = vmax; } // set it *(long *)x = v; // do not print start result num (m->m_sprop is 0 for // "s" now) //if ( cgi[0] == 's' && cgi[1] == '\0' ) continue; // should we propagate it? true by default //if ( ! m->m_sprop ) continue; // if it is the same as its default, and the default is // always from m_def and never from the CollectionRec, // then do not both storing it in here! what's the // point? if ( v == def && m->m_off < 0 ) continue; // if not default do not propagate if ( v == def ) continue; // . include for sure if explicitly provided // . vp will be NULL if "cgi" is not explicitly listed // as a cgi parm. otherwise, even if *vp == '\0', vp // is non-NULL. // . crap, it can be in the cookie now //char *vp = r->getValue(cgi, NULL, NULL); // if not given at all, do not propagate //if ( ! vp ) continue; // store in up if different from default, even if // same as default ("def") because default may be // changed by the admin since m->m_off >= 0 //if ( m->m_sprpg && up + gbstrlen(cgi) + 20 < upend ) // up += sprintf ( up , "%s=%li&", cgi , v ); //if ( m->m_sprpp && pp + gbstrlen(cgi) + 80 < ppend ) // pp += sprintf ( pp , "<input type=hidden " // "name=%s value=\"%li\">\n", // cgi , v ); } else if ( m->m_type == TYPE_LONG_LONG ) { // default was set above long def = *(long long *)x; // assume default long long v = def; // but cgi parms override cookie v = r->getLongLong ( cgi , v ); // but if its a privledged parm and we're not an admin // then do not allow overrides, but m_priv of 3 means // to not display for clients, but to allow overrides if ( ! m_isAdmin && m->m_priv && m->m_priv!=3) v = def; // set it *(long long *)x = v; // if it is the same as its default, and the default is // always from m_def and never from the CollectionRec, // then do not both storing it in here! what's the // point? if ( v == def && m->m_off < 0 ) continue; // if not default do not propagate if ( v == def ) continue; } else if ( m->m_type == TYPE_FLOAT ) { // default was set above float def = *(float *)x; // get overriding from http request, if any float v; // but if its a privledged parm and we're not an admin // then do not allow overrides if ( ! m_isAdmin && m->m_priv && m->m_priv!=3) v = def; else v = r->getFloat( cgi , def ); // bounds checks if ( v < m->m_smin ) v = m->m_smin; if ( v > m->m_smax ) v = m->m_smax; if ( m->m_sminc >= 0 ) { float vmin = *(float *)((char *)cr+m->m_sminc); if ( v < vmin ) v = vmin; } if ( m->m_smaxc >= 0 ) { float vmax = *(float *)((char *)cr+m->m_smaxc); if ( v > vmax ) v = vmax; } // set it *(float *)x = v; // do not print start result num //if ( cgi[0] == 's' && cgi[1] == '\0' ) continue; // include for sure if explicitly provided char *vp = r->getValue(cgi, NULL, NULL); if ( ! vp ) continue; // unchanged from default? if ( v == def ) continue; // store in up different from default //if ((vp||v!= def) && up + gbstrlen(cgi)+20 < upend ) // up += sprintf ( up , "%s=%f&", cgi , v ); //if ((vp||v!= def) && pp + gbstrlen(cgi)+20 < ppend ) // pp += sprintf ( pp , "<input type=hidden " // "name=%s value=\"%f\">\n", // cgi , v ); } else if ( m->m_type == TYPE_DOUBLE ) { // default was set above double def = *(double *)x; // get overriding from http request, if any double v; // but if its a privledged parm and we're not an admin // then do not allow overrides if ( ! m_isAdmin && m->m_priv && m->m_priv!=3) v = def; else v = r->getDouble( cgi , def ); // bounds checks if ( v < m->m_smin ) v = m->m_smin; if ( v > m->m_smax ) v = m->m_smax; if ( m->m_sminc >= 0 ) { double vmin=*(double *)((char *)cr+m->m_sminc); if ( v < vmin ) v = vmin; } if ( m->m_smaxc >= 0 ) { double vmax=*(double *)((char *)cr+m->m_smaxc); if ( v > vmax ) v = vmax; } // set it *(double *)x = v; // include for sure if explicitly provided char *vp = r->getValue(cgi, NULL, NULL); if ( ! vp ) continue; // unchanged from default? if ( v == def ) continue; } else if ( m->m_type == TYPE_BOOL ) { // default was set above long def = *(char *)x; if ( def != 0 ) def = 1; // normalize // assume default long v = def; // cgi parms override cookie v = r->getBool ( cgi , v ); // but if no perm, use default if ( ! m_isAdmin && m->m_priv && m->m_priv!=3) v = def; if ( v != 0 ) v = 1; // normalize *(char *)x = v; // don't propagate rcache //if ( ! strcmp(cgi,"rcache") ) continue; // should we propagate it? true by default //if ( ! m->m_sprop ) continue; // if it is the same as its default, and the default is // always from m_def and never from the CollectionRec, // then do not both storing it in here! what's the // point? if ( v == def && m->m_off < 0 ) continue; // if not default do not propagate if ( v == def ) continue; // . include for sure if explicitly provided // . vp will be NULL if "cgi" is not explicitly listed // as a cgi parm. otherwise, even if *vp == '\0', vp // is non-NULL. // . crap, it can be in the cookie now! //char *vp = r->getValue(cgi, NULL, NULL); // if not given at all, do not propagate //if ( ! vp ) continue; // store in up if different from default, even if // same as default ("def") because default may be // changed by the admin since m->m_off >= 0 //if ( m->m_sprpg && up + gbstrlen(cgi) + 10 < upend ) // up += sprintf ( up , "%s=%li&", cgi , v ); //if ( m->m_sprpp && pp + gbstrlen(cgi) + 80 < ppend ) // pp += sprintf ( pp , "<input type=hidden " // "name=%s value=\"%li\">\n", // cgi , v ); } else if ( m->m_type == TYPE_CHAR ) { // default was set above char def = *(char *)x; *(char *)x = r->getLong ( cgi, def ); // use this long v = *(char *)x; // store in up if different from default, even if // same as default ("def") because default may be // changed by the admin since m->m_off >= 0. nah, // let's try to reduce cgi parm pollution... if ( v == def ) continue; //if ( m->m_sprpg && up + gbstrlen(cgi) + 10 < upend ) // up += sprintf ( up , "%s=%li&", cgi , v ); //if ( m->m_sprpp && pp + gbstrlen(cgi) + 80 < ppend ) // pp += sprintf ( pp , "<input type=hidden " // "name=%s value=\"%li\">\n", // cgi , v ); } else if ( m->m_type == TYPE_STRING || m->m_type == TYPE_STRINGBOX ) { //if ( m->m_cgi && strcmp ( m->m_cgi, "qlang" ) == 0 ) // log("hey2"); char *def = *(char **)x; // get overriding from http request, if any long len = 0; char *v = NULL; // . cgi parms override cookie // . is this url encoded? v = r->getString ( cgi , &len , v ); // if not specified explicitly, default it and continue if ( ! v ) { // sanity if ( ! def ) def = ""; *(char **)x = def; // length preceeds char ptr in SearchInput *(long *)(x - 4) = gbstrlen(def); continue; } // if something was specified, override, it might // be length zero, too *(char **)x = v; // length preceeds char ptr in SearchInput *(long *)(x - 4) = len; // do not store if query, that needs to be last so // related topics can append to it //if ( cgi[0] == 'q' && cgi[1] == '\0' ) continue; // should we propagate it? true by default //if ( ! m->m_sprop ) continue; // if not given at all, do not propagate //if ( ! vp ) continue; // if it is the same as its default, and the default is // always from m_def and never from the CollectionRec, // then do not both storing it in here! what's the // point? //if ( v && v == def && !strcmp(def,v) && m->m_off < 0) // continue; // Need to set qcs based on page encoding... // not propagated if (!strncmp(cgi, "qcs", 3)) continue; // do not propagate defaults if ( v == def ) continue; // store in up if different from default, even if // same as default ("def") because default may be // changed by the admin since m->m_off >= 0 //if( m->m_sprpg && up+gbstrlen(cgi)+len+6 < upend ) { // up += sprintf ( up , "%s=", cgi ); // up += urlEncode ( up , upend-up-2 , v , len ); // *up++ = '&'; //} // propogate hidden inputs //if ( m->m_sprpp && up+gbstrlen(cgi)+len+80 < upend ) // pp += sprintf ( pp , "<input type=hidden " // "name=%s value=\"%s\">\n", // cgi , v ); } } // now add the special "qh" parm whose default value changes // depending on if we are widget related or not long qhDefault = 1; m_doQueryHighlighting = r->getLong("qh",qhDefault); // // TODO: use Parms.cpp defaults // TopicGroup *tg = &m_topicGroups[0]; // // // gigabits // // tg->m_numTopics = 50; tg->m_maxTopics = 50; tg->m_docsToScanForTopics = m_docsToScanForTopics; tg->m_minTopicScore = 0; tg->m_maxWordsPerTopic = 6; tg->m_meta[0] = '\0'; tg->m_delimeter = '\0'; tg->m_useIdfForTopics = false; tg->m_dedup = true; // need to be on at least 2 pages! tg->m_minDocCount = 2; tg->m_ipRestrict = true; tg->m_dedupSamplePercent = 80; tg->m_topicRemoveOverlaps = true; tg->m_topicSampleSize = 4096; // max sequential punct chars allowedin a topic tg->m_topicMaxPunctLen = 1; m_numTopicGroups = 1; // use "&dg=1" to debug gigabits m_debugGigabits = r->getLong("dg",0); // override m_format = format; // . omit scoring info from the xml feed for now // . we have to roll this out to gk144 net i think //if ( m_format != FORMAT_HTML ) // m_getDocIdScoringInfo = 0; // turn off by default! if ( ! r->getLong("gigabits",0) ) { m_numTopicGroups = 0; } ////////////////////////////////////// // // transform input into classes // ////////////////////////////////////// // USER_ADMIN, ... m_username = g_users.getUsername(r); // if collection is NULL default to one in g_conf //if ( ! m_coll2 || ! m_coll2[0] ) { // //m_coll = g_conf.m_defaultColl; // m_coll2 = g_conf.getDefaultColl(r->getHost(), r->getHostLen()); // m_collLen2 = gbstrlen(m_coll2); //} // reset this m_gblang = 0; // use gblang then! long gglen; char *gg = r->getString ( "clang" , &gglen , NULL ); if ( gg && gglen > 1 ) m_gblang = getLanguageFromAbbr(gg); // allow for "qlang" if still don't have it //long gglen2; //char *gg2 = r->getString ( "qlang" , &gglen2 , NULL ); //if ( m_gblang == 0 && gg2 && gglen2 > 1 ) // m_gblang = getLanguageFromAbbr(gg2); // fix query by removing lang:xx from ask.com queries //char *end = m_query + m_queryLen -8; //if ( m_queryLen > 8 && m_query && end > m_query && // strncmp(end," lang:",6)==0 ) { // char *asklang = m_query+m_queryLen - 2; // m_gblang = getLanguageFromAbbr(asklang); // m_queryLen -= 8; // m_query[m_queryLen] = 0; // //} // . returns false and sets g_errno on error // . sets m_qbuf1 and m_qbuf2 if ( ! setQueryBuffers (r) ) return log("query: setQueryBuffers: %s",mstrerror(g_errno)); /* --- Virtual host language detection --- */ if(r->getHost()) { bool langset = getLanguageFromAbbr(m_defaultSortLanguage); char *cp; if(!langset && (cp = strrchr(r->getHost(), '.'))) { uint8_t lang = getLanguageFromUserAgent(++cp); if(lang) { // char langbuf[128]; // sprintf(langbuf, "qlang=%s\0", getLanguageAbbr(lang)); //m_defaultSortLanguage = getLanguageAbbr(lang); char *tmp = getLanguageAbbr(lang); strncpy(m_defaultSortLanguage, tmp, 6); // log(LOG_INFO, // getLanguageString(lang), r->getHost(), this); } } } /* --- End Virtual host language detection --- */ //char *qs1 = m_defaultSortLanguage; // this overrides though //long qlen2; //char *qs2 = r->getString ("qlang",&qlen2,NULL); //if ( qs2 ) qs1 = qs2; //m_queryLang = getLanguageFromAbbr ( qs1 ); m_queryLang = detectQueryLanguage(); char *qs1 = getLangAbbr(m_queryLang); log("query: using default lang of %s",getLangAbbr(m_queryLang)); if ( qs1 && qs1[0] && ! m_queryLang ) log("query: qlang of \"%s\" is NOT SUPPORTED",qs1); char *prepend = r->getString("prepend",NULL,NULL); // . the query to use for highlighting... can be overriden with "hq" // . we need the language id for doing synonyms if ( prepend && prepend[0] ) m_hqq.set2 ( prepend , m_queryLang , true ); else if ( m_highlightQuery && m_highlightQuery[0] ) m_hqq.set2 ( m_highlightQuery , m_queryLang , true ); else if ( m_query && m_query[0] ) m_hqq.set2 ( m_query , m_queryLang , true ); // log it here log("query: got query %s",m_sbuf1.getBufStart()); // . now set from m_qbuf1, the advanced/composite query buffer // . returns false and sets g_errno on error (ETOOMANYOPERANDS) if ( ! m_q->set2 ( m_sbuf1.getBufStart(), m_queryLang , m_queryExpansion ) ) { g_msg = " (error: query has too many operands)"; return false; } if ( m_q->m_truncated && m_q->m_isBoolean ) { g_errno = ETOOMANYOPERANDS; g_msg = " (error: query has too many operands)"; return false; } // do not allow querier to use the links: query operator unless they // are admin or the search controls explicitly allow links: //if ( m_q->m_hasLinksOperator && ! m_isAdmin && // !cr->m_allowLinksSearch ) { // g_errno = ENOPERM; // g_msg = " (error: permission denied)"; // return false; //} // miscellaneous m_showBanned = false; //if ( m_isAdmin ) m_showBanned = true; // admin can say &sb=0 explicitly to not show banned results // . if you are searching a diffbot collection, you are the admin // i guess... if ( m_isAdmin || cr->m_isCustomCrawl ) m_showBanned = r->getLong("sb",m_showBanned); if ( m_q->m_hasUrlField ) m_ipRestrictForTopics = false; if ( m_q->m_hasIpField ) { m_ipRestrictForTopics = false; //if( m_isAdmin ) m_showBanned = true; } if ( m_q->m_hasPositiveSiteField ) { m_ipRestrictForTopics = false; m_doSiteClustering = false; } if ( m_q->m_hasQuotaField ) { m_doSiteClustering = false; m_doDupContentRemoval = false; } m_familyFilter = r->getLong("ff",0); long codeLen; char *code = r->getString ("code",&codeLen,NULL); // set m_endUser if ( ! codeLen || ! code || strcmp(code,"gbfront")==0 ) m_endUser = true; else m_endUser = false; if(codeLen && !m_endUser) { m_maxResults = cr->m_maxSearchResultsForClients; } else { m_maxResults = cr->m_maxSearchResults; } // don't let admin bewilder himself if ( m_maxResults < 1 ) m_maxResults = 500; // we can't get this kind of constraint from generic Parms routines if ( m_firstResultNum + m_docsWanted > m_maxResults ) m_firstResultNum = m_maxResults - m_docsWanted; if(m_firstResultNum < 0) m_firstResultNum = 0; // DEBUG: temp hack // static bool first = true; // if ( first ) { // first = false; // m_firstResultNum = 10; // } // if useCache is -1 then pick a default value if ( m_useCache == -1 ) { // assume yes as default m_useCache = 1; // . if query has url: or site: term do NOT use cache by def. // . however, if spider is off then use the cache by default if ( g_conf.m_spideringEnabled ) { if ( m_q->m_hasPositiveSiteField ) m_useCache = 0; else if ( m_q->m_hasIpField ) m_useCache = 0; else if ( m_q->m_hasUrlField ) m_useCache = 0; else if ( m_siteLen > 0 ) m_useCache = 0; else if ( m_whiteListBuf.length() ) m_useCache = 0; else if ( m_urlLen > 0 ) m_useCache = 0; } } // never use cache if doing a rerank (msg3b) //if ( m_rerankRuleset >= 0 ) m_useCache = 0; bool readFromCache = false; if ( m_useCache == 1 ) readFromCache = true; if ( m_rcache == 0 ) readFromCache = false; if ( m_useCache == 0 ) readFromCache = false; // if useCache is false, don't write to cache if it was not specified if ( m_wcache == -1 ) { if ( m_useCache == 0 ) m_wcache = 0; else m_wcache = 1; } // save it m_rcache = readFromCache; /* m_language = 0; // convert m_languageCode to a number for m_language if ( m_languageCode ) { m_language = (unsigned char)atoi(m_languageCode); if ( m_language == 0 ) m_language = getLanguageFromAbbr(m_languageCode); } */ // a hack for buzz for backwards compatibility //if ( strstr ( m_q->m_orig,"gbkeyword:r36p1" ) ) // m_ruleset = 36; // // . turn this off for now // . it is used in setClusterLevels() to use clusterdb to filter our // search results via Msg39, so it is not the most efficient. // . plus i am deleting most foreign language pages from the index // so we can just focus on english and that will give us more english // pages that we could normally get. we don't have resources to // de-spam the other languages, etc. // . turn it back on, i took out the setClusterLevels() use of that // because we got the langid in the posdb keys now // //m_language = 0; // convert m_defaultSortCountry to a number for m_countryHint m_countryHint = g_countryCode.getIndexOfAbbr(m_defaultSortCountry); return true; }
// Return the value of the specified "field" within this node. // the case of "field" does not matter. char *XmlNode::getFieldValue ( char *field , int32_t *valueLen ) { // reset this to 0 *valueLen = 0; // scan for the field name in our node int32_t flen = gbstrlen(field); char inQuotes = '\0'; int32_t i; // scan the characters in the node, looking for the field name in ascii for ( i = 1; i + flen < m_nodeLen ; i++ ) { // skip the field if it's quoted if ( inQuotes) { if (m_node[i] == inQuotes ) inQuotes = 0; continue; } // set inQuotes to the quote if we're in quotes if ( (m_node[i]=='\"' || m_node[i]=='\'')){ inQuotes = m_node[i]; continue; } // a field name must be preceeded by non-alnum if ( is_alnum_a ( m_node[i-1] ) ) continue; // the first character of this field shout match field[0] if ( to_lower_a (m_node[i]) != to_lower_a(field[0] )) continue; // field just be immediately followed by an = or space if (m_node[i+flen]!='='&&!is_wspace_a(m_node[i+flen]))continue; // field names must match if ( strncasecmp ( &m_node[i], field, flen ) != 0 ) continue; // break cuz we got a match for our field name break; } // return NULL if no matching field if ( i + flen >= m_nodeLen ) return NULL; // advance i over the fieldname so it pts to = or space i += flen; // advance i over spaces while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++; // advance over the equal sign, return NULL if does not exist if ( i < m_nodeLen && m_node[i++] != '=' ) return NULL; // advance i over spaces after the equal sign while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++; // now parse out the value of this field (could be in quotes) inQuotes = '\0'; // set inQuotes to the quote if we're in quotes if ( m_node[i]=='\"' || m_node[i]=='\'') inQuotes = m_node[i++]; // mark this as the start of the value int start=i; // advance i until we hit a space, or we hit a that quote if inQuotes if (inQuotes) { while (i<m_nodeLen && m_node[i] != inQuotes ) i++; } else { while ( i<m_nodeLen && !is_wspace_a(m_node[i])&& m_node[i]!='>') i++; } // set the length of the value *valueLen = i - start; // return a ptr to the value return m_node + start; }
// . return the length of a node starting at "node" int32_t getTagLen ( char *node ) { // , int32_t version ) { // see if it's not a node //if ( node[0] != '<' ) return 0; // skip over first < int32_t i ; // . keep looping until we hit a < or > OR while we're in quotes // . ignore < and > when they're in quotes for ( i = 1 ; node[i] ; i++ ) { // this switch should speed things up... no! if ( node[i] != '<' && node[i] != '>' && node[i] != '\"' && node[i] != '\'' ) continue; // this is about 1.3 times faster than above (with -O2 on both) //if ( ! is_tag_control_char ( node[i] ) ) continue; if ( node[i] == '<' ) break; if ( node[i] == '>' ) { break; //if ( node[i-1]!='b') break; //if ( i -2 < 0 ) break; //if ( node[i-2]!='g') break; // we had a "gb>" which means that these 3 chars // we originally a > html encoded entity which // we decoded for easier parsing //continue; } //if (version >= 70 && version < 77) continue; // we can have double quotes within single quotes if ( node [ i ] == '\"' ) { // scan back looking for equal sign... int32_t k; for ( k = i - 1 ; k > 1 ; k-- ) { if ( is_wspace_a(node[k]) ) continue; break; } if ( k <= 1 ) continue; // . if an equal sign did not immediately preceed // this double quote then ignore the double quote // . this now fixes the harwoodmuseum.org issue // talked about below if ( node[k] != '=' ) continue; // skip over this first quote i++; while ( node[i] && node[i]!='\"' ) { // crap some pages have unbalanced quotes. // see /test/doc.14541556377486183454.html if ( node[i ]=='>' && node[i-1]=='\"' ) { i--; break; } // like an img tag hits a </a> for // http://www.harwoodmuseum.org/press_deta // il.php?ID=44 // BUT this f***s up // onclick="tb_show('<b>Community Calendar</b>' // on the </b> which is legitamately in quotes //if ( node[i ]=='<' && // node[i+1]=='/' ) { // i--; // break; //} if ( node[i ]=='>' && node[i-1]==' ' && node[i-2]=='\"' ) { i--; break; } // skip this char i++; } // return the length if tag ended abuptly if ( ! node[i] ) return i; // back-to-back quotes? common mistake if ( node[i+1] == '\"' ) i++; continue; } // continue if we don't have a " '" or "='" if ( node [ i ] != '\'' ) continue; if ( node[i-1] != '=' && !is_wspace_a( node[i-1] ) ) continue; // skip to end of quote while ( node[i] && node[i]!='\'' ) i++; } // skip i over the > if ( node[i] == '>' ) i++; // . else we found no closure outside of quotes so be more stringent // . look for closure with regard to quotes else for ( i=1; node[i] && node[i] != '>' && node[i] != '<';i++); // return the LENGTH of the whole node return i ; }
bool Log::logR ( int64_t now, int32_t type, const char *msg, bool forced ) { if ( ! g_loggingEnabled ) { return true; } // return true if we should not log this if ( ! forced && ! shouldLog ( type , msg ) ) { return true; } // get "msg"'s length int32_t msgLen = strlen ( msg ); ScopedLock sl(s_lock); // do a timestamp, too. use the time synced with host #0 because // it is easier to debug because all log timestamps are in sync. if ( now == 0 ) now = gettimeofdayInMillisecondsGlobalNoCore(); // . skip all logging if power out, we do not want to screw things up // . allow logging for 10 seconds after power out though if ( ! g_process.m_powerIsOn && now - g_process.m_powerOffTime >10000){ return false; } // chop off any spaces at the end of the msg. while ( is_wspace_a ( msg [ msgLen - 1 ] ) && msgLen > 0 ) msgLen--; // a tmp buffer char tt [ MAX_LINE_LEN ]; char *p = tt; if (m_logPrefix) { if ( m_logTimestamps ) { if( m_logReadableTimestamps ) { time_t now_t = (time_t)(now / 1000); struct tm tm_buf; struct tm *stm = localtime_r(&now_t,&tm_buf); p += sprintf ( p , "%04d%02d%02d-%02d%02d%02d-%03d %04" PRId32" ", stm->tm_year+1900,stm->tm_mon+1,stm->tm_mday,stm->tm_hour,stm->tm_min,stm->tm_sec,(int)(now%1000), g_hostdb.m_hostId ); } else { if ( g_hostdb.getNumHosts() <= 999 ) p += sprintf ( p , "%" PRIu64 " %03" PRId32 " ", (uint64_t)now , g_hostdb.m_hostId ); else if ( g_hostdb.getNumHosts() <= 9999 ) p += sprintf ( p , "%" PRIu64" %04" PRId32" ", (uint64_t)now , g_hostdb.m_hostId ); else if ( g_hostdb.getNumHosts() <= 99999 ) p += sprintf ( p , "%" PRIu64" %05" PRId32" ", (uint64_t)now , g_hostdb.m_hostId ); } } // Get thread id. pthread_self instead? unsigned tid=(unsigned)syscall(SYS_gettid); p += sprintf(p, "%06u ", tid); // Log level p += sprintf(p, "%s ", getTypeString(type)); } // then message itself const char *x = msg; int32_t avail = (MAX_LINE_LEN) - (p - tt) - 1; if ( msgLen > avail ) msgLen = avail; if ( *x == ':' ) x++; if ( *x == ' ' ) x++; strncpy ( p , x , avail ); // capitalize for consistency. no, makes grepping log msgs harder. //if ( is_alpha_a(*p) ) *p = to_upper_a(*p); p += strlen(p); // back up over spaces while ( p[-1] == ' ' ) p--; // end in period or ? or ! //if ( p[-1] != '?' && p[-1] != '.' && p[-1] != '!' ) // *p++ = '.'; *p ='\0'; // the total length, not including the \0 int32_t tlen = p - tt; // . filter out nasty chars from the message // . replace with ~'s char cs; char *ttp = tt; char *ttpend = tt + tlen; for ( ; ttp < ttpend ; ttp += cs ) { cs = getUtf8CharSize ( ttp ); if ( is_binary_utf8 ( ttp ) ) { for ( int32_t k = 0 ; k < cs ; k++ ) *ttp++ = '.'; // careful not to skip the already skipped bytes cs = 0; continue; } } // . if filesize would be too big then make a new log file // . should make a new m_fd if ( m_logFileSize + tlen+1 > MAXLOGFILESIZE && g_conf.m_logToFile ) makeNewLogFile(); if ( m_fd >= 0 ) { write ( m_fd , tt , tlen ); write ( m_fd , "\n", 1 ); m_logFileSize += tlen + 1; } else { // print it out for now fprintf ( stderr, "%s\n", tt ); } return false; }
// come here once per second i guess void Test::initTestRun ( ) { g_errno = 0; // . all hosts should have their g_conf.m_repairMode parm set // . it is global now, not collection based, since we need to // lock down titledb for the scan and there could be recs from // the collection we are repairing in titledb's rdbtree, which, // when dumped, would mess up our scan. if ( ! g_conf.m_testSpiderEnabled && ! g_conf.m_testParserEnabled ) { char *xx=NULL;*xx=0; } // if both enabled, core if ( g_conf.m_testSpiderEnabled && g_conf.m_testParserEnabled ) { char *xx=NULL;*xx=0; } // if the power went off if ( ! g_process.m_powerIsOn ) return; // return if currently running // no, admin can re-init even if running now //if ( m_isRunning ) { char *xx=NULL;*xx=0; }//return; // must be host #0 only if ( g_hostdb.m_myHost->m_hostId != 0 ) return; // if was initially in this mode, don't do anything //if ( m_testSpiderEnabledSaved ) return; //if ( m_testParserEnabledSaved ) return; // you must have the "test" coll already setup! CollectionRec *cr = g_collectiondb.getRec("test"); if ( ! cr ) { // note it log("test: please add a collection named \"test\" first."); // stop the test g_conf.m_testParserEnabled = false; g_conf.m_testSpiderEnabled = false; // all done return; } char *testDir = getTestDir(); // scan for file named "run.start.%li.txt" which is a dump of all // the conf and parms char filename[100]; File f; long i; for ( i = 0 ; i < 9999 ; i++ ) { // make filename. base it off working dir, g_hostdb.m_dir sprintf ( filename,"%s/%s/run.%li.collparms.txt", g_hostdb.m_dir,testDir,i ); // exist? f.set ( filename ); // open files long status = f.doesExist(); // error? if ( status == -1 ) { // note it in the log log("test: doesExist() returned -1"); // end the test g_conf.m_testParserEnabled = false; g_conf.m_testSpiderEnabled = false; // all done return; } // try next i if this one in use if ( status ) continue; // got one break; } // close it f.close(); // create the run.%li.version.txt file char cmd[1000]; char vfile[200]; sprintf(vfile,"%s/%s/run.%li.version.txt",g_hostdb.m_dir,testDir,i); sprintf(cmd, "%s/gb -v >& %s ; " "echo -n \"RUN START TIME: \" >> %s ; " "date >> %s", g_hostdb.m_dir,vfile, vfile, vfile); system(cmd); // save it m_runId = i; cr = g_collectiondb.getRec ( "test" ); if ( ! cr ) { // and no more of this g_conf.m_testParserEnabled = false; g_conf.m_testSpiderEnabled = false; return; } // set these m_coll = cr->m_coll; // turn on spiders //cr->m_spideringEnabled = 1; // crap i guess this too!!! //g_conf.m_spideringEnabled = 1; // // log out the global parms // char fbuf[100]; // print our global parms into a file called run.%li.start.txt sprintf(fbuf,"%s/%s/run.%li.confparms.txt",g_hostdb.m_dir,testDir,i); // this saves it as xml i think g_parms.saveToXml ( (char *)&g_conf , fbuf ); // // log out the coll specific parms // // update name sprintf(fbuf,"%s/%s/run.%li.collparms.txt",g_hostdb.m_dir,testDir,i); // save that g_parms.saveToXml ( (char *)cr , fbuf ); // get the list of urls to download and inject in order sprintf(fbuf,"%s/%s/urls.txt",g_hostdb.m_dir,testDir); // set it f.set ( fbuf ) ; // read it in long fsize = f.getFileSize(); // add one for \0 termination long need = fsize + 1; // read it in char *buf = (char *)mmalloc ( need ,"qatest"); // error? if ( ! buf ) { // note it log("test: failed to alloc %li bytes for url buf",fsize); // disable testing g_conf.m_testParserEnabled = false; g_conf.m_testSpiderEnabled = false; // all done return; } // open it f.open ( O_RDONLY ); // read it in long rs = f.read ( buf , fsize , 0 ) ; // check it if ( rs != fsize ) { // note it log("test: failed to read %li bytes of urls.txt file",fsize); // disable testing g_conf.m_testParserEnabled = false; g_conf.m_testSpiderEnabled = false; // all done return; } // save it m_urlBuf = buf; // null term it just in case buf[need-1] = '\0'; // end of it, including the terminating \0 m_urlEnd = buf + need; // init url offset m_urlPtr = m_urlBuf; // reset just in case //m_spiderLinks = false; m_bypassMenuElimination = false; // first check for spiderlinks=1|true for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ ) { //if ( p[0] != 's' ) continue; //if ( p[1] != 'p' ) continue; //if ( ! strncmp(p,"spiderlinks",11) ) // m_spiderLinks = true; //if ( ! strncmp(p,"bypassmenuelimination",21) ) // m_bypassMenuElimination = true; } // force max spiders to one because one page is often dependent // on the previous page! //if ( ! m_spiderLinks ) cr->m_maxNumSpiders = 1; // need to make it 6 since some priorities essentially lock the // ips up that have urls in higher priorities. i.e. once we dole // a url out for ip X, then if later we add a high priority url for // IP X it can't get spidered until the one that is doled does. //else cr->m_maxNumSpiders = 6; // . first space out all comments // . comments are nice because we know why the url is in urls.txt for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ ) { // skip if not start of a comment line if ( *p != '#' ) continue; // if not preceeded by a \n or start, skip if ( p > m_urlBuf && *(p-1) != '\n' ) continue; // ok, nuke it for ( ; *p && *p !='\n' ; p++ ) *p = ' '; } // if we hit "\nSTOP\n" then white out that and all past it for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ ) { // skip if not start of a comment line if ( *p != '\n' ) continue; // check it if ( strncmp(p,"\nSTOP\n",6) ) continue; // white out for ( ; *p ; p++ ) { // until we HIT RESUME if ( *p == '\n' && ! strncmp(p,"\nRESUME\n",8) ) { p[1] = ' '; p[2] = ' '; p[3] = ' '; p[4] = ' '; p[5] = ' '; p[6] = ' '; break; } *p = ' '; } // all done //break; } // then NULL terminate all urls by converting all white space to \0s for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ ) // all non url chars to \0 if ( is_wspace_a(*p) ) *p = '\0'; // flag this m_isRunning = true; // and this m_isAdding = true; m_testStartTime = gettimeofdayInMilliseconds(); // set up dedup table m_dt.set ( 8,0,0,NULL,0,false,MAX_NICENESS,"testdedup"); // remove all old files for now to avoid system diffs log("test: beginning injection"); // . now inject each url in order, one at a time using msg7 i guess // . returns true if all done if ( ! injectLoop() ) return; // close it up //stopIt(); }
// . parse an incoming request // . return false and set g_errno on error // . CAUTION: we destroy "req" by replacing it's last char with a \0 // . last char must be \n or \r for it to be a proper request anyway bool HttpRequest::set ( char *origReq , long origReqLen , TcpSocket *sock ) { // reset number of cgi field terms reset(); if ( ! m_reqBuf.reserve ( origReqLen + 1 ) ) { log("http: failed to copy request: %s",mstrerror(g_errno)); return false; } // copy it to avoid mangling it m_reqBuf.safeMemcpy ( origReq , origReqLen ); // NULL term m_reqBuf.pushChar('\0'); m_reqBufValid = true; // and point to that char *req = m_reqBuf.getBufStart(); long reqLen = m_reqBuf.length() - 1; // save this m_userIP = 0; if ( sock ) m_userIP = sock->m_ip; m_isSSL = 0; if ( sock ) m_isSSL = (bool)sock->m_ssl; // TcpServer should always give us a NULL terminated request if ( req[reqLen] != '\0' ) { char *xx = NULL; *xx = 0; } // how long is the first line, the primary request long i; // for ( i = 0 ; i<reqLen && i<MAX_REQ_LEN && // req[i]!='\n' && req[i]!='\r'; i++); // . now fill up m_buf, used to log the request // . make sure the url was encoded correctly // . we don't want assholes encoding every char so we can't see what // url they are submitting to be spidered/indexed // . also, don't de-code encoded ' ' '+' '?' '=' '&' because that would // change the meaning of the url // . and finally, non-ascii chars that don't display correctly // . this should NULL terminate m_buf, too // . turn this off for now, just try to log a different way // m_bufLen = urlNormCode ( m_buf , MAX_REQ_LEN - 1 , req , i ); // ensure it's big enough to be a valid request if ( reqLen < 5 ) { log("http: got reqlen<5 = %s",req); g_errno = EBADREQUEST; return false; } // or if first line too long //if ( i >= 1024 ) { g_errno = EBADREQUEST; return false; } // get the type, must be GET or HEAD if ( strncmp ( req , "GET " , 4 ) == 0 ) m_requestType = 0; // these means a compressed reply was requested. use by query // compression proxies. else if ( strncmp ( req , "ZET " , 4 ) == 0 ) m_requestType = 0; else if ( strncmp ( req , "HEAD " , 5 ) == 0 ) m_requestType = 1; else if ( strncmp ( req , "POST " , 5 ) == 0 ) m_requestType = 2; else { log("http: got bad request cmd: %s",req); g_errno = EBADREQUEST; return false; } // . NULL terminate the request (a destructive operation!) // . this removes the last \n in the trailing \r\n // . shit, but it f***s up POST requests if ( m_requestType != 2 ) { req [ reqLen - 1 ] = '\0'; reqLen--; } // POST requests can be absolutely huge if you are injecting a 100MB // file, so limit our strstrs to the end of the mime char *d = NULL; char dc; // check for body if it was a POST request if ( m_requestType == 2 ) { d = strstr ( req , "\r\n\r\n" ); if ( d ) { dc = *d; *d = '\0'; } else log("http: Got POST request without \\r\\n\\r\\n."); } // . point to the file path // . skip over the "GET " long filenameStart = 4 ; // skip over extra char if it's a "HEAD " request if ( m_requestType == 1 || m_requestType == 2 ) filenameStart++; // are we a redirect? i = filenameStart; m_redirLen = 0; if ( strncmp ( &req[i] , "/?redir=" , 8 ) == 0 ) { for ( long k = i+8; k<reqLen && m_redirLen<126 ; k++) { if ( req[k] == '\r' ) break; if ( req[k] == '\n' ) break; if ( req[k] == '\t' ) break; if ( req[k] == ' ' ) break; m_redir[m_redirLen++] = req[k]; } } m_redir[m_redirLen] = '\0'; // find a \n space \r or ? that delimits the filename for ( i = filenameStart ; i < reqLen ; i++ ) { if ( is_wspace_a ( req [ i ] ) ) break; if ( req [ i ] == '?' ) break; } // now calc the filename length m_filenameLen = i - filenameStart; // return false and set g_errno if it's 0 if ( m_filenameLen <= 0 ) { log("http: got filenameLen<=0: %s",req); g_errno = EBADREQUEST; return false; } // . bitch if too big // . leave room for strcatting "index.html" below if ( m_filenameLen >= MAX_HTTP_FILENAME_LEN - 10 ) { log("http: got filenameLen>=max"); g_errno = EBADREQUEST; return false; } // . decode the filename into m_filename and reassign it's length // . decode %2F to / , etc... m_filenameLen = urlDecode(m_filename,req+filenameStart,m_filenameLen); // NULL terminate m_filename m_filename [ m_filenameLen ] = '\0'; // does it have a file extension AFTER the last / in the filename? bool hasExtension = false; for ( long j = m_filenameLen-1 ; j >= 0 ; j-- ) { if ( m_filename[j] == '.' ) { hasExtension = true; break; } if ( m_filename[j] == '/' ) break; } // if it has no file extension append a /index.html if ( ! hasExtension && m_filename [ m_filenameLen - 1 ] == '/' ) { strcat ( m_filename , "index.html" ); m_filenameLen = gbstrlen ( m_filename ); } // set file offset/size defaults m_fileOffset = 0; // -1 means ALL the file from m_fileOffset onwards m_fileSize = -1; // "e" points to where the range actually starts, if any //char *e; // . TODO: speed up by doing one strstr for Range: and maybe range: // . do they have a Range: 0-100\n in the mime denoting a partial get? //char *s = strstr ( req ,"Range:bytes=" ); //e = s + 12; // try alternate formats //if ( ! s ) { s = strstr ( req ,"Range: bytes=" ); e = s + 13; } //if ( ! s ) { s = strstr ( req ,"Range: " ); e = s + 7; } // parse out the range if we got one //if ( s ) { // long x = 0; // sscanf ( e ,"%li-%li" , &m_fileOffset , &x ); // // get all file if range's 2nd number is non-existant // if ( x == 0 ) m_fileSize = -1; // else m_fileSize = x - m_fileOffset; // // ensure legitimacy // if ( m_fileOffset < 0 ) m_fileOffset = 0; //} // reset our hostname m_hostLen = 0; // assume request is NOT from local network //m_isAdmin = false; m_isLocal = false; // get the virtual hostname they want to use char *s = strstr ( req ,"Host:" ); // try alternate formats if ( ! s ) s = strstr ( req , "host:" ); // must be on its own line, otherwise it's not valid if ( s && s > req && *(s-1) !='\n' ) s = NULL; // parse out the host if we got one if ( s ) { // skip field name, host: s += 5; // skip e to beginning of the host name after "host:" while ( *s==' ' || *s=='\t' ) s++; // find end of the host name char *end = s; while ( *end && !is_wspace_a(*end) ) end++; // . now *end should be \0, \n, \r, ' ', ... // . get host len m_hostLen = end - s; // truncate if too big if ( m_hostLen >= 255 ) m_hostLen = 254; // copy into hostname memcpy ( m_host , s , m_hostLen ); } // NULL terminate it m_host [ m_hostLen ] = '\0'; // get Referer: field s = strstr ( req ,"Referer:" ); // find another if ( ! s ) s = strstr ( req ,"referer:" ); // must be on its own line, otherwise it's not valid if ( s && s > req && *(s-1) !='\n' ) s = NULL; // assume no referer m_refLen = 0; // parse out the referer if we got one if ( s ) { // skip field name, referer: s += 8; // skip e to beginning of the host name after ':' while ( *s==' ' || *s=='\t' ) s++; // find end of the host name char *end = s; while ( *end && !is_wspace_a(*end) ) end++; // . now *end should be \0, \n, \r, ' ', ... // . get len m_refLen = end - s; // truncate if too big if ( m_refLen >= 255 ) m_refLen = 254; // copy into m_ref memcpy ( m_ref , s , m_refLen ); } // NULL terminate it m_ref [ m_refLen ] = '\0'; // get User-Agent: field s = strstr ( req ,"User-Agent:" ); // find another if ( ! s ) s = strstr ( req ,"user-agent:" ); // must be on its own line, otherwise it's not valid if ( s && s > req && *(s-1) !='\n' ) s = NULL; // assume empty long len = 0; // parse out the referer if we got one if ( s ) { // skip field name, referer: s += 11; // skip e to beginning of the host name after ':' while ( *s==' ' || *s=='\t' ) s++; // find end of the agent name char *end = s; while ( *end && *end!='\n' && *end!='\r' ) end++; // . now *end should be \0, \n, \r, ' ', ... // . get agent len len = end - s; // truncate if too big if ( len > 127 ) len = 127; // copy into m_userAgent memcpy ( m_userAgent , s , len ); } // NULL terminate it m_userAgent [ len ] = '\0'; m_isMSIE = false; if ( strstr ( m_userAgent , "MSIE" ) ) m_isMSIE = true; // get Cookie: field s = strstr ( req, "Cookie:" ); // find another if ( !s ) s = strstr ( req, "cookie:" ); // must be on its own line, otherwise it's not valid if ( s && s > req && *(s-1) != '\n' ) s = NULL; // assume empty // m_cookieBufLen = 0; m_cookiePtr = s; // parse out the cookie if we got one if ( s ) { // skip field name, Cookie: s += 7; // skip s to beginning of cookie after ':' while ( *s == ' ' || *s == '\t' ) s++; // find end of the cookie char *end = s; while ( *end && *end != '\n' && *end != '\r' ) end++; // save length m_cookieLen = end - m_cookiePtr; // get cookie len //m_cookieBufLen = end - s; // trunc if too big //if (m_cookieBufLen > 1023) m_cookieBufLen = 1023; // copy into m_cookieBuf //memcpy(m_cookieBuf, s, m_cookieBufLen); } // NULL terminate it if ( m_cookiePtr ) m_cookiePtr[m_cookieLen] = '\0'; //m_cookieBuf[m_cookieBufLen] = '\0'; // convert every '&' in cookie to a \0 for parsing the fields // for ( long j = 0 ; j < m_cookieBufLen ; j++ ) // if ( m_cookieBuf[j] == '&' ) m_cookieBuf[j] = '\0'; // mark it as cgi if it has a ? bool isCgi = ( req [ i ] == '?' ) ; // reset m_filename length to exclude the ?* stuff if ( isCgi ) { // skip over the '?' i++; // find a space the delmits end of cgi long j; for ( j = i; j < reqLen; j++) if (is_wspace_a(req[j])) break; // now add it if ( ! addCgi ( &req[i] , j-i ) ) return false; // update i i = j; } // . set path ptrs // . the whole /cgi/14.cgi?coll=xxx&..... thang m_path = req + filenameStart; m_plen = i - filenameStart; // we're local if hostname is 192.168.[0|1].y //if ( strncmp(iptoa(sock->m_ip),"192.168.1.",10) == 0) { // m_isAdmin = true; m_isLocal = true; } //if ( strncmp(iptoa(sock->m_ip),"192.168.0.",10) == 0) { // m_isAdmin = true; m_isLocal = true; } //if(strncmp(iptoa(sock->m_ip),"192.168.1.",10) == 0) m_isLocal = true; //if(strncmp(iptoa(sock->m_ip),"192.168.0.",10) == 0) m_isLocal = true; if ( sock && strncmp(iptoa(sock->m_ip),"192.168.",8) == 0) m_isLocal = true; if ( sock && strncmp(iptoa(sock->m_ip),"10.",3) == 0) m_isLocal = true; // steve cook's comcast at home: // if ( sock && strncmp(iptoa(sock->m_ip),"68.35.100.143",13) == 0) // m_isLocal = true; // procog's ip // if ( sock && strncmp(iptoa(sock->m_ip),"216.168.36.21",13) == 0) // m_isLocal = true; // roadrunner ip // if ( sock && strncmp(iptoa(sock->m_ip),"66.162.42.131",13) == 0) // m_isLocal = true; // cnsp ip //if ( sock && strncmp(iptoa(sock->m_ip),"67.130.216.27",13) == 0) // m_isLocal = true; // emily parker //if ( sock && strncmp(iptoa(sock->m_ip),"69.92.68.202",12) == 0) //m_isLocal = true; // 127.0.0.1 if ( sock && sock->m_ip == 16777343 ) m_isLocal = true; // steve cook's webserver //if ( sock && strncmp(iptoa(sock->m_ip),"216.168.36.21",13) == 0) // m_isLocal = true; // . also if we're coming from lenny at my house consider it local // . this is a security risk, however... TODO: FIX!!! //if ( sock->m_ip == atoip ("68.35.105.199" , 13 ) ) m_isAdmin = true; // . TODO: now add any cgi data from a POST..... // . look after the mime //char *d = NULL; // check for body if it was a POST request //if ( m_requestType == 2 ) d = strstr ( req , "\r\n\r\n" ); // now put d's char back, just in case... does it really matter? if ( d ) *d = dc; // return true now if no cgi stuff to parse if ( d ) { char *post = d + 4; long postLen = reqLen-(d+4-req) ; // post sometimes has a \r or\n after it while ( postLen > 0 && post[postLen-1]=='\r' ) postLen--; // add it to m_cgiBuf, filter and everything if ( ! addCgi ( post , postLen ) ) return false; } // sometimes i don't want to be admin //if ( getLong ( "admin" , 1 ) == 0 ) m_isAdmin = false; // success ///// // Handle Extra parms... char *ep = g_conf.m_extraParms; char *epend = g_conf.m_extraParms + g_conf.m_extraParmsLen; char *qstr = m_cgiBuf; long qlen = m_cgiBufLen; while (ep < epend){ char buf[AUTOBAN_TEXT_SIZE]; long bufLen = 0; // get next substring while (*ep && ep < epend && *ep != ' ' && *ep != '\n'){ buf[bufLen++] = *ep++; } // skip whitespace while (*ep && ep < epend && *ep == ' '){ ep++; } // null terminate buf[bufLen] = '\0'; // No match if (!bufLen || !strnstr(qstr, qlen, buf)){ // skip to end of line while (*ep && ep < epend && *ep != '\n') ep++; // skip newline while (*ep && ep < epend && *ep == '\n') ep++; // try next substr continue; } // found a match... // get parm string bufLen = 0; while (*ep && ep < epend && *ep != '\n'){ buf[bufLen++] = *ep++; } buf[bufLen] = '\0'; // skip newline while (*ep && ep < epend && *ep == '\n') ep++; logf(LOG_DEBUG, "query: appending \"%s\" to query", buf); long newSize = m_cgiBuf2Size + bufLen+1; char *newBuf = (char*)mmalloc(newSize, "extraParms"); if (!newBuf){ return log("query: unable to allocate %ld bytes " "for extraParms", newSize); } char *p = newBuf; if (m_cgiBuf2Size) { memcpy(newBuf, m_cgiBuf2, m_cgiBuf2Size); p += m_cgiBuf2Size-1; mfree(m_cgiBuf2, m_cgiBuf2Size, "extraParms"); m_cgiBuf2 = NULL; m_cgiBuf2Size = 0; } memcpy(p, buf, bufLen); m_cgiBuf2 = newBuf; m_cgiBuf2Size = newSize; p += bufLen; *p = '\0'; } // Put '\0' back into the HttpRequest buffer... if (m_cgiBuf){ // do not mangle the "ucontent"! long cgiBufLen = m_cgiBufLen; cgiBufLen -= m_ucontentLen; char *buf = m_cgiBuf; for (long i = 0; i < cgiBufLen ; i++) if (buf[i] == '&') buf[i] = '\0'; // don't decode the ucontent= field! long decodeLen = m_cgiBufLen; // so subtract that if ( m_ucontent ) decodeLen -= m_ucontentLen; // decode everything long len = urlDecode ( m_cgiBuf , m_cgiBuf , decodeLen ); // we're parsing crap after the null if the last parm // has no value //memset(m_cgiBuf+len, '\0', m_cgiBufLen-len); m_cgiBufLen = len; // ensure that is null i guess if ( ! m_ucontent ) m_cgiBuf[len] = '\0'; } if (m_cgiBuf2){ char *buf = m_cgiBuf2; for (long i = 0; i < m_cgiBuf2Size-1 ; i++) if (buf[i] == '&') buf[i] = '\0'; long len = urlDecode ( m_cgiBuf2 , m_cgiBuf2 , m_cgiBuf2Size); memset(m_cgiBuf2+len, '\0', m_cgiBuf2Size-len); } // . parse the fields after the ? in a cgi filename // . or fields in the content if it's a POST // . m_cgiBuf must be and is NULL terminated for this parseFields ( m_cgiBuf , m_cgiBufLen ); // Add extra parms to the request. if (m_cgiBuf2Size){ parseFields(m_cgiBuf2, m_cgiBuf2Size); } // urldecode the cookie buf too!! if ( m_cookiePtr ) { char *p = m_cookiePtr; for (long i = 0; i < m_cookieLen ; i++) { //if (p[i] == '&') p[i] = '\0'; // cookies are separated with ';' in the request only if (p[i] == ';') p[i] = '\0'; // a hack for the metacookie=.... // which uses &'s to separate its subcookies // this is a hack for msie's limit of 50 cookies if ( p[i] == '&' ) p[i] = '\0'; // set m_metaCookie to start of meta cookie if ( p[i] == 'm' && p[i+1] == 'e' && strncmp(p,"metacookie",10) == 0 ) m_metaCookie = p; } long len = urlDecode ( m_cookiePtr , m_cookiePtr, m_cookieLen ); // we're parsing crap after the null if the last parm // has no value memset(m_cookiePtr+len, '\0', m_cookieLen-len); m_cookieLen = len; } return true; }
bool Log::logR ( long long now , long type , char *msg , bool asterisk , bool forced ) { // filter if we should //if ( forced ) goto skipfilter; // return true if we should not log this if ( ! forced && ! shouldLog ( type , msg ) ) return true; // skipfilter: // can we log if we're a sig handler? don't take changes if ( g_inSigHandler ) return logLater ( now , type , msg , NULL ); //if ( g_inSigHandler ) return false; // get "msg"'s length long msgLen = gbstrlen ( msg ); #ifdef PTHREADS // lock for threads pthread_mutex_lock ( &s_lock ); #endif // do a timestamp, too. use the time synced with host #0 because // it is easier to debug because all log timestamps are in sync. if ( now == 0 ) now = gettimeofdayInMillisecondsGlobalNoCore(); // . skip all logging if power out, we do not want to screw things up // . allow logging for 10 seconds after power out though if ( ! g_process.m_powerIsOn && now - g_process.m_powerOffTime >10000){ #ifdef PTHREADS pthread_mutex_unlock ( &s_lock ); #endif return false; } //if ( now == 0 ) now = g_nowApprox; // chop off any spaces at the end of the msg. while ( is_wspace_a ( msg [ msgLen - 1 ] ) && msgLen > 0 ) msgLen--; // get this pid pid_t pid = getpidtid(); // a tmp buffer char tt [ MAX_LINE_LEN ]; char *p = tt; char *pend = tt + MAX_LINE_LEN; /* // print timestamp, hostid, type if ( g_hostdb.m_numHosts <= 999 ) sprintf ( p , "%llu %03li %s ", now , g_hostdb.m_hostId , getTypeString(type) ); else if ( g_hostdb.m_numHosts <= 9999 ) sprintf ( p , "%llu %04li %s ", now , g_hostdb.m_hostId , getTypeString(type) ); else if ( g_hostdb.m_numHosts <= 99999 ) sprintf ( p , "%llu %05li %s ", now , g_hostdb.m_hostId , getTypeString(type) ); */ // print timestamp, hostid, type if ( m_logTimestamps ) { if ( g_hostdb.m_numHosts <= 999 ) sprintf ( p , "%llu %03li ", now , g_hostdb.m_hostId ); else if ( g_hostdb.m_numHosts <= 9999 ) sprintf ( p , "%llu %04li ", now , g_hostdb.m_hostId ); else if ( g_hostdb.m_numHosts <= 99999 ) sprintf ( p , "%llu %05li ", now , g_hostdb.m_hostId ); p += gbstrlen ( p ); } // msg resource char *x = msg; long cc = 7; // the first 7 bytes or up to the : must be ascii //while ( p < pend && *x && is_alnum_a(*x) ) { *p++ = *x++; cc--; } // space pad //while ( cc-- > 0 ) *p++ = ' '; // ignore the label for now... while ( p < pend && *x && is_alnum_a(*x) ) { x++; cc--; } // thread id if in "thread" if ( pid != s_pid && s_pid != -1 ) { //sprintf ( p , "[%li] " , (long)getpid() ); sprintf ( p , "[%lu] " , (unsigned long)pid ); p += gbstrlen ( p ); } // then message itself long avail = (MAX_LINE_LEN) - (p - tt) - 1; if ( msgLen > avail ) msgLen = avail; if ( *x == ':' ) x++; if ( *x == ' ' ) x++; strncpy ( p , x , avail ); // capitalize for consistency. no, makes grepping log msgs harder. //if ( is_alpha_a(*p) ) *p = to_upper_a(*p); p += gbstrlen(p); // back up over spaces while ( p[-1] == ' ' ) p--; // end in period or ? or ! //if ( p[-1] != '?' && p[-1] != '.' && p[-1] != '!' ) // *p++ = '.'; *p ='\0'; // the total length, not including the \0 long tlen = p - tt; // call sprintf, but first make sure we have room in m_buf and in // the arrays. who know how much room the sprintf is going to need??? // NOTE: TODO: this is shaky -- fix it! if ( m_bufPtr + tlen >= 1024 * 32 || m_numErrors >= MAX_LOG_MSGS){ // this sets m_bufPtr to 0 if ( ! dumpLog ( ) ) { fprintf(stderr,"Log::log: could not dump to file!\n"); #ifdef PTHREADS pthread_mutex_unlock ( &s_lock ); #endif return false; } } // . filter out nasty chars from the message // . replace with ~'s char cs; char *ttp = tt; char *ttpend = tt + tlen; for ( ; ttp < ttpend ; ttp += cs ) { cs = getUtf8CharSize ( ttp ); if ( is_binary_utf8 ( ttp ) ) { for ( long k = 0 ; k < cs ; k++ ) *ttp++ = '.'; // careful not to skip the already skipped bytes cs = 0; continue; } // convert \n's and \r's to spaces if ( *ttp == '\n' ) *ttp = ' '; if ( *ttp == '\r' ) *ttp = ' '; if ( *ttp == '\t' ) *ttp = ' '; } if ( m_fd >= 0 ) { write ( m_fd , tt , tlen ); write ( m_fd , "\n", 1 ); } else { // print it out for now fprintf ( stderr, "%s\n", tt ); } // set the stuff in the array m_errorMsg [m_numErrors] = msg; m_errorMsgLen [m_numErrors] = msgLen; m_errorTime [m_numErrors] = now; m_errorType [m_numErrors] = type; // increase the # of errors m_numErrors++; #ifdef PTHREADS // unlock for threads pthread_mutex_unlock ( &s_lock ); #endif return false; }
// . parse an incoming request // . return false and set g_errno on error // . CAUTION: we destroy "req" by replacing it's last char with a \0 // . last char must be \n or \r for it to be a proper request anyway bool HttpRequest::set ( char *origReq , int32_t origReqLen , TcpSocket *sock ) { // reset number of cgi field terms reset(); if ( ! m_reqBuf.reserve ( origReqLen + 1 ) ) { log("http: failed to copy request: %s",mstrerror(g_errno)); return false; } // copy it to avoid mangling it m_reqBuf.safeMemcpy ( origReq , origReqLen ); // NULL term m_reqBuf.pushChar('\0'); m_reqBufValid = true; // and point to that char *req = m_reqBuf.getBufStart(); if( !req ) { log(LOG_ERROR, "http: req is NULL"); g_errno = EBADREQUEST; return false; } int32_t reqLen = m_reqBuf.length() - 1; // save this m_userIP = sock ? sock->m_ip : 0; m_isSSL = sock ? (sock->m_ssl!=NULL) : false; // TcpServer should always give us a NULL terminated request if ( req[reqLen] != '\0' ) { g_process.shutdownAbort(true); } // how long is the first line, the primary request // int32_t i; // for ( i = 0 ; i<reqLen && i<MAX_REQ_LEN && // req[i]!='\n' && req[i]!='\r'; i++); // . now fill up m_buf, used to log the request // . make sure the url was encoded correctly // . we don't want assholes encoding every char so we can't see what // url they are submitting to be spidered/indexed // . also, don't de-code encoded ' ' '+' '?' '=' '&' because that would // change the meaning of the url // . and finally, non-ascii chars that don't display correctly // . this should NULL terminate m_buf, too // . turn this off for now, just try to log a different way // m_bufLen = urlNormCode ( m_buf , MAX_REQ_LEN - 1 , req , i ); // ensure it's big enough to be a valid request if ( reqLen < 5 ) { log(LOG_WARN, "http: got reqlen %" PRId32"<5 = %s",reqLen,req); g_errno = EBADREQUEST; return false; } int32_t cmdLen = 0; // or if first line too long //if ( i >= 1024 ) { g_errno = EBADREQUEST; return false; } // get the type, must be GET or HEAD if ( strncmp ( req , "GET " , 4 ) == 0 ) { m_requestType = RT_GET; cmdLen = 3; } // these means a compressed reply was requested. use by query // compression proxies. else if ( strncmp ( req , "ZET " , 4 ) == 0 ) { m_requestType = RT_GET; cmdLen = 3; } else if ( strncmp ( req , "HEAD " , 5 ) == 0 ) { m_requestType = RT_HEAD; cmdLen = 4; } else if ( strncmp ( req , "POST " , 5 ) == 0 ) { m_requestType = RT_POST; cmdLen = 4; } else if ( strncmp ( req , "CONNECT " , 8 ) == 0 ) { // take this out until it stops losing descriptors and works //m_requestType = RT_CONNECT; //cmdLen = 7; // we no longer insert section info. emmanuel gets section // info when injecting a doc now i think in PageInject.cpp. // we do not proxy https requests because we can't // decrypt the page contents to cache them or to insert // the sectiondb voting markup, so it's kinda pointless... // and i'm not aiming to be a full-fledge squid proxy. log("http: CONNECT request not supported because we " "can't insert section markup and we can't cache: %s",req); g_errno = EBADREQUEST; return false; } else { log("http: got bad request cmd: %s",req); g_errno = EBADREQUEST; return false; } // . NULL terminate the request (a destructive operation!) // . this removes the last \n in the trailing \r\n // . shit, but it f***s up POST requests if ( m_requestType != RT_POST ) { req [ reqLen - 1 ] = '\0'; reqLen--; } // POST requests can be absolutely huge if you are injecting a 100MB // file, so limit our strstrs to the end of the mime char *d = NULL; char dc; // check for body if it was a POST request if ( m_requestType == RT_POST ) { d = strstr ( req , "\r\n\r\n" ); if ( d ) { dc = *d; *d = '\0'; } else log("http: Got POST request without \\r\\n\\r\\n."); } // is it a proxy request? m_isSquidProxyRequest = false; if ( strncmp ( req + cmdLen + 1, "http://" ,7) == 0 || strncmp ( req + cmdLen + 1, "https://",8) == 0 ) { m_isSquidProxyRequest = true; // set url parms for it m_squidProxiedUrl = req + cmdLen + 1; char *p = m_squidProxiedUrl + 7; if ( *p == '/' ) p++; // https:// ? // stop at whitespace or \0 for ( ; *p && ! is_wspace_a(*p) ; p++ ); // that's the length of it m_squidProxiedUrlLen = p - m_squidProxiedUrl; } else if ( m_requestType == RT_CONNECT ) { m_isSquidProxyRequest = true; // set url parms for it m_squidProxiedUrl = req + cmdLen + 1; // usually its like CONNECT diffbot.com:443 char *p = m_squidProxiedUrl; // stop at whitespace or \0 for ( ; *p && ! is_wspace_a(*p) ; p++ ); // that's the length of it m_squidProxiedUrlLen = p - m_squidProxiedUrl; } // check authentication char *auth = NULL; if ( m_isSquidProxyRequest && req ) auth = strstr(req,"Proxy-authorization: Basic "); //if ( m_isSquidProxyRequest && ! auth ) { // log("http: no auth in proxy request %s",req); // g_errno = EBADREQUEST; // return false; //} SafeBuf tmp; if ( auth ) { // find end of it char *p = auth; for ( ; *p && *p != '\r' && *p != '\n' ; p++ ); tmp.base64Decode ( auth , p - auth ); } // assume incorrect username/password bool matched = false; if ( m_isSquidProxyRequest ) { // now try to match in g_conf.m_proxyAuth safebuf of // username:password space-separated list char *p = g_conf.m_proxyAuth.getBufStart(); // loop over those for ( ; p && *p ; ) { // skip initial white space for ( ; *p && is_wspace_a(*p); p++ ); // skip to end of username:password thing char *end = p; for ( ; *end && !is_wspace_a(*end); end++); // save char *start = p; // advance p = end; // this is always a match if ( end-start == 3 && strncmp(start,"*:*",3) == 0 ) { matched = true; break; } // compare now if ( tmp.length() != end-start ) continue; if ( strncmp(tmp.getBufStart(),start,end-start) != 0 ) continue; // we got a match matched = true; break; } } // incorrect username:passwrod? if ( m_isSquidProxyRequest && ! matched ) { log("http: bad username:password in proxy request %s",req); g_errno = EPERMDENIED; return false; } // if proxy request to download a url through us, we are done if ( m_isSquidProxyRequest ) return true; bool multipart = false; if ( m_requestType == 2 ) { // is POST? char *cd ; cd = gb_strcasestr(req,"Content-Type: multipart/form-data"); if ( cd ) multipart = true; } // . point to the file path // . skip over the "GET " int32_t filenameStart = 4 ; // skip over extra char if it's a "HEAD " request if ( m_requestType == RT_HEAD || m_requestType == RT_POST ) filenameStart++; // are we a redirect? int32_t i = filenameStart; m_redirLen = 0; if ( strncmp ( &req[i] , "/?redir=" , 8 ) == 0 ) { for ( int32_t k = i+8; k<reqLen && m_redirLen<126 ; k++) { if ( req[k] == '\r' ) break; if ( req[k] == '\n' ) break; if ( req[k] == '\t' ) break; if ( req[k] == ' ' ) break; m_redir[m_redirLen++] = req[k]; } } m_redir[m_redirLen] = '\0'; // find a \n space \r or ? that delimits the filename for ( i = filenameStart ; i < reqLen ; i++ ) { if ( is_wspace_a ( req [ i ] ) ) break; if ( req [ i ] == '?' ) break; } // now calc the filename length m_filenameLen = i - filenameStart; // return false and set g_errno if it's 0 if ( m_filenameLen <= 0 ) { log("http: got filenameLen<=0: %s",req); g_errno = EBADREQUEST; return false; } // . bitch if too big // . leave room for strcatting "index.html" below if ( m_filenameLen >= MAX_HTTP_FILENAME_LEN - 10 ) { log("http: got filenameLen>=max"); g_errno = EBADREQUEST; return false; } // . decode the filename into m_filename and reassign it's length // . decode %2F to / , etc... m_filenameLen = urlDecode(m_filename,req+filenameStart,m_filenameLen); // NULL terminate m_filename m_filename [ m_filenameLen ] = '\0'; // does it have a file extension AFTER the last / in the filename? bool hasExtension = false; for ( int32_t j = m_filenameLen-1 ; j >= 0 ; j-- ) { if ( m_filename[j] == '.' ) { hasExtension = true; break; } if ( m_filename[j] == '/' ) break; } // if it has no file extension append a /index.html if ( ! hasExtension && m_filename [ m_filenameLen - 1 ] == '/' ) { strcat ( m_filename , "index.html" ); m_filenameLen = strlen ( m_filename ); } // . uses the TcpSocket::m_readBuf // . if *p was ? then keep going m_origUrlRequest = origReq + filenameStart; char *p = origReq + m_filenameLen; for ( ; *p && ! is_wspace_a(*p) ; p++ ); m_origUrlRequestLen = p - m_origUrlRequest; // set file offset/size defaults m_fileOffset = 0; // -1 means ALL the file from m_fileOffset onwards m_fileSize = -1; // "e" points to where the range actually starts, if any //char *e; // . TODO: speed up by doing one strstr for Range: and maybe range: // . do they have a Range: 0-100\n in the mime denoting a partial get? //char *s = strstr ( req ,"Range:bytes=" ); //e = s + 12; // try alternate formats //if ( ! s ) { s = strstr ( req ,"Range: bytes=" ); e = s + 13; } //if ( ! s ) { s = strstr ( req ,"Range: " ); e = s + 7; } // parse out the range if we got one //if ( s ) { // int32_t x = 0; // sscanf ( e ,"%" PRId32"-%" PRId32 , &m_fileOffset , &x ); // // get all file if range's 2nd number is non-existant // if ( x == 0 ) m_fileSize = -1; // else m_fileSize = x - m_fileOffset; // // ensure legitimacy // if ( m_fileOffset < 0 ) m_fileOffset = 0; //} // reset our hostname m_hostLen = 0; // assume request is NOT from local network //m_isMasterAdmin = false; m_isLocal = false; // get the virtual hostname they want to use char *s = strstr ( req ,"Host:" ); // try alternate formats if ( ! s ) s = strstr ( req , "host:" ); // must be on its own line, otherwise it's not valid if ( s && s > req && *(s-1) !='\n' ) s = NULL; // parse out the host if we got one if ( s ) { // skip field name, host: s += 5; // skip e to beginning of the host name after "host:" while ( *s==' ' || *s=='\t' ) s++; // find end of the host name char *end = s; while ( *end && !is_wspace_a(*end) ) end++; // . now *end should be \0, \n, \r, ' ', ... // . get host len m_hostLen = end - s; // truncate if too big if ( m_hostLen >= 255 ) m_hostLen = 254; // copy into hostname gbmemcpy ( m_host , s , m_hostLen ); } // NULL terminate it m_host [ m_hostLen ] = '\0'; // get Referer: field s = strstr ( req ,"Referer:" ); // find another if ( ! s ) s = strstr ( req ,"referer:" ); // must be on its own line, otherwise it's not valid if ( s && s > req && *(s-1) !='\n' ) s = NULL; // assume no referer m_refLen = 0; // parse out the referer if we got one if ( s ) { // skip field name, referer: s += 8; // skip e to beginning of the host name after ':' while ( *s==' ' || *s=='\t' ) s++; // find end of the host name char *end = s; while ( *end && !is_wspace_a(*end) ) end++; // . now *end should be \0, \n, \r, ' ', ... // . get len m_refLen = end - s; // truncate if too big if ( m_refLen >= 255 ) m_refLen = 254; // copy into m_ref gbmemcpy ( m_ref , s , m_refLen ); } // NULL terminate it m_ref [ m_refLen ] = '\0'; // get User-Agent: field s = strstr ( req ,"User-Agent:" ); // find another if ( ! s ) s = strstr ( req ,"user-agent:" ); // must be on its own line, otherwise it's not valid if ( s && s > req && *(s-1) !='\n' ) s = NULL; // assume empty int32_t len = 0; // parse out the referer if we got one if ( s ) { // skip field name, referer: s += 11; // skip e to beginning of the host name after ':' while ( *s==' ' || *s=='\t' ) s++; // find end of the agent name char *end = s; while ( *end && *end!='\n' && *end!='\r' ) end++; // . now *end should be \0, \n, \r, ' ', ... // . get agent len len = end - s; // truncate if too big if ( len > 127 ) len = 127; // copy into m_userAgent gbmemcpy ( m_userAgent , s , len ); } // NULL terminate it m_userAgent [ len ] = '\0'; // get Cookie: field s = strstr ( req, "Cookie:" ); // find another if ( !s ) s = strstr ( req, "cookie:" ); // must be on its own line, otherwise it's not valid if ( s && s > req && *(s-1) != '\n' ) s = NULL; // assume empty // m_cookieBufLen = 0; m_cookiePtr = s; // parse out the cookie if we got one if ( s ) { // skip field name, Cookie: s += 7; // skip s to beginning of cookie after ':' while ( *s == ' ' || *s == '\t' ) s++; // find end of the cookie char *end = s; while ( *end && *end != '\n' && *end != '\r' ) end++; // save length m_cookieLen = end - m_cookiePtr; // get cookie len //m_cookieBufLen = end - s; // trunc if too big //if (m_cookieBufLen > 1023) m_cookieBufLen = 1023; // copy into m_cookieBuf //gbmemcpy(m_cookieBuf, s, m_cookieBufLen); } // NULL terminate it if ( m_cookiePtr ) m_cookiePtr[m_cookieLen] = '\0'; //m_cookieBuf[m_cookieBufLen] = '\0'; // convert every '&' in cookie to a \0 for parsing the fields // for ( int32_t j = 0 ; j < m_cookieBufLen ; j++ ) // if ( m_cookieBuf[j] == '&' ) m_cookieBuf[j] = '\0'; // mark it as cgi if it has a ? bool isCgi = ( req [ i ] == '?' ) ; // reset m_filename length to exclude the ?* stuff if ( isCgi ) { // skip over the '?' i++; // find a space the delmits end of cgi int32_t j; for ( j = i; j < reqLen; j++) if (is_wspace_a(req[j])) break; // now add it if ( ! addCgi ( &req[i] , j-i ) ) return false; // update i i = j; } // . set path ptrs // . the whole /cgi/14.cgi?coll=xxx&..... thang m_path = req + filenameStart; m_plen = i - filenameStart; // we're local if hostname is 192.168.[0|1].y //if ( strncmp(iptoa(sock->m_ip),"192.168.1.",10) == 0) { // m_isMasterAdmin = true; m_isLocal = true; } //if ( strncmp(iptoa(sock->m_ip),"192.168.0.",10) == 0) { // m_isMasterAdmin = true; m_isLocal = true; } //if(strncmp(iptoa(sock->m_ip),"192.168.1.",10) == 0) m_isLocal = true; //if(strncmp(iptoa(sock->m_ip),"192.168.0.",10) == 0) m_isLocal = true; if ( sock && strncmp(iptoa(sock->m_ip),"192.168.",8) == 0) m_isLocal = true; if ( sock && strncmp(iptoa(sock->m_ip),"10.",3) == 0) m_isLocal = true; // gotta scan all ips in hosts.conf as well... // if we are coming from any of our own hosts.conf c blocks // consider ourselves local uint32_t last = 0; for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) { Host *h = g_hostdb.getHost(i); // save time with this check if ( h->m_ip == last ) continue; // update it last = h->m_ip; // returns number of top bytes in comon int32_t nt = sock ? ipCmp ( sock->m_ip , h->m_ip ) : 0; // at least be in the same c-block as a host in hosts.conf if ( nt < 3 ) continue; m_isLocal = true; break; } // connectips/adminips // for ( int32_t i = 0 ; i < g_conf.m_numConnectIps ; i++ ) { // if ( sock->m_ip != g_conf.m_connectIps[i] ) continue; // m_isLocal = true; // break; // } // 127.0.0.1 if ( sock && sock->m_ip == 16777343 ) m_isLocal = true; // . TODO: now add any cgi data from a POST..... // . look after the mime //char *d = NULL; // check for body if it was a POST request //if ( m_requestType == RT_POST ) d = strstr ( req , "\r\n\r\n" ); // return true now if no cgi stuff to parse if ( d ) { // now put d's char back, just in case... does it really matter? *d = dc; char *post = d + 4; int32_t postLen = reqLen-(d+4-req) ; // post sometimes has a \r or\n after it while ( postLen > 0 && post[postLen-1]=='\r' ) postLen--; // add it to m_cgiBuf, filter and everything if ( ! addCgi ( post , postLen ) ) return false; } // Put '\0' back into the HttpRequest buffer... // crap, not if we are multi-part unencoded stuff... if ( m_cgiBuf && ! multipart ) { // do not mangle the "ucontent"! int32_t cgiBufLen = m_cgiBufLen; cgiBufLen -= m_ucontentLen; char *buf = m_cgiBuf; for (int32_t i = 0; i < cgiBufLen ; i++) if (buf[i] == '&') buf[i] = '\0'; // don't decode the ucontent= field! int32_t decodeLen = m_cgiBufLen; // so subtract that if ( m_ucontent ) decodeLen -= m_ucontentLen; // decode everything. fixed for %00 in &content= so it // doesn't set our parms when injecting. int32_t len = urlDecodeNoZeroes(m_cgiBuf,m_cgiBuf,decodeLen); // we're parsing crap after the null if the last parm // has no value //memset(m_cgiBuf+len, '\0', m_cgiBufLen-len); m_cgiBufLen = len; // ensure that is null i guess if ( ! m_ucontent ) m_cgiBuf[len] = '\0'; } if (m_cgiBuf2){ char *buf = m_cgiBuf2; for (int32_t i = 0; i < m_cgiBuf2Size-1 ; i++) if (buf[i] == '&') buf[i] = '\0'; // decode everything. fixed for %00 in &content= so it // doesn't set our parms when injecting. int32_t len = urlDecodeNoZeroes ( m_cgiBuf2 , m_cgiBuf2 , m_cgiBuf2Size); memset(m_cgiBuf2+len, '\0', m_cgiBuf2Size-len); } // . parse the fields after the ? in a cgi filename // . or fields in the content if it's a POST // . m_cgiBuf must be and is NULL terminated for this parseFields ( m_cgiBuf , m_cgiBufLen ); // Add extra parms to the request. if (m_cgiBuf2Size){ parseFields(m_cgiBuf2, m_cgiBuf2Size); } // urldecode the cookie buf too!! if ( m_cookiePtr ) { char *p = m_cookiePtr; for (int32_t i = 0; i < m_cookieLen ; i++) { //if (p[i] == '&') p[i] = '\0'; // cookies are separated with ';' in the request only if (p[i] == ';') p[i] = '\0'; // a hack for the metacookie=.... // which uses &'s to separate its subcookies // this is a hack for msie's limit of 50 cookies if ( p[i] == '&' ) p[i] = '\0'; // set m_metaCookie to start of meta cookie if ( p[i] == 'm' && p[i+1] == 'e' && strncmp(p,"metacookie",10) == 0 ) m_metaCookie = p; } int32_t len = urlDecode ( m_cookiePtr , m_cookiePtr, m_cookieLen ); // we're parsing crap after the null if the last parm // has no value memset(m_cookiePtr+len, '\0', m_cookieLen-len); m_cookieLen = len; } return true; }
// returns false on bad mime bool HttpMime::parse ( char *mime , long mimeLen , Url *url ) { // reset locUrl to 0 m_locUrl.reset(); // return if we have no valid complete mime if ( mimeLen == 0 ) return false; // status is on first line m_status = -1; // skip HTTP/x.x till we hit a space char *p = mime; char *pend = mime + mimeLen; while ( p < pend && !is_wspace_a(*p) ) p++; // then skip over spaces while ( p < pend && is_wspace_a(*p) ) p++; // return false on a problem if ( p == pend ) return false; // then read in the http status m_status = atol2 ( p , pend - p ); // if no Content-Type: mime field was provided, assume html m_contentType = CT_HTML; // assume default charset m_charset = NULL; m_charsetLen = 0; // set contentLen, lastModifiedDate, m_cookie p = mime; while ( p < pend ) { // compute the length of the string starting at p and ending // at a \n or \r long len = 0; while ( &p[len] < pend && p[len]!='\n' && p[len]!='\r' ) len++; // . if we could not find a \n or \r there was an error // . MIMEs must always end in \n or \r if ( &p[len] >= pend ) return false; // . stick a NULL at the end of the line // . overwrites \n or \r TEMPORARILY char c = p [ len ]; p [ len ] = '\0'; // parse out some meaningful data if ( strncasecmp ( p , "Content-Length:" ,15) == 0 ) { m_contentLengthPos = p + 15; m_contentLen = atol( m_contentLengthPos); } else if ( strncasecmp ( p , "Last-Modified:" ,14) == 0 ) { m_lastModifiedDate=atotime(p+14); // do not let them exceed current time for purposes // of sorting by date using datedb (see Msg16.cpp) time_t now = time(NULL); if (m_lastModifiedDate > now) m_lastModifiedDate = now; } else if ( strncasecmp ( p , "Content-Type:" ,13) == 0 ) m_contentType = getContentTypePrivate ( p + 13 ); else if ( strncasecmp ( p , "Set-Cookie: " ,11) == 0 ) { m_cookie = p + 11; m_cookieLen = gbstrlen ( p + 11 ); } else if ( strncasecmp ( p , "Location:" , 9) == 0 ) { // point to it char *tt = p + 9; // skip if space if ( *tt == ' ' ) tt++; if ( *tt == ' ' ) tt++; // at least set this for Msg13.cpp to use m_locationField = tt; m_locationFieldLen = gbstrlen(tt); // . we don't add the "www." because of slashdot.com // . we skip initial spaces in this Url::set() routine if(url) m_locUrl.set ( url, p + 9, len - 9, false/*addWWW?*/); } else if ( strncasecmp ( p , "Content-Encoding:", 17) == 0 ) { //only support gzip now, it doesn't seem like servers //implement the other types much m_contentEncodingPos = p+17; if(strstr(m_contentEncodingPos, "gzip")) { m_contentEncoding = ET_GZIP; } else if(strstr(m_contentEncodingPos, "deflate")) { //zlib's compression m_contentEncoding = ET_DEFLATE; } } //else if ( strncasecmp ( p, "Cookie:", 7) == 0 ) // log (LOG_INFO, "mime: Got Cookie = %s", (p+7)); // re-insert the character that we replaced with a '\0' p [ len ] = c; // go to next line p += len; // skip over the cruft at the end of this line while ( p < pend && ( *p=='\r' || *p=='\n' ) ) p++; } return true; }
// returns false on bad mime bool HttpMime::parse(char *mime, int32_t mimeLen, Url *url) { #ifdef _VALGRIND_ VALGRIND_CHECK_MEM_IS_DEFINED(mime,mimeLen); #endif // reset locUrl to 0 m_locUrl.reset(); // return if we have no valid complete mime if (mimeLen == 0) { return false; } // status is on first line m_status = -1; // skip HTTP/x.x till we hit a space char *p = mime; char *pend = mime + mimeLen; while (p < pend && !is_wspace_a(*p)) p++; // then skip over spaces while (p < pend && is_wspace_a(*p)) p++; // return false on a problem if (p == pend) return false; // then read in the http status m_status = atol2(p, pend - p); // if no Content-Type: mime field was provided, assume html m_contentType = CT_HTML; // assume default charset m_charset = NULL; m_charsetLen = 0; // skip over first line getNextLine(); while (getNextLine()) { const char *field = NULL; size_t fieldLen = 0; if (getField(&field, &fieldLen)) { if (parseContentEncoding(field, fieldLen)) { continue; } if (parseContentLength(field, fieldLen)) { continue; } if (parseContentType(field, fieldLen)) { continue; } if (parseLocation(field, fieldLen, url)) { continue; } if (parseSetCookie(field, fieldLen)) { continue; } // add parsing of other header here } } return true; }
char *getMatches2 ( Needle *needles , int32_t numNeedles , char *haystack , int32_t haystackSize , char *linkPos , int32_t *needleNum , bool stopAtFirstMatch , bool *hadPreMatch , bool saveQuickTables , int32_t niceness ) { // assume not if ( hadPreMatch ) *hadPreMatch = false; // empty haystack? then no matches if ( ! haystack || haystackSize <= 0 ) return NULL; // JAB: no needles? then no matches if ( ! needles || numNeedles <= 0 ) return NULL; //char tmp[8192]; //char *t = tmp; //char *tend = tmp + 8192; // reset counts to 0 //if ( ! stopAtFirstMatch ) // for ( int32_t i=0 ; i < numNeedles ; i++ ) // needles[i].m_count = 0; // are we responsible for init'ing string lengths? this is much // faster than having to specify lengths manually. for ( int32_t i=0 ; i < numNeedles; i++ ) { // breathe QUICKPOLL(niceness); // clear needles[i].m_count = 0; needles[i].m_firstMatch = NULL; // set the string size in bytes if not provided if ( needles[i].m_stringSize == 0 ) needles[i].m_stringSize = gbstrlen(needles[i].m_string); } // . set up the quick tables. // . utf16 is not as effective here because half the bytes are zeroes! // . TODO: use a static cache of like 4 of these tables where the key // is the Needles ptr ... done int32_t numNeedlesToInit = numNeedles; char space[256 * 4 * sizeof(BITVEC)]; char *buf = NULL; BITVEC *s0; BITVEC *s1; BITVEC *s2; BITVEC *s3; /* static bool s_quickTableInit = false; static char s_qtbuf[128*(12+1)*2]; int32_t slot = -1; if(saveQuickTables) { if ( ! s_quickTableInit ) { s_quickTableInit = true; s_quickTables.set(8,4,128,s_qtbuf,256*13,false,0,"qx"); } uint64_t key = (uint32_t)needles; slot = s_quickTables.getSlot(&key); if ( slot >= 0 ) { buf = s_quickTables.getValueFromSlot(slot); numNeedlesToInit = 0; } } */ if(!buf) { buf = space; memset ( buf , 0 , sizeof(BITVEC)*256*4); } /* if( useQuickTables && slot == -1 ) { //buf = (char*)mcalloc(sizeof(uint32_t)*256*5, // "matches"); if(buf) s_quickTables.addKey(&key, &buf); //sanity check, no reason why there needs to be a //limit, I just don't expect there to be this many //static needles at this point. if(s_quickTables.getNumSlotsUsed() > 32){ char *xx=NULL; *xx = 0; } } */ // try 64 bit bit vectors now since we doubled # of needles int32_t offset = 0; s0 = (BITVEC *)(buf + offset); offset += sizeof(BITVEC)*256; s1 = (BITVEC *)(buf + offset); offset += sizeof(BITVEC)*256; s2 = (BITVEC *)(buf + offset); offset += sizeof(BITVEC)*256; s3 = (BITVEC *)(buf + offset); offset += sizeof(BITVEC)*256; BITVEC mask; // set the letter tables, s0[] through sN[], for each needle for ( int32_t i = 0 ; i < numNeedlesToInit ; i++ ) { // breathe QUICKPOLL(niceness); unsigned char *w = (unsigned char *)needles[i].m_string; unsigned char *wend = w + needles[i].m_stringSize; // BITVEC is now 64 bits mask = (1<<(i&0x3f)); // (1<<(i%64)); // if the needle is small, fill up the remaining letter tables // with its mask... so it matches any character in haystack. s0[(unsigned char)to_lower_a(*w)] |= mask; s0[(unsigned char)to_upper_a(*w)] |= mask; w += 1;//step; if ( w >= wend ) { for ( int32_t j = 0 ; j < 256 ; j++ ) { s1[j] |= mask; s2[j] |= mask; s3[j] |= mask; } continue; } s1[(unsigned char)to_lower_a(*w)] |= mask; s1[(unsigned char)to_upper_a(*w)] |= mask; w += 1;//step; if ( w >= wend ) { for ( int32_t j = 0 ; j < 256 ; j++ ) { s2[j] |= mask; s3[j] |= mask; } continue; } s2[(unsigned char)to_lower_a(*w)] |= mask; s2[(unsigned char)to_upper_a(*w)] |= mask; w += 1;//step; if ( w >= wend ) { for ( int32_t j = 0 ; j < 256 ; j++ ) { s3[j] |= mask; } continue; } s3[(unsigned char)to_lower_a(*w)] |= mask; s3[(unsigned char)to_upper_a(*w)] |= mask; w += 1;//step; } // return a ptr to the first match if we should, this is it char *retVal = NULL; // debug vars //int32_t debugCount = 0; //int32_t pp = 0; // now find the first needle in the haystack unsigned char *p = (unsigned char *)haystack; unsigned char *pend = (unsigned char *)haystack + haystackSize; char *dend = (char *)pend; // do not breach! pend -= 4; for ( ; p < pend ; p++ ) { // breathe QUICKPOLL(niceness); //if ( (char *)p - (char *)haystack >= 12508 ) // log("hey"); // analytics... // is this a possible match? (this should be VERY fast) mask = s0[*(p+0)]; if ( ! mask ) continue; mask &= s1[*(p+1)]; if ( ! mask ) continue; mask &= s2[*(p+2)]; if ( ! mask ) continue; mask &= s3[*(p+3)]; if ( ! mask ) continue; //debugCount++; /* // display char oo[148]; char *xx ; xx = oo; //gbmemcpy ( xx , p , 8 ); for ( int32_t k = 0 ; k < 5 ; k++ ) { *xx++ = p[k]; } gbmemcpy ( xx , "..." , 3 ); xx += 3; */ // // XXX: do a hashtable lookup here so we have the candidate // matches in a chain... // XXX: for small needles which match frequently let's have // a single char hash table, a 2 byte char hash table, // etc. so if we have small needles we check the hash // in those tables first, but only if mask & SMALL_NEEDLE // is true! the single byte needle hash table can just // be a lookup table. just XOR the bytes together for // the hash. // XXX: just hash the mask into a table to get candidate // matches in a chain? but there's 4B hashes!! // we got a good candidate, loop through all the needles for ( int32_t j = 0 ; j < numNeedles ; j++ ) { // skip if does not match mask, will save time if ( ! ((1<<(j&0x3f)) & mask) ) continue; if( needles[j].m_stringSize > 3) { // ensure first 4 bytes matches this needle's if (needles[j].m_string[0]!=to_lower_a(*(p+0))) continue; if (needles[j].m_string[1]!=to_lower_a(*(p+1))) continue; if (needles[j].m_string[2]!=to_lower_a(*(p+2))) continue; if (needles[j].m_string[3]!=to_lower_a(*(p+3))) continue; } // get needle size int32_t msize = needles[j].m_stringSize; // can p possibly be big enough? if ( pend - p < msize ) continue; // needle is "m" now char *m = needles[j].m_string; char *mend = needles[j].m_stringSize + m; // use a tmp ptr for ptr into haystack char *d = (char *)p; // skip first 4 bytes since we know they match if(msize > 3) { d += 4; m += 4; } // loop over each char in "m" //for ( ; *m ; m++ ) { for ( ; m < mend ; m++ ) { //while ( ! *d && d < dend ) d++; //while ( ! *m && m < mend ) m++; // if we are a non alnum, that will match // any string of non-alnums, like a space // for instance. the 0 byte does not count // because it is used in utf16 a lot. this // may trigger some false matches in utf16 // but, oh well... this way "link partner" // will match "link - partner" in the haystk if ( is_wspace_a(*m) && m < mend ) { // skip all in "d" then. while (d<dend&&is_wspace_a(*d)) d++; // advance m then continue; } // make sure we match otherwise if ( *m != to_lower_a(*d) ) break; // ok, we matched, go to next d++; } // if not null, keep going if ( m < mend ) continue; // if this needle is "special" AND it occurs AFTER // linkPos, then do not consider it a match. this is // if we have a comment section indicator, like // "div id=\"comment" AND it occurs AFTER linkPos // (the char ptr to our link in the haystack) then // the match does not count. if ( linkPos && needles[j].m_isSection && (char *)p>linkPos ) { // record this for LinkText.cpp if ( hadPreMatch ) *hadPreMatch = true; continue; } // store ptr if NULL if ( ! needles[j].m_firstMatch ) needles[j].m_firstMatch = (char *)p; // return ptr to needle in "haystack" if ( stopAtFirstMatch ) { // ok, we got a match if ( needleNum ) *needleNum = j; //return (char *)p; retVal = (char *)p; p = pend; break; } // otherwise, just count it needles[j].m_count++; // see if we match another needle, fixes bug // of matching "anal" but not "analy[tics]" continue; // advance to next char in the haystack break; } // ok, we did not match any needles, advance p and try again } // // HACK: // // repeat above loop but for the last 4 characters in haystack!! // this fixes a electric fence mem breach core // // it is slower because we check for \0 // pend += 4; for ( ; p < pend ; p++ ) { // breathe QUICKPOLL(niceness); //if ( (char *)p - (char *)haystack >= 12508 ) // log("hey"); // is this a possible match? (this should be VERY fast) mask = s0[*(p+0)]; if ( ! mask ) continue; if ( p+1 < pend ) { mask &= s1[*(p+1)]; if ( ! mask ) continue; } if ( p+2 < pend ) { mask &= s2[*(p+2)]; if ( ! mask ) continue; } if ( p+3 < pend ) { mask &= s3[*(p+3)]; if ( ! mask ) continue; } //debugCount++; /* // display char oo[148]; char *xx ; xx = oo; //gbmemcpy ( xx , p , 8 ); for ( int32_t k = 0 ; k < 5 ; k++ ) { *xx++ = p[k]; } gbmemcpy ( xx , "..." , 3 ); xx += 3; */ // // XXX: do a hashtable lookup here so we have the candidate // matches in a chain... // XXX: for small needles which match frequently let's have // a single char hash table, a 2 byte char hash table, // etc. so if we have small needles we check the hash // in those tables first, but only if mask & SMALL_NEEDLE // is true! the single byte needle hash table can just // be a lookup table. just XOR the bytes together for // the hash. // XXX: just hash the mask into a table to get candidate // matches in a chain? but there's 4B hashes!! // we got a good candidate, loop through all the needles for ( int32_t j = 0 ; j < numNeedles ; j++ ) { // skip if does not match mask, will save time if ( ! ((1<<(j&0x3f)) & mask) ) continue; if( needles[j].m_stringSize > 3) { // ensure first 4 bytes matches this needle's if (needles[j].m_string[0]!=to_lower_a(*(p+0))) continue; if (!p[1] || needles[j].m_string[1]!=to_lower_a(*(p+1))) continue; if (!p[2] || needles[j].m_string[2]!=to_lower_a(*(p+2))) continue; if (!p[3] || needles[j].m_string[3]!=to_lower_a(*(p+3))) continue; } // get needle size int32_t msize = needles[j].m_stringSize; // can p possibly be big enough? if ( pend - p < msize ) continue; // needle is "m" now char *m = needles[j].m_string; char *mend = needles[j].m_stringSize + m; // use a tmp ptr for ptr into haystack char *d = (char *)p; // skip first 4 bytes since we know they match if(msize > 3) { d += 4; m += 4; } // loop over each char in "m" //for ( ; *m ; m++ ) { for ( ; m < mend ; m++ ) { //while ( ! *d && d < dend ) d++; //while ( ! *m && m < mend ) m++; // if we are a non alnum, that will match // any string of non-alnums, like a space // for instance. the 0 byte does not count // because it is used in utf16 a lot. this // may trigger some false matches in utf16 // but, oh well... this way "link partner" // will match "link - partner" in the haystk if ( is_wspace_a(*m) && m < mend ) { // skip all in "d" then. while (d<dend&&is_wspace_a(*d)) d++; // advance m then continue; } // make sure we match otherwise if ( *m != to_lower_a(*d) ) break; // ok, we matched, go to next d++; } // if not null, keep going if ( m < mend ) continue; // if this needle is "special" AND it occurs AFTER // linkPos, then do not consider it a match. this is // if we have a comment section indicator, like // "div id=\"comment" AND it occurs AFTER linkPos // (the char ptr to our link in the haystack) then // the match does not count. if ( linkPos && needles[j].m_isSection && (char *)p>linkPos ) { // record this for LinkText.cpp if ( hadPreMatch ) *hadPreMatch = true; continue; } // store ptr if NULL if ( ! needles[j].m_firstMatch ) needles[j].m_firstMatch = (char *)p; // return ptr to needle in "haystack" if ( stopAtFirstMatch ) { // ok, we got a match if ( needleNum ) *needleNum = j; //return (char *)p; retVal = (char *)p; p = pend; break; } // otherwise, just count it needles[j].m_count++; // advance to next char in the haystack break; } // ok, we did not match any needles, advance p and try again } //if ( debugCount > 0 ) pp = haystackSize / debugCount; //log("build: debug count = %"INT32" uc=%"INT32" hsize=%"INT32" " // "1 in %"INT32" chars matches.", // debugCount,(int32_t)isHaystackUtf16,haystackSize,pp); // before we exit, clean up return retVal; }
bool HttpMime::getAttribute(const char **attribute, size_t *attributeLen, const char **attributeValue, size_t *attributeValueLen) { // initialize value *attribute = NULL; *attributeLen = 0; *attributeValue = NULL; *attributeValueLen = 0; // no attribute if (m_attributeStartPos == 0) { return false; } // strip starting whitespaces while (is_wspace_a(m_currentLine[m_attributeStartPos]) && (m_attributeStartPos < m_currentLineLen)) { ++m_attributeStartPos; } *attribute = m_currentLine + m_attributeStartPos; *attributeLen = m_currentLineLen - m_attributeStartPos; // next attribute const char *semicolonPos = (const char *)memchr(*attribute, ';', *attributeLen); if (semicolonPos) { *attributeLen = semicolonPos - *attribute; m_attributeStartPos = semicolonPos - m_currentLine + 1; } else { m_attributeStartPos = 0; } // attribute value const char *equalPos = (const char *)memchr(*attribute, '=', *attributeLen); if (equalPos) { *attributeValueLen = *attributeLen; *attributeLen = equalPos - *attribute; *attributeValueLen -= *attributeLen + 1; *attributeValue = equalPos + 1; // strip ending attribute whitespace while (is_wspace_a((*attribute)[*attributeLen - 1])) { --(*attributeLen); } // strip starting attribute value whitespace/quote while (is_wspace_a((*attributeValue)[0]) || (*attributeValue)[0] == '"' || (*attributeValue)[0] == '\'') { ++(*attributeValue); --(*attributeValueLen); } // strip ending attribute value whitespace/quote while (is_wspace_a((*attributeValue)[*attributeValueLen - 1]) || (*attributeValue)[*attributeValueLen - 1] == '"' || (*attributeValue)[*attributeValueLen - 1] == '\'') { --(*attributeValueLen); } } // cater for empty values between semicolon // eg: Set-Cookie: name=value; Path=/; ;SECURE; HttpOnly; if (*attributeLen == 0 && m_attributeStartPos) { return getAttribute(attribute, attributeLen, attributeValue, attributeValueLen); } logTrace(g_conf.m_logTraceHttpMime, "attribute='%.*s' value='%.*s'", static_cast<int>(*attributeLen), *attribute, static_cast<int>(*attributeValueLen), *attributeValue); return (*attributeLen > 0); }
JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , int32_t niceness ) { m_prev = NULL; m_stackPtr = 0; m_sb.purge(); JsonItem *ji = NULL; if ( ! json ) return NULL; // how much space will we need to avoid any reallocs? char *p = json; bool inQuote = false; int32_t need = 0; for ( ; *p ; p++ ) { // ignore any escaped char. also \x1234 if ( *p == '\\' ) { if ( p[1] ) p++; continue; } if ( *p == '\"' ) inQuote = ! inQuote; if ( inQuote ) continue; if ( *p == '{' || *p == ',' || *p == '[' || *p == ':' ) // +1 for null terminating string of each item need += sizeof(JsonItem) +1; } // plus the length of the string to store it decoded etc. need += p - json; // plus a \0 for the value and a \0 for the name of each jsonitem need += 2; // prevent cores for now need += 10; // . to prevent safebuf from reallocating do this // . safeMemcpy() calls reserve(m_length+len) and reserves // tries to alloc m_length + (m_length+len) so since, // m_length+len should never be more than "need" we need to // double up here need *= 2; // this should be enough if ( ! m_sb.reserve ( need ) ) return NULL; // for testing if we realloc char *mem = m_sb.getBufStart(); int32_t size; char *NAME = NULL; int32_t NAMELEN = 0; // reset p p = json; // json maybe bad utf8 causing us to miss the \0 char, so use "pend" char *pend = json + gbstrlen(json); // scan for ( ; p < pend ; p += size ) { // get size size = getUtf8CharSize ( p ); // skip spaces if ( is_wspace_a (*p) ) continue; // skip commas if ( *p == ',' ) continue; // did we hit a '{'? that means the existing json item // is a parent of the item(s) inside the {}'s if ( *p == '{' ) { // if ji is non-null it must be a name like in // \"stats\":{\"fetchTime\":2069,....} // . this indicates the start of a json object // . addNewItem() will push the current item on stack ji = addNewItem(); if ( ! ji ) return NULL; // current ji is an object type then ji->m_type = JT_OBJECT; // set the name ji->m_name = NAME; ji->m_nameLen = NAMELEN; // this goes on the stack if ( m_stackPtr >= MAXJSONPARENTS ) return NULL; m_stack[m_stackPtr++] = ji; // and null this ji = NULL; continue; } // pop the stack? if ( *p == '}' ) { // just pop it and restore name cursor if ( m_stackPtr > 0 ) { JsonItem *px = m_stack[m_stackPtr-1]; NAME = px->m_name; NAMELEN = px->m_nameLen; m_stackPtr--; } continue; } // array of things? if ( *p == '[' ) { // make a newitem to put on stack ji = addNewItem(); if ( ! ji ) return NULL; // current ji is an object type then ji->m_type = JT_ARRAY; // start of array hack. HACK! //ji->m_valueLong = (int32_t)p; ji->m_valueArray = p; // set the name ji->m_name = NAME; ji->m_nameLen = NAMELEN; // init to a bogus value. should be set below. // at least this should avoid a core in XmlDoc.cpp // getTokenizedDiffbotReply() ji->m_valueLen = 0; // this goes on the stack if ( m_stackPtr >= MAXJSONPARENTS ) return NULL; m_stack[m_stackPtr++] = ji; ji = NULL; continue; } // pop the stack? if ( *p == ']' ) { // just pop it and restore name cursor if ( m_stackPtr > 0 ) { JsonItem *px = m_stack[m_stackPtr-1]; NAME = px->m_name; NAMELEN = px->m_nameLen; // start of array hack. HACK! char *start = (char *)px->m_valueArray;//Long; // include ending ']' in length of array px->m_valueLen = p - start + 1; m_stackPtr--; } continue; } // a quote? if ( *p == '\"' ) { // find end of quote char *end = p + 1; for ( ; *end ; end++ ) { // skip two chars if escaped if ( *end == '\\' && end[1] ) { end++; continue; } // this quote is unescaped then if ( *end == '\"' ) break; } // field? char *x = end + 1; // skip spaces for ( ; *x && is_wspace_a(*x) ; x++ ); // define the string char *str = p + 1; int32_t slen = end - str; // . if a colon follows, it was a field if ( *x == ':' ) { // we can't be the first thing in the safebuf // json must start with { or [ i guess // otherwise getFirstItem() won't work! if ( m_sb.m_length==0 ) { g_errno = EBADJSONPARSER; return NULL; } // let's push this now so we can \0 term char *savedStr = m_sb.getBuf(); m_sb.safeMemcpy ( str , slen ); m_sb.pushChar('\0'); // just set the name cursor NAME = savedStr;//str; NAMELEN = slen; } // . otherwise, it was field value, so index it // . TODO: later make field names compounded to // better represent nesting? // . added 'else if (NAME){' fix for json=\"too small\" else if ( NAME ) { // make a new one in safebuf. our // parent will be the array type item. ji = addNewItem(); if ( ! ji ) return NULL; // we are a string ji->m_type = JT_STRING; // use name cursor ji->m_name = NAME; ji->m_nameLen = NAMELEN; // get length decoded int32_t curr = m_sb.length(); // store decoded string right after jsonitem if ( !m_sb.safeDecodeJSONToUtf8 (str,slen, niceness )) return NULL; // store length decoded json ji->m_valueLen = m_sb.length() - curr; // end with a \0 m_sb.pushChar('\0'); // ok, this one is done ji = NULL; } else { log("json: fieldless name in json"); g_errno = EBADJSONPARSER; return NULL; } // skip over the string size = 0; p = x; continue; } // true or false? if ( (*p == 't' && strncmp(p,"true",4)==0) || (*p == 'f' && strncmp(p,"false",5)==0) ) { // make a new one ji = addNewItem(); if ( ! ji ) return NULL; // copy the number as a string as well int32_t curr = m_sb.length(); // what is the length of it? int32_t slen = 4; ji->m_valueLong = 1; ji->m_valueDouble = 1.0; if ( *p == 'f' ) { slen = 5; ji->m_valueLong = 0; ji->m_valueDouble = 0; } // store decoded string right after jsonitem if ( !m_sb.safeDecodeJSONToUtf8 (p,slen,niceness)) return NULL; // store length decoded json ji->m_valueLen = m_sb.length() - curr; // end with a \0 m_sb.pushChar('\0'); ji->m_type = JT_NUMBER; // use name cursor ji->m_name = NAME; ji->m_nameLen = NAMELEN; ji = NULL; // skip over the string size = 1; //p = end; continue; } // if we hit a digit they might not be in quotes like // "crawled":123 if ( is_digit ( *p ) || // like .123 ? ( *p == '.' && is_digit(p[1]) ) ) { // find end of the number char *end = p + 1; // . allow '.' for decimal numbers // . TODO: allow E for exponent for ( ; *end && (is_digit(*end) || *end=='.');end++) ; // define the string char *str = p; int32_t slen = end - str; // make a new one ji = addNewItem(); if ( ! ji ) return NULL; // back up over negative sign? if ( str > json && str[-1] == '-' ) str--; // decode //char c = str[slen]; //str[slen] = '\0'; ji->m_valueLong = atol(str); ji->m_valueDouble = atof(str); // copy the number as a string as well int32_t curr = m_sb.length(); // store decoded string right after jsonitem if ( !m_sb.safeDecodeJSONToUtf8 ( str, slen,niceness)) return NULL; // store length decoded json ji->m_valueLen = m_sb.length() - curr; // end with a \0 m_sb.pushChar('\0'); //str[slen] = c; ji->m_type = JT_NUMBER; // use name cursor ji->m_name = NAME; ji->m_nameLen = NAMELEN; ji = NULL; // skip over the string size = 0; p = end; continue; } } // for testing if we realloc char *memEnd = m_sb.getBufStart(); if ( mem != memEnd ) { char *xx=NULL;*xx=0; } return (JsonItem *)m_sb.getBufStart(); }
// *next is set to ptr into m_cgiBuf so that the next successive call to // getString with the SAME "field" will start at *next. that way you // can use the same cgi parameter multiple times. (like strstr kind of) char *HttpRequest::getStringFromCookie ( char *field , long *len , char *defaultStr , long *next ) { // get field len long flen = gbstrlen(field); // assume none if ( len ) *len = 0; // if no cookie, forget it if ( ! m_cookiePtr ) return defaultStr; // the end of the cookie //char *pend = m_cookieBuf + m_cookieBufLen; char *pend = m_cookiePtr + m_cookieLen; char *p = m_cookiePtr; // skip over spaces and punct for ( ; p && p < pend ; p++ ) if ( is_alnum_a(*p) ) break; // skip "Cookie:" if ( p + 7 < pend && ! strncasecmp(p,"cookie:",7) ) p += 7; // skip spaces after that for ( ; p && p < pend ; p++ ) if ( is_alnum_a(*p) ) break; // crazy? if ( p >= pend ) return defaultStr; char *savedVal = NULL; // so we do not skip the first cookie, jump right in! // otherwise we lose the calendar cookie for msie goto entryPoint; // . loop over all xxx=yyy\0 thingies in the cookie // . we converted every '&' to a \0 when the cookiebuf was set above //for ( char *p = m_cookieBuf ; *p ; p += gbstrlen(p) + 1 ) { // . no, we just keep them as &'s because seems like cookies use ;'s // as delimeters not so much &'s. and when we log the cookie in the // log, i wanted to see the whole cookie, so having \0's in the // cookie was messing that up. for ( ; p < pend ; p++ ) { // need a \0 // fixes "display=0&map=0&calendar=0;" that is only one cookie. // so do not grap value of map or calendar from that!! if ( *p ) continue; // back to back \0's? be careful how we skip over them! if ( ! p[1] ) continue; // skip that if ( ++p >= pend ) break; // skip whitespace that follows for ( ; p < pend ; p++ ) if ( ! is_wspace_a(*p) ) break; // end of cookie? if ( p >= pend ) break; entryPoint: // check first char if ( *p != *field ) continue; // does it match? continue if not a match if ( strncmp ( p , field , flen ) ) continue; // point to value char *val = p + flen; // must be an equal sign if ( *val != '=' ) continue; // skip that sign val++; // . cookies terminate fields by space or ; or & // . skip to end of cookie value for this field char *e = val; // skip over alnum. might also be \0 if this function // was already called somewhere else! // we NULL separated each cookie and then urldecoded each // cookie above in the m_cookieBuf logic. cookies can contain // encoded ;'s and &'s so i took this checks out of this while // loop. like the widgetHeader has semicolons in it and it // stores in the cookie. while ( e < pend && *e ) e++; // that is the length if ( len ) *len = e - val; // NULL terminate it, we should have already logged the cookie // so it should be ok to NULL terminate now. we already // call urlDecode() now above... and make the &'s into \0's *e = '\0'; // if we were in the meta cookie, return that... // otherwise if you visited this site before metacookies // were used you might have the cookie outside the meta // cookie AND inside the metacookie, and only the value // inside the metacookie is legit... if ( val > m_metaCookie ) return val; // otherwise, save it and try to get from meta cookie savedVal = val; // length //if ( len ) *len = gbstrlen(val); // this is the value! //return val; } // did we save something? if ( savedVal ) return savedVal; // no match return defaultStr; }
// . return the value of the specified "field" within this html tag, "s" // . the case of "field" does not matter char *getFieldValue ( char *s , long slen , char *field , long *valueLen ) { // reset this to 0 *valueLen = 0; // scan for the field name in our node long flen = gbstrlen(field); char inQuotes = '\0'; long i; // make it sane if ( slen > 2000 ) slen = 2000; for ( i = 1; i + flen < slen ; i++ ) { // skip the field if it's quoted if ( inQuotes) { if (s[i] == inQuotes ) inQuotes = 0; continue; } // set inQuotes to the quote if we're in quotes if ( (s[i]=='\"' || s[i]=='\'')){ inQuotes = s[i]; continue; } // if not in quote tag might end if ( s[i] == '>' && ! inQuotes ) return NULL; // a field name must be preceeded by non-alnum if ( is_alnum_a ( s[i-1] ) ) continue; // the first character of this field shout match field[0] if ( to_lower_a (s[i]) != to_lower_a(field[0] )) continue; // field just be immediately followed by an = or space if (s[i+flen]!='='&&!is_wspace_a(s[i+flen]))continue; // field names must match if ( strncasecmp ( &s[i], field, flen ) != 0 ) continue; // break cuz we got a match for our field name break; } // return NULL if no matching field if ( i + flen >= slen ) return NULL; // advance i over the fieldname so it pts to = or space i += flen; // advance i over spaces while ( i < slen && is_wspace_a ( s[i] ) ) i++; // advance over the equal sign, return NULL if does not exist if ( i < slen && s[i++] != '=' ) return NULL; // advance i over spaces after the equal sign while ( i < slen && is_wspace_a ( s[i] ) ) i++; // now parse out the value of this field (could be in quotes) inQuotes = '\0'; // set inQuotes to the quote if we're in quotes if ( s[i]=='\"' || s[i]=='\'') inQuotes = s[i++]; // mark this as the start of the value int start=i; // advance i until we hit a space, or we hit a that quote if inQuotes if (inQuotes) while (i<slen && s[i] != inQuotes ) i++; else while ( i<slen &&!is_wspace_a(s[i])&&s[i]!='>')i++; // set the length of the value *valueLen = i - start; // return a ptr to the value return s + start; }
// . so now this adds a list of Synonyms to the m_pools[] and returns a ptr // to the first one. // . then the parent caller can store that ptr in the m_wordToSyn[] array // which we pre-alloc upon calling the set() function based on the # of // words we got // . returns # of synonyms stored into "tmpBuf" long Synonyms::getSynonyms ( Words *words , long wordNum , uint8_t langId , char *tmpBuf , long niceness ) { // punct words have no synoyms if ( ! words->m_wordIds[wordNum] ) return 0; // store these m_words = words; m_docLangId = langId; m_niceness = niceness; // sanity check if ( wordNum > m_words->m_numWords ) { char *xx=NULL;*xx=0; } // init the dedup table to dedup wordIds HashTableX dt; char dbuf[512]; dt.set(8,0,12,dbuf,512,false,m_niceness,"altwrds"); long maxSyns = (long)MAX_SYNS; char *bufPtr = tmpBuf; // point into buffer m_aids = (long long *)bufPtr; bufPtr += maxSyns * 8; // then the word ids m_wids0 = (long long *)bufPtr; bufPtr += maxSyns * 8; // second word ids, for multi alnum word synonyms, i.e. "New Jersey" m_wids1 = (long long *)bufPtr; bufPtr += maxSyns * 8; m_termPtrs = (char **)bufPtr; bufPtr += maxSyns * 4; m_termLens = (long *)bufPtr; bufPtr += maxSyns * 4; m_numAlnumWords = (long *)bufPtr; bufPtr += maxSyns * 4; m_numAlnumWordsInBase = (long *)bufPtr; bufPtr += maxSyns * 4; // source m_src = bufPtr; bufPtr += maxSyns; // cursors m_aidsPtr = m_aids; m_wids0Ptr = m_wids0; m_wids1Ptr = m_wids1; m_srcPtr = m_src; m_termPtrsPtr = m_termPtrs; m_termLensPtr = m_termLens; m_numAlnumWordsPtr = m_numAlnumWords; m_numAlnumWordsInBasePtr = m_numAlnumWordsInBase; char *w = m_words->m_words [wordNum]; long wlen = m_words->m_wordLens[wordNum]; // // NOW hit wiktionary // Trust this less then our s_exceptions above, but more than // our morph computations below // char sourceId = SOURCE_WIKTIONARY; char *ss = NULL; long long bwid; char wikiLangId = m_docLangId; bool hadSpace ; long klen ; long baseNumAlnumWords; tryOtherLang: /* // if word only exists in one language, assume that language for word // even if m_docLangId is langUnknown (0) if ( ! ss && ! m_docLangId && ! wikiLangId ) { // get raw word id bwid = m_words->m_wordIds[wordNum]; // each lang has its own bit long long bits = g_speller.getLangBits64 ( &bwid ); // skip if not unique char count = getNumBitsOn64 ( bits ) ; // if we only got one lang we could be, assume that if ( count == 1 ) // get it. bit #0 is english, so add 1 wikiLangId = getBitPosLL((uint8_t *)&bits) + 1; // try setting based on script. greek. russian. etc. // if the word was not in the wiktionary. // this will be langUnknown if not definitive. else wikiLangId = getCharacterLanguage(w); } */ // try looking up bigram so "new jersey" gets "nj" as synonym if ( wikiLangId && wordNum+2< m_words->m_numWords && m_words->m_wordIds[wordNum+2]) { // get phrase id bigram then long conti = 0; bwid = hash64Lower_utf8_cont(w,wlen,0,&conti); // then the next word char *wp2 = m_words->m_words[wordNum+2]; long wlen2 = m_words->m_wordLens[wordNum+2]; bwid = hash64Lower_utf8_cont(wp2,wlen2,bwid,&conti); baseNumAlnumWords = 2; ss = g_wiktionary.getSynSet( bwid, wikiLangId ); } // need a language for wiktionary to work with if ( wikiLangId && ! ss ) { // get raw word id bwid = m_words->m_wordIds[wordNum]; baseNumAlnumWords = 1; //if ( bwid == 1424622907102375150LL) // log("a"); ss = g_wiktionary.getSynSet( bwid, wikiLangId ); // if that failed try removing 's from word if there if ( ! ss && wlen >= 3 && w[wlen-2]=='\'' && w[wlen-1]=='s' ) { long long cwid = hash64Lower_utf8(w,wlen-2); ss = g_wiktionary.getSynSet( cwid, wikiLangId ); } } // even though a document may be in german it often has some // english words "pdf download" "copyright" etc. so if the word // has no synset in german, try it in english if ( //numPresets == 0 && ! ss && m_docLangId != langEnglish && wikiLangId != langEnglish && m_docLangId && g_speller.getSynsInEnglish(w,wlen,m_docLangId,langEnglish) ) { // try english wikiLangId = langEnglish; sourceId = SOURCE_WIKTIONARY_EN; goto tryOtherLang; } // if it was in wiktionary, just use that synset if ( ss ) { // prepare th HashTableX dedup; HashTableX *dd = NULL; char dbuf[512]; long count = 0; addSynSet: // do we have another set following this char *next = g_wiktionary.getNextSynSet(bwid,m_docLangId,ss); // if so, init the dedup table then if ( next && ! dd ) { dd = &dedup; dd->set ( 8,0,8,dbuf,512,false,m_niceness,"sddbuf"); } // skip over the pipe i guess char *pipe = ss + 2; // zh_ch? if ( *pipe == '_' ) pipe += 3; // sanity if ( *pipe != '|' ) { char *xx=NULL;*xx=0; } // point to word list char *p = pipe + 1; // hash up the list of words, they are in utf8 and char *e = p + 1; // save count in case we need to undo //long saved = m_numAlts[wordNum]; hashLoop: // skip synonyms that are anagrams because its to ambiguous // the are mappings like // "PC" -> "PC,Personal Computer" // "PC" -> "PC,Probable Cause" ... (lots more!) //bool isAnagram = true; for ( ; *e !='\n' && *e != ',' ; e++ ) ; // if ( ! is_upper_a(*e) ) isAnagram = false; // get it long long h = hash64Lower_utf8_nospaces ( p , e - p ); // skip if same as base word if ( h == bwid ) goto getNextSyn; // should we check for dups? if ( dd ) { // skip dups if ( dd->isInTable(&h) ) goto getNextSyn; // dedup. return false with g_errno set on error if ( ! dd->addKey(&h) ) return m_aidsPtr - m_aids; } // store it *m_aidsPtr++ = h; // store source *m_srcPtr++ = sourceId; hadSpace = false; klen = e - p; for ( long k = 0 ; k < klen ; k++ ) if ( is_wspace_a(p[k]) ) hadSpace = true; *m_termPtrsPtr++ = p; *m_termLensPtr++ = e-p; // only for multi-word synonyms like "New Jersey"... *m_wids0Ptr = 0LL; *m_wids1Ptr = 0LL; *m_numAlnumWordsPtr = 1; // and for multi alnum word synonyms if ( hadSpace ) { Words sw; sw.setx ( p , e - p , m_niceness ); *(long long *)m_wids0Ptr = sw.m_wordIds[0]; *(long long *)m_wids1Ptr = sw.m_wordIds[2]; *(long *)m_numAlnumWordsPtr = sw.getNumAlnumWords(); } m_wids0Ptr++; m_wids1Ptr++; m_numAlnumWordsPtr++; // how many words did we have to hash to find a synset? // i.e. "new jersey" would be 2, to get "nj" *m_numAlnumWordsInBasePtr++ = baseNumAlnumWords; // do not breach if ( ++count >= maxSyns ) goto done; getNextSyn: // loop for more if ( *e == ',' ) { e++; p = e; goto hashLoop; } // add in the next syn set, deduped if ( next ) { ss = next; goto addSynSet; } // wrap it up done: // all done return m_aidsPtr - m_aids; } // strip marks from THIS word, return -1 w/ g_errno set on error if ( ! addStripped ( w , wlen,&dt ) ) return m_aidsPtr - m_aids; // returns false with g_errno set if ( ! addAmpPhrase ( wordNum, &dt ) ) return m_aidsPtr - m_aids; // if we end in apostrophe, strip and add if ( wlen>= 3 && w[wlen-1] == 's' && w[wlen-2]=='\'' && ! addWithoutApostrophe ( wordNum, &dt ) ) return m_aidsPtr - m_aids; return m_aidsPtr - m_aids; }
// . sets m_qbuf1[] and m_qbuf2[] // . m_qbuf1[] is the advanced query // . m_qbuf2[] is the query to be used for spell checking // . returns false and set g_errno on error bool SearchInput::setQueryBuffers ( ) { m_sbuf1.reset(); m_sbuf2.reset(); m_sbuf3.reset(); short qcs = csUTF8; if (m_queryCharset && m_queryCharsetLen){ // we need to convert the query string to utf-8 qcs = get_iana_charset(m_queryCharset, m_queryCharsetLen); if (qcs == csUnknown) { //g_errno = EBADCHARSET; //g_msg = "(error: unknown query charset)"; //return false; qcs = csUTF8; } } // prepend sites terms long numSites = 0; char *csStr = NULL; numSites = 0; csStr = get_charset_str(qcs); if ( m_sites && m_sites[0] ) { char *s = m_sites; char *t; long len; m_sbuf1.pushChar('(');//*p++ = '('; loop: // skip white space while ( *s && ! is_alnum_a(*s) ) s++; // bail if done if ( ! *s ) goto done; // get length of it t = s; while ( *t && ! is_wspace_a(*t) ) t++; len = t - s; // add site: term //if ( p + 12 + len >= pend ) goto toobig; if ( numSites > 0 ) m_sbuf1.safeStrcpy ( " UOR " ); m_sbuf1.safeStrcpy ( "site:" ); //p += ucToUtf8(p, pend-p,s, len, csStr, 0,0); m_sbuf1.safeMemcpy ( s , len ); //memcpy ( p , s , len ); p += len; //*p++ = ' '; m_sbuf1.pushChar(' '); s = t; numSites++; goto loop; done: m_sbuf1.safePrintf(") | "); // inc totalLen m_sitesQueryLen = m_sitesLen + (numSites * 10); } // append site: term if ( m_siteLen > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //memcpy ( p , "+site:" , 6 ); p += 6; m_sbuf1.safePrintf("+site:"); //memcpy ( p , m_site , m_siteLen ); p += m_siteLen; m_sbuf1.safeMemcpy(m_site,m_siteLen); } // append gblang: term if( m_gblang > 0 ) { //if( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //p += sprintf( p, "+gblang:%li |", m_gblang ); m_sbuf1.safePrintf( "+gblang:%li |", m_gblang ); } // bookmark here so we can copy into st->m_displayQuery below //long displayQueryOffset = m_sbuf1.length(); // append url: term if ( m_urlLen > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //memcpy ( p , "+url:" , 5 ); p += 5; m_sbuf1.safeStrcpy ( "+url:"); //memcpy ( p , m_url , m_urlLen ); p += m_urlLen; m_sbuf1.safeMemcpy ( m_url , m_urlLen ); } // append url: term if ( m_linkLen > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //memcpy ( p , "+link:" , 6 ); p += 6; m_sbuf1.safeStrcpy ( "+link:"); //memcpy ( p , m_link , m_linkLen ); p += m_linkLen; m_sbuf1.safeMemcpy ( m_link , m_linkLen ); } // append the natural query if ( m_queryLen > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //p += ucToUtf8(p, pend-p, m_query, m_queryLen, csStr, 0,0); m_sbuf1.safeMemcpy ( m_query , m_queryLen ); //memcpy ( p , m_query , m_queryLen ); p += m_queryLen; // add to spell checked buf, too //if ( p2 > pstart2 ) *p2++ = ' '; if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); //p2 +=ucToUtf8(p2, pend2-p2, m_query, m_queryLen, csStr, 0,0); m_sbuf2.safeMemcpy ( m_query , m_queryLen ); //memcpy ( p2 , m_query , m_queryLen ); p2 += m_queryLen; } if ( m_query2Len > 0 ) { //if ( p3 > pstart3 ) *p3++ = ' '; if ( m_sbuf3.length() ) m_sbuf3.pushChar(' '); //p3+=ucToUtf8(p3, pend3-p3, m_query2, m_query2Len, csStr,0,0); m_sbuf3.safeMemcpy ( m_query2 , m_query2Len ); } //if (g_errno == EILSEQ){ // illegal character seq // log("query: bad char set"); // g_errno = 0; // if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;} // if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;} //} // append quoted phrases to query if ( m_quoteLen1 > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //*p++ = '+'; //*p++ = '\"'; m_sbuf1.safeStrcpy("+\""); //p += ucToUtf8(p, pend-p, m_quote1, m_quoteLen1, csStr, 0,0); m_sbuf1.safeMemcpy ( m_quote1 , m_quoteLen1 ); //memcpy ( p , m_quote1 , m_quoteLen1 ); p += m_quoteLen1 ; //*p++ = '\"'; m_sbuf1.safeStrcpy("\""); // add to spell checked buf, too //if ( p2 > pstart2 ) *p2++ = ' '; if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); //*p2++ = '+'; //*p2++ = '\"'; m_sbuf2.safeStrcpy("+\""); //p2+=ucToUtf8(p2, pend2-p2, m_quote1, m_quoteLen1, csStr,0,0); m_sbuf2.safeMemcpy ( m_quote1 , m_quoteLen1 ); //memcpy ( p2 , m_quote1 , m_quoteLen1 ); p2 += m_quoteLen1 ; //*p2++ = '\"'; m_sbuf2.safeStrcpy("\""); } //if (g_errno == EILSEQ){ // illegal character seq // g_errno = 0; // if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;} // if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;} //} if ( m_quoteLen2 > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //*p++ = '+'; //*p++ = '\"'; m_sbuf1.safeStrcpy("+\""); //p += ucToUtf8(p, pend-p, m_quote2, m_quoteLen2, csStr, 0,0); m_sbuf1.safeMemcpy ( m_quote2 , m_quoteLen2 ); //memcpy ( p , m_quote2 , m_quoteLen2 ); p += m_quoteLen2 ; //*p++ = '\"'; m_sbuf1.safeStrcpy("\""); // add to spell checked buf, too //if ( p2 > pstart2 ) *p2++ = ' '; if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); //*p2++ = '+'; //*p2++ = '\"'; m_sbuf2.safeStrcpy("+\""); //p2+=ucToUtf8(p2, pend2-p2, m_quote2, m_quoteLen2, csStr,0,0); m_sbuf2.safeMemcpy ( m_quote2 , m_quoteLen2 ); //memcpy ( p2 , m_quote2 , m_quoteLen2 ); p2 += m_quoteLen2 ; //*p2++ = '\"'; m_sbuf2.safeStrcpy("\""); } //if (g_errno == EILSEQ){ // illegal character seq // g_errno = 0; // if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;} // if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;} //} // append plus terms if ( m_plusLen > 0 ) { char *s = m_plus, *send = m_plus + m_plusLen; //if ( p > pstart && p < pend ) *p++ = ' '; //if ( p2 > pstart2 && p2 < pend2) *p2++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); while (s < send) { while (isspace(*s) && s < send) s++; char *s2 = s+1; if (*s == '\"') { // if there's no closing quote just treat // the end of the line as such while (*s2 != '\"' && s2 < send) s2++; if (s2 < send) s2++; } else { while (!isspace(*s2) && s2 < send) s2++; } if (s < send) break; //if (p < pend) *p++ = '+'; //if (p2 < pend2) *p2++ = '+'; m_sbuf1.pushChar('+'); m_sbuf2.pushChar('+'); //p += ucToUtf8(p, pend-p, s, s2-s, csStr, 0,0); //p2 += ucToUtf8(p2, pend2-p2, s, s2-s, csStr, 0,0); m_sbuf1.safeMemcpy ( s , s2 - s ); m_sbuf2.safeMemcpy ( s , s2 - s ); /* if (g_errno == EILSEQ) { // illegal character seq g_errno = 0; if (qcs == csUTF8) { qcs = csISOLatin1; goto doOver; } if (qcs != csISOLatin1) { qcs = csUTF8; goto doOver; } } */ s = s2 + 1; if (s < send) { //if (p < pend) *p++ = ' '; //if (p2 < pend2) *p2++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); } } } // append minus terms if ( m_minusLen > 0 ) { char *s = m_minus, *send = m_minus + m_minusLen; //if ( p > pstart && p < pend ) *p++ = ' '; //if ( p2 > pstart2 && p2 < pend2) *p2++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); while (s < send) { while (isspace(*s) && s < send) s++; char *s2 = s+1; if (*s == '\"') { // if there's no closing quote just treat // the end of the line as such while (*s2 != '\"' && s2 < send) s2++; if (s2 < send) s2++; } else { while (!isspace(*s2) && s2 < send) s2++; } if (s < send) break; //if (p < pend) *p++ = '-'; //if (p2 < pend2) *p2++ = '-'; m_sbuf1.pushChar('-'); m_sbuf2.pushChar('-'); //p += ucToUtf8(p, pend-p, s, s2-s, csStr, 0,0); //p2 += ucToUtf8(p2, pend2-p2, s, s2-s, csStr, 0,0); m_sbuf1.safeMemcpy ( s , s2 - s ); m_sbuf2.safeMemcpy ( s , s2 - s ); /* if (g_errno == EILSEQ) { // illegal character seq g_errno = 0; if (qcs == csUTF8) { qcs = csISOLatin1; goto doOver; } if (qcs != csISOLatin1) { qcs = csUTF8; goto doOver; } } */ s = s2 + 1; if (s < send) { //if (p < pend) *p++ = ' '; //if (p2 < pend2) *p2++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); } } } // append gbkeyword:numinlinks if they have &mininlinks=X, X>0 long minInlinks = m_hr->getLong("mininlinks",0); if ( minInlinks > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //char *str = "gbkeyword:numinlinks"; //long len = gbstrlen(str); //memcpy ( p , str , len ); //p += len; m_sbuf1.safePrintf ( "gbkeyword:numinlinks"); } // null terms m_sbuf1.pushChar('\0'); m_sbuf2.pushChar('\0'); m_sbuf3.pushChar('\0'); // the natural query m_displayQuery = m_sbuf2.getBufStart();// + displayQueryOffset; if ( ! m_displayQuery ) m_displayQuery = ""; while ( *m_displayQuery == ' ' ) m_displayQuery++; m_displayQueryLen = gbstrlen(m_displayQuery);//p-m_displayQuery //log("query: got query %s",m_sbuf1.getBufStart()); //log("query: got display query %s",m_displayQuery); // urlencoded display query urlEncode(m_qe, MAX_QUERY_LEN*2, m_displayQuery, m_displayQueryLen); return true; }
// . when the Conf::m_proxyIps parm is updated we call this to rebuild // s_iptab, our table of SpiderProxy instances, which has the proxies and // their performance statistics. // . we try to maintain stats of ip/ports that did NOT change when rebuilding. bool buildProxyTable ( ) { // scan the NEW list of proxy ip/port pairs in g_conf char *p = g_conf.m_proxyIps.getBufStart(); HashTableX tmptab; tmptab.set(8,0,16,NULL,0,false,"tmptab"); // scan the user inputted space-separated list of ip:ports // (optional username:password@ip:port) for ( ; *p ; ) { // skip white space if ( is_wspace_a(*p) ) { p++; continue; } // skip http:// if ( strncasecmp(p,"http://",7) == 0 ) { p += 7; continue; } // scan in an ip:port char *s = p; char *portStr = NULL; int32_t dc = 0, pc = 0, gc = 0, bc = 0; const char *msg; char *usernamePwd = NULL; int32_t usernamePwdLen = 0; char *ipStart = p; // scan all characters until we hit \0 or another whitespace for ( ; *s && !is_wspace_a(*s); s++) { if ( *s == '@' ) { // must be username:pwd if ( pc != 1 ) { msg = "bad username:password"; goto hadError; } usernamePwd = p; usernamePwdLen = s - p; if ( usernamePwdLen >= MAXUSERNAMEPWD-2 ) { msg = "username:password too long"; goto hadError; } dc = 0; gc = 0; bc = 0; pc = 0; portStr = NULL; ipStart = s+1; continue; } if ( *s == '.' ) { dc++; continue; } if ( *s == ':' ) { portStr=s; pc++; continue; } if ( is_digit(*s) ) { gc++; continue; } bc++; continue; } // ensure it is a legit ip:port combo msg = NULL; if ( gc < 4 ) msg = "not enough digits for an ip"; if ( pc > 1 ) msg = "too many colons"; if ( dc != 3 ) msg = "need 3 dots for an ip address"; if ( bc ) msg = "got illegal char in ip:port listing"; if ( msg ) { hadError: char c = *s; *s = '\0'; log("buf: %s for %s",msg,p); *s = c; return false; } // convert it int32_t iplen = s - ipStart; if ( portStr ) iplen = portStr - ipStart; int32_t ip = atoip(ipStart,iplen); // another sanity check if ( ip == 0 || ip == -1 ) { log("spider: got bad proxy ip for %s",p); return false; } // and the port default is 80 int32_t port = 80; if ( portStr ) port = atol2(portStr+1,s-portStr-1); if ( port < 0 || port > 65535 ) { log("spider: got bad proxy port for %s",p); return false; } // . we got a legit ip:port // . see if already in our table uint64_t ipKey = (uint32_t)ip; ipKey <<= 16; ipKey |= (uint16_t)(port & 0xffff); // also store into tmptable to see what we need to remove tmptab.addKey(&ipKey); // see if in table int32_t islot = s_iptab.getSlot( &ipKey); // advance p p = s; // if in there, keep it as is if ( islot >= 0 ) continue; // otherwise add new entry SpiderProxy newThing; memset ( &newThing , 0 , sizeof(SpiderProxy)); newThing.m_ip = ip; newThing.m_port = port; newThing.m_lastDownloadTookMS = -1; newThing.m_lastSuccessfulTestMS = -1; gbmemcpy(newThing.m_usernamePwd,usernamePwd,usernamePwdLen); // ensure it is NULL terminated newThing.m_usernamePwd[usernamePwdLen] = '\0'; if ( ! s_iptab.addKey ( &ipKey, &newThing ) ) return false; } redo: int32_t removed = 0; // scan all SpiderProxies in tmptab for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) { // skip empty buckets in hashtable s_iptab if ( ! s_iptab.m_flags[i] ) continue; // get the key int64_t key = *(int64_t *)s_iptab.getKeyFromSlot(i); // must also exist in tmptab, otherwise it got removed by user if ( tmptab.isInTable ( &key ) ) continue; // skip if not in table if ( s_iptab.getSlot ( &key ) < 0 ) { log("sproxy: iptable hashing messed up"); continue; } // shoot, it got removed. not in the new list of ip:ports s_iptab.removeKey ( &key ); removed++; // hashtable is messed up now, start over //goto redo; } if ( removed ) goto redo; return true; }
void HttpMime::addCookie(const httpcookie_t &cookie, const Url ¤tUrl, SafeBuf *cookieJar) { // don't add expired cookie into cookie jar if (cookie.m_expired) { return; } if (cookie.m_domain) { cookieJar->safeMemcpy(cookie.m_domain, cookie.m_domainLen); cookieJar->pushChar('\t'); cookieJar->safeStrcpy(cookie.m_defaultDomain ? "FALSE\t" : "TRUE\t"); } else { cookieJar->safeMemcpy(currentUrl.getHost(), currentUrl.getHostLen()); cookieJar->pushChar('\t'); cookieJar->safeStrcpy("FALSE\t"); } if (cookie.m_path) { cookieJar->safeMemcpy(cookie.m_path, cookie.m_pathLen); cookieJar->pushChar('\t'); } else { if (currentUrl.getPathLen()) { cookieJar->safeMemcpy(currentUrl.getPath(), currentUrl.getPathLen()); } else { cookieJar->pushChar('/'); } cookieJar->pushChar('\t'); } if (cookie.m_secure) { cookieJar->safeStrcpy("TRUE\t"); } else { cookieJar->safeStrcpy("FALSE\t"); } // we're not using expiration field cookieJar->safeStrcpy("0\t"); int32_t currentLen = cookieJar->length(); cookieJar->safeMemcpy(cookie.m_cookie, cookie.m_cookieLen); // cater for multiline cookie const char *currentPos = cookieJar->getBufStart() + currentLen; const char *delPosStart = NULL; int32_t delLength = 0; while (currentPos < cookieJar->getBufPtr() - 1) { if (delPosStart) { if (is_wspace_a(*currentPos) || *currentPos == '\n' || *currentPos == '\r') { ++delLength; } else { break; } } else { if (*currentPos == '\n' || *currentPos == '\r') { delPosStart = currentPos; ++delLength; } } ++currentPos; } cookieJar->removeChunk1(delPosStart, delLength); /// @todo ALC handle httpOnly attribute cookieJar->pushChar('\n'); }