// . s[maxLen] should be the NULL // . returns full length of entity @ "s" if there is a valid one, 0 otherwise // . sets *c to the iso character the entity represents (if there is one) // JAB: const-ness for optimizer... int32_t getEntity_a ( char *s , int32_t maxLen , uint32_t *c ) { // ensure there's an & as first char if ( s[0] != '&' ) return 0; // compute maximum length of entity, if it's indeed an entity int32_t len = 1; if ( s[len]=='#' ) len++; // cut it off after 9 chars to save time while ( len < maxLen && len < 9 && is_alnum_a(s[len]) ) len++; // include the ending ; if any if ( len < maxLen && s[len]==';' ) len++; // char d = s[len]; // s[len]='\0'; // fprintf(stderr,"got entity %s \n",s); // s[len]=d; // we don't have entities longer than "¤" if ( len > 10 ) return 0; // all entites are 3 or more chars (>) if ( len < 3 ) return 0; // . if it's a numeric entity like { use this routine // . pass in the whole she-bang: "...;" or "´...; if ( s[1] == '#' ) { if ( s[2] == 'x' ) *c = getHexadecimalEntity (s, len ); else *c = getDecimalEntity (s, len ); } // otherwise, it's text else *c = getTextEntity ( s , len ); // return 0 if not an entity, length of entity if it is an entity if ( *c ) return len; else return 0; }
// . s[maxLen] should be the NULL // . returns full length of entity @ "s" if there is a valid one, 0 otherwise // . sets *c to the iso character the entity represents (if there is one) // JAB: const-ness for optimizer... int32_t getEntity_a ( const char *s , int32_t maxLen , uint32_t *c ) { // ensure there's an & as first char if ( s[0] != '&' ) { return 0; } // compute maximum length of entity, if it's indeed an entity int32_t len = 1; if ( s[len] == '#' ) { len++; } // cut it off after 9 chars to save time while ( len < maxLen && len < 9 && is_alnum_a( s[len] ) ) { len++; } // character entity reference must end with a semicolon. // some browsers have lenient parsing, but we don't accept invalid // references. if ( len == maxLen || s[len] != ';' ) { //not a valid character entity reference return 0; } len++; // we don't have entities longer than "¤" if ( len > 10 ) { return 0; } // all entites are 3 or more chars (>) if ( len < 3 ) { return 0; } // . if it's a numeric entity like { use this routine // . pass in the whole she-bang: "...;" or "´...; if ( s[1] == '#' ) { if ( s[2] == 'x' ) { *c = getHexadecimalEntity( s, len ); } else { *c = getDecimalEntity( s, len ); } } else { // otherwise, it's text *c = getTextEntity( s, len ); } // return 0 if not an entity, length of entity if it is an entity if ( *c ) { return len; } else { return 0; } }
// . get the # of words in this string int32_t getNumWords ( char *s , int32_t len, int32_t titleVersion ) { int32_t wordCount = 0; bool inWord = false; for ( int32_t i = 0 ; i < len ; i++ ) { if ( ! is_alnum_a ( s[i] ) && s[i]!='\'' ) { inWord = false; continue; } if ( ! inWord ) { inWord = true; wordCount++; } } return wordCount; }
unsigned char Words::isBounded(int wordi) { if(wordi+1 < m_numWords && getWord(wordi)[getWordLen(wordi)] == '/' //|| //getWord(wordi)[getWordLen(wordi)] == '?' ) return(true); if(wordi+1 < m_numWords && (getWord(wordi)[getWordLen(wordi)] == '.' || getWord(wordi)[getWordLen(wordi)] == '?') && is_alnum_a(getWord(wordi)[getWordLen(wordi)+1]) ) return(true); if(wordi > 0 && (getWord(wordi)[-1] == '/' || getWord(wordi)[-1] == '?')) return(true); return(false); }
nodeid_t getTagId ( char *s , NodeType **retp ) { // init table? static bool s_init = false; static HashTableX s_ht; static char s_buf[10000]; if ( ! s_init ) { s_init = true; s_ht.set ( 4 ,4,1024,s_buf,10000,false,0,"tagids");//niceness=0 // how many NodeTypes do we have in g_nodes? static int32_t nn = sizeof(g_nodes) / sizeof(NodeType); // set the hash table for ( int32_t i = 0 ; i < nn ; i++ ) { char *name = g_nodes[i].m_nodeName; int32_t nlen = gbstrlen(name); int64_t h = hash64Upper_a ( name,nlen,0LL ); NodeType *nt = &g_nodes[i]; if ( ! s_ht.addKey(&h,&nt) ) { char *xx=NULL;*xx=0; } } // sanity if ( s_ht.m_numSlots != 1024 ) { char *xx=NULL;*xx=0; } // sanity test nodeid_t tt = getTagId ( "br" ); if ( tt != TAG_BR ) { char *xx=NULL;*xx=0; } } // find end of tag name. hyphens are ok to be in name. // facebook uses underscores like <start_time> char *e = s; for ( ; *e && (is_alnum_a(*e) || *e=='-'|| *e=='_'); e++); // hash it for lookup int64_t h = hash64Upper_a ( s , e - s , 0 ); // look it up NodeType **ntp = (NodeType **)s_ht.getValue(&h); // assume none if ( retp ) *retp = NULL; // none? if ( ! ntp ) return 0; // got one if ( retp ) *retp = *ntp; // get id otherwise return (*ntp)->m_nodeId; }
// *next is set to ptr into m_cgiBuf so that the next successive call to // getString with the SAME "field" will start at *next. that way you // can use the same cgi parameter multiple times. (like strstr kind of) char *HttpRequest::getStringFromCookie ( char *field , long *len , char *defaultStr , long *next ) { // get field len long flen = gbstrlen(field); // assume none if ( len ) *len = 0; // if no cookie, forget it if ( ! m_cookiePtr ) return defaultStr; // the end of the cookie //char *pend = m_cookieBuf + m_cookieBufLen; char *pend = m_cookiePtr + m_cookieLen; char *p = m_cookiePtr; // skip over spaces and punct for ( ; p && p < pend ; p++ ) if ( is_alnum_a(*p) ) break; // skip "Cookie:" if ( p + 7 < pend && ! strncasecmp(p,"cookie:",7) ) p += 7; // skip spaces after that for ( ; p && p < pend ; p++ ) if ( is_alnum_a(*p) ) break; // crazy? if ( p >= pend ) return defaultStr; char *savedVal = NULL; // so we do not skip the first cookie, jump right in! // otherwise we lose the calendar cookie for msie goto entryPoint; // . loop over all xxx=yyy\0 thingies in the cookie // . we converted every '&' to a \0 when the cookiebuf was set above //for ( char *p = m_cookieBuf ; *p ; p += gbstrlen(p) + 1 ) { // . no, we just keep them as &'s because seems like cookies use ;'s // as delimeters not so much &'s. and when we log the cookie in the // log, i wanted to see the whole cookie, so having \0's in the // cookie was messing that up. for ( ; p < pend ; p++ ) { // need a \0 // fixes "display=0&map=0&calendar=0;" that is only one cookie. // so do not grap value of map or calendar from that!! if ( *p ) continue; // back to back \0's? be careful how we skip over them! if ( ! p[1] ) continue; // skip that if ( ++p >= pend ) break; // skip whitespace that follows for ( ; p < pend ; p++ ) if ( ! is_wspace_a(*p) ) break; // end of cookie? if ( p >= pend ) break; entryPoint: // check first char if ( *p != *field ) continue; // does it match? continue if not a match if ( strncmp ( p , field , flen ) ) continue; // point to value char *val = p + flen; // must be an equal sign if ( *val != '=' ) continue; // skip that sign val++; // . cookies terminate fields by space or ; or & // . skip to end of cookie value for this field char *e = val; // skip over alnum. might also be \0 if this function // was already called somewhere else! // we NULL separated each cookie and then urldecoded each // cookie above in the m_cookieBuf logic. cookies can contain // encoded ;'s and &'s so i took this checks out of this while // loop. like the widgetHeader has semicolons in it and it // stores in the cookie. while ( e < pend && *e ) e++; // that is the length if ( len ) *len = e - val; // NULL terminate it, we should have already logged the cookie // so it should be ok to NULL terminate now. we already // call urlDecode() now above... and make the &'s into \0's *e = '\0'; // if we were in the meta cookie, return that... // otherwise if you visited this site before metacookies // were used you might have the cookie outside the meta // cookie AND inside the metacookie, and only the value // inside the metacookie is legit... if ( val > m_metaCookie ) return val; // otherwise, save it and try to get from meta cookie savedVal = val; // length //if ( len ) *len = gbstrlen(val); // this is the value! //return val; } // did we save something? if ( savedVal ) return savedVal; // no match return defaultStr; }
// Return the value of the specified "field" within this node. // the case of "field" does not matter. char *XmlNode::getFieldValue ( char *field , int32_t *valueLen ) { // reset this to 0 *valueLen = 0; // scan for the field name in our node int32_t flen = gbstrlen(field); char inQuotes = '\0'; int32_t i; // scan the characters in the node, looking for the field name in ascii for ( i = 1; i + flen < m_nodeLen ; i++ ) { // skip the field if it's quoted if ( inQuotes) { if (m_node[i] == inQuotes ) inQuotes = 0; continue; } // set inQuotes to the quote if we're in quotes if ( (m_node[i]=='\"' || m_node[i]=='\'')){ inQuotes = m_node[i]; continue; } // a field name must be preceeded by non-alnum if ( is_alnum_a ( m_node[i-1] ) ) continue; // the first character of this field shout match field[0] if ( to_lower_a (m_node[i]) != to_lower_a(field[0] )) continue; // field just be immediately followed by an = or space if (m_node[i+flen]!='='&&!is_wspace_a(m_node[i+flen]))continue; // field names must match if ( strncasecmp ( &m_node[i], field, flen ) != 0 ) continue; // break cuz we got a match for our field name break; } // return NULL if no matching field if ( i + flen >= m_nodeLen ) return NULL; // advance i over the fieldname so it pts to = or space i += flen; // advance i over spaces while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++; // advance over the equal sign, return NULL if does not exist if ( i < m_nodeLen && m_node[i++] != '=' ) return NULL; // advance i over spaces after the equal sign while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++; // now parse out the value of this field (could be in quotes) inQuotes = '\0'; // set inQuotes to the quote if we're in quotes if ( m_node[i]=='\"' || m_node[i]=='\'') inQuotes = m_node[i++]; // mark this as the start of the value int start=i; // advance i until we hit a space, or we hit a that quote if inQuotes if (inQuotes) { while (i<m_nodeLen && m_node[i] != inQuotes ) i++; } else { while ( i<m_nodeLen && !is_wspace_a(m_node[i])&& m_node[i]!='>') i++; } // set the length of the value *valueLen = i - start; // return a ptr to the value return m_node + start; }
// . called by Xml class // . returns the length of the node // . TODO: "node" is now guaranteed to be \0 terminated -- make this faster int32_t XmlNode::set ( char *node , bool pureXml , int32_t version ) { // save head of node m_node = node; // sanity check static bool s_check = false; if ( ! s_check ) { s_check = true; // how many NodeTypes do we have in g_nodes? static int32_t nn = sizeof(g_nodes) / sizeof(NodeType); // set the hash table for ( int32_t i = 0 ; i < nn ; i++ ) { // sanity if ( g_nodes[i].m_nodeId != i ) { char *xx=NULL;*xx=0;} } } // . reset this // . need to do here instead of in Links.cpp because sometimes // we think an anchor tag indicates a link, but it is really // just an <a href="javascript:..."> function call and Links.cpp // ignored it but we are expecting this to be valid! m_isSelfLink = 0; // reset //m_linkNum = -1; // CDATA tag was identified in earlier versions as a text node. Now // it is identified as a CDATA tag node. But gb.conf and others always // pass their version as 0 if ( node[0] == '<' && node[1] == '!' && node[2] == '[' && node[3] == 'C' && node[4] == 'D' && node[5] == 'A' && node[6] == 'T' && node[7] == 'A' && node[8] == '[' ) return setCDATANode ( node ); // if "node" isn't the start of a tag then set it as a Text Node if ( *node != '<' || ! isTagStart ( node ) ) {//, 0, version ) ) { // . set this node as a text node! // . nodeId for text nodes is 0 m_nodeId = 0; m_node = node; m_hasBackTag = false; m_hash = 0; int32_t i = 0; //char inCDATA = 0; // inc i as int32_t as it's NOT the beginning of a tag while ( node[i] && (node[i] != '<' || ! isTagStart ( node+i)))//,versin))) i++; m_nodeLen = i; m_pairTagNum = -1; return m_nodeLen; } // . see if it's a comment (node end is "-->" for comments) // . comments are special cases if ( node[1] == '!' ) { if ( node[2]=='-' && node[3]=='-' ) return setCommentNode ( node ); // this means comment too: // <![if ....]> if ( node[2]=='[' ) return setCommentNode2 ( node ); } // . otherwise it's a regular tag // . might be <!DOCTYPE ...> or something though m_nodeLen = getTagLen ( node );//, version ); // . get the node's name's length (i-1) // . node name ends at non alnum char // . we can have hyphens in node name (TODO: even at beginning???) int32_t tagNameStart = 1; // . skip over backslash in the back tags // . or skip over / or ? or ! now // . tag names must start with a letter, fwiw if ( ! is_alnum_a(node[tagNameStart]) /* == '/'*/ ) tagNameStart++; int32_t i = tagNameStart; // skip i to end of tagName. this should only allow ascii chars // to be "tag name chars" for ( ; i < m_nodeLen && is_tagname_char(node[i]) ; i++ ); // set the tagName and tagNameLen m_tagName = &node [ tagNameStart ]; m_tagNameLen = i - tagNameStart; // break point //if ( m_tagNameLen == 3 && m_tagName[0]=='!' && // m_tagName[1]=='-' && m_tagName[2]=='-' ) // fprintf(stderr,"man!"); // . set the node's hash -- used cuz it's faster than strcmp // . just hash the letters as upper case // . tag names are never utf8, so use the ascii ha m_hash = hash64Upper_a ( m_tagName , m_tagNameLen , 0LL); // if we're pure xml, don't allow any html tags accept <!-- --> if ( pureXml ) { m_hasBackTag = true; m_isBreaking = true; m_isVisible = true; //m_nodeId = TAG_XMLTAG;//1; // this returns 1 if tag is not in the list m_nodeId = setNodeInfo ( m_hash );//&m_hasBackTag , } // . determine if the nodeId for this node // . determine if it breaks lines (for phrasing purposes) else m_nodeId = setNodeInfo ( m_hash );//&m_hasBackTag , //&m_isBreaking , &m_isVisible ); // . no back tag if / follow name // . this was only for "pureXml" but now i do it for all tags! if ( m_node [ m_nodeLen - 2 ] == '/' ) m_hasBackTag = false; if ( m_node [ m_nodeLen - 2 ] == '?' ) m_hasBackTag = false; return m_nodeLen; }
bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) { long i = 0; long j; //long k = 0; long wlen; //unsigned long e; //long skip; long badCount = 0; bool hadApostrophe = false; UCScript oldScript = ucScriptCommon; UCScript saved; UCProps props; uptop: // bad utf8 can cause a breach if ( i >= nodeLen ) goto done; if ( ! s[i] ) goto done; if ( ! is_alnum_utf8(s+i) ) { // && m_numWords < m_preCount ) { if ( m_numWords >= m_preCount ) goto done; // tag? if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) { // get the tag id if ( s[i+1]=='/' ) { // skip over / m_tagIds [m_numWords] = ::getTagId(s+i+2); m_tagIds [m_numWords] |= BACKBIT; } else m_tagIds [m_numWords] = ::getTagId(s+i+1); // word start m_words [m_numWords] = s + i; m_wordIds [m_numWords] = 0LL; // skip till end long tagLen = getTagLen(s+i); // ,niceness); m_wordLens [m_numWords] = tagLen; m_numWords++; // advance i += tagLen; goto uptop; } // it is a punct word, find end of it char *start = s+i; //for (;s[i] && ! is_alnum_utf8(s+i);i+=getUtf8CharSize(s+i)); for ( ; s[i] ; i += getUtf8CharSize(s+i)){ // stop on < if we got tags if ( s[i] == '<' && m_hasTags ) break; // breathe QUICKPOLL(niceness); // if we are simple ascii, skip quickly if ( is_ascii(s[i]) ) { // accumulate NON-alnum chars if ( ! is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got alnum break; } // if we are utf8 we stop on special props UChar32 c = utf8Decode ( s+i ); // stop if word char if ( ! ucIsWordChar ( c ) ) continue; // update first though oldScript = ucGetScript ( c ); // then stop break; } m_words [ m_numWords ] = start; m_wordLens [ m_numWords ] = s+i - start; m_wordIds [ m_numWords ] = 0LL; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; goto uptop; } // get an alnum word j = i; again: //for ( ; is_alnum_utf8 (&s[i] ) ; i += getUtf8CharSize(s+i) ); for ( ; s[i] ; i += getUtf8CharSize(s+i) ) { // breathe QUICKPOLL(niceness); // simple ascii? if ( is_ascii(s[i]) ) { // accumulate alnum chars if ( is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got punct break; } // get the code point of the utf8 char UChar32 c = utf8Decode ( s+i ); // get props props = ucProperties ( c ); // good stuff? if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue; // stop? if UC_WORCHAR is set, that means its an alnum if ( ! ( props & UC_WORDCHAR ) ) { // reset script between words oldScript = ucScriptCommon; break; } // save it saved = oldScript; // update here oldScript = ucGetScript(c); // treat ucScriptLatin (30) as common so we can have latin1 // like char without breaking the word! if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon; // stop on this crap too i guess. like japanes chars? if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) { // include it i += getUtf8CharSize(s+i); // but stop break; } // script change? if ( saved != oldScript ) break; } // . java++, A++, C++ exception // . A+, C+, exception // . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE if ( s[i]=='+' ) { if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2; else if ( !is_alnum_utf8(&s[i+1]) ) i++; } // . c#, j#, ... if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++; // allow for words like we're dave's and i'm if(s[i]=='\''&&s[i+1]&&is_alnum_utf8(&s[i+1])&&!hadApostrophe){ i++; hadApostrophe = true; goto again; } hadApostrophe = false; // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [ m_numWords ] = &s[j]; m_wordLens[ m_numWords ] = wlen; // . Lars says it's better to leave the accented chars intact // . google agrees // . but what about "re'sume"? if ( computeWordIds ) { long long h = hash64Lower_utf8(&s[j],wlen); m_wordIds [m_numWords] = h; // until we get an accent removal algo, comment this // out and possibly use the query synonym pipeline // to search without accents. MDW //long long h2 = hash64AsciiLowerE(&s[j],wlen); //if ( h2 != h ) m_stripWordIds [m_numWords] = h2; //else m_stripWordIds [m_numWords] = 0LL; //m_stripWordIds[m_numWords] = 0; } if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; m_numAlnumWords++; // break on \0 or MAX_WORDS //if ( ! s[i] ) goto done; // get a punct word goto uptop; /* j = i; // delineate the "punctuation" word for ( ; s[i] && !is_alnum_utf8(&s[i]);i+=getUtf8CharSize(s+i)); // bad utf8 could cause us to breach the node, so watch out! if ( i > nodeLen ) { badCount++; i = nodeLen; } // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [m_numWords ] = &s[j]; m_wordLens [m_numWords ] = wlen; m_wordIds [m_numWords ] = 0LL; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; */ done: // bad programming warning if ( m_numWords > m_preCount ) { log(LOG_LOGIC, "build: words: set: Fix counting routine."); char *xx = NULL; *xx = 0; } // compute total length if ( m_numWords <= 0 ) m_totalLen = 0; else m_totalLen = m_words[m_numWords-1] - s + m_wordLens[m_numWords-1]; if ( badCount ) log("words: had %li bad utf8 chars",badCount); return true; }
// . return the value of the specified "field" within this html tag, "s" // . the case of "field" does not matter char *getFieldValue ( char *s , long slen , char *field , long *valueLen ) { // reset this to 0 *valueLen = 0; // scan for the field name in our node long flen = gbstrlen(field); char inQuotes = '\0'; long i; // make it sane if ( slen > 2000 ) slen = 2000; for ( i = 1; i + flen < slen ; i++ ) { // skip the field if it's quoted if ( inQuotes) { if (s[i] == inQuotes ) inQuotes = 0; continue; } // set inQuotes to the quote if we're in quotes if ( (s[i]=='\"' || s[i]=='\'')){ inQuotes = s[i]; continue; } // if not in quote tag might end if ( s[i] == '>' && ! inQuotes ) return NULL; // a field name must be preceeded by non-alnum if ( is_alnum_a ( s[i-1] ) ) continue; // the first character of this field shout match field[0] if ( to_lower_a (s[i]) != to_lower_a(field[0] )) continue; // field just be immediately followed by an = or space if (s[i+flen]!='='&&!is_wspace_a(s[i+flen]))continue; // field names must match if ( strncasecmp ( &s[i], field, flen ) != 0 ) continue; // break cuz we got a match for our field name break; } // return NULL if no matching field if ( i + flen >= slen ) return NULL; // advance i over the fieldname so it pts to = or space i += flen; // advance i over spaces while ( i < slen && is_wspace_a ( s[i] ) ) i++; // advance over the equal sign, return NULL if does not exist if ( i < slen && s[i++] != '=' ) return NULL; // advance i over spaces after the equal sign while ( i < slen && is_wspace_a ( s[i] ) ) i++; // now parse out the value of this field (could be in quotes) inQuotes = '\0'; // set inQuotes to the quote if we're in quotes if ( s[i]=='\"' || s[i]=='\'') inQuotes = s[i++]; // mark this as the start of the value int start=i; // advance i until we hit a space, or we hit a that quote if inQuotes if (inQuotes) while (i<slen && s[i] != inQuotes ) i++; else while ( i<slen &&!is_wspace_a(s[i])&&s[i]!='>')i++; // set the length of the value *valueLen = i - start; // return a ptr to the value return s + start; }
// . MDW: TODO: bring this back when we have a subdir for each collection // . add a new rec // . returns false and sets g_errno on error // . use a collnum_t of -1 if it is new bool Collectiondb::addRec ( char *coll , char *cpc , long cpclen , bool isNew , collnum_t collnum , bool isDump , bool saveIt ) { // sanity check if ( ( isNew && collnum >= 0) || (!isNew && collnum < 0) ) { log(LOG_LOGIC,"admin: Bad parms passed to addRec."); char *xx = NULL; *xx = 0; } // ensure coll name is legit char *p = coll; for ( ; *p ; p++ ) { if ( is_alnum_a(*p) ) continue; if ( *p == '-' ) continue; break; } if ( *p ) { g_errno = EBADENGINEER; log("admin: \"%s\" is a malformed collection name because it " "contains the '%c' character.",coll,*p); return false; } // . scan for holes // . i is also known as the collection id long i ; if ( collnum >= 0 ) i = (long)collnum; else for ( i = 0 ; i < m_numRecs ; i++ ) if ( ! m_recs[i] ) break; // ceiling? if ( i >= MAX_COLLS ) { g_errno = ENOBUFS; return log("admin: Limit of %li collection reached. " "Collection not created.",(long)MAX_COLLS); } // if empty... bail, no longer accepted, use "main" if ( ! coll || !coll[0] ) { g_errno = EBADENGINEER; return log("admin: Trying to create a new collection " "but no collection name provided. Use the \"c\" " "cgi parameter to specify it."); } // or if too big if ( gbstrlen(coll) > MAX_COLL_LEN ) { g_errno = ENOBUFS; return log("admin: Trying to create a new collection " "whose name \"%s\" of %i chars is longer than the " "max of %li chars.",coll,gbstrlen(coll), (long)MAX_COLL_LEN); } // ensure does not already exist in memory if ( getCollnum ( coll ) >= 0 ) { g_errno = EEXIST; return log("admin: Trying to create collection \"%s\" but " "already exists in memory.",coll); } // MDW: ensure not created on disk since time of last load char dname[512]; sprintf(dname, "%scoll.%s.%li/",g_hostdb.m_dir,coll,i); if ( isNew && opendir ( dname ) ) { g_errno = EEXIST; return log("admin: Trying to create collection %s but " "directory %s already exists on disk.",coll,dname); } //char fname[512]; // ending '/' is ALWAYS included in g_hostdb.m_dir //sprintf ( fname , "%s%li.%s.conf",g_hostdb.m_dir,i,coll); //File f; //f.set ( fname ); //if ( f.doesExist() ) { // g_errno = EEXIST; // return log("admin: Trying to create collection \"%s\" but " // "file %s already exists on disk.",coll,fname); //} // create the record in memory m_recs[i] = new (CollectionRec); if ( ! m_recs[i] ) return log("admin: Failed to allocated %li bytes for new " "collection record for \"%s\".", (long)sizeof(CollectionRec),coll); mnew ( m_recs[i] , sizeof(CollectionRec) , "CollectionRec" ); // get copy collection CollectionRec *cpcrec = NULL; if ( cpc && cpc[0] ) cpcrec = getRec ( cpc , cpclen ); if ( cpc && cpc[0] && ! cpcrec ) log("admin: Collection \"%s\" to copy config from does not " "exist.",cpc); // get the default.conf from working dir if there g_parms.setToDefault( (char *)m_recs[i] ); if ( isNew ) { // the default conf file char tmp1[1024]; sprintf ( tmp1 , "%sdefault.conf" , g_hostdb.m_dir ); // . set our parms from the file. // . accepts OBJ_COLLECTIONREC or OBJ_CONF g_parms.setFromFile ( m_recs[i] , NULL , tmp1 ); } // this will override all if ( cpcrec ) { // copy it, but not the timedb hashtable, etc. long size = (char *)&(cpcrec->m_END_COPY) - (char *)cpcrec; // JAB: bad memcpy - no donut! // this is not how objects are supposed to be copied!!! memcpy ( m_recs[i] , cpcrec , size);//sizeof(CollectionRec) ); // perform the cleanup that a copy constructor might do... //for (int rx = 0; rx < MAX_FILTERS; rx++) // m_recs[i]->m_pRegExParser[rx] = NULL; // don't NUKE the filters! // m_recs[i]->m_numRegExs = 0; // OK - done with cleaning up... // but never copy over the collection hostname, that is // problematic m_recs[i]->m_collectionHostname [0] = '\0'; m_recs[i]->m_collectionHostname1[0] = '\0'; m_recs[i]->m_collectionHostname2[0] = '\0'; } // set coll id and coll name for coll id #i strcpy ( m_recs[i]->m_coll , coll ); m_recs[i]->m_collLen = gbstrlen ( coll ); m_recs[i]->m_collnum = i; // point to this, so Rdb and RdbBase can reference it coll = m_recs[i]->m_coll; // . if has no password or ip add the default password, footbar // . no, just don't have any password, just use the 127.0.0.1 ip // that is the loopback /* if ( m_recs[i]->m_numAdminIps == 0 && m_recs[i]->m_numAdminPwds == 0 ) { m_recs[i]->m_numAdminIps = 1; m_recs[i]->m_adminIps[0] = atoip("0.0.0.0",7); //strcpy ( m_recs[i]->m_adminPwds[0] , "footbar23" ); //m_recs[i]->m_numAdminPwds = 1; //log("admin: Using default password for new collection of " // "'footbar23'."); } */ // collection name HACK for backwards compatibility //if ( strcmp ( coll , "main" ) == 0 ) { // m_recs[i]->m_coll[0] = '\0'; // m_recs[i]->m_collLen = 0; // //coll[0] = '\0'; //} // MDW: create the new directory if ( isNew ) { retry22: if ( ::mkdir ( dname , S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP | S_IXGRP | S_IROTH | S_IXOTH ) ) { // valgrind? if ( errno == EINTR ) goto retry22; g_errno = errno; mdelete ( m_recs[i] , sizeof(CollectionRec) , "CollectionRec" ); delete ( m_recs[i]); m_recs[i] = NULL; return log("admin: Creating directory %s had error: " "%s.", dname,mstrerror(g_errno)); } // save it into this dir... might fail! if ( ! m_recs[i]->save() ) { mdelete ( m_recs[i] , sizeof(CollectionRec) , "CollectionRec" ); delete ( m_recs[i]); m_recs[i] = NULL; return log("admin: Failed to save file %s: %s", dname,mstrerror(g_errno)); } } // load if not new if ( ! isNew && ! m_recs[i]->load ( coll , i ) ) { mdelete ( m_recs[i], sizeof(CollectionRec), "CollectionRec" ); delete ( m_recs[i]); m_recs[i] = NULL; return log("admin: Failed to load conf for collection " "\"%s\".",coll); } // mark it as needing to be saved instead m_recs[i]->m_needsSave = false; // force this to off for now //m_recs[i]->m_queryExpansion = false; // reserve it if ( i >= m_numRecs ) m_numRecs = i + 1; // count it m_numRecsUsed++; // update the time updateTime(); // if we are doing a dump from the command line, skip this stuff if ( isDump ) return true; bool verify = true; if(isNew) verify = false; // tell rdbs to add one, too //if ( ! g_indexdb.addColl ( coll, verify ) ) goto hadError; if ( ! g_posdb.addColl ( coll, verify ) ) goto hadError; //if ( ! g_datedb.addColl ( coll, verify ) ) goto hadError; if ( ! g_titledb.addColl ( coll, verify ) ) goto hadError; //if ( ! g_revdb.addColl ( coll, verify ) ) goto hadError; //if ( ! g_sectiondb.addColl ( coll, verify ) ) goto hadError; if ( ! g_tagdb.addColl ( coll, verify ) ) goto hadError; //if ( ! g_catdb.addColl ( coll, verify ) ) goto hadError; //if ( ! g_checksumdb.addColl ( coll, verify ) ) goto hadError; if ( ! g_spiderdb.addColl ( coll, verify ) ) goto hadError; if ( ! g_doledb.addColl ( coll, verify ) ) goto hadError; //if ( ! g_tfndb.addColl ( coll, verify ) ) goto hadError; if ( ! g_clusterdb.addColl ( coll, verify ) ) goto hadError; if ( ! g_linkdb.addColl ( coll, verify ) ) goto hadError; // debug message log ( LOG_INFO, "admin: added collection \"%s\" (%li).",coll,(long)i); // tell SpiderCache about this collection, it will create a // SpiderCollection class for it. //g_spiderCache.reset1(); // . make it set is CollectionRec::m_sortByDateTable now // . everyone else uses setTimeOfDayInMilliseconds() in fctypes.cpp // to call this function once their clock is synced with host #0 //if ( g_hostdb.m_initialized && g_hostdb.m_hostId == 0 ) // initSortByDateTable(coll); //else if ( g_hostdb.m_initialized && isClockInSync() ) // initSortByDateTable(coll); // . do it for all regard-less // . once clock is in sync with host #0 we may do it again! //if ( g_hostdb.m_initialized ) // initSortByDateTable(coll); // success return true; hadError: log("admin: Had error adding new collection: %s.",mstrerror(g_errno)); // do not delete it, might have failed to add because not enough // memory to read in the tree *-saved.dat file on disk!! and if // you delete in then core the *-saved.dat file gets overwritten!!! return false; /* g_indexdb.getRdb()->delColl ( coll ); g_datedb.getRdb()->delColl ( coll ); g_timedb.getRdb()->delColl ( coll ); g_titledb.getRdb()->delColl ( coll ); g_revdb.getRdb()->delColl ( coll ); g_sectiondb.getRdb()->delColl ( coll ); g_placedb.getRdb()->delColl ( coll ); g_tagdb.getRdb()->delColl ( coll ); //g_catdb.getRdb()->delColl ( coll ); //g_checksumdb.getRdb()->delColl ( coll ); g_spiderdb.getRdb()->delColl ( coll ); g_doledb.getRdb()->delColl ( coll ); g_tfndb.getRdb()->delColl ( coll ); g_clusterdb.getRdb()->delColl ( coll ); g_linkdb.getRdb()->delColl ( coll ); deleteRec ( coll ); return false; */ }
// . s[maxLen] should be the NULL // . returns full length of entity @ "s" if there is a valid one, 0 otherwise // . sets *c to the iso character the entity represents (if there is one) // JAB: const-ness for optimizer... int32_t getEntity_a ( const char *s, int32_t maxLen, uint32_t codepoint[2], int32_t *codepointCount, int32_t *utf8Len ) { //TODO: handle multi-codepoint entitites *utf8Len=0; // ensure there's an & as first char if ( s[0] != '&' ) { return 0; } // compute maximum length of entity, if it's indeed an entity int32_t len = 1; if ( s[len] == '#' ) { len++; } // cut it off after <32> chars to save time and also to avoid parsing // obscenely long incorrect entitites (eg an ampersand followed by 2MB of letters) while ( len < maxLen && len < max_entity_name_len && is_alnum_a( s[len] ) ) { len++; } // character entity reference must end with a semicolon. // some browsers have lenient parsing, but we don't accept invalid // references. if ( len == maxLen || s[len] != ';' ) { //not a valid character entity reference return 0; } len++; // we don't have entities longer than what w3c specified if ( len > max_entity_name_len+1 ) { return 0; } // all entites are 3 or more chars (>) if ( len < 3 ) { return 0; } // . if it's a numeric entity like { use this routine // . pass in the whole she-bang: "...;" or "´...; if ( s[1] == '#' ) { if ( s[2] == 'x' ) { codepoint[0] = getHexadecimalEntity( s, len ); *codepointCount = 1; } else { codepoint[0] = getDecimalEntity( s, len ); *codepointCount = 1; } } else { // otherwise, it's a named entity const Entity *entity = getTextEntity( s, len ); if(entity) { memcpy(codepoint, entity->codepoint, entity->codepoints*sizeof(int32_t)); *codepointCount = entity->codepoints; *utf8Len = (int32_t)entity->utf8Len; return len; } else { return 0; //unknown named entity } } // return 0 if not an entity, length of entity if it is an entity if ( codepoint[0] ) { return len; } else { return 0; } }
static bool isTLD ( char *tld , int32_t tldLen ) { int32_t pcount = 0; // now they are random! for ( int32_t i = 0 ; i < tldLen ; i++ ) { // period count if ( tld[i] == '.' ) { pcount++; continue; } if ( ! is_alnum_a(tld[i]) && tld[i] != '-' ) return false; } if ( pcount == 0 ) return true; if ( pcount >= 2 ) return false; // otherwise, if one period, check table to see if qualified // we use this as our hashtable static bool s_isInitialized = false; // . i shrunk this list a lot // . see backups for the hold list static const char * const s_tlds[] = { // From: https://data.iana.org/TLD/tlds-alpha-by-domain.txt "AAA", "AARP", "ABB", "ABBOTT", "ABBVIE", "ABOGADO", "ABUDHABI", "AC", "ACADEMY", "ACCENTURE", "ACCOUNTANT", "ACCOUNTANTS", "ACO", "ACTIVE", "ACTOR", "AD", "ADAC", "ADS", "ADULT", "AE", "AEG", "AERO", "AF", "AFL", "AG", "AGAKHAN", "AGENCY", "AI", "AIG", "AIRFORCE", "AIRTEL", "AKDN", "AL", "ALIBABA", "ALIPAY", "ALLFINANZ", "ALLY", "ALSACE", "AM", "AMICA", "AMSTERDAM", "ANALYTICS", "ANDROID", "ANQUAN", "AO", "APARTMENTS", "APP", "APPLE", "AQ", "AQUARELLE", "AR", "ARAMCO", "ARCHI", "ARMY", "ARPA", "ARTE", "AS", "ASIA", "ASSOCIATES", "AT", "ATTORNEY", "AU", "AUCTION", "AUDI", "AUDIO", "AUTHOR", "AUTO", "AUTOS", "AVIANCA", "AW", "AWS", "AX", "AXA", "AZ", "AZURE", "BA", "BABY", "BAIDU", "BAND", "BANK", "BAR", "BARCELONA", "BARCLAYCARD", "BARCLAYS", "BAREFOOT", "BARGAINS", "BAUHAUS", "BAYERN", "BB", "BBC", "BBVA", "BCG", "BCN", "BD", "BE", "BEATS", "BEER", "BENTLEY", "BERLIN", "BEST", "BET", "BF", "BG", "BH", "BHARTI", "BI", "BIBLE", "BID", "BIKE", "BING", "BINGO", "BIO", "BIZ", "BJ", "BLACK", "BLACKFRIDAY", "BLOOMBERG", "BLUE", "BM", "BMS", "BMW", "BN", "BNL", "BNPPARIBAS", "BO", "BOATS", "BOEHRINGER", "BOM", "BOND", "BOO", "BOOK", "BOOTS", "BOSCH", "BOSTIK", "BOT", "BOUTIQUE", "BR", "BRADESCO", "BRIDGESTONE", "BROADWAY", "BROKER", "BROTHER", "BRUSSELS", "BS", "BT", "BUDAPEST", "BUGATTI", "BUILD", "BUILDERS", "BUSINESS", "BUY", "BUZZ", "BV", "BW", "BY", "BZ", "BZH", "CA", "CAB", "CAFE", "CAL", "CALL", "CAMERA", "CAMP", "CANCERRESEARCH", "CANON", "CAPETOWN", "CAPITAL", "CAR", "CARAVAN", "CARDS", "CARE", "CAREER", "CAREERS", "CARS", "CARTIER", "CASA", "CASH", "CASINO", "CAT", "CATERING", "CBA", "CBN", "CC", "CD", "CEB", "CENTER", "CEO", "CERN", "CF", "CFA", "CFD", "CG", "CH", "CHANEL", "CHANNEL", "CHASE", "CHAT", "CHEAP", "CHLOE", "CHRISTMAS", "CHROME", "CHURCH", "CI", "CIPRIANI", "CIRCLE", "CISCO", "CITIC", "CITY", "CITYEATS", "CK", "CL", "CLAIMS", "CLEANING", "CLICK", "CLINIC", "CLINIQUE", "CLOTHING", "CLOUD", "CLUB", "CLUBMED", "CM", "CN", "CO", "COACH", "CODES", "COFFEE", "COLLEGE", "COLOGNE", "COM", "COMMBANK", "COMMUNITY", "COMPANY", "COMPARE", "COMPUTER", "COMSEC", "CONDOS", "CONSTRUCTION", "CONSULTING", "CONTACT", "CONTRACTORS", "COOKING", "COOL", "COOP", "CORSICA", "COUNTRY", "COUPON", "COUPONS", "COURSES", "CR", "CREDIT", "CREDITCARD", "CREDITUNION", "CRICKET", "CROWN", "CRS", "CRUISES", "CSC", "CU", "CUISINELLA", "CV", "CW", "CX", "CY", "CYMRU", "CYOU", "CZ", "DABUR", "DAD", "DANCE", "DATE", "DATING", "DATSUN", "DAY", "DCLK", "DE", "DEALER", "DEALS", "DEGREE", "DELIVERY", "DELL", "DELOITTE", "DELTA", "DEMOCRAT", "DENTAL", "DENTIST", "DESI", "DESIGN", "DEV", "DIAMONDS", "DIET", "DIGITAL", "DIRECT", "DIRECTORY", "DISCOUNT", "DJ", "DK", "DM", "DNP", "DO", "DOCS", "DOG", "DOHA", "DOMAINS", "DOWNLOAD", "DRIVE", "DUBAI", "DURBAN", "DVAG", "DZ", "EARTH", "EAT", "EC", "EDEKA", "EDU", "EDUCATION", "EE", "EG", "EMAIL", "EMERCK", "ENERGY", "ENGINEER", "ENGINEERING", "ENTERPRISES", "EPSON", "EQUIPMENT", "ER", "ERNI", "ES", "ESQ", "ESTATE", "ET", "EU", "EUROVISION", "EUS", "EVENTS", "EVERBANK", "EXCHANGE", "EXPERT", "EXPOSED", "EXPRESS", "EXTRASPACE", "FAGE", "FAIL", "FAIRWINDS", "FAITH", "FAMILY", "FAN", "FANS", "FARM", "FASHION", "FAST", "FEEDBACK", "FERRERO", "FI", "FILM", "FINAL", "FINANCE", "FINANCIAL", "FIRESTONE", "FIRMDALE", "FISH", "FISHING", "FIT", "FITNESS", "FJ", "FK", "FLICKR", "FLIGHTS", "FLORIST", "FLOWERS", "FLSMIDTH", "FLY", "FM", "FO", "FOO", "FOOTBALL", "FORD", "FOREX", "FORSALE", "FORUM", "FOUNDATION", "FOX", "FR", "FRESENIUS", "FRL", "FROGANS", "FRONTIER", "FTR", "FUND", "FURNITURE", "FUTBOL", "FYI", "GA", "GAL", "GALLERY", "GALLO", "GALLUP", "GAME", "GARDEN", "GB", "GBIZ", "GD", "GDN", "GE", "GEA", "GENT", "GENTING", "GF", "GG", "GGEE", "GH", "GI", "GIFT", "GIFTS", "GIVES", "GIVING", "GL", "GLASS", "GLE", "GLOBAL", "GLOBO", "GM", "GMAIL", "GMBH", "GMO", "GMX", "GN", "GOLD", "GOLDPOINT", "GOLF", "GOO", "GOOG", "GOOGLE", "GOP", "GOT", "GOV", "GP", "GQ", "GR", "GRAINGER", "GRAPHICS", "GRATIS", "GREEN", "GRIPE", "GROUP", "GS", "GT", "GU", "GUCCI", "GUGE", "GUIDE", "GUITARS", "GURU", "GW", "GY", "HAMBURG", "HANGOUT", "HAUS", "HDFCBANK", "HEALTH", "HEALTHCARE", "HELP", "HELSINKI", "HERE", "HERMES", "HIPHOP", "HITACHI", "HIV", "HK", "HM", "HN", "HOCKEY", "HOLDINGS", "HOLIDAY", "HOMEDEPOT", "HOMES", "HONDA", "HORSE", "HOST", "HOSTING", "HOTELES", "HOTMAIL", "HOUSE", "HOW", "HR", "HSBC", "HT", "HTC", "HU", "HYUNDAI", "IBM", "ICBC", "ICE", "ICU", "ID", "IE", "IFM", "IINET", "IL", "IM", "IMAMAT", "IMMO", "IMMOBILIEN", "IN", "INDUSTRIES", "INFINITI", "INFO", "ING", "INK", "INSTITUTE", "INSURANCE", "INSURE", "INT", "INTERNATIONAL", "INVESTMENTS", "IO", "IPIRANGA", "IQ", "IR", "IRISH", "IS", "ISELECT", "ISMAILI", "IST", "ISTANBUL", "IT", "ITAU", "IWC", "JAGUAR", "JAVA", "JCB", "JCP", "JE", "JETZT", "JEWELRY", "JLC", "JLL", "JM", "JMP", "JNJ", "JO", "JOBS", "JOBURG", "JOT", "JOY", "JP", "JPMORGAN", "JPRS", "JUEGOS", "KAUFEN", "KDDI", "KE", "KERRYHOTELS", "KERRYLOGISTICS", "KERRYPROPERTIES", "KFH", "KG", "KH", "KI", "KIA", "KIM", "KINDER", "KITCHEN", "KIWI", "KM", "KN", "KOELN", "KOMATSU", "KP", "KPMG", "KPN", "KR", "KRD", "KRED", "KUOKGROUP", "KW", "KY", "KYOTO", "KZ", "LA", "LACAIXA", "LAMBORGHINI", "LAMER", "LANCASTER", "LAND", "LANDROVER", "LANXESS", "LASALLE", "LAT", "LATROBE", "LAW", "LAWYER", "LB", "LC", "LDS", "LEASE", "LECLERC", "LEGAL", "LEXUS", "LGBT", "LI", "LIAISON", "LIDL", "LIFE", "LIFEINSURANCE", "LIFESTYLE", "LIGHTING", "LIKE", "LIMITED", "LIMO", "LINCOLN", "LINDE", "LINK", "LIPSY", "LIVE", "LIVING", "LIXIL", "LK", "LOAN", "LOANS", "LOCUS", "LOL", "LONDON", "LOTTE", "LOTTO", "LOVE", "LR", "LS", "LT", "LTD", "LTDA", "LU", "LUPIN", "LUXE", "LUXURY", "LV", "LY", "MA", "MADRID", "MAIF", "MAISON", "MAKEUP", "MAN", "MANAGEMENT", "MANGO", "MARKET", "MARKETING", "MARKETS", "MARRIOTT", "MBA", "MC", "MD", "ME", "MED", "MEDIA", "MEET", "MELBOURNE", "MEME", "MEMORIAL", "MEN", "MENU", "MEO", "MG", "MH", "MIAMI", "MICROSOFT", "MIL", "MINI", "MK", "ML", "MLS", "MM", "MMA", "MN", "MO", "MOBI", "MOBILY", "MODA", "MOE", "MOI", "MOM", "MONASH", "MONEY", "MONTBLANC", "MORMON", "MORTGAGE", "MOSCOW", "MOTORCYCLES", "MOV", "MOVIE", "MOVISTAR", "MP", "MQ", "MR", "MS", "MT", "MTN", "MTPC", "MTR", "MU", "MUSEUM", "MUTUAL", "MUTUELLE", "MV", "MW", "MX", "MY", "MZ", "NA", "NADEX", "NAGOYA", "NAME", "NATURA", "NAVY", "NC", "NE", "NEC", "NET", "NETBANK", "NETWORK", "NEUSTAR", "NEW", "NEWS", "NEXT", "NEXTDIRECT", "NEXUS", "NF", "NG", "NGO", "NHK", "NI", "NICO", "NIKON", "NINJA", "NISSAN", "NISSAY", "NL", "NO", "NOKIA", "NORTHWESTERNMUTUAL", "NORTON", "NOWRUZ", "NP", "NR", "NRA", "NRW", "NTT", "NU", "NYC", "NZ", "OBI", "OFFICE", "OKINAWA", "OLAYAN", "OM", "OMEGA", "ONE", "ONG", "ONL", "ONLINE", "OOO", "ORACLE", "ORANGE", "ORG", "ORGANIC", "ORIGINS", "OSAKA", "OTSUKA", "OVH", "PA", "PAGE", "PAMPEREDCHEF", "PANERAI", "PARIS", "PARS", "PARTNERS", "PARTS", "PARTY", "PASSAGENS", "PE", "PET", "PF", "PG", "PH", "PHARMACY", "PHILIPS", "PHOTO", "PHOTOGRAPHY", "PHOTOS", "PHYSIO", "PIAGET", "PICS", "PICTET", "PICTURES", "PID", "PIN", "PING", "PINK", "PIZZA", "PK", "PL", "PLACE", "PLAY", "PLAYSTATION", "PLUMBING", "PLUS", "PM", "PN", "POHL", "POKER", "P**N", "POST", "PR", "PRAXI", "PRESS", "PRO", "PROD", "PRODUCTIONS", "PROF", "PROGRESSIVE", "PROMO", "PROPERTIES", "PROPERTY", "PROTECTION", "PS", "PT", "PUB", "PW", "PWC", "PY", "QA", "QPON", "QUEBEC", "QUEST", "RACING", "RE", "READ", "REALTOR", "REALTY", "RECIPES", "RED", "REDSTONE", "REDUMBRELLA", "REHAB", "REISE", "REISEN", "REIT", "REN", "RENT", "RENTALS", "REPAIR", "REPORT", "REPUBLICAN", "REST", "RESTAURANT", "REVIEW", "REVIEWS", "REXROTH", "RICH", "RICOH", "RIO", "RIP", "RO", "ROCHER", "ROCKS", "RODEO", "ROOM", "RS", "RSVP", "RU", "RUHR", "RUN", "RW", "RWE", "RYUKYU", "SA", "SAARLAND", "SAFE", "SAFETY", "SAKURA", "SALE", "SALON", "SAMSUNG", "SANDVIK", "SANDVIKCOROMANT", "SANOFI", "SAP", "SAPO", "SARL", "SAS", "SAXO", "SB", "SBI", "SBS", "SC", "SCA", "SCB", "SCHAEFFLER", "SCHMIDT", "SCHOLARSHIPS", "SCHOOL", "SCHULE", "SCHWARZ", "SCIENCE", "SCOR", "SCOT", "SD", "SE", "SEAT", "SECURITY", "SEEK", "SELECT", "SENER", "SERVICES", "SEVEN", "SEW", "SEX", "SEXY", "SFR", "SG", "SH", "SHARP", "SHAW", "SHELL", "SHIA", "SHIKSHA", "SHOES", "SHOUJI", "SHOW", "SHRIRAM", "SI", "SINA", "SINGLES", "SITE", "SJ", "SK", "SKI", "SKIN", "SKY", "SKYPE", "SL", "SM", "SMILE", "SN", "SNCF", "SO", "SOCCER", "SOCIAL", "SOFTBANK", "SOFTWARE", "SOHU", "SOLAR", "SOLUTIONS", "SONG", "SONY", "SOY", "SPACE", "SPIEGEL", "SPOT", "SPREADBETTING", "SR", "SRL", "ST", "STADA", "STAR", "STARHUB", "STATEBANK", "STATEFARM", "STATOIL", "STC", "STCGROUP", "STOCKHOLM", "STORAGE", "STORE", "STREAM", "STUDIO", "STUDY", "STYLE", "SU", "SUCKS", "SUPPLIES", "SUPPLY", "SUPPORT", "SURF", "SURGERY", "SUZUKI", "SV", "SWATCH", "SWISS", "SX", "SY", "SYDNEY", "SYMANTEC", "SYSTEMS", "SZ", "TAB", "TAIPEI", "TALK", "TAOBAO", "TATAMOTORS", "TATAR", "TATTOO", "TAX", "TAXI", "TC", "TCI", "TD", "TEAM", "TECH", "TECHNOLOGY", "TEL", "TELECITY", "TELEFONICA", "TEMASEK", "TENNIS", "TEVA", "TF", "TG", "TH", "THD", "THEATER", "THEATRE", "TICKETS", "TIENDA", "TIFFANY", "TIPS", "TIRES", "TIROL", "TJ", "TK", "TL", "TM", "TMALL", "TN", "TO", "TODAY", "TOKYO", "TOOLS", "TOP", "TORAY", "TOSHIBA", "TOTAL", "TOURS", "TOWN", "TOYOTA", "TOYS", "TR", "TRADE", "TRADING", "TRAINING", "TRAVEL", "TRAVELERS", "TRAVELERSINSURANCE", "TRUST", "TRV", "TT", "TUBE", "TUI", "TUNES", "TUSHU", "TV", "TVS", "TW", "TZ", "UA", "UBS", "UG", "UK", "UNICOM", "UNIVERSITY", "UNO", "UOL", "US", "UY", "UZ", "VA", "VACATIONS", "VANA", "VC", "VE", "VEGAS", "VENTURES", "VERISIGN", "VERSICHERUNG", "VET", "VG", "VI", "VIAJES", "VIDEO", "VIG", "VIKING", "VILLAS", "VIN", "VIP", "VIRGIN", "VISION", "VISTA", "VISTAPRINT", "VIVA", "VLAANDEREN", "VN", "VODKA", "VOLKSWAGEN", "VOTE", "VOTING", "VOTO", "VOYAGE", "VU", "VUELOS", "WALES", "WALTER", "WANG", "WANGGOU", "WARMAN", "WATCH", "WATCHES", "WEATHER", "WEATHERCHANNEL", "WEBCAM", "WEBER", "WEBSITE", "WED", "WEDDING", "WEIBO", "WEIR", "WF", "WHOSWHO", "WIEN", "WIKI", "WILLIAMHILL", "WIN", "WINDOWS", "WINE", "WME", "WOLTERSKLUWER", "WORK", "WORKS", "WORLD", "WS", "WTC", "WTF", "XBOX", "XEROX", "XIHUAN", "XIN", "XN--11B4C3D", "XN--1CK2E1B", "XN--1QQW23A", "XN--30RR7Y", "XN--3BST00M", "XN--3DS443G", "XN--3E0B707E", "XN--3PXU8K", "XN--42C2D9A", "XN--45BRJ9C", "XN--45Q11C", "XN--4GBRIM", "XN--55QW42G", "XN--55QX5D", "XN--5TZM5G", "XN--6FRZ82G", "XN--6QQ986B3XL", "XN--80ADXHKS", "XN--80AO21A", "XN--80ASEHDB", "XN--80ASWG", "XN--8Y0A063A", "XN--90A3AC", "XN--90AIS", "XN--9DBQ2A", "XN--9ET52U", "XN--9KRT00A", "XN--B4W605FERD", "XN--BCK1B9A5DRE4C", "XN--C1AVG", "XN--C2BR7G", "XN--CCK2B3B", "XN--CG4BKI", "XN--CLCHC0EA0B2G2A9GCD", "XN--CZR694B", "XN--CZRS0T", "XN--CZRU2D", "XN--D1ACJ3B", "XN--D1ALF", "XN--E1A4C", "XN--ECKVDTC9D", "XN--EFVY88H", "XN--ESTV75G", "XN--FCT429K", "XN--FHBEI", "XN--FIQ228C5HS", "XN--FIQ64B", "XN--FIQS8S", "XN--FIQZ9S", "XN--FJQ720A", "XN--FLW351E", "XN--FPCRJ9C3D", "XN--FZC2C9E2C", "XN--G2XX48C", "XN--GCKR3F0F", "XN--GECRJ9C", "XN--H2BRJ9C", "XN--HXT814E", "XN--I1B6B1A6A2E", "XN--IMR513N", "XN--IO0A7I", "XN--J1AEF", "XN--J1AMH", "XN--J6W193G", "XN--JLQ61U9W7B", "XN--JVR189M", "XN--KCRX77D1X4A", "XN--KPRW13D", "XN--KPRY57D", "XN--KPU716F", "XN--KPUT3I", "XN--L1ACC", "XN--LGBBAT1AD8J", "XN--MGB9AWBF", "XN--MGBA3A3EJT", "XN--MGBA3A4F16A", "XN--MGBA7C0BBN0A", "XN--MGBAAM7A8H", "XN--MGBAB2BD", "XN--MGBAYH7GPA", "XN--MGBB9FBPOB", "XN--MGBBH1A71E", "XN--MGBC0A9AZCG", "XN--MGBCA7DZDO", "XN--MGBERP4A5D4AR", "XN--MGBPL2FH", "XN--MGBT3DHD", "XN--MGBTX2B", "XN--MGBX4CD0AB", "XN--MIX891F", "XN--MK1BU44C", "XN--MXTQ1M", "XN--NGBC5AZD", "XN--NGBE9E0A", "XN--NODE", "XN--NQV7F", "XN--NQV7FS00EMA", "XN--NYQY26A", "XN--O3CW4H", "XN--OGBPF8FL", "XN--P1ACF", "XN--P1AI", "XN--PBT977C", "XN--PGBS0DH", "XN--PSSY2U", "XN--Q9JYB4C", "XN--QCKA1PMC", "XN--QXAM", "XN--RHQV96G", "XN--ROVU88B", "XN--S9BRJ9C", "XN--SES554G", "XN--T60B56A", "XN--TCKWE", "XN--UNUP4Y", "XN--VERMGENSBERATER-CTB", "XN--VERMGENSBERATUNG-PWB", "XN--VHQUV", "XN--VUQ861B", "XN--W4R85EL8FHU5DNRA", "XN--WGBH1C", "XN--WGBL6A", "XN--XHQ521B", "XN--XKC2AL3HYE2A", "XN--XKC2DL3A5EE0H", "XN--Y9A3AQ", "XN--YFRO4I67O", "XN--YGBI2AMMX", "XN--ZFR164B", "XPERIA", "XXX", "XYZ", "YACHTS", "YAHOO", "YAMAXUN", "YANDEX", "YE", "YODOBASHI", "YOGA", "YOKOHAMA", "YOU", "YOUTUBE", "YT", "YUN", "ZA", "ZARA", "ZERO", "ZIP", "ZM", "ZONE", "ZUERICH", "ZW", "AB.CA", "AC.AE", "AC.AT", "AC.CN", "AC.CR", "AC.CY", "AC.FJ", "AC.GG", "AC.ID", "AC.IL", "AC.IM", "AC.IN", "AC.JE", "AC.JP", "AC.KR", "AC.NZ", "AC.PA", "AC.TH", "AC.UG", "AC.UK", "AC.YU", "AC.ZA", "AD.JP", "AH.CN", "ALDERNEY.GG", "ALT.ZA", "ART.BR", "ART.DO", "ARTS.CO", "ARTS.VE", "ASN.AU", "ASN.LV", "BBS.TR", "BC.CA", "BIB.VE", "BJ.CN", "CO.AT", "CO.AO", "CO.CK", "CO.CR", "CO.GG", "CO.HU", "CO.ID", "CO.IL", "CO.IM", "CO.IN", "CO.JE", "CO.JP", "CO.KR", "COM.AR", "COM.AU", "COM.AZ", "COM.BB", "COM.BM", "COM.BR", "COM.BS", "COM.CN", "COM.CO", "COM.CU", "COM.CY", "COM.DO", "COM.EC", "COM.EG", "COM.FJ", "COM.GE", "COM.GU", "COM.HK", "COM.JO", "COM.KH", "COM.LA", "COM.LB", "COM.LC", "COM.LV", "COM.LY", "COM.MM", "COM.MO", "COM.MT", "COM.MX", "COM.MY", "COM.NA", "COM.NC", "COM.NI", "COM.NP", "COM.PA", "COM.PE", "COM.PH", "COM.PL", "COM.PY", "COM.RU", "COM.SG", "COM.SH", "COM.SY", "COM.TN", "COM.TR", "COM.TW", "COM.UA", "COM.UY", "COM.VE", "CONF.AU", "CONF.LV", "CO.NZ", "COOP", "CO.AE", "CO.SV", "CO.TH", "CO.UG", "CO.UK", "CO.VE", "CO.VI", "CO.YU", "CO.ZA", "CQ.CN", "CSIRO.AU", "ED.CR", "EDU.BM", "EDU.AR", "EDU.CN", "EDU.CO", "EDU.DO", "EDU.EC", "EDU.EG", "EDU.GE", "EDU.GU", "EDU.JO", "EDU.LC", "EDU.LV", "EDU.MM", "EDU.MO", "EDU.MY", "EDUNET.TN", "EDU.PA", "EDU.PY", "EDU.SG", "EDU.SH", "EDU.TR", "EDU.TW", "EDU.UY", "EDU.VE", "EDU.YU", "EDU.ZA", "ENS.TN", "ERNET.IN", "ESP.BR", "ETC.BR", "EUN.EG", "FI.CR", "FIN.EC", "FIN.TN", "FIRM.CO", "FIRM.VE", "G12.BR", "GD.CN", "GEN.NZ", "GOB.PA", "GO.CR", "GO.ID", "GO.KR", "GO.TH", "GO.UG", "GOV.AE", "GOV.AR", "GOV.AU", "GOV.BM", "GOV.BR", "GOV.CN", "GOV.CO", "GOV.CY", "GOV.DO", "GOV.EC", "GOV.EG", "GOVE.TW", "GOV.FJ", "GOV.GE", "GOV.GG", "GOV.GU", "GOV.IL", "GOV.IM", "GOV.IN", "GOV.JE", "GOV.JO", "GOV.JP", "GOV.LB", "GOV.LC", "GOV.LV", "GOV.MM", "GOV.MO", "GOV.MY", "GOV.SG", "GOV.SH", "GOV.TN", "GOVT.NZ", "GOV.TR", "GOV.UA", "GOV.UK", "GOV.VE", "GOV.ZA", "GS.CN", "GUERNSEY.GG", "GX.CN", "GZ.CN", "HB.CN", "HE.CN", "HI.CN", "HK.CN", "HL.CN", "HN.CN", "ID.AU", "ID.FJ", "ID.LV", "IND.BR", "IND.GG", "IND.JE", "IND.TN", "INF.BR", "INFO.AU", "INFO.CO", "INFO.HU", "INFO.TN", "INFO.VE", "INT.CO", "INTL.TN", "INT.VE", "JERSEY.JE", "JL.CN", "JS.CN", "K12.EC", "K12.IL", "K12.TR", "LKD.CO.IM", "LN.CN", "LTD.GG", "LTD.JE", "LTD.UK", "MB.CA", "MED.EC", "MIL.BR", "MIL.CO", "MIL.DO", "MIL.EC", "MIL.GE", "MIL.GU", "MIL.ID", "MIL.LB", "MIL.LV", "MIL.PH", "MIL.SH", "MIL.TR", "MIL.VE", "MIL.ZA", "MO.CN", "MOD.UK", "MUNI.IL", "MUSEUM", "NAME", "NAT.TN", "NB.CA", "NET.AR", "NET.AU", "NET.AZ", "NET.BB", "NET.BM", "NET.BR", "NET.BS", "NET.CN", "NET.CU", "NET.CY", "NET.DO", "NET.EC", "NET.EG", "NET.GE", "NET.GG", "NET.GU", "NET.HK", "NET.ID", "NET.IL", "NET.IM", "NET.IN", "NET.JE", "NET.JO", "NET.JP", "NET.KH", "NET.LA", "NET.LB", "NET.LC", "NET.LV", "NET.LY", "NET.MM", "NET.MO", "NET.MT", "NET.MX", "NET.MY", "NET.NA", "NET.NC", "NET.NP", "NET.NZ", "NET.PA", "NET.PE", "NET.PH", "NET.PL", "NET.PY", "NET.RU", "NET.SG", "NET.SH", "NET.SY", "NET.TH", "NET.TN", "NET.TR", "NET.TW", "NET.UA", "NET.UK", "NET.UY", "NET.VE", "NET.VI", "NET.ZA", "NF.CA", "NGO.PH", "NGO.ZA", "NHS.UK", "NIC.IM", "NIC.IN", "NM.CN", "NM.KR", "NOM.CO", "NOM.VE", "NOM.ZA", "NS.CA", "NSK.SU", "NT.CA", "NUI.HU", "NX.CN", "ON.CA", "OR.CR", "ORG.AE", "ORG.AR", "ORG.AU", "ORG.AZ", "ORG.BB", "ORG.BM", "ORG.BR", "ORG.BS", "ORG.CN", "ORG.CO", "ORG.CU", "ORG.CY", "ORG.DO", "ORG.EC", "ORG.EG", "ORG.FJ", "ORG.GE", "ORG.GG", "ORG.GU", "ORG.HK", "ORG.HU", "ORG.IL", "ORG.IM", "ORG.JE", "ORG.JP", "ORG.KH", "ORG.LA", "ORG.LB", "ORG.LC", "ORG.LV", "ORG.LY", "ORG.MM", "ORG.MO", "ORG.MT", "ORG.MX", "ORG.MY", "ORG.NA", "ORG.NC", "ORG.NZ", "ORG.PA", "ORG.PE", "ORG.PH", "ORG.PL", "ORG.PY", "ORG.RU", "ORG.SG", "ORG.SH", "ORG.SY", "ORG.TN", "ORG.TR", "ORG.TW", "ORG.UK", "ORG.UY", "ORG.VE", "ORG.VI", "ORG.YU", "ORG.ZA", "OR.ID", "OR.KR", "OR.TH", "ORT.NP", "OR.UG", "OZ.AU", "PE.CA", "PLC.CO.IM", "PLC.UK", "POLICE.UK", "PRIV.HU", "PSI.BR", "PVT.GE", "QC.CA", "QH.CN", "REC.BR", "REC.CO", "REC.VE", "RE.KR", "RES.IN", "RNRT.TN", "RNS.TN", "RNU.TN", "SA.CR", "SARK.GG", "SC.CN", "SCH.GG", "SCH.JE", "SCHOOL.FJ", "SCHOOL.ZA", "SCH.UK", "SCI.EG", "SH.CN", "SK.CA", "SLD.PA", "SN.CN", "STORE.CO", "STORE.VE", "SX.CN", "TEC.VE", "TELEMEMO.AU", "TJ.CN", "TM.HU", "TMP.BR", "TM.ZA", "TOURISM.TN", "TW.CN", "WEB.CO", "WEB.DO", "WEB.VE", "WEB.ZA", "XJ.CN", "XZ.CN", "YK.CA", "YN.CN", "ZJ.CN" }; if ( ! s_isInitialized ) { // set up the hash table if ( ! s_table.set ( 8 , 0, sizeof(s_tlds)*2,NULL,0,false,0, "tldtbl") ) return log("build: Could not init table of TLDs."); // now add in all the stop words int32_t n = (int32_t)sizeof(s_tlds)/ sizeof(char *); for ( int32_t i = 0 ; i < n ; i++ ) { const char *d = s_tlds[i]; int32_t dlen = gbstrlen ( d ); int64_t dh = hash64Lower_a ( d , dlen ); if ( ! s_table.addKey (&dh,NULL) ) return log("build: dom table failed"); } s_isInitialized = true; } int64_t h = hash64Lower_a ( tld , tldLen ); // gbstrlen(tld)); return s_table.isInTable ( &h );//getScoreFromTermId ( h ); }
// // hardcoded support for popular formats and sites // bool SiteGetter::setRecognizedSite ( ) { // clear just in case g_errno = 0; // get path of url char *p = m_url; for ( ; *p && *p != ':' ; p++ ); // error? if ( *p != ':' ) return false; // skip :// p += 3; // save host ptr char *host = p; // then another / for the path for ( ; *p && *p != '/' ; p++ ); // error? if ( *p != '/' ) return false; // // ok, "p" now points to the path // char *path = p; // convenience vars int32_t len = 0; // . deal with site indicators // . these are applied to all domains uniformly // . if it is xyz.com/users/ use xyz.com/users/fred/ as the site // a lot of times these were not indivual blogs, but the blog subsite // of a site... http://dccc.org/blog/P4575/ //if ( strncasecmp(p,"/blogs/" , 7) == 0 ) len = 7; //if ( strncasecmp(p,"/blog/" , 6) == 0 ) len = 6; // commented out a bunch cuz they were profiles mostly, not blogs... if ( strncasecmp(p,"/~" , 2) == 0 ) len = 2; // assume this is a username. skip the first / //if ( sitepathdepth == 1 ) len = 1; if ( strncasecmp(p,"/users/" , 7) == 0 ) len = 7; if ( strncasecmp(p,"/user/" , 6) == 0 ) len = 6; if ( strncasecmp(p,"/members/" , 9) == 0 ) len = 9; if ( strncasecmp(p,"/membres/" , 9) == 0 ) len = 9; if ( strncasecmp(p,"/member/" , 8) == 0 ) len = 8; if ( strncasecmp(p,"/membre/" , 8) == 0 ) len = 8; if ( strncasecmp(p,"/member.php?u=",14) == 0 ) len = 14; // point to after the /users/, /blogs/, /user/, /blog/ or /~xxx/ p += len; // assume there is NOT an alpha char after this char username = false; // . skip to next / OR ? // . stop at . or -, because we do not allow those in usernames and // they are often indicative of filenames without file extensions // . no, fix http://www.rus-obr.ru/users/maksim-sokolov (no - or _ or.) while ( len && *p && *p!= '/'&&*p!='?' ) { // sometimes usernames are numbers!!! //if ( is_alpha_a(*p) ) username = true; // http://stackoverflow.com/users/271376/sigterm if ( is_alnum_a(*p) ) username = true; p++; } // if we hit this, not a username //if ( *p=='.' || *p == '-' || *p == '_' ) username = false; // did we get a match? // . www.cits.ucsb.edu/users/michael-osborne // . www.cits.ucsb.edu/users/michael-osborne/ // . after /blog/ or /~ should be another / or \0, not a period, // because that indicates probably a filename, which is not right, // because we are expecting a username! if ( username && p - host + 6 < MAX_SITE_LEN ) { // jump up here to store storeIt: // for parsing char *x = m_site; // store www first if its a domain only url if ( ! m_hasSubdomain ) { gbmemcpy ( x , "www." , 4 ); x += 4; } // store it gbmemcpy ( x , host , p - host ); x += p - host; // set the length of it m_siteLen = x - m_site; // make it end on a '/' if we can if ( m_site[m_siteLen-1] != '/' && // watch out for /?uid=xxxx crap m_site[m_siteLen-1] != '=' ) { // force the / then m_site[m_siteLen] = '/'; m_siteLen++; } // null term the site m_site [ m_siteLen ] = '\0'; return true; } // // popular homesteads // int32_t depth = 0; // term host char c = *path; *path = '\0'; if ( strstr(host,"vimeo.com" ) ) depth = 1; if ( strstr(host,"www.myspace.com") ) depth = 1; if ( strstr(host,"twitter.com" ) ) depth = 1; if ( strstr(host,"www.facebook.com") ) depth = 1; // revert *path = c; // return false to indicate no recognized site detected if ( ! depth ) return false; // skip over the initial root / after the hostname p = path + 1; // no path really? root path? just return the hostname then if ( ! *p && path - host + 6 < MAX_SITE_LEN ) { // for parsing char *x = m_site; // store www first if its a domain only url if ( ! m_hasSubdomain ) { gbmemcpy ( x , "www." , 4 ); x += 4; } // store it gbmemcpy ( x , host , path - host ); x += path - host; m_siteLen = x - m_site; m_site [ m_siteLen ] = '\0'; return true; } // for depth for ( ; *p ; p++ ) if ( *p == '/' && --depth == 0 ) break; if ( p - host + 6 >= MAX_SITE_LEN ) return false; goto storeIt; return true; }
bool Words::addWords( char *s, int32_t nodeLen, bool computeWordIds ) { int32_t i = 0; int32_t j; int32_t wlen; bool hadApostrophe = false; UCScript oldScript = ucScriptCommon; UCScript saved; UCProps props; uptop: // bad utf8 can cause a breach if ( i >= nodeLen ) { goto done; } if ( ! s[i] ) { goto done; } if ( !is_alnum_utf8( s + i ) ) { if ( m_numWords >= m_preCount ) { goto done; } // tag? if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) { // get the tag id if( m_tagIds ) { if ( s[i + 1] == '/' ) { // skip over / m_tagIds[m_numWords] = ::getTagId( s + i + 2 ); m_tagIds[m_numWords] |= BACKBIT; } else { m_tagIds[m_numWords] = ::getTagId( s + i + 1 ); } } m_words[m_numWords] = s + i; m_wordIds[m_numWords] = 0LL; // skip till end int32_t tagLen = getTagLen( s + i ); m_wordLens[m_numWords] = tagLen; m_nodes[m_numWords] = 0; m_numWords++; // advance i += tagLen; goto uptop; } // it is a punct word, find end of it char *start = s+i; for ( ; s[i] ; i += getUtf8CharSize(s+i)) { // stop on < if we got tags if ( s[i] == '<' && m_hasTags ) { break; } // if we are simple ascii, skip quickly if ( is_ascii(s[i]) ) { // accumulate NON-alnum chars if ( ! is_alnum_a(s[i]) ) { continue; } // update oldScript = ucScriptCommon; // otherwise, stop we got alnum break; } // if we are utf8 we stop on special props UChar32 c = utf8Decode ( s+i ); // stop if word char if ( ! ucIsWordChar ( c ) ) { continue; } // update first though oldScript = ucGetScript ( c ); // then stop break; } m_words [ m_numWords ] = start; m_wordLens [ m_numWords ] = s+i - start; m_wordIds [ m_numWords ] = 0LL; m_nodes [ m_numWords ] = 0; if (m_tagIds) { m_tagIds[m_numWords] = 0; } m_numWords++; goto uptop; } // get an alnum word j = i; again: for ( ; s[i] ; i += getUtf8CharSize(s+i) ) { // simple ascii? if ( is_ascii(s[i]) ) { // accumulate alnum chars if ( is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got punct break; } // get the code point of the utf8 char UChar32 c = utf8Decode ( s+i ); // get props props = ucProperties ( c ); // good stuff? if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue; // stop? if UC_WORCHAR is set, that means its an alnum if ( ! ( props & UC_WORDCHAR ) ) { // reset script between words oldScript = ucScriptCommon; break; } // save it saved = oldScript; // update here oldScript = ucGetScript(c); // treat ucScriptLatin (30) as common so we can have latin1 // like char without breaking the word! if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon; // stop on this crap too i guess. like japanes chars? if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) { // include it i += getUtf8CharSize(s+i); // but stop break; } // script change? if ( saved != oldScript ) break; } // . java++, A++, C++ exception // . A+, C+, exception // . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE if ( s[i]=='+' ) { if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2; else if ( !is_alnum_utf8(&s[i+1]) ) i++; } // . c#, j#, ... if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++; // comma is ok if like ,ddd!d if ( s[i]==',' && i-j <= 3 && is_digit(s[i-1]) ) { // if word so far is 2 or 3 chars, make sure digits if ( i-j >= 2 && ! is_digit(s[i-2]) ) goto nogo; if ( i-j >= 3 && ! is_digit(s[i-3]) ) goto nogo; // scan forward while ( s[i] == ',' && is_digit(s[i+1]) && is_digit(s[i+2]) && is_digit(s[i+3]) && ! is_digit(s[i+4]) ) { i += 4; } } // decimal point? if ( s[i] == '.' && is_digit(s[i-1]) && is_digit(s[i+1]) ) { // allow the decimal point i++; // skip over string of digits while ( is_digit(s[i]) ) i++; } nogo: // allow for words like we're dave's and i'm if ( s[i] == '\'' && s[i + 1] && is_alnum_utf8( &s[i + 1] ) && !hadApostrophe ) { i++; hadApostrophe = true; goto again; } hadApostrophe = false; // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [ m_numWords ] = &s[j]; m_wordLens[ m_numWords ] = wlen; if ( computeWordIds ) { int64_t h = hash64Lower_utf8(&s[j],wlen); m_wordIds [m_numWords] = h; } m_nodes[m_numWords] = 0; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; m_numAlnumWords++; // get a punct word goto uptop; done: // bad programming warning if ( m_numWords > m_preCount ) { log(LOG_LOGIC, "build: words: set: Fix counting routine."); gbshutdownLogicError(); } return true; }
bool Log::logR ( long long now , long type , char *msg , bool asterisk , bool forced ) { // filter if we should //if ( forced ) goto skipfilter; // return true if we should not log this if ( ! forced && ! shouldLog ( type , msg ) ) return true; // skipfilter: // can we log if we're a sig handler? don't take changes if ( g_inSigHandler ) return logLater ( now , type , msg , NULL ); //if ( g_inSigHandler ) return false; // get "msg"'s length long msgLen = gbstrlen ( msg ); #ifdef PTHREADS // lock for threads pthread_mutex_lock ( &s_lock ); #endif // do a timestamp, too. use the time synced with host #0 because // it is easier to debug because all log timestamps are in sync. if ( now == 0 ) now = gettimeofdayInMillisecondsGlobalNoCore(); // . skip all logging if power out, we do not want to screw things up // . allow logging for 10 seconds after power out though if ( ! g_process.m_powerIsOn && now - g_process.m_powerOffTime >10000){ #ifdef PTHREADS pthread_mutex_unlock ( &s_lock ); #endif return false; } //if ( now == 0 ) now = g_nowApprox; // chop off any spaces at the end of the msg. while ( is_wspace_a ( msg [ msgLen - 1 ] ) && msgLen > 0 ) msgLen--; // get this pid pid_t pid = getpidtid(); // a tmp buffer char tt [ MAX_LINE_LEN ]; char *p = tt; char *pend = tt + MAX_LINE_LEN; /* // print timestamp, hostid, type if ( g_hostdb.m_numHosts <= 999 ) sprintf ( p , "%llu %03li %s ", now , g_hostdb.m_hostId , getTypeString(type) ); else if ( g_hostdb.m_numHosts <= 9999 ) sprintf ( p , "%llu %04li %s ", now , g_hostdb.m_hostId , getTypeString(type) ); else if ( g_hostdb.m_numHosts <= 99999 ) sprintf ( p , "%llu %05li %s ", now , g_hostdb.m_hostId , getTypeString(type) ); */ // print timestamp, hostid, type if ( m_logTimestamps ) { if ( g_hostdb.m_numHosts <= 999 ) sprintf ( p , "%llu %03li ", now , g_hostdb.m_hostId ); else if ( g_hostdb.m_numHosts <= 9999 ) sprintf ( p , "%llu %04li ", now , g_hostdb.m_hostId ); else if ( g_hostdb.m_numHosts <= 99999 ) sprintf ( p , "%llu %05li ", now , g_hostdb.m_hostId ); p += gbstrlen ( p ); } // msg resource char *x = msg; long cc = 7; // the first 7 bytes or up to the : must be ascii //while ( p < pend && *x && is_alnum_a(*x) ) { *p++ = *x++; cc--; } // space pad //while ( cc-- > 0 ) *p++ = ' '; // ignore the label for now... while ( p < pend && *x && is_alnum_a(*x) ) { x++; cc--; } // thread id if in "thread" if ( pid != s_pid && s_pid != -1 ) { //sprintf ( p , "[%li] " , (long)getpid() ); sprintf ( p , "[%lu] " , (unsigned long)pid ); p += gbstrlen ( p ); } // then message itself long avail = (MAX_LINE_LEN) - (p - tt) - 1; if ( msgLen > avail ) msgLen = avail; if ( *x == ':' ) x++; if ( *x == ' ' ) x++; strncpy ( p , x , avail ); // capitalize for consistency. no, makes grepping log msgs harder. //if ( is_alpha_a(*p) ) *p = to_upper_a(*p); p += gbstrlen(p); // back up over spaces while ( p[-1] == ' ' ) p--; // end in period or ? or ! //if ( p[-1] != '?' && p[-1] != '.' && p[-1] != '!' ) // *p++ = '.'; *p ='\0'; // the total length, not including the \0 long tlen = p - tt; // call sprintf, but first make sure we have room in m_buf and in // the arrays. who know how much room the sprintf is going to need??? // NOTE: TODO: this is shaky -- fix it! if ( m_bufPtr + tlen >= 1024 * 32 || m_numErrors >= MAX_LOG_MSGS){ // this sets m_bufPtr to 0 if ( ! dumpLog ( ) ) { fprintf(stderr,"Log::log: could not dump to file!\n"); #ifdef PTHREADS pthread_mutex_unlock ( &s_lock ); #endif return false; } } // . filter out nasty chars from the message // . replace with ~'s char cs; char *ttp = tt; char *ttpend = tt + tlen; for ( ; ttp < ttpend ; ttp += cs ) { cs = getUtf8CharSize ( ttp ); if ( is_binary_utf8 ( ttp ) ) { for ( long k = 0 ; k < cs ; k++ ) *ttp++ = '.'; // careful not to skip the already skipped bytes cs = 0; continue; } // convert \n's and \r's to spaces if ( *ttp == '\n' ) *ttp = ' '; if ( *ttp == '\r' ) *ttp = ' '; if ( *ttp == '\t' ) *ttp = ' '; } if ( m_fd >= 0 ) { write ( m_fd , tt , tlen ); write ( m_fd , "\n", 1 ); } else { // print it out for now fprintf ( stderr, "%s\n", tt ); } // set the stuff in the array m_errorMsg [m_numErrors] = msg; m_errorMsgLen [m_numErrors] = msgLen; m_errorTime [m_numErrors] = now; m_errorType [m_numErrors] = type; // increase the # of errors m_numErrors++; #ifdef PTHREADS // unlock for threads pthread_mutex_unlock ( &s_lock ); #endif return false; }
bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) { // get collection name //int32_t nclen; //char *nc = r->getString ( "nc" , &nclen ); //int32_t cpclen; //char *cpc = r->getString ( "cpc" , &cpclen ); g_errno = 0; //bool cast = r->getLong("cast",0); const char *msg = NULL; // if any host in network is dead, do not do this //if ( g_hostdb.hasDeadHost() ) msg = "A host in the network is dead."; char format = r->getReplyFormat(); if ( format == FORMAT_XML || format == FORMAT_JSON ) { // no addcoll given? int32_t page = g_pages.getDynamicPageNumber ( r ); const char *addcoll = r->getString("addcoll",NULL); const char *delcoll = r->getString("delcoll",NULL); if ( ! addcoll ) addcoll = r->getString("addColl",NULL); if ( ! delcoll ) delcoll = r->getString("delColl",NULL); if ( page == PAGE_ADDCOLL && ! addcoll ) { g_errno = EBADENGINEER; const char *msg = "no addcoll parm provided"; return g_httpServer.sendErrorReply(s,g_errno,msg,NULL); } if ( page == PAGE_DELCOLL && ! delcoll ) { g_errno = EBADENGINEER; const char *msg = "no delcoll parm provided"; return g_httpServer.sendErrorReply(s,g_errno,msg,NULL); } return g_httpServer.sendSuccessReply(s,format); } // error? const char *action = r->getString("action",NULL); const char *addColl = r->getString("addcoll",NULL); char buf [ 64*1024 ]; SafeBuf p(buf, 64*1024); // // CLOUD SEARCH ENGINE SUPPORT - GIGABOT ERRORS // SafeBuf gtmp; char *gmsg = NULL; // is it too big? if ( action && addColl && strlen(addColl) > MAX_COLL_LEN ) { gtmp.safePrintf("search engine name is too long"); gmsg = gtmp.getBufStart(); } // from Collectiondb.cpp::addNewColl() ensure coll name is legit const char *x = addColl; for ( ; x && *x ; x++ ) { if ( is_alnum_a(*x) ) continue; if ( *x == '-' ) continue; if ( *x == '_' ) continue; // underscore now allowed break; } if ( x && *x ) { g_errno = EBADENGINEER; gtmp.safePrintf("<font color=red>Error. \"%s\" is a " "malformed name because it " "contains the '%c' character.</font><br><br>", addColl,*x); gmsg = gtmp.getBufStart(); } // // END GIGABOT ERRORS // // // CLOUD SEARCH ENGINE SUPPORT // // if added the coll successfully, do not print same page, jump to // printing the basic settings page so they can add sites to it. // crap, this GET request, "r", is missing the "c" parm sometimes. // we need to use the "addcoll" parm anyway. maybe print a meta // redirect then? char guide = r->getLong("guide",0); // do not redirect if gmsg is set, there was a problem with the name if ( action && ! msg && format == FORMAT_HTML && guide && ! gmsg ) { //return g_parms.sendPageGeneric ( s, r, PAGE_BASIC_SETTINGS ); // just redirect to it if ( addColl ) p.safePrintf("<meta http-equiv=Refresh " "content=\"0; URL=/admin/settings" "?guide=1&c=%s\">", addColl); return g_httpServer.sendDynamicPage (s, p.getBufStart(), p.length()); } // print standard header g_pages.printAdminTop ( &p , s , r , NULL, "onload=document." "getElementById('acbox').focus();"); if ( g_errno ) { msg = mstrerror( g_errno ); } if ( msg && ! guide ) { const char *cc = "deleting"; if ( add ) cc = "adding"; p.safePrintf ( "<center>\n" "<font color=red>" "<b>Error %s collection: %s. " "See log file for details.</b>" "</font>" "</center><br>\n",cc,msg); } // // CLOUD SEARCH ENGINE SUPPORT // if ( add && guide ) printGigabotAdvice ( &p , PAGE_ADDCOLL , r , gmsg ); // print the add collection box if ( add /*&& (! nc[0] || g_errno ) */ ) { const char *t1 = "Add Collection"; if ( guide ) t1 = "Add Search Engine"; p.safePrintf ( "<center>\n<table %s>\n" "<tr class=hdrow><td colspan=2>" "<center><b>%s</b></center>" "</td></tr>\n" ,TABLE_STYLE ,t1 ); const char *t2 = "collection"; if ( guide ) t2 = "search engine"; const char *str = addColl; if ( ! addColl ) str = ""; p.safePrintf ( "<tr bgcolor=#%s>" "<td><b>name of new %s to add</td>\n" "<td><input type=text name=addcoll size=30 " "id=acbox " "value=\"%s\">" "</td></tr>\n" , LIGHT_BLUE , t2 , str ); // don't show the clone box if we are under gigabot the guide if ( ! guide ) p.safePrintf( "<tr bgcolor=#%s>" "<td><b>clone settings from this " "collection</b>" "<br><font size=1>Copy settings from " "this pre-existing collection. Leave " "blank to " "accept default values.</font></td>\n" "<td><input type=text name=clonecoll " "size=30>" "</td>" "</tr>" , LIGHT_BLUE ); // collection pwds p.safePrintf( "<tr bgcolor=#%s>" "<td><b>collection passwords" "</b>" "<br><font size=1>List of white space separated " "passwords allowed to adminster collection." "</font>" "</td>\n" "<td><input type=text name=collpwd " "size=60>" "</td>" "</tr>" , LIGHT_BLUE ); // ips box for security p.safePrintf( "<tr bgcolor=#%s>" "<td><b>collection ips" "</b>" "<br><font size=1>List of white space separated " "IPs allowed to adminster collection." "</font>" "</td>\n" "<td><input type=text name=collips " "size=60>" "</td>" "</tr>" , LIGHT_BLUE ); // now list collections from which to copy the config //p.safePrintf ( // "<tr><td><b>copy configuration from this " // "collection</b><br><font size=1>Leave blank to " // "accept default values.</font></td>\n" // "<td><input type=text name=cpc value=\"%s\" size=30>" // "</td></tr>\n",coll); p.safePrintf ( "</table></center><br>\n"); // wrap up the form started by printAdminTop g_pages.printAdminBottom ( &p ); int32_t bufLen = p.length(); return g_httpServer.sendDynamicPage (s,p.getBufStart(),bufLen); } // if we added a collection, print its page //if ( add && nc[0] && ! g_errno ) // return g_parms.sendPageGeneric2 ( s , r , PAGE_SEARCH , // nc , pwd ); if ( g_collectiondb.m_numRecsUsed <= 0 ) goto skip; // print all collections out in a checklist so you can check the // ones you want to delete, the values will be the id of that collectn p.safePrintf ( "<center>\n<table %s>\n" "<tr class=hdrow><td><center><b>Delete Collections" "</b></center></td></tr>\n" "<tr bgcolor=#%s><td>" "<center><b>Select the collections you wish to delete. " //"<font color=red>This feature is currently under " //"development.</font>" "</b></center></td></tr>\n" "<tr bgcolor=#%s><td>" // table within a table "<center><table width=20%%>\n", TABLE_STYLE, LIGHT_BLUE, DARK_BLUE ); for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { CollectionRec *cr = g_collectiondb.m_recs[i]; if ( ! cr ) continue; p.safePrintf ( "<tr bgcolor=#%s><td>" "<input type=checkbox name=delcoll value=\"%s\"> " "%s</td></tr>\n", DARK_BLUE, cr->m_coll,cr->m_coll); } p.safePrintf( "</table></center></td></tr></table><br>\n" ); skip: // wrap up the form started by printAdminTop g_pages.printAdminBottom ( &p ); int32_t bufLen = p.length(); return g_httpServer.sendDynamicPage (s,p.getBufStart(),bufLen); }
// . sets m_qbuf1[] and m_qbuf2[] // . m_qbuf1[] is the advanced query // . m_qbuf2[] is the query to be used for spell checking // . returns false and set g_errno on error bool SearchInput::setQueryBuffers ( ) { m_sbuf1.reset(); m_sbuf2.reset(); m_sbuf3.reset(); short qcs = csUTF8; if (m_queryCharset && m_queryCharsetLen){ // we need to convert the query string to utf-8 qcs = get_iana_charset(m_queryCharset, m_queryCharsetLen); if (qcs == csUnknown) { //g_errno = EBADCHARSET; //g_msg = "(error: unknown query charset)"; //return false; qcs = csUTF8; } } // prepend sites terms long numSites = 0; char *csStr = NULL; numSites = 0; csStr = get_charset_str(qcs); if ( m_sites && m_sites[0] ) { char *s = m_sites; char *t; long len; m_sbuf1.pushChar('(');//*p++ = '('; loop: // skip white space while ( *s && ! is_alnum_a(*s) ) s++; // bail if done if ( ! *s ) goto done; // get length of it t = s; while ( *t && ! is_wspace_a(*t) ) t++; len = t - s; // add site: term //if ( p + 12 + len >= pend ) goto toobig; if ( numSites > 0 ) m_sbuf1.safeStrcpy ( " UOR " ); m_sbuf1.safeStrcpy ( "site:" ); //p += ucToUtf8(p, pend-p,s, len, csStr, 0,0); m_sbuf1.safeMemcpy ( s , len ); //memcpy ( p , s , len ); p += len; //*p++ = ' '; m_sbuf1.pushChar(' '); s = t; numSites++; goto loop; done: m_sbuf1.safePrintf(") | "); // inc totalLen m_sitesQueryLen = m_sitesLen + (numSites * 10); } // append site: term if ( m_siteLen > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //memcpy ( p , "+site:" , 6 ); p += 6; m_sbuf1.safePrintf("+site:"); //memcpy ( p , m_site , m_siteLen ); p += m_siteLen; m_sbuf1.safeMemcpy(m_site,m_siteLen); } // append gblang: term if( m_gblang > 0 ) { //if( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //p += sprintf( p, "+gblang:%li |", m_gblang ); m_sbuf1.safePrintf( "+gblang:%li |", m_gblang ); } // bookmark here so we can copy into st->m_displayQuery below //long displayQueryOffset = m_sbuf1.length(); // append url: term if ( m_urlLen > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //memcpy ( p , "+url:" , 5 ); p += 5; m_sbuf1.safeStrcpy ( "+url:"); //memcpy ( p , m_url , m_urlLen ); p += m_urlLen; m_sbuf1.safeMemcpy ( m_url , m_urlLen ); } // append url: term if ( m_linkLen > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //memcpy ( p , "+link:" , 6 ); p += 6; m_sbuf1.safeStrcpy ( "+link:"); //memcpy ( p , m_link , m_linkLen ); p += m_linkLen; m_sbuf1.safeMemcpy ( m_link , m_linkLen ); } // append the natural query if ( m_queryLen > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //p += ucToUtf8(p, pend-p, m_query, m_queryLen, csStr, 0,0); m_sbuf1.safeMemcpy ( m_query , m_queryLen ); //memcpy ( p , m_query , m_queryLen ); p += m_queryLen; // add to spell checked buf, too //if ( p2 > pstart2 ) *p2++ = ' '; if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); //p2 +=ucToUtf8(p2, pend2-p2, m_query, m_queryLen, csStr, 0,0); m_sbuf2.safeMemcpy ( m_query , m_queryLen ); //memcpy ( p2 , m_query , m_queryLen ); p2 += m_queryLen; } if ( m_query2Len > 0 ) { //if ( p3 > pstart3 ) *p3++ = ' '; if ( m_sbuf3.length() ) m_sbuf3.pushChar(' '); //p3+=ucToUtf8(p3, pend3-p3, m_query2, m_query2Len, csStr,0,0); m_sbuf3.safeMemcpy ( m_query2 , m_query2Len ); } //if (g_errno == EILSEQ){ // illegal character seq // log("query: bad char set"); // g_errno = 0; // if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;} // if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;} //} // append quoted phrases to query if ( m_quoteLen1 > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //*p++ = '+'; //*p++ = '\"'; m_sbuf1.safeStrcpy("+\""); //p += ucToUtf8(p, pend-p, m_quote1, m_quoteLen1, csStr, 0,0); m_sbuf1.safeMemcpy ( m_quote1 , m_quoteLen1 ); //memcpy ( p , m_quote1 , m_quoteLen1 ); p += m_quoteLen1 ; //*p++ = '\"'; m_sbuf1.safeStrcpy("\""); // add to spell checked buf, too //if ( p2 > pstart2 ) *p2++ = ' '; if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); //*p2++ = '+'; //*p2++ = '\"'; m_sbuf2.safeStrcpy("+\""); //p2+=ucToUtf8(p2, pend2-p2, m_quote1, m_quoteLen1, csStr,0,0); m_sbuf2.safeMemcpy ( m_quote1 , m_quoteLen1 ); //memcpy ( p2 , m_quote1 , m_quoteLen1 ); p2 += m_quoteLen1 ; //*p2++ = '\"'; m_sbuf2.safeStrcpy("\""); } //if (g_errno == EILSEQ){ // illegal character seq // g_errno = 0; // if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;} // if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;} //} if ( m_quoteLen2 > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //*p++ = '+'; //*p++ = '\"'; m_sbuf1.safeStrcpy("+\""); //p += ucToUtf8(p, pend-p, m_quote2, m_quoteLen2, csStr, 0,0); m_sbuf1.safeMemcpy ( m_quote2 , m_quoteLen2 ); //memcpy ( p , m_quote2 , m_quoteLen2 ); p += m_quoteLen2 ; //*p++ = '\"'; m_sbuf1.safeStrcpy("\""); // add to spell checked buf, too //if ( p2 > pstart2 ) *p2++ = ' '; if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); //*p2++ = '+'; //*p2++ = '\"'; m_sbuf2.safeStrcpy("+\""); //p2+=ucToUtf8(p2, pend2-p2, m_quote2, m_quoteLen2, csStr,0,0); m_sbuf2.safeMemcpy ( m_quote2 , m_quoteLen2 ); //memcpy ( p2 , m_quote2 , m_quoteLen2 ); p2 += m_quoteLen2 ; //*p2++ = '\"'; m_sbuf2.safeStrcpy("\""); } //if (g_errno == EILSEQ){ // illegal character seq // g_errno = 0; // if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;} // if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;} //} // append plus terms if ( m_plusLen > 0 ) { char *s = m_plus, *send = m_plus + m_plusLen; //if ( p > pstart && p < pend ) *p++ = ' '; //if ( p2 > pstart2 && p2 < pend2) *p2++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); while (s < send) { while (isspace(*s) && s < send) s++; char *s2 = s+1; if (*s == '\"') { // if there's no closing quote just treat // the end of the line as such while (*s2 != '\"' && s2 < send) s2++; if (s2 < send) s2++; } else { while (!isspace(*s2) && s2 < send) s2++; } if (s < send) break; //if (p < pend) *p++ = '+'; //if (p2 < pend2) *p2++ = '+'; m_sbuf1.pushChar('+'); m_sbuf2.pushChar('+'); //p += ucToUtf8(p, pend-p, s, s2-s, csStr, 0,0); //p2 += ucToUtf8(p2, pend2-p2, s, s2-s, csStr, 0,0); m_sbuf1.safeMemcpy ( s , s2 - s ); m_sbuf2.safeMemcpy ( s , s2 - s ); /* if (g_errno == EILSEQ) { // illegal character seq g_errno = 0; if (qcs == csUTF8) { qcs = csISOLatin1; goto doOver; } if (qcs != csISOLatin1) { qcs = csUTF8; goto doOver; } } */ s = s2 + 1; if (s < send) { //if (p < pend) *p++ = ' '; //if (p2 < pend2) *p2++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); } } } // append minus terms if ( m_minusLen > 0 ) { char *s = m_minus, *send = m_minus + m_minusLen; //if ( p > pstart && p < pend ) *p++ = ' '; //if ( p2 > pstart2 && p2 < pend2) *p2++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); while (s < send) { while (isspace(*s) && s < send) s++; char *s2 = s+1; if (*s == '\"') { // if there's no closing quote just treat // the end of the line as such while (*s2 != '\"' && s2 < send) s2++; if (s2 < send) s2++; } else { while (!isspace(*s2) && s2 < send) s2++; } if (s < send) break; //if (p < pend) *p++ = '-'; //if (p2 < pend2) *p2++ = '-'; m_sbuf1.pushChar('-'); m_sbuf2.pushChar('-'); //p += ucToUtf8(p, pend-p, s, s2-s, csStr, 0,0); //p2 += ucToUtf8(p2, pend2-p2, s, s2-s, csStr, 0,0); m_sbuf1.safeMemcpy ( s , s2 - s ); m_sbuf2.safeMemcpy ( s , s2 - s ); /* if (g_errno == EILSEQ) { // illegal character seq g_errno = 0; if (qcs == csUTF8) { qcs = csISOLatin1; goto doOver; } if (qcs != csISOLatin1) { qcs = csUTF8; goto doOver; } } */ s = s2 + 1; if (s < send) { //if (p < pend) *p++ = ' '; //if (p2 < pend2) *p2++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); } } } // append gbkeyword:numinlinks if they have &mininlinks=X, X>0 long minInlinks = m_hr->getLong("mininlinks",0); if ( minInlinks > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //char *str = "gbkeyword:numinlinks"; //long len = gbstrlen(str); //memcpy ( p , str , len ); //p += len; m_sbuf1.safePrintf ( "gbkeyword:numinlinks"); } // null terms m_sbuf1.pushChar('\0'); m_sbuf2.pushChar('\0'); m_sbuf3.pushChar('\0'); // the natural query m_displayQuery = m_sbuf2.getBufStart();// + displayQueryOffset; if ( ! m_displayQuery ) m_displayQuery = ""; while ( *m_displayQuery == ' ' ) m_displayQuery++; m_displayQueryLen = gbstrlen(m_displayQuery);//p-m_displayQuery //log("query: got query %s",m_sbuf1.getBufStart()); //log("query: got display query %s",m_displayQuery); // urlencoded display query urlEncode(m_qe, MAX_QUERY_LEN*2, m_displayQuery, m_displayQueryLen); return true; }