/* * Note: This shares a buffer with httpGetStaticRequestHdrs, so the returned value * can no longer be used once httpGetStaticRequestHdrs is invoked. */ const char *httpGetRequestHdrs(WspHttpParms_t *parmPtr, const char *contentType, const char *contentLength) { int headerSize = 0; char *date = getRFC822Date(); XPTDEBUG((" httpGetRequestHdrs(%lx, %s, %s)\n", (unsigned long) parmPtr, contentType, contentLength)); if ((parmPtr == NULL) || (parmPtr->request == NULL)) return NULL; headerSize += getTagLen("Content-Type", contentType); headerSize += getTagLen("Content-Length", contentLength); headerSize += getTagLen("Date", date); headerSize += getTagLen("Referer: ", parmPtr->request->referer); headerSize += xppStrlen(NL); headerSize += 1; /* For null terminator */ if (parmPtr->request->composedHeader != NULL) parmPtr->request->composedHeader = (char *) xppRealloc(parmPtr->request->composedHeader, headerSize); else parmPtr->request->composedHeader = (char *) xppMalloc(headerSize); if (parmPtr->request->composedHeader == NULL) return NULL; xppMemset(parmPtr->request->composedHeader, 0, headerSize); addTag(parmPtr->request->composedHeader, "Content-Type", contentType); addTag(parmPtr->request->composedHeader, "Content-Length", contentLength); addTag(parmPtr->request->composedHeader, "Date", date); addTag(parmPtr->request->composedHeader, "Referer", parmPtr->request->referer); xppStrcat(parmPtr->request->composedHeader, NL); XPTDEBUG((" httpGetRequestHdrs() response = %s\n", parmPtr->request->composedHeader)); return parmPtr->request->composedHeader; } /* End httpGetRequestHdrs */
int calcStaticHeaderSize(WspHttpRequestParms_t *request) { int length = 0; XPTDEBUG((" calcStaticHeaderSize(%lx)\n", (unsigned long) request)); length += getTagLen("Cache-Control", D_CACHE_CONTROL); length += getTagLen("Accept", request->accept); length += getTagLen("Accept-Charset", request->acceptCharset); length += getTagLen("From", request->from); length += getTagLen("User-Agent", D_USER_AGENT); length += getTagLen("Accept-Encoding", request->acceptEncoding); length += getTagLen("Accept-Language", request->acceptLanguage); length += getTagLen("host", request->host); length += xppStrlen(NL); length += 1; /* For null terminator */ return length; } /* End calcStaticHeaderSize() */
// . called by Xml class // . returns the length of the node // . TODO: "node" is now guaranteed to be \0 terminated -- make this faster int32_t XmlNode::set ( char *node , bool pureXml , int32_t version ) { // save head of node m_node = node; // sanity check static bool s_check = false; if ( ! s_check ) { s_check = true; // how many NodeTypes do we have in g_nodes? static int32_t nn = sizeof(g_nodes) / sizeof(NodeType); // set the hash table for ( int32_t i = 0 ; i < nn ; i++ ) { // sanity if ( g_nodes[i].m_nodeId != i ) { char *xx=NULL;*xx=0;} } } // . reset this // . need to do here instead of in Links.cpp because sometimes // we think an anchor tag indicates a link, but it is really // just an <a href="javascript:..."> function call and Links.cpp // ignored it but we are expecting this to be valid! m_isSelfLink = 0; // reset //m_linkNum = -1; // CDATA tag was identified in earlier versions as a text node. Now // it is identified as a CDATA tag node. But gb.conf and others always // pass their version as 0 if ( node[0] == '<' && node[1] == '!' && node[2] == '[' && node[3] == 'C' && node[4] == 'D' && node[5] == 'A' && node[6] == 'T' && node[7] == 'A' && node[8] == '[' ) return setCDATANode ( node ); // if "node" isn't the start of a tag then set it as a Text Node if ( *node != '<' || ! isTagStart ( node ) ) {//, 0, version ) ) { // . set this node as a text node! // . nodeId for text nodes is 0 m_nodeId = 0; m_node = node; m_hasBackTag = false; m_hash = 0; int32_t i = 0; //char inCDATA = 0; // inc i as int32_t as it's NOT the beginning of a tag while ( node[i] && (node[i] != '<' || ! isTagStart ( node+i)))//,versin))) i++; m_nodeLen = i; m_pairTagNum = -1; return m_nodeLen; } // . see if it's a comment (node end is "-->" for comments) // . comments are special cases if ( node[1] == '!' ) { if ( node[2]=='-' && node[3]=='-' ) return setCommentNode ( node ); // this means comment too: // <![if ....]> if ( node[2]=='[' ) return setCommentNode2 ( node ); } // . otherwise it's a regular tag // . might be <!DOCTYPE ...> or something though m_nodeLen = getTagLen ( node );//, version ); // . get the node's name's length (i-1) // . node name ends at non alnum char // . we can have hyphens in node name (TODO: even at beginning???) int32_t tagNameStart = 1; // . skip over backslash in the back tags // . or skip over / or ? or ! now // . tag names must start with a letter, fwiw if ( ! is_alnum_a(node[tagNameStart]) /* == '/'*/ ) tagNameStart++; int32_t i = tagNameStart; // skip i to end of tagName. this should only allow ascii chars // to be "tag name chars" for ( ; i < m_nodeLen && is_tagname_char(node[i]) ; i++ ); // set the tagName and tagNameLen m_tagName = &node [ tagNameStart ]; m_tagNameLen = i - tagNameStart; // break point //if ( m_tagNameLen == 3 && m_tagName[0]=='!' && // m_tagName[1]=='-' && m_tagName[2]=='-' ) // fprintf(stderr,"man!"); // . set the node's hash -- used cuz it's faster than strcmp // . just hash the letters as upper case // . tag names are never utf8, so use the ascii ha m_hash = hash64Upper_a ( m_tagName , m_tagNameLen , 0LL); // if we're pure xml, don't allow any html tags accept <!-- --> if ( pureXml ) { m_hasBackTag = true; m_isBreaking = true; m_isVisible = true; //m_nodeId = TAG_XMLTAG;//1; // this returns 1 if tag is not in the list m_nodeId = setNodeInfo ( m_hash );//&m_hasBackTag , } // . determine if the nodeId for this node // . determine if it breaks lines (for phrasing purposes) else m_nodeId = setNodeInfo ( m_hash );//&m_hasBackTag , //&m_isBreaking , &m_isVisible ); // . no back tag if / follow name // . this was only for "pureXml" but now i do it for all tags! if ( m_node [ m_nodeLen - 2 ] == '/' ) m_hasBackTag = false; if ( m_node [ m_nodeLen - 2 ] == '?' ) m_hasBackTag = false; return m_nodeLen; }
bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) { long i = 0; long j; //long k = 0; long wlen; //unsigned long e; //long skip; long badCount = 0; bool hadApostrophe = false; UCScript oldScript = ucScriptCommon; UCScript saved; UCProps props; uptop: // bad utf8 can cause a breach if ( i >= nodeLen ) goto done; if ( ! s[i] ) goto done; if ( ! is_alnum_utf8(s+i) ) { // && m_numWords < m_preCount ) { if ( m_numWords >= m_preCount ) goto done; // tag? if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) { // get the tag id if ( s[i+1]=='/' ) { // skip over / m_tagIds [m_numWords] = ::getTagId(s+i+2); m_tagIds [m_numWords] |= BACKBIT; } else m_tagIds [m_numWords] = ::getTagId(s+i+1); // word start m_words [m_numWords] = s + i; m_wordIds [m_numWords] = 0LL; // skip till end long tagLen = getTagLen(s+i); // ,niceness); m_wordLens [m_numWords] = tagLen; m_numWords++; // advance i += tagLen; goto uptop; } // it is a punct word, find end of it char *start = s+i; //for (;s[i] && ! is_alnum_utf8(s+i);i+=getUtf8CharSize(s+i)); for ( ; s[i] ; i += getUtf8CharSize(s+i)){ // stop on < if we got tags if ( s[i] == '<' && m_hasTags ) break; // breathe QUICKPOLL(niceness); // if we are simple ascii, skip quickly if ( is_ascii(s[i]) ) { // accumulate NON-alnum chars if ( ! is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got alnum break; } // if we are utf8 we stop on special props UChar32 c = utf8Decode ( s+i ); // stop if word char if ( ! ucIsWordChar ( c ) ) continue; // update first though oldScript = ucGetScript ( c ); // then stop break; } m_words [ m_numWords ] = start; m_wordLens [ m_numWords ] = s+i - start; m_wordIds [ m_numWords ] = 0LL; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; goto uptop; } // get an alnum word j = i; again: //for ( ; is_alnum_utf8 (&s[i] ) ; i += getUtf8CharSize(s+i) ); for ( ; s[i] ; i += getUtf8CharSize(s+i) ) { // breathe QUICKPOLL(niceness); // simple ascii? if ( is_ascii(s[i]) ) { // accumulate alnum chars if ( is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got punct break; } // get the code point of the utf8 char UChar32 c = utf8Decode ( s+i ); // get props props = ucProperties ( c ); // good stuff? if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue; // stop? if UC_WORCHAR is set, that means its an alnum if ( ! ( props & UC_WORDCHAR ) ) { // reset script between words oldScript = ucScriptCommon; break; } // save it saved = oldScript; // update here oldScript = ucGetScript(c); // treat ucScriptLatin (30) as common so we can have latin1 // like char without breaking the word! if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon; // stop on this crap too i guess. like japanes chars? if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) { // include it i += getUtf8CharSize(s+i); // but stop break; } // script change? if ( saved != oldScript ) break; } // . java++, A++, C++ exception // . A+, C+, exception // . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE if ( s[i]=='+' ) { if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2; else if ( !is_alnum_utf8(&s[i+1]) ) i++; } // . c#, j#, ... if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++; // allow for words like we're dave's and i'm if(s[i]=='\''&&s[i+1]&&is_alnum_utf8(&s[i+1])&&!hadApostrophe){ i++; hadApostrophe = true; goto again; } hadApostrophe = false; // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [ m_numWords ] = &s[j]; m_wordLens[ m_numWords ] = wlen; // . Lars says it's better to leave the accented chars intact // . google agrees // . but what about "re'sume"? if ( computeWordIds ) { long long h = hash64Lower_utf8(&s[j],wlen); m_wordIds [m_numWords] = h; // until we get an accent removal algo, comment this // out and possibly use the query synonym pipeline // to search without accents. MDW //long long h2 = hash64AsciiLowerE(&s[j],wlen); //if ( h2 != h ) m_stripWordIds [m_numWords] = h2; //else m_stripWordIds [m_numWords] = 0LL; //m_stripWordIds[m_numWords] = 0; } if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; m_numAlnumWords++; // break on \0 or MAX_WORDS //if ( ! s[i] ) goto done; // get a punct word goto uptop; /* j = i; // delineate the "punctuation" word for ( ; s[i] && !is_alnum_utf8(&s[i]);i+=getUtf8CharSize(s+i)); // bad utf8 could cause us to breach the node, so watch out! if ( i > nodeLen ) { badCount++; i = nodeLen; } // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [m_numWords ] = &s[j]; m_wordLens [m_numWords ] = wlen; m_wordIds [m_numWords ] = 0LL; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; */ done: // bad programming warning if ( m_numWords > m_preCount ) { log(LOG_LOGIC, "build: words: set: Fix counting routine."); char *xx = NULL; *xx = 0; } // compute total length if ( m_numWords <= 0 ) m_totalLen = 0; else m_totalLen = m_words[m_numWords-1] - s + m_wordLens[m_numWords-1]; if ( badCount ) log("words: had %li bad utf8 chars",badCount); return true; }
bool Words::addWords( char *s, int32_t nodeLen, bool computeWordIds ) { int32_t i = 0; int32_t j; int32_t wlen; bool hadApostrophe = false; UCScript oldScript = ucScriptCommon; UCScript saved; UCProps props; uptop: // bad utf8 can cause a breach if ( i >= nodeLen ) { goto done; } if ( ! s[i] ) { goto done; } if ( !is_alnum_utf8( s + i ) ) { if ( m_numWords >= m_preCount ) { goto done; } // tag? if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) { // get the tag id if( m_tagIds ) { if ( s[i + 1] == '/' ) { // skip over / m_tagIds[m_numWords] = ::getTagId( s + i + 2 ); m_tagIds[m_numWords] |= BACKBIT; } else { m_tagIds[m_numWords] = ::getTagId( s + i + 1 ); } } m_words[m_numWords] = s + i; m_wordIds[m_numWords] = 0LL; // skip till end int32_t tagLen = getTagLen( s + i ); m_wordLens[m_numWords] = tagLen; m_nodes[m_numWords] = 0; m_numWords++; // advance i += tagLen; goto uptop; } // it is a punct word, find end of it char *start = s+i; for ( ; s[i] ; i += getUtf8CharSize(s+i)) { // stop on < if we got tags if ( s[i] == '<' && m_hasTags ) { break; } // if we are simple ascii, skip quickly if ( is_ascii(s[i]) ) { // accumulate NON-alnum chars if ( ! is_alnum_a(s[i]) ) { continue; } // update oldScript = ucScriptCommon; // otherwise, stop we got alnum break; } // if we are utf8 we stop on special props UChar32 c = utf8Decode ( s+i ); // stop if word char if ( ! ucIsWordChar ( c ) ) { continue; } // update first though oldScript = ucGetScript ( c ); // then stop break; } m_words [ m_numWords ] = start; m_wordLens [ m_numWords ] = s+i - start; m_wordIds [ m_numWords ] = 0LL; m_nodes [ m_numWords ] = 0; if (m_tagIds) { m_tagIds[m_numWords] = 0; } m_numWords++; goto uptop; } // get an alnum word j = i; again: for ( ; s[i] ; i += getUtf8CharSize(s+i) ) { // simple ascii? if ( is_ascii(s[i]) ) { // accumulate alnum chars if ( is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got punct break; } // get the code point of the utf8 char UChar32 c = utf8Decode ( s+i ); // get props props = ucProperties ( c ); // good stuff? if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue; // stop? if UC_WORCHAR is set, that means its an alnum if ( ! ( props & UC_WORDCHAR ) ) { // reset script between words oldScript = ucScriptCommon; break; } // save it saved = oldScript; // update here oldScript = ucGetScript(c); // treat ucScriptLatin (30) as common so we can have latin1 // like char without breaking the word! if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon; // stop on this crap too i guess. like japanes chars? if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) { // include it i += getUtf8CharSize(s+i); // but stop break; } // script change? if ( saved != oldScript ) break; } // . java++, A++, C++ exception // . A+, C+, exception // . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE if ( s[i]=='+' ) { if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2; else if ( !is_alnum_utf8(&s[i+1]) ) i++; } // . c#, j#, ... if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++; // comma is ok if like ,ddd!d if ( s[i]==',' && i-j <= 3 && is_digit(s[i-1]) ) { // if word so far is 2 or 3 chars, make sure digits if ( i-j >= 2 && ! is_digit(s[i-2]) ) goto nogo; if ( i-j >= 3 && ! is_digit(s[i-3]) ) goto nogo; // scan forward while ( s[i] == ',' && is_digit(s[i+1]) && is_digit(s[i+2]) && is_digit(s[i+3]) && ! is_digit(s[i+4]) ) { i += 4; } } // decimal point? if ( s[i] == '.' && is_digit(s[i-1]) && is_digit(s[i+1]) ) { // allow the decimal point i++; // skip over string of digits while ( is_digit(s[i]) ) i++; } nogo: // allow for words like we're dave's and i'm if ( s[i] == '\'' && s[i + 1] && is_alnum_utf8( &s[i + 1] ) && !hadApostrophe ) { i++; hadApostrophe = true; goto again; } hadApostrophe = false; // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [ m_numWords ] = &s[j]; m_wordLens[ m_numWords ] = wlen; if ( computeWordIds ) { int64_t h = hash64Lower_utf8(&s[j],wlen); m_wordIds [m_numWords] = h; } m_nodes[m_numWords] = 0; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; m_numAlnumWords++; // get a punct word goto uptop; done: // bad programming warning if ( m_numWords > m_preCount ) { log(LOG_LOGIC, "build: words: set: Fix counting routine."); gbshutdownLogicError(); } return true; }