void test_sentence(PersonRecog &pr, const char *sentence) { Words words; pr.recog(sentence, words); for (int i=0; i<words.size(); i+=1) printf("%s\n", words[i].c_str()); }
void Dictionary::printInteractiveSuggestions(Words const& active_words, unsigned const show) const { assert(!active_words.empty()); assert(show > 0); vector<pair<double, Word>> suggestions; suggestions.reserve(active_words.size()); puts("Suggestions:"); { DividedWordList dl(wordLength()); auto it = keys_.cbegin(); do { dl.build(*this, active_words, *it); suggestions.push_back(make_pair(-dl.entropy(), *it)); } while (++it != keys_.cend()); } sort(suggestions.begin(), suggestions.end()); { auto it = suggestions.cbegin(); unsigned i = 0; do { printf("\t%2u: ", ++i); describeWord(stdout, it->second); putchar('\n'); } while (++it != suggestions.cend() && i < show); } putchar('\n'); }
// When player has entered something, it is parsed elsewhere uint8 AgiEngine::testSaid(uint8 nwords, uint8 *cc) { AgiGame *state = &_game; AgiEngine *vm = state->_vm; Words *words = vm->_words; int c, n = words->getEgoWordCount(); int z = 0; if (vm->getFlag(VM_FLAG_SAID_ACCEPTED_INPUT) || !vm->getFlag(VM_FLAG_ENTERED_CLI)) return false; // FR: // I think the reason for the code below is to add some speed.... // // if (nwords != num_ego_words) // return false; // // In the disco scene in Larry 1 when you type "examine blonde", // inside the logic is expected ( said("examine", "blonde", "rol") ) // where word("rol") = 9999 // // According to the interpreter code 9999 means that whatever the // user typed should be correct, but it looks like code 9999 means that // if the string is empty at this point, the entry is also correct... // // With the removal of this code, the behavior of the scene was // corrected for (c = 0; nwords && n; c++, nwords--, n--) { z = READ_LE_UINT16(cc); cc += 2; switch (z) { case 9999: // rest of line (empty string counts to...) nwords = 1; break; case 1: // any word break; default: if (words->getEgoWordId(c) != z) return false; break; } } // The entry string should be entirely parsed, or last word = 9999 if (n && z != 9999) return false; // The interpreter string shouldn't be entirely parsed, but next // word must be 9999. if (nwords != 0 && READ_LE_UINT16(cc) != 9999) return false; setFlag(VM_FLAG_SAID_ACCEPTED_INPUT, true); return true; }
void tokenize(const char* string, Words& words) { enum State { ScanningForWordBeginning, ScanningForWordEnd, }; State state = ScanningForWordBeginning; const char* wordStart = string; const char* wordEnd = string; while (true) { char ch = *wordEnd; switch (state) { case ScanningForWordBeginning: { if (ch == '\0') { return; } else if (ch != ' ' && ch != '\t') { wordStart = wordEnd; wordEnd++; state = ScanningForWordEnd; } } break; case ScanningForWordEnd: { if (ch == '\0') { words.push_back(std::string(wordStart)); return; } if (ch != ' ' && ch != '\t') { wordEnd++; } else { words.push_back(std::string(wordStart, wordEnd - wordStart)); wordEnd++; state = ScanningForWordBeginning; } } break; } } }
bool Matches::addMatches( char *s, int32_t slen, mf_t flags ) { // . do not breach // . happens a lot with a lot of link info text if ( m_numMatchGroups >= MAX_MATCHGROUPS ) { return true; } // get some new ptrs for this match group Words *wp = &m_wordsArray [ m_numMatchGroups ]; Bits *bp = &m_bitsArray [ m_numMatchGroups ]; Pos *pb = &m_posArray [ m_numMatchGroups ]; // set the words class for this match group if ( !wp->set( s, slen, true ) ) { return false; } // bits vector if ( ! bp->setForSummary ( wp ) ) { return false; } // position vector if ( ! pb->set ( wp ) ) { return false; } // record the start int32_t startNumMatches = m_numMatches; // sometimes it returns true w/o incrementing this int32_t n = m_numMatchGroups; // . add all the Match classes from this match group // . this increments m_numMatchGroups on success bool status = addMatches( wp, NULL, NULL, bp, pb, flags ); // if this matchgroup had some, matches, then keep it if ( m_numMatches > startNumMatches ) { return status; } // otherwise, reset it, useless wp->reset(); bp->reset(); pb->reset(); // do not decrement the counter if we never incremented it if ( n == m_numMatchGroups ) { return status; } // ok, remove it m_numMatchGroups--; return status; }
int main( int argc, char * argv[] ) { char from[10] = "aablls?"; if( argc >= 2 ) { std::string arg(argv[1]); std::sort( arg.begin(), arg.end() ); strcpy( from, arg.c_str() ); } if( argc >= 3 ) { min_length = atoi( argv[2] ); } logf( 1, CLEAR "Score %s -> %i\n", from, ScoreString( from ) ); const int MAX_LEN = 32; int lengths[MAX_LEN] = {0}; if( FILE * fp = fopen( "enable1.txt", "rt" ) ) { double start = PerfTime(); while( !feof( fp ) ) { char buf[MAX_LEN]; char * got = fgets( buf, 63, fp ); if( got ) { char *end = buf + strlen(buf) -1; while( end > buf && !isalpha( *end ) ) { *end = 0; end -= 1; } int l = strlen(buf); lengths[l] += 1; if( l <= 15 ) { words.push_back(std::string(buf)); } } } double loaded = PerfTime(); double runTime = PerfTime(); Words gathered = GatherAngrams( from ); std::sort( gathered.begin(), gathered.end(), XScoresLessThanY ); for( auto s : gathered ) { logf( 1, CLEAR "%s - %i\n", s.c_str(), ScoreString( s ) ); } double done = PerfTime(); logf( 1, CLEAR "Timing (%f s) Loading\n", loaded-start ); //logf( 1, CLEAR "Timing (%f s) Preparing\n", prepared-prepping); logf( 1, CLEAR "Timing (%f s) running\n", done-runTime ); } return 0; }
void parse_doc_icu(char *s, int len, bool doHash, char *charset){ Xml xml; xml.set( s, len, TITLEREC_CURRENT_VERSION, 0, CT_HTML ); // Extract text from (x)html char *text_buf = (char*)malloc(64*1024); int32_t textLen = xml.getText( text_buf, 64 * 1024, 0, 99999999, doFilterSpaces ); Words w; w.set(text_buf, textLen, doHash); free(text_buf); }
void parse_doc_8859_1(char *s, int len, bool doHash,char *charset) { Xml xml; xml.set( s, len, TITLEREC_CURRENT_VERSION, 0, CT_HTML ); // Extract text from (x)html char *text_buf = (char*)malloc(len+1); xml.getText( text_buf, len, 0, 99999999, doFilterSpaces ); Words words; // just tokenize words words.set(text_buf, len, doHash); free(text_buf); }
void DividedWordList::build(Dictionary const& dictionary, Words const& ws, Word const guess) { assert(size() == dictionary.wordLength() + 1); assert(!ws.empty()); for (auto it = begin(); it != end(); ++it) it->clear(); { auto it = ws.cbegin(); do { unsigned const c = correct_letters(*it, guess); assert(c <= dictionary.wordLength()); (*this)[c].update(dictionary, *it); } while (++it != ws.cend()); } }
void RconConnection::EnableEvents(void) { Words command; command.clear(); command.push_back("admin.eventsEnabled"); command.push_back("true"); if(sendRequest(command)) throw string("sendRequest failed :: eventsEnabled"); TextRconPacket response = getResponse(); if(!response.m_isResponse || response.m_data[0] != "OK") throw string("eventsEnabled failed"); }
void LoadDic() { const int MAX_LEN = 32; int lengths[MAX_LEN] = {0}; if( FILE * fp = fopen( "enable1.txt", "rt" ) ) { //double start = PerfTime(); while( !feof( fp ) ) { char buf[MAX_LEN]; char * got = fgets( buf, 63, fp ); if( got ) { char *end = buf + strlen(buf) -1; while( end > buf && !isalpha( *end ) ) { *end = 0; end -= 1; } int l = strlen(buf); lengths[l] += 1; if( l <= 15 ) { words.push_back(std::string(buf)); wordSet.insert(std::string(buf)); } } } fclose( fp ); //double loaded = PerfTime(); } }
virtual void onServerResponse(const Words& words) { if (words.size() == 3 && words[0] == "OK") printf("Server version: Game %s, build ID %s\n", words[1].c_str(), words[2].c_str()); else printf("Invalid response to version query\n"); }
bool submissionsSequence(const Words& words, unsigned int& i) { if (i + 1 >= words.size()) return false; if (words[i] == "From" || words[i] == "from" || words[i] == "From:" || words[i] == "from:" || words[i] == "Merged" || words[i] == "Integrated") return true; if (i + 2 >= words.size()) return false; if (words[i] == "submitted" && words[i + 1] == "by") { i += 1; return true; } if (words[i] == "Folded" && words[i + 1] == "in") { i += 1; return true; } if (words[i] == "Rolled" && words[i + 1] == "in") { i += 1; return true; } if (words[i] == "Checked" && words[i + 1] == "in") { i += 1; return true; } if (i + 3 >= words.size()) return false; if (words[i] == "sent" && words[i + 1] == "in" && words[i + 2] == "by") { i += 2; return true; } return false; }
void RconConnection::Login(void) { Words loginCommand; loginCommand.push_back("login.hashed"); if(sendRequest(loginCommand)) throw string("Login failed"); TextRconPacket response = getResponse(); if(response.m_isResponse && response.m_data[0] == "OK") { string salt = response.m_data[1]; const char* hex_str = salt.c_str(); string hash, saltHex; uint32_t ch; for( ; sscanf( hex_str, "%2x", &ch) == 1 ; hex_str += 2) hash += ch; saltHex = hash; hash.append( this->password ); hash = MD5String( (char*)hash.c_str() ); boost::to_upper(hash); loginCommand.clear(); loginCommand.push_back("login.hashed"); loginCommand.push_back(hash); if(sendRequest(loginCommand)) throw string("sendRequest failed :: Login"); response = getResponse(); if(response.m_isResponse && response.m_data[0] == "InvalidPasswordHash") throw string("Login failed :: InvalidPasswordHash (Salt: " + salt + " | SaltHex: "+ saltHex +" | Hash: " + hash + ")"); } else throw string("Login failed"); }
void test_file(PersonRecog &pr, const char *file) { ifstream fi(file); string line; Words words; if (!fi) { printf("can not open file: %s\n", file); return; } while (getline(fi, line)) { pr.recog(line.c_str(), words); //printf("%s\n", line.c_str()); for (int i=0; i<words.size(); i+=1) printf("%s\n", words[i].c_str()); } fi.close(); }
void parse_doc_icu(char *s, int len, bool doHash, char *charset){ Xml xml; xml.set(csUTF8,s,len,false, 0,false, TITLEREC_CURRENT_VERSION); //fprintf(stderr,"\nparse_doc_icu\n"); // Extract text from (x)html char *text_buf = (char*)malloc(64*1024); long textLen = xml.getText(text_buf, 64*1024, 0, 99999999, false, true, false, doFilterSpaces, false); Words w; w.set(true,false, text_buf, textLen, TITLEREC_CURRENT_VERSION,doHash); free(text_buf); }
void Output::print(const Words& message, bool useComma, bool useAnd) { if (!_streams[_curStream].print()) return; for (int i = 0; i < message.length(); ++i) { stream() << message[i]; if (useComma) { if ((message.length() > 2) && (i != message.length() - 1)) stream() << ","; if ((i == message.length() - 2) && (useAnd)) stream() << " and"; } if (i != message.length() - 1) stream() << " "; } stream() << flush; }
Words GatherAngrams( const char * from ) { int wild = 0; int find_freq[26] = {0}; uint32_t find_have = 0; for( const char *s = from; *s; ++s ) { if( *s == '?' ) { wild += 1; } else { const int ord = *s - 'a'; find_freq[ord] += 1; find_have |= (1<<ord); } } uint32_t find_dont = ~find_have; Words matchList; const int count = words.size(); for( int i = 0; i < count; ++i ) { const char *s = words[i].c_str(); if( strlen( s ) < min_length ) continue; int freq[26] = {0}; uint32_t have = 0; for( ; *s; ++s ) { const int ord = *s - 'a'; freq[ord] += 1; have |= (1<<ord); } if( wild == 0 && find_dont & have ) continue; int over = 0; for( int c = 0; c < 26; ++c ) { if( freq[c] > find_freq[c] ) over += freq[c] - find_freq[c]; } if( over <= wild ) { logf( 1, RED "From (%s) -> (%s)\n", from, words[i].c_str() ); matchList.push_back( words[i] ); } } return matchList; }
static void generateSummary( Summary &summary, char *htmlInput, const char *queryStr, const char *urlStr ) { Xml xml; ASSERT_TRUE(xml.set(htmlInput, strlen(htmlInput), 0, CT_HTML)); Words words; ASSERT_TRUE(words.set(&xml, true)); Bits bits; ASSERT_TRUE(bits.set(&words)); Url url; url.set(urlStr); Sections sections; ASSERT_TRUE(sections.set(&words, &bits, &url, "", CT_HTML)); Query query; ASSERT_TRUE(query.set2(queryStr, langEnglish, true)); LinkInfo linkInfo; memset ( &linkInfo , 0 , sizeof(LinkInfo) ); linkInfo.m_lisize = sizeof(LinkInfo); Title title; ASSERT_TRUE(title.setTitle(&xml, &words, 80, &query, &linkInfo, &url, NULL, 0, CT_HTML, langEnglish)); Pos pos; ASSERT_TRUE(pos.set(&words)); Bits bitsForSummary; ASSERT_TRUE(bitsForSummary.setForSummary(&words)); Phrases phrases; ASSERT_TRUE(phrases.set(&words, &bits)); Matches matches; matches.setQuery(&query); ASSERT_TRUE(matches.set(&words, &phrases, §ions, &bitsForSummary, &pos, &xml, &title, &url, &linkInfo)); summary.setSummary(&xml, &words, §ions, &pos, &query, 180, 3, 3, 180, &url, &matches, title.getTitle(), title.getTitleLen()); }
SDataForSolver CSolver::ExtractData(const std::string & inputString) { SDataForSolver result; Words arguments = SplitWords(inputString); if (arguments.size() == AMOUNT_ARGUMENTS) { result.firstPoint.x = stof(arguments[0]); result.firstPoint.y = stof(arguments[1]); result.secondPoint.x = stof(arguments[2]); result.secondPoint.y = stof(arguments[3]); result.radiusCircle = stof(arguments[4]); } else { throw std::invalid_argument(MESSAGE_INCORRECT_AMOUNT_ARGUMENTS); } return result; }
void Dictionary::recursiveSolve(AbstractSolutionAcceptor * const acceptor, Words const& active_words, AnswerSequence * const runningAnswersp) const { assert(!active_words.empty()); if (active_words.size() == 1) { // We found an end node acceptor->acceptSolution(*runningAnswersp, active_words.front()); } else { DividedWordList dl(wordLength()); double best_entropy = -1.0; Word best_word = 0; // Of all of the words in active_words, find the one that // will gives us the most even split (and thus the highest entropy) { auto it = keys_.cbegin(); do { dl.build(*this, active_words, *it); double const entropy = dl.entropy(); if (entropy > best_entropy) { best_entropy = entropy; best_word = *it; } } while (++it != keys_.cend()); } assert(popcount32(best_word) == wordLength()); // Now that we have found our best word, recompute the split dl.build(*this, active_words, best_word); assert(dl.entropy() == best_entropy); acceptor->acceptBranch(*runningAnswersp, best_word, best_entropy, dl); // Now recursively descend for (unsigned i = 0; i < wordLength() + 1; i++) { const WordsWithTotalChoices& wwtc = dl[i]; if (wwtc.count() > 0) { runningAnswersp->push_back(i); recursiveSolve(acceptor, wwtc.choices(), runningAnswersp); runningAnswersp->pop_back(); } } } }
void parse_doc_8859_1(char *s, int len, bool doHash,char *charset) { Xml xml; xml.set(csASCII,s,len,false, 0, false, TITLEREC_CURRENT_VERSION); //fprintf(stderr,"\nparse_doc_8859_1\n"); // Extract text from (x)html char *text_buf = (char*)malloc(len+1); xml.getText(text_buf, len, 0, 99999999, false, true, false, doFilterSpaces, false); Words words; // just tokenize words words.set(false, text_buf, TITEREC_CURRENT_VERSION, doHash); free(text_buf); }
bool CheckWord( const char *word ) { #if 1 return wordSet.count( std::string( word ) ) == 1; #else const int count = words.size(); for( int i = 0; i < count; ++i ) { const char *s = words[i].c_str(); if( strcmp( s, word ) == 0 ) { return true; } } return false; #endif }
virtual void onServerResponse(const Words& words) { if (words.size() == 1) { if (words[0] == "OK") printf("Logged in successfully\n"); else if (words[0] == "InvalidPassword") printf("Invalid password\n"); else printf("Invalid response to login query\n"); } else printf("Invalid response to login query\n"); }
void Dictionary::recursiveInteractive(Words const& active_words) const { if (active_words.empty()) { puts("No choices remain.\n"); } else if (active_words.size() == 1) { fputs("Only choice remaining is: ", stdout); describeWord(stdout, active_words.front()); puts("\n"); } else { printf("%u choices remain", static_cast<unsigned>(active_words.size())); if (active_words.size() <= 5) { // If we're near the end, print a list fputs(": ", stdout); auto it = active_words.cbegin(); describeWord(stdout, *it); do { fputs(", ", stdout); describeWord(stdout, *it); } while (++it != active_words.cend()); putchar('\n'); } else { puts("."); } putchar('\n'); string cmd; for (;;) { printInteractiveSuggestions(active_words); fputs("Give your move in form \"<word> <result>\" like \"TAXI 2\"\n\n" "> ", stdout); fflush(stdout); if (!read_uppercase_word(stdin, &cmd)) break; // EOF Word tried; unsigned matched; if (parse_move(cmd, wordLength(), &tried, &matched)) { DividedWordList dl(wordLength()); dl.build(*this, active_words, tried); recursiveInteractive(dl[matched].choices()); break; } puts("Invalid move.\n"); } } }
bool RTABMapDBAdapter::readData(const std::vector<std::string> &dbPaths, Words &words, Labels &labels) { // Read data from databases std::map<int, std::map<int, std::unique_ptr<rtabmap::Signature>>> allSignatures; std::list<std::unique_ptr<Label>> allLabels; int dbId = 0; for (const auto &dbPath : dbPaths) { auto dbSignatures = readSignatures(dbPath); allSignatures.emplace(dbId, std::move(dbSignatures)); auto dbLabels = readLabels(dbPath, dbId, allSignatures); std::move(dbLabels.begin(), dbLabels.end(), std::back_inserter(allLabels)); dbId++; } std::cout << "Building Index for Words" << std::endl; std::list<std::unique_ptr<Word>> allWords = createWords(allSignatures); std::cout << "Total Number of words: " << allWords.size() << std::endl; long count = 0; for (const auto &word : allWords) { for (const auto &desc : word->getDescriptorsByDb()) { count += desc.second.rows; } } std::cout << "Total Number of descriptors: " << count << std::endl; allWords = clusterPointsInWords(allWords); count = 0; for (const auto &word : allWords) { for (const auto &desc : word->getDescriptorsByDb()) { count += desc.second.rows; } } std::cout << "Total Number of points: " << count << std::endl; words.putWords(std::move(allWords)); std::cout << "Building Index for Labels" << std::endl; labels.putLabels(std::move(allLabels)); return true; }
// returns false if blocked, true otherwise bool processLoop ( void *state ) { // get it State2 *st = (State2 *)state; // get the tcp socket from the state TcpSocket *s = st->m_socket; // get it XmlDoc *xd = &st->m_xd; if ( ! xd->m_loaded ) { // setting just the docid. niceness is 0. //xd->set3 ( st->m_docId , st->m_coll , 0 ); // callback xd->setCallback ( state , processLoop ); // . and tell it to load from the old title rec // . this sets xd->m_oldTitleRec/m_oldTitleRecSize // . this sets xd->ptr_* and all other member vars from // the old title rec if found in titledb. if ( ! xd->loadFromOldTitleRec ( ) ) return false; } if ( g_errno ) return sendErrorReply ( st , g_errno ); // now force it to load old title rec //char **tr = xd->getTitleRec(); SafeBuf *tr = xd->getTitleRecBuf(); // blocked? return false if so. it will call processLoop() when it rets if ( tr == (void *)-1 ) return false; // we did not block. check for error? this will free "st" too. if ( ! tr ) return sendErrorReply ( st , g_errno ); // if title rec was empty, that is a problem if ( xd->m_titleRecBuf.length() == 0 ) return sendErrorReply ( st , ENOTFOUND); // set callback char *na = xd->getIsNoArchive(); // wait if blocked if ( na == (void *)-1 ) return false; // error? if ( ! na ) return sendErrorReply ( st , g_errno ); // forbidden? allow turkeys through though... if ( ! st->m_isAdmin && *na ) return sendErrorReply ( st , ENOCACHE ); SafeBuf *sb = &st->m_sb; // &page=4 will print rainbow sections if ( ! st->m_printed && st->m_r.getLong("page",0) ) { // do not repeat this call st->m_printed = true; // this will call us again since we called // xd->setCallback() above to us if ( ! xd->printDocForProCog ( sb , &st->m_r ) ) return false; } char *contentType = "text/html"; char format = st->m_format; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // if we printed a special page (like rainbow sections) then return now if ( st->m_printed ) { bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, //"text/html", contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); return status; } /* // this was calling XmlDoc and setting sections, etc. to // get the SpiderReply junk... no no no // is it banned or filtered? this ignores the TagRec in the titleRec // and uses msg8a to get it fresh instead char *vi = xd->getIsFiltered();//Visible( ); // wait if blocked if ( vi == (void *)-1 ) return false; // error? if ( ! vi ) return sendErrorReply ( st , g_errno ); // banned? if ( ! st->m_isAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED); */ // get the utf8 content char **utf8 = xd->getUtf8Content(); //long len = xd->size_utf8Content - 1; // wait if blocked??? if ( utf8 == (void *)-1 ) return false; // strange if ( xd->size_utf8Content<=0) { log("pageget: utf8 content <= 0"); return sendErrorReply(st,EBADENGINEER ); } // alloc error? if ( ! utf8 ) return sendErrorReply ( st , g_errno ); // get this host Host *h = g_hostdb.getHost ( g_hostdb.m_hostId ); if ( ! h ) { log("pageget: hostid %li is bad",g_hostdb.m_hostId); return sendErrorReply(st,EBADENGINEER ); } char *content = xd->ptr_utf8Content; long contentLen = xd->size_utf8Content - 1; // shortcut char strip = st->m_strip; // alloc buffer now //char *buf = NULL; //long bufMaxSize = 0; //bufMaxSize = len + ( 32 * 1024 ) ; //bufMaxSize = contentLen + ( 32 * 1024 ) ; //buf = (char *)mmalloc ( bufMaxSize , "PageGet2" ); //char *p = buf; //char *bufEnd = buf + bufMaxSize; //if ( ! buf ) { // return sendErrorReply ( st , g_errno ); //} // for undoing the header //char *start1 = p; long startLen1 = sb->length(); // we are always utfu if ( strip != 2 ) sb->safePrintf( "<meta http-equiv=\"Content-Type\" " "content=\"text/html;charset=utf8\">\n"); // base href //Url *base = &xd->m_firstUrl; //if ( xd->ptr_redirUrl.m_url[0] ) // base = &xd->m_redirUrl; char *base = xd->ptr_firstUrl; if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl; //Url *redir = *xd->getRedirUrl(); if ( strip != 2 ) { sb->safePrintf ( "<BASE HREF=\"%s\">" , base ); //p += gbstrlen ( p ); } // default colors in case css files missing if ( strip != 2 ) { sb->safePrintf( "\n<style type=\"text/css\">\n" "body{background-color:white;color:black;}\n" "</style>\n"); //p += gbstrlen ( p ); } //char format = st->m_format; if ( format == FORMAT_XML ) sb->reset(); if ( format == FORMAT_JSON ) sb->reset(); // for undoing the stuff below long startLen2 = sb->length();//p; // query should be NULL terminated char *q = st->m_q; long qlen = st->m_qlen; char styleTitle[128] = "font-size:14px;font-weight:600;" "color:#000000;"; char styleText[128] = "font-size:14px;font-weight:400;" "color:#000000;"; char styleLink[128] = "font-size:14px;font-weight:400;" "color:#0000ff;"; char styleTell[128] = "font-size:14px;font-weight:600;" "color:#cc0000;"; // get the url of the title rec Url *f = xd->getFirstUrl(); bool printDisclaimer = st->m_printDisclaimer; if ( xd->m_contentType == CT_JSON ) printDisclaimer = false; if ( format == FORMAT_XML ) printDisclaimer = false; if ( format == FORMAT_JSON ) printDisclaimer = false; char tbuf[100]; tbuf[0] = 0; time_t lastSpiderDate = xd->m_spideredTime; if ( printDisclaimer || format == FORMAT_XML || format == FORMAT_JSON ) { struct tm *timeStruct = gmtime ( &lastSpiderDate ); strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); } // We should always be displaying this disclaimer. // - May eventually want to display this at a different location // on the page, or on the click 'n' scroll browser page itself // when this page is not being viewed solo. // CNS: if ( ! st->m_clickNScroll ) { if ( printDisclaimer ) { sb->safePrintf(//sprintf ( p , //"<BASE HREF=\"%s\">" //"<table border=1 width=100%%>" //"<tr><td>" "<table border=\"1\" bgcolor=\"#" BGCOLOR "\" cellpadding=\"10\" " //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\"" "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">" "<tr" //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\"" "><td>" //"<font face=times,sans-serif color=black size=-1>" "<span style=\"%s\">" "This is Gigablast's cached page of </span>" "<a href=\"%s\" style=\"%s\">%s</a>" "" , styleTitle, f->getUrl(), styleLink, f->getUrl() ); //p += gbstrlen ( p ); // then the rest //sprintf(p , sb->safePrintf( "<span style=\"%s\">. " "Gigablast is not responsible for the content of " "this page.</span>", styleTitle ); //p += gbstrlen ( p ); sb->safePrintf ( "<br/><span style=\"%s\">" "Cached: </span>" "<span style=\"%s\">", styleTitle, styleText ); //p += gbstrlen ( p ); // then the spider date in GMT // time_t lastSpiderDate = xd->m_spideredTime; // struct tm *timeStruct = gmtime ( &lastSpiderDate ); // char tbuf[100]; // strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); //p += gbstrlen ( p ); sb->safeStrcpy(tbuf); // Moved over from PageResults.cpp sb->safePrintf( "</span> - <a href=\"" "/get?" "q=%s&c=%s&rtq=%li&" "d=%lli&strip=1\"" " style=\"%s\">" "[stripped]</a>", q , st->m_coll , (long)st->m_rtq, st->m_docId, styleLink ); // a link to alexa if ( f->getUrlLen() > 5 ) { sb->safePrintf( " - <a href=\"http:" "//web.archive.org/web/*/%s\"" " style=\"%s\">" "[older copies]</a>" , f->getUrl(), styleLink ); } if (st->m_noArchive){ sb->safePrintf( " - <span style=\"%s\"><b>" "[NOARCHIVE]</b></span>", styleTell ); } if (st->m_isBanned){ sb->safePrintf(" - <span style=\"%s\"><b>" "[BANNED]</b></span>", styleTell ); } // only print this if we got a query if ( qlen > 0 ) { sb->safePrintf("<br/><br/><span style=\"%s\"> " "These search terms have been " "highlighted: ", styleText ); //p += gbstrlen ( p ); } } // how much space left in p? //long avail = bufEnd - p; // . make the url that we're outputting for (like in PageResults.cpp) // . "thisUrl" is the baseUrl for click & scroll char thisUrl[MAX_URL_LEN]; char *thisUrlEnd = thisUrl + MAX_URL_LEN; char *x = thisUrl; // . use the external ip of our gateway // . construct the NAT mapped port // . you should have used iptables to map port to the correct // internal ip:port //unsigned long ip =g_conf.m_mainExternalIp ; // h->m_externalIp; //unsigned short port=g_conf.m_mainExternalPort;//h->m_externalHttpPort // local check //if ( st->m_isLocal ) { unsigned long ip = h->m_ip; unsigned short port = h->m_httpPort; //} //sprintf ( x , "http://%s:%li/get?q=" , iptoa ( ip ) , port ); // . we no longer put the port in here // . but still need http:// since we use <base href=> if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip)); else sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port); x += gbstrlen ( x ); // the query url encoded long elen = urlEncode ( x , thisUrlEnd - x , q , qlen ); x += elen; // separate cgi vars with a & //sprintf ( x, "&seq=%li&rtq=%lid=%lli", // (long)st->m_seq,(long)st->m_rtq,st->m_msg22.getDocId()); sprintf ( x, "&d=%lli",st->m_docId ); x += gbstrlen(x); // set our query for highlighting Query qq; qq.set2 ( q, st->m_langId , true ); // print the query terms into our highlight buffer Highlight hi; // make words so we can set the scores to ignore fielded terms Words qw; qw.set ( q , // content being highlighted, utf8 qlen , // content being highlighted, utf8 TITLEREC_CURRENT_VERSION, true , // computeIds false ); // hasHtmlEntities? // . assign scores of 0 to query words that should be ignored // . TRICKY: loop over words in qq.m_qwords, but they should be 1-1 // with words in qw. // . sanity check //if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;} // declare up here Matches m; // do the loop //Scores ss; //ss.set ( &qw , NULL ); //for ( long i = 0 ; i < qq.m_numWords ; i++ ) // if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0; // now set m.m_matches[] to those words in qw that match a query word // or phrase in qq. m.setQuery ( &qq ); //m.addMatches ( &qw , &ss , true ); m.addMatches ( &qw ); long hilen = 0; // CNS: if ( ! st->m_clickNScroll ) { // and highlight the matches if ( printDisclaimer ) { hilen = hi.set ( //p , //avail , sb , &qw , // words to highlight &m , // matches relative to qw false , // doSteming false , // st->m_clickAndScroll , (char *)thisUrl );// base url for ClcknScrll //p += hilen; // now an hr //memcpy ( p , "</span></table></table>\n" , 24 ); p += 24; sb->safeStrcpy("</span></table></table>\n"); } bool includeHeader = st->m_includeHeader; // do not show header for json object display if ( xd->m_contentType == CT_JSON ) includeHeader = false; if ( format == FORMAT_XML ) includeHeader = false; if ( format == FORMAT_JSON ) includeHeader = false; //mfree(uq, uqCapacity, "PageGet"); // undo the header writes if we should if ( ! includeHeader ) { // including base href is off by default when not including // the header, so the caller must explicitly turn it back on if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2; else sb->m_length=startLen1;//p=start1; } //sb->safeStrcpy(tbuf); if ( format == FORMAT_XML ) { sb->safePrintf("<response>\n"); sb->safePrintf("<statusCode>0</statusCode>\n"); sb->safePrintf("<statusMsg>Success</statusMsg>\n"); sb->safePrintf("<url><![CDATA["); sb->cdataEncode(xd->m_firstUrl.m_url); sb->safePrintf("]]></url>\n"); sb->safePrintf("<docId>%llu</docId>\n",xd->m_docId); sb->safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n", lastSpiderDate); sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf); } if ( format == FORMAT_JSON ) { sb->safePrintf("{\"response\":{\n"); sb->safePrintf("\t\"statusCode\":0,\n"); sb->safePrintf("\t\"statusMsg\":\"Success\",\n"); sb->safePrintf("\t\"url\":\""); sb->jsonEncode(xd->m_firstUrl.m_url); sb->safePrintf("\",\n"); sb->safePrintf("\t\"docId\":%llu,\n",xd->m_docId); sb->safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate); sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf); } // identify start of <title> tag we wrote out char *sbstart = sb->getBufStart(); char *sbend = sb->getBufEnd(); char *titleStart = NULL; char *titleEnd = NULL; for ( char *t = sbstart ; t < sbend ; t++ ) { // title tag? if ( t[0]!='<' ) continue; if ( to_lower_a(t[1])!='t' ) continue; if ( to_lower_a(t[2])!='i' ) continue; if ( to_lower_a(t[3])!='t' ) continue; if ( to_lower_a(t[4])!='l' ) continue; if ( to_lower_a(t[5])!='e' ) continue; // point to it char *x = t + 5; // max - to keep things fast char *max = x + 500; for ( ; *x && *x != '>' && x < max ; x++ ); x++; // find end char *e = x; for ( ; *e && e < max ; e++ ) { if ( e[0]=='<' && to_lower_a(e[1])=='/' && to_lower_a(e[2])=='t' && to_lower_a(e[3])=='i' && to_lower_a(e[4])=='t' && to_lower_a(e[5])=='l' && to_lower_a(e[6])=='e' ) break; } if ( e < max ) { titleStart = x; titleEnd = e; } break; } // . print title at top! // . consider moving if ( titleStart ) { char *ebuf = st->m_r.getString("eb"); if ( ! ebuf ) ebuf = ""; //p += sprintf ( p , sb->safePrintf( "<table border=1 " "cellpadding=10 " "cellspacing=0 " "width=100%% " "color=#ffffff>" ); long printLinks = st->m_r.getLong("links",0); if ( ! printDisclaimer && printLinks ) sb->safePrintf(//p += sprintf ( p , // first put cached and live link "<tr>" "<td bgcolor=lightyellow>" // print cached link //"<center>" " " "<b>" "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=\"" "/get?" "c=%s&d=%lli&qh=0&cnsp=1&eb=%s\">" "cached link</a>" " " "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=%s>live link</a>" "</b>" //"</center>" "</td>" "</tr>\n" ,st->m_coll ,st->m_docId ,ebuf ,thisUrl // st->ptr_ubuf ); if ( printLinks ) { sb->safePrintf(//p += sprintf ( p , "<tr><td bgcolor=pink>" "<span style=\"font-size:18px;" "font-weight:600;" "color:#000000;\">" " " "<b>PAGE TITLE:</b> " ); long tlen = titleEnd - titleStart; sb->safeMemcpy ( titleStart , tlen ); sb->safePrintf ( "</span></td></tr>" ); } sb->safePrintf( "</table><br>\n" ); } // is the content preformatted? bool pre = false; char ctype = (char)xd->m_contentType; if ( ctype == CT_TEXT ) pre = true ; // text/plain if ( ctype == CT_DOC ) pre = true ; // filtered msword if ( ctype == CT_PS ) pre = true ; // filtered postscript if ( format == FORMAT_XML ) pre = false; if ( format == FORMAT_JSON ) pre = false; // if it is content-type text, add a <pre> if ( pre ) {//p + 5 < bufEnd && pre ) { sb->safePrintf("<pre>"); //p += 5; } if ( st->m_strip == 1 ) contentLen = stripHtml( content, contentLen, (long)xd->m_version, st->m_strip ); // it returns -1 and sets g_errno on error, line OOM if ( contentLen == -1 ) { //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } Xml xml; Words ww; // if no highlighting, skip it bool queryHighlighting = st->m_queryHighlighting; if ( st->m_strip == 2 ) queryHighlighting = false; // do not do term highlighting if json if ( xd->m_contentType == CT_JSON ) queryHighlighting = false; SafeBuf tmp; SafeBuf *xb = sb; if ( format == FORMAT_XML ) xb = &tmp; if ( format == FORMAT_JSON ) xb = &tmp; if ( ! queryHighlighting ) { xb->safeMemcpy ( content , contentLen ); //p += contentLen ; } else { // get the content as xhtml (should be NULL terminated) //Words *ww = xd->getWords(); if ( ! xml.set ( content , contentLen , false , 0 , false , TITLEREC_CURRENT_VERSION , false , 0 , CT_HTML ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // sanity check //if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; } // how much space left in p? //avail = bufEnd - p; Matches m; m.setQuery ( &qq ); m.addMatches ( &ww ); hilen = hi.set ( xb , // p , avail , &ww , &m , false /*doStemming?*/ , st->m_clickAndScroll , thisUrl /*base url for click & scroll*/); //p += hilen; log(LOG_DEBUG, "query: Done highlighting cached page content"); } if ( format == FORMAT_XML ) { sb->safePrintf("\t<content><![CDATA["); sb->cdataEncode ( xb->getBufStart() ); sb->safePrintf("]]></content>\n"); sb->safePrintf("</response>\n"); } if ( format == FORMAT_JSON ) { sb->safePrintf("\t\"content\":\"\n"); sb->jsonEncode ( xb->getBufStart() ); sb->safePrintf("\"\n}\n}\n"); } // if it is content-type text, add a </pre> if ( pre ) { // p + 6 < bufEnd && pre ) { sb->safeMemcpy ( "</pre>" , 6 ); //p += 6; } // calculate bufLen //long bufLen = p - buf; long ct = xd->m_contentType; // now filter the entire buffer to escape out the xml tags // so it is displayed nice SafeBuf newbuf; if ( ct == CT_XML ) { // encode the xml tags into <tagname> sequences if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() , sb->getLength(), 0)){// niceness=0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // reassign //buf = newbuf.getBufStart(); //bufLen = newbuf.length(); sb->stealBuf ( &newbuf ); } // now encapsulate it in html head/tail and send it off // sendErr: contentType = "text/html"; if ( strip == 2 ) contentType = "text/xml"; // xml is usually buggy and this throws browser off //if ( ctype == CT_XML ) contentType = "text/xml"; if ( xd->m_contentType == CT_JSON ) contentType = "application/json"; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // safebuf, sb, is a member of "st" so this should copy the buffer // when it constructs the http reply, and we gotta call delete(st) // AFTER this so sb is still valid. bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( ct == CT_XML ) newbuf.purge(); //else if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // and convey the status return status; }
// . so now this adds a list of Synonyms to the m_pools[] and returns a ptr // to the first one. // . then the parent caller can store that ptr in the m_wordToSyn[] array // which we pre-alloc upon calling the set() function based on the # of // words we got // . returns # of synonyms stored into "tmpBuf" long Synonyms::getSynonyms ( Words *words , long wordNum , uint8_t langId , char *tmpBuf , long niceness ) { // punct words have no synoyms if ( ! words->m_wordIds[wordNum] ) return 0; // store these m_words = words; m_docLangId = langId; m_niceness = niceness; // sanity check if ( wordNum > m_words->m_numWords ) { char *xx=NULL;*xx=0; } // init the dedup table to dedup wordIds HashTableX dt; char dbuf[512]; dt.set(8,0,12,dbuf,512,false,m_niceness,"altwrds"); long maxSyns = (long)MAX_SYNS; char *bufPtr = tmpBuf; // point into buffer m_aids = (long long *)bufPtr; bufPtr += maxSyns * 8; // then the word ids m_wids0 = (long long *)bufPtr; bufPtr += maxSyns * 8; // second word ids, for multi alnum word synonyms, i.e. "New Jersey" m_wids1 = (long long *)bufPtr; bufPtr += maxSyns * 8; m_termPtrs = (char **)bufPtr; bufPtr += maxSyns * 4; m_termLens = (long *)bufPtr; bufPtr += maxSyns * 4; m_numAlnumWords = (long *)bufPtr; bufPtr += maxSyns * 4; m_numAlnumWordsInBase = (long *)bufPtr; bufPtr += maxSyns * 4; // source m_src = bufPtr; bufPtr += maxSyns; // cursors m_aidsPtr = m_aids; m_wids0Ptr = m_wids0; m_wids1Ptr = m_wids1; m_srcPtr = m_src; m_termPtrsPtr = m_termPtrs; m_termLensPtr = m_termLens; m_numAlnumWordsPtr = m_numAlnumWords; m_numAlnumWordsInBasePtr = m_numAlnumWordsInBase; char *w = m_words->m_words [wordNum]; long wlen = m_words->m_wordLens[wordNum]; // // NOW hit wiktionary // Trust this less then our s_exceptions above, but more than // our morph computations below // char sourceId = SOURCE_WIKTIONARY; char *ss = NULL; long long bwid; char wikiLangId = m_docLangId; bool hadSpace ; long klen ; long baseNumAlnumWords; tryOtherLang: /* // if word only exists in one language, assume that language for word // even if m_docLangId is langUnknown (0) if ( ! ss && ! m_docLangId && ! wikiLangId ) { // get raw word id bwid = m_words->m_wordIds[wordNum]; // each lang has its own bit long long bits = g_speller.getLangBits64 ( &bwid ); // skip if not unique char count = getNumBitsOn64 ( bits ) ; // if we only got one lang we could be, assume that if ( count == 1 ) // get it. bit #0 is english, so add 1 wikiLangId = getBitPosLL((uint8_t *)&bits) + 1; // try setting based on script. greek. russian. etc. // if the word was not in the wiktionary. // this will be langUnknown if not definitive. else wikiLangId = getCharacterLanguage(w); } */ // try looking up bigram so "new jersey" gets "nj" as synonym if ( wikiLangId && wordNum+2< m_words->m_numWords && m_words->m_wordIds[wordNum+2]) { // get phrase id bigram then long conti = 0; bwid = hash64Lower_utf8_cont(w,wlen,0,&conti); // then the next word char *wp2 = m_words->m_words[wordNum+2]; long wlen2 = m_words->m_wordLens[wordNum+2]; bwid = hash64Lower_utf8_cont(wp2,wlen2,bwid,&conti); baseNumAlnumWords = 2; ss = g_wiktionary.getSynSet( bwid, wikiLangId ); } // need a language for wiktionary to work with if ( wikiLangId && ! ss ) { // get raw word id bwid = m_words->m_wordIds[wordNum]; baseNumAlnumWords = 1; //if ( bwid == 1424622907102375150LL) // log("a"); ss = g_wiktionary.getSynSet( bwid, wikiLangId ); // if that failed try removing 's from word if there if ( ! ss && wlen >= 3 && w[wlen-2]=='\'' && w[wlen-1]=='s' ) { long long cwid = hash64Lower_utf8(w,wlen-2); ss = g_wiktionary.getSynSet( cwid, wikiLangId ); } } // even though a document may be in german it often has some // english words "pdf download" "copyright" etc. so if the word // has no synset in german, try it in english if ( //numPresets == 0 && ! ss && m_docLangId != langEnglish && wikiLangId != langEnglish && m_docLangId && g_speller.getSynsInEnglish(w,wlen,m_docLangId,langEnglish) ) { // try english wikiLangId = langEnglish; sourceId = SOURCE_WIKTIONARY_EN; goto tryOtherLang; } // if it was in wiktionary, just use that synset if ( ss ) { // prepare th HashTableX dedup; HashTableX *dd = NULL; char dbuf[512]; long count = 0; addSynSet: // do we have another set following this char *next = g_wiktionary.getNextSynSet(bwid,m_docLangId,ss); // if so, init the dedup table then if ( next && ! dd ) { dd = &dedup; dd->set ( 8,0,8,dbuf,512,false,m_niceness,"sddbuf"); } // skip over the pipe i guess char *pipe = ss + 2; // zh_ch? if ( *pipe == '_' ) pipe += 3; // sanity if ( *pipe != '|' ) { char *xx=NULL;*xx=0; } // point to word list char *p = pipe + 1; // hash up the list of words, they are in utf8 and char *e = p + 1; // save count in case we need to undo //long saved = m_numAlts[wordNum]; hashLoop: // skip synonyms that are anagrams because its to ambiguous // the are mappings like // "PC" -> "PC,Personal Computer" // "PC" -> "PC,Probable Cause" ... (lots more!) //bool isAnagram = true; for ( ; *e !='\n' && *e != ',' ; e++ ) ; // if ( ! is_upper_a(*e) ) isAnagram = false; // get it long long h = hash64Lower_utf8_nospaces ( p , e - p ); // skip if same as base word if ( h == bwid ) goto getNextSyn; // should we check for dups? if ( dd ) { // skip dups if ( dd->isInTable(&h) ) goto getNextSyn; // dedup. return false with g_errno set on error if ( ! dd->addKey(&h) ) return m_aidsPtr - m_aids; } // store it *m_aidsPtr++ = h; // store source *m_srcPtr++ = sourceId; hadSpace = false; klen = e - p; for ( long k = 0 ; k < klen ; k++ ) if ( is_wspace_a(p[k]) ) hadSpace = true; *m_termPtrsPtr++ = p; *m_termLensPtr++ = e-p; // only for multi-word synonyms like "New Jersey"... *m_wids0Ptr = 0LL; *m_wids1Ptr = 0LL; *m_numAlnumWordsPtr = 1; // and for multi alnum word synonyms if ( hadSpace ) { Words sw; sw.setx ( p , e - p , m_niceness ); *(long long *)m_wids0Ptr = sw.m_wordIds[0]; *(long long *)m_wids1Ptr = sw.m_wordIds[2]; *(long *)m_numAlnumWordsPtr = sw.getNumAlnumWords(); } m_wids0Ptr++; m_wids1Ptr++; m_numAlnumWordsPtr++; // how many words did we have to hash to find a synset? // i.e. "new jersey" would be 2, to get "nj" *m_numAlnumWordsInBasePtr++ = baseNumAlnumWords; // do not breach if ( ++count >= maxSyns ) goto done; getNextSyn: // loop for more if ( *e == ',' ) { e++; p = e; goto hashLoop; } // add in the next syn set, deduped if ( next ) { ss = next; goto addSynSet; } // wrap it up done: // all done return m_aidsPtr - m_aids; } // strip marks from THIS word, return -1 w/ g_errno set on error if ( ! addStripped ( w , wlen,&dt ) ) return m_aidsPtr - m_aids; // returns false with g_errno set if ( ! addAmpPhrase ( wordNum, &dt ) ) return m_aidsPtr - m_aids; // if we end in apostrophe, strip and add if ( wlen>= 3 && w[wlen-1] == 's' && w[wlen-2]=='\'' && ! addWithoutApostrophe ( wordNum, &dt ) ) return m_aidsPtr - m_aids; return m_aidsPtr - m_aids; }
void readContributors(NameMap& names, const string& file) { osgDB::ifstream fin(file.c_str()); Words words; while(fin) { string keyword; fin >> keyword; words.push_back(keyword); } string blank_string; for(unsigned int i = 0; i < words.size(); ++i) { if (submissionsSequence(words, i)) { if (i + 2 < words.size() && validName(words[i + 1])) { NamePair name = createName(words[i + 1], words[i + 2]); nameCorrection(name); if (!name.first.empty()) ++names[name]; i += 2; } else if (i + 1 < words.size() && validName(words[i + 1])) { NamePair name = createName(words[i + 1], blank_string); nameCorrection(name); if (!name.first.empty()) ++names[name]; i += 1; } } else { if (words[i] == "robert") { ++names[NameRobertOsfield]; } else if (words[i] == "don") { ++names[NameDonBurns]; } } } // reassign fisrt name entries to their full names entries if (names.size() > 1) { for (NameMap::iterator itr = names.begin(); itr != names.end(); ) { if (itr->first.second.empty()) { NameMap::iterator next_itr = itr; ++next_itr; if (next_itr != names.end() && itr->first.first == next_itr->first.first) { next_itr->second += itr->second; names.erase(itr); itr = next_itr; } else { ++itr; } } else { ++itr; } } } // remove the double entries from Robert's contributions if (names.size() > 1) { for (NameMap::iterator itr = names.begin(); itr != names.end(); ++itr) { if (itr->first != NameRobertOsfield && itr->first != NameDonBurns) { names[NameRobertOsfield] -= itr->second; } } } }
// langId is language of the query long long getSynBaseHash64 ( char *qstr , uint8_t langId ) { Words ww; ww.set3 ( qstr ); long nw = ww.getNumWords(); long long *wids = ww.getWordIds(); //char **wptrs = ww.getWords(); //long *wlens = ww.getWordLens(); long long baseHash64 = 0LL; Synonyms syn; // assume english if unknown to fix 'pandora's tower' // vs 'pandoras tower' where both words are in both // english and german so langid is unknown if ( langId == langUnknown ) langId = langEnglish; // . store re-written query into here then hash that string // . this way we can get rid of spaces //char rebuf[1024]; //char *p = rebuf; //if ( strstr(qstr,"cheatcodes") ) // log("hey"); // for deduping HashTableX dups; if ( ! dups.set ( 8,0,1024,NULL,0,false,0,"qhddup") ) return false; // scan the words for ( long i = 0 ; i < nw ; i++ ) { // skip if not alnum if ( ! wids[i] ) continue; // get its synonyms into tmpBuf char tmpBuf[TMPSYNBUFSIZE]; // . assume niceness of 0 for now // . make sure to get all synsets!! ('love' has two synsets) long naids = syn.getSynonyms (&ww,i,langId,tmpBuf,0); // term freq algo //long pop = g_speller.getPhrasePopularity(NULL, // wids[i], // true, // langId); // is it a queryStopWord like "the" or "and"? bool isQueryStop = ::isQueryStopWord(NULL,0,wids[i]); // a more restrictive list bool isStop = ::isStopWord(NULL,0,wids[i]); if ( ::isCommonQueryWordInEnglish(wids[i]) ) isStop = true; // find the smallest one unsigned long long min = wids[i]; //char *minWordPtr = wptrs[i]; //long minWordLen = wlens[i]; // declare up here since we have a goto below long j; // add to table too if ( dups.isInTable ( &min ) ) goto gotdup; // add to it if ( ! dups.addKey ( &min ) ) return false; // now scan the synonyms, they do not include "min" in them for ( j = 0 ; j < naids ; j++ ) { // get it unsigned long long aid64; aid64 = (unsigned long long)syn.m_aids[j]; // if any syn already hashed then skip it and count // as a repeated term. we have to do it this way // rather than just getting the minimum synonym // word id, because 'love' has two synsets and // 'like', a synonym of 'love' only has one synset // and they end up having different minimum synonym // word ids!!! if ( dups.isInTable ( &aid64 ) ) break; // add it. this could fail! if ( ! dups.addKey ( &aid64 ) ) return false; // set it? if ( aid64 >= min ) continue; // got a new min min = aid64; //minWordPtr = syn.m_termPtrs[j]; //minWordLen = syn.m_termLens[j]; // get largest term freq of all synonyms //long pop2 = g_speller.getPhrasePopularity(NULL,aid64, // true,langId); //if ( pop2 > pop ) pop = pop2; } // early break out means a hit in dups table if ( j < naids ) { gotdup: // do not count as repeat if query stop word // because they often repeat if ( isQueryStop ) continue; // count # of repeated word forms //nrwf++; continue; } // hash that now // do not include stop words in synbasehash so // 'search the web' != 'search web' if ( ! isStop ) { // no! make it order independent so 'search the web' // equals 'web the search' and 'engine search' // equals 'search engine' //baseHash64 <<= 1LL; baseHash64 ^= min; } // count it, but only if not a query stop word like "and" // or "the" or "a". # of unique word forms. //if ( ! isQueryStop ) nuwf++; // get term freq //if ( pop > maxPop ) maxPop = pop; // control word? //if ( wids[i] == cw1 ) ncwf++; } return baseHash64; }