Esempi in C++ (Cpp) per Words, esempi in C++ (Cpp) per Words

Esempio n. 1

0

Mostra file

File: test.cpp Progetto: zhouxianggen/personrecog

void test_sentence(PersonRecog &pr, const char *sentence)
{
    Words words;
	pr.recog(sentence, words);
    for (int i=0; i<words.size(); i+=1)
		printf("%s\n", words[i].c_str());
}

Esempio n. 2

0

Mostra file

File: qiksolve.cpp Progetto: mitchblank/qiksolve

void Dictionary::printInteractiveSuggestions(Words const& active_words, unsigned const show) const
{
	assert(!active_words.empty());
	assert(show > 0);
	vector<pair<double, Word>> suggestions;
	suggestions.reserve(active_words.size());
	puts("Suggestions:");
	{
		DividedWordList dl(wordLength());
		auto it = keys_.cbegin();
		do {
			dl.build(*this, active_words, *it);
			suggestions.push_back(make_pair(-dl.entropy(), *it));
		} while (++it != keys_.cend());
	}
	sort(suggestions.begin(), suggestions.end());
	{
		auto it = suggestions.cbegin();
		unsigned i = 0;
		do {
			printf("\t%2u: ", ++i);
			describeWord(stdout, it->second);
			putchar('\n');
		} while (++it != suggestions.cend() && i < show);
	}
	putchar('\n');
}

Esempio n. 3

0

Mostra file

File: op_test.cpp Progetto: Cruel/scummvm

// When player has entered something, it is parsed elsewhere
uint8 AgiEngine::testSaid(uint8 nwords, uint8 *cc) {
	AgiGame *state = &_game;
	AgiEngine *vm = state->_vm;
	Words *words = vm->_words;
	int c, n = words->getEgoWordCount();
	int z = 0;

	if (vm->getFlag(VM_FLAG_SAID_ACCEPTED_INPUT) || !vm->getFlag(VM_FLAG_ENTERED_CLI))
		return false;

	// FR:
	// I think the reason for the code below is to add some speed....
	//
	//      if (nwords != num_ego_words)
	//              return false;
	//
	// In the disco scene in Larry 1 when you type "examine blonde",
	// inside the logic is expected ( said("examine", "blonde", "rol") )
	// where word("rol") = 9999
	//
	// According to the interpreter code 9999 means that whatever the
	// user typed should be correct, but it looks like code 9999 means that
	// if the string is empty at this point, the entry is also correct...
	//
	// With the removal of this code, the behavior of the scene was
	// corrected

	for (c = 0; nwords && n; c++, nwords--, n--) {
		z = READ_LE_UINT16(cc);
		cc += 2;

		switch (z) {
		case 9999:  // rest of line (empty string counts to...)
			nwords = 1;
			break;
		case 1: // any word
			break;
		default:
			if (words->getEgoWordId(c) != z)
				return false;
			break;
		}
	}

	// The entry string should be entirely parsed, or last word = 9999
	if (n && z != 9999)
		return false;

	// The interpreter string shouldn't be entirely parsed, but next
	// word must be 9999.
	if (nwords != 0 && READ_LE_UINT16(cc) != 9999)
		return false;

	setFlag(VM_FLAG_SAID_ACCEPTED_INPUT, true);

	return true;
}

Esempio n. 4

0

Mostra file

File: Console.cpp Progetto: Kalmalyzer/FrostbiteRcon

void tokenize(const char* string, Words& words)
{

	enum State
	{
		ScanningForWordBeginning,
		ScanningForWordEnd,
	};

	State state = ScanningForWordBeginning;

	const char* wordStart = string;
	const char* wordEnd = string;

	while (true)
	{
		char ch = *wordEnd;

		switch (state)
		{
		case ScanningForWordBeginning:
			{
				if (ch == '\0')
				{
					return;
				}
				else if (ch != ' ' && ch != '\t')
				{
					wordStart = wordEnd;
					wordEnd++;
					state = ScanningForWordEnd;
				}
			}
			break;

		case ScanningForWordEnd:
			{
				if (ch == '\0')
				{
					words.push_back(std::string(wordStart));
					return;
				}
				if (ch != ' ' && ch != '\t')
				{
					wordEnd++;
				}
				else
				{
					words.push_back(std::string(wordStart, wordEnd - wordStart));
					wordEnd++;
					state = ScanningForWordBeginning;
				}
			}
			break;
		}
	}
}

Esempio n. 5

0

Mostra file

File: Matches.cpp Progetto: privacore/open-source-search-engine

bool Matches::addMatches( char *s, int32_t slen, mf_t flags ) {
	// . do not breach
	// . happens a lot with a lot of link info text
	if ( m_numMatchGroups >= MAX_MATCHGROUPS ) {
		return true;
	}

	// get some new ptrs for this match group
	Words    *wp = &m_wordsArray    [ m_numMatchGroups ];
	Bits     *bp = &m_bitsArray     [ m_numMatchGroups ];
	Pos      *pb = &m_posArray      [ m_numMatchGroups ];

	// set the words class for this match group
	if ( !wp->set( s, slen, true ) ) {
		return false;
	}

	// bits vector
	if ( ! bp->setForSummary ( wp ) ) {
		return false;
	}

	// position vector
	if ( ! pb->set ( wp ) ) {
		return false;
	}

	// record the start
	int32_t startNumMatches = m_numMatches;
	// sometimes it returns true w/o incrementing this
	int32_t n = m_numMatchGroups;
	// . add all the Match classes from this match group
	// . this increments m_numMatchGroups on success
	bool status = addMatches( wp, NULL, NULL, bp, pb, flags );

	// if this matchgroup had some, matches, then keep it
	if ( m_numMatches > startNumMatches ) {
		return status;
	}

	// otherwise, reset it, useless
	wp->reset();
	bp->reset();
	pb->reset();

	// do not decrement the counter if we never incremented it
	if ( n == m_numMatchGroups ) {
		return status;
	}

	// ok, remove it
	m_numMatchGroups--;

	return status;
}

Esempio n. 6

0

Mostra file

File: angram.cpp Progetto: raspofabs/wordfinder

int main( int argc, char * argv[] ) {
    char from[10] = "aablls?";

    if( argc >= 2 ) {
        std::string arg(argv[1]);
        std::sort( arg.begin(), arg.end() );
        strcpy( from, arg.c_str() );
    }

    if( argc >= 3 ) {
        min_length = atoi( argv[2] );
    }


    logf( 1, CLEAR "Score %s -> %i\n", from, ScoreString( from ) );

    const int MAX_LEN = 32;
    int lengths[MAX_LEN] = {0};
    if( FILE * fp = fopen( "enable1.txt", "rt" ) ) {
        double start = PerfTime();
        while( !feof( fp ) ) {
            char buf[MAX_LEN];
            char * got = fgets( buf, 63, fp );

            if( got ) {
                char *end = buf + strlen(buf) -1;
                while( end > buf && !isalpha( *end ) ) {
                    *end = 0;
                    end -= 1;
                }

                int l = strlen(buf);
                lengths[l] += 1;

                if( l <= 15 ) {
                    words.push_back(std::string(buf));
                }
            }
        }
        double loaded = PerfTime();

        double runTime = PerfTime();
        Words gathered = GatherAngrams( from );
        std::sort( gathered.begin(), gathered.end(), XScoresLessThanY );
        for( auto s : gathered ) {
            logf( 1, CLEAR "%s - %i\n", s.c_str(), ScoreString( s ) );
        }
        double done = PerfTime();
        logf( 1, CLEAR "Timing (%f s) Loading\n", loaded-start );
        //logf( 1, CLEAR "Timing (%f s) Preparing\n", prepared-prepping);
        logf( 1, CLEAR "Timing (%f s) running\n", done-runTime );
    }

    return 0;
}

Esempio n. 7

0

Mostra file

File: test_parser.cpp Progetto: exename/open-source-search-engine

void parse_doc_icu(char *s, int len, bool doHash, char *charset){
	Xml xml;
	xml.set( s, len, TITLEREC_CURRENT_VERSION, 0, CT_HTML );

	// Extract text from (x)html
	char *text_buf = (char*)malloc(64*1024);
	int32_t textLen = xml.getText( text_buf, 64 * 1024, 0, 99999999, doFilterSpaces );
	Words w;
	w.set(text_buf, textLen, doHash);
	free(text_buf);
}

Esempio n. 8

0

Mostra file

File: test_parser.cpp Progetto: exename/open-source-search-engine

void parse_doc_8859_1(char *s, int len, bool doHash,char *charset)
{
	Xml xml;
	xml.set( s, len, TITLEREC_CURRENT_VERSION, 0, CT_HTML );

	// Extract text from (x)html
	char *text_buf = (char*)malloc(len+1);
	xml.getText( text_buf, len, 0, 99999999, doFilterSpaces );
	Words words;

	// just tokenize words
	words.set(text_buf, len, doHash);
	free(text_buf);
}

Esempio n. 9

0

Mostra file

File: qiksolve.cpp Progetto: mitchblank/qiksolve

void DividedWordList::build(Dictionary const& dictionary, Words const& ws, Word const guess)
{
	assert(size() == dictionary.wordLength() + 1);
	assert(!ws.empty());
	for (auto it = begin(); it != end(); ++it)
		it->clear();
	{
		auto it = ws.cbegin();
		do {
			unsigned const c = correct_letters(*it, guess);
			assert(c <= dictionary.wordLength());
			(*this)[c].update(dictionary, *it);
		} while (++it != ws.cend());
	}
}

Esempio n. 10

0

Mostra file

File: RconConnection.cpp Progetto: DeadHunter/TuxNet

void RconConnection::EnableEvents(void)
{
	Words command;
	command.clear();
	command.push_back("admin.eventsEnabled");
	command.push_back("true");

	if(sendRequest(command))
		throw string("sendRequest failed :: eventsEnabled");

	TextRconPacket response = getResponse();

	if(!response.m_isResponse || response.m_data[0] != "OK")
		throw string("eventsEnabled failed");
}

Esempio n. 11

0

Mostra file

File: test.cpp Progetto: raspofabs/wordfinder

void LoadDic() {
    const int MAX_LEN = 32;
    int lengths[MAX_LEN] = {0};
    if( FILE * fp = fopen( "enable1.txt", "rt" ) ) {
        //double start = PerfTime();
        while( !feof( fp ) ) {
            char buf[MAX_LEN];
            char * got = fgets( buf, 63, fp );

            if( got ) {
                char *end = buf + strlen(buf) -1;
                while( end > buf && !isalpha( *end ) ) {
                    *end = 0;
                    end -= 1;
                }

                int l = strlen(buf);
                lengths[l] += 1;

                if( l <= 15 ) {
                    words.push_back(std::string(buf));
                    wordSet.insert(std::string(buf));
                }
            }
        }
        fclose( fp );
        //double loaded = PerfTime();
    }
}

Esempio n. 12

0

Mostra file

File: ThreadedConnectionTest.cpp Progetto: Kalmalyzer/FrostbiteRcon

	virtual void onServerResponse(const Words& words)
	{
		if (words.size() == 3 && words[0] == "OK")
			printf("Server version: Game %s, build ID %s\n", words[1].c_str(), words[2].c_str());
		else
			printf("Invalid response to version query\n");
	}

Esempio n. 13

0

Mostra file

File: Contributors.cpp Progetto: bianjinsong/osg

bool submissionsSequence(const Words& words, unsigned int& i)
{
    if (i + 1 >= words.size()) return false;

    if (words[i] == "From"   ||
        words[i] == "from"   ||
        words[i] == "From:"  ||
        words[i] == "from:"  ||
        words[i] == "Merged" ||
        words[i] == "Integrated") return true;

    if (i + 2 >= words.size()) return false;

    if (words[i] == "submitted" && words[i + 1] == "by")
    {
        i += 1;
        return true;
    }

    if (words[i] == "Folded" && words[i + 1] == "in")
    {
        i += 1;
        return true;
    }

    if (words[i] == "Rolled" && words[i + 1] == "in")
    {
        i += 1;
        return true;
    }

    if (words[i] == "Checked" && words[i + 1] == "in")
    {
        i += 1;
        return true;
    }

    if (i + 3 >= words.size()) return false;

    if (words[i] == "sent" && words[i + 1] == "in" && words[i + 2] == "by")
    {
        i += 2;
        return true;
    }

    return false;
}

Esempio n. 14

0

Mostra file

File: RconConnection.cpp Progetto: DeadHunter/TuxNet

void RconConnection::Login(void)
{
	Words loginCommand;
	loginCommand.push_back("login.hashed");

	if(sendRequest(loginCommand))
		throw string("Login failed");

	TextRconPacket response = getResponse();

	if(response.m_isResponse && response.m_data[0] == "OK")
	{
		string salt = response.m_data[1];

		const char* hex_str = salt.c_str();
		string hash, saltHex;
		uint32_t ch;

		for( ; sscanf( hex_str, "%2x", &ch) == 1 ; hex_str += 2)
			hash += ch;

		saltHex = hash;

		hash.append( this->password );
		hash = MD5String( (char*)hash.c_str() );

		boost::to_upper(hash);

		loginCommand.clear();
		loginCommand.push_back("login.hashed");
		loginCommand.push_back(hash);

		if(sendRequest(loginCommand))
			throw string("sendRequest failed :: Login");

		response = getResponse();

		if(response.m_isResponse && response.m_data[0] == "InvalidPasswordHash")
			throw string("Login failed :: InvalidPasswordHash (Salt: " + salt + " | SaltHex: "+ saltHex +" | Hash: " + hash + ")");

	}
	else
		throw string("Login failed");


}

Esempio n. 15

0

Mostra file

File: test.cpp Progetto: zhouxianggen/personrecog

void test_file(PersonRecog &pr, const char *file)
{
	ifstream fi(file);
	string line;
	Words words;
	if (!fi) {
		printf("can not open file: %s\n", file);
		return;
	}
	while (getline(fi, line)) {
		pr.recog(line.c_str(), words);
		//printf("%s\n", line.c_str());
		for (int i=0; i<words.size(); i+=1)
			printf("%s\n", words[i].c_str());
	}
    fi.close();
}

Esempio n. 16

0

Mostra file

File: test_parser.cpp Progetto: BILObilo/open-source-search-engine

void parse_doc_icu(char *s, int len, bool doHash, char *charset){
	Xml xml;
	xml.set(csUTF8,s,len,false, 0,false, TITLEREC_CURRENT_VERSION);
	//fprintf(stderr,"\nparse_doc_icu\n");	
	// Extract text from (x)html
	char *text_buf = (char*)malloc(64*1024);
	long textLen = xml.getText(text_buf, 
				   64*1024, 
				   0,
				   99999999,
				   false,
				   true,
				   false,
				   doFilterSpaces,
				   false);
	Words w;
	w.set(true,false, text_buf, textLen, TITLEREC_CURRENT_VERSION,doHash);
	free(text_buf);
}

Esempio n. 17

0

Mostra file

File: output.cpp Progetto: bbucior/mint

void Output::print(const Words& message, bool useComma, bool useAnd)
{
	if (!_streams[_curStream].print())
		return;
	for (int i = 0; i < message.length(); ++i)
	{
		stream() << message[i];
		if (useComma)
		{
			if ((message.length() > 2) && (i != message.length() - 1))
				stream() << ",";
			if ((i == message.length() - 2) && (useAnd))
				stream() << " and";
		}
		if (i != message.length() - 1)
			stream() << " ";
	}
	stream() << flush;
}

Esempio n. 18

0

Mostra file

File: angram.cpp Progetto: raspofabs/wordfinder

Words GatherAngrams( const char * from ) {
    int wild = 0;
    int find_freq[26] = {0};
    uint32_t find_have = 0;
    for( const char *s = from; *s; ++s ) {
        if( *s == '?' ) {
            wild += 1;
        } else {
            const int ord = *s - 'a';
            find_freq[ord] += 1;
            find_have |= (1<<ord);
        }
    }
    uint32_t find_dont = ~find_have;

    Words matchList;
    const int count = words.size();
    for( int i = 0; i < count; ++i ) {
        const char *s = words[i].c_str();
        if( strlen( s ) < min_length )
            continue;
        int freq[26] = {0};
        uint32_t have = 0;
        for( ; *s; ++s ) {
            const int ord = *s - 'a';
            freq[ord] += 1;
            have |= (1<<ord);
        }

        if( wild == 0 && find_dont & have )
            continue;
        int over = 0;
        for( int c = 0; c < 26; ++c ) {
            if( freq[c] > find_freq[c] )
                over += freq[c] - find_freq[c];
        }
        if( over <= wild ) {
            logf( 1, RED "From (%s) -> (%s)\n", from, words[i].c_str() );
            matchList.push_back( words[i] );
        }
    }
    return matchList;
}

Esempio n. 19

0

Mostra file

File: SummaryTest.cpp Progetto: privacore/open-source-search-engine

static void generateSummary( Summary &summary, char *htmlInput, const char *queryStr, const char *urlStr ) {
	Xml xml;
	ASSERT_TRUE(xml.set(htmlInput, strlen(htmlInput), 0, CT_HTML));

	Words words;
	ASSERT_TRUE(words.set(&xml, true));

	Bits bits;
	ASSERT_TRUE(bits.set(&words));

	Url url;
	url.set(urlStr);

	Sections sections;
	ASSERT_TRUE(sections.set(&words, &bits, &url, "", CT_HTML));

	Query query;
	ASSERT_TRUE(query.set2(queryStr, langEnglish, true));

	LinkInfo linkInfo;
	memset ( &linkInfo , 0 , sizeof(LinkInfo) );
	linkInfo.m_lisize = sizeof(LinkInfo);

	Title title;
	ASSERT_TRUE(title.setTitle(&xml, &words, 80, &query, &linkInfo, &url, NULL, 0, CT_HTML, langEnglish));

	Pos pos;
	ASSERT_TRUE(pos.set(&words));

	Bits bitsForSummary;
	ASSERT_TRUE(bitsForSummary.setForSummary(&words));

	Phrases phrases;
	ASSERT_TRUE(phrases.set(&words, &bits));

	Matches matches;
	matches.setQuery(&query);
	ASSERT_TRUE(matches.set(&words, &phrases, &sections, &bitsForSummary, &pos, &xml, &title, &url, &linkInfo));

	summary.setSummary(&xml, &words, &sections, &pos, &query, 180, 3, 3, 180, &url, &matches, title.getTitle(), title.getTitleLen());
}

Esempio n. 20

0

Mostra file

File: Solver.cpp Progetto: 7kia/MLaTA

SDataForSolver CSolver::ExtractData(const std::string & inputString)
{
	SDataForSolver result;
	Words arguments = SplitWords(inputString);

	if (arguments.size() == AMOUNT_ARGUMENTS)
	{
		result.firstPoint.x = stof(arguments[0]);
		result.firstPoint.y = stof(arguments[1]);

		result.secondPoint.x = stof(arguments[2]);
		result.secondPoint.y = stof(arguments[3]);

		result.radiusCircle = stof(arguments[4]);
	}
	else
	{
		throw std::invalid_argument(MESSAGE_INCORRECT_AMOUNT_ARGUMENTS);
	}

	return result;
}

Esempio n. 21

0

Mostra file

File: qiksolve.cpp Progetto: mitchblank/qiksolve

void Dictionary::recursiveSolve(AbstractSolutionAcceptor * const acceptor, Words const& active_words, AnswerSequence * const runningAnswersp) const
{
	assert(!active_words.empty());
	if (active_words.size() == 1) {	// We found an end node
		acceptor->acceptSolution(*runningAnswersp, active_words.front());
	} else {
		DividedWordList dl(wordLength());
		double best_entropy = -1.0;
		Word best_word = 0;
		// Of all of the words in active_words, find the one that
		// will gives us the most even split (and thus the highest entropy)
		{
			auto it = keys_.cbegin();
			do {
				dl.build(*this, active_words, *it);
				double const entropy = dl.entropy();
				if (entropy > best_entropy) {
					best_entropy = entropy;
					best_word = *it;
				}
			} while (++it != keys_.cend());
		}
		assert(popcount32(best_word) == wordLength());
		// Now that we have found our best word, recompute the split
		dl.build(*this, active_words, best_word);
		assert(dl.entropy() == best_entropy);
		acceptor->acceptBranch(*runningAnswersp, best_word, best_entropy, dl);
		// Now recursively descend
		for (unsigned i = 0; i < wordLength() + 1; i++) {
			const WordsWithTotalChoices& wwtc = dl[i];
			if (wwtc.count() > 0) {
				runningAnswersp->push_back(i);
				recursiveSolve(acceptor, wwtc.choices(), runningAnswersp);
				runningAnswersp->pop_back();
			}
		}
	}
}

Esempio n. 22

0

Mostra file

File: test_parser.cpp Progetto: BILObilo/open-source-search-engine

void parse_doc_8859_1(char *s, int len, bool doHash,char *charset)
{
	Xml xml;
	xml.set(csASCII,s,len,false, 0, false, TITLEREC_CURRENT_VERSION);
	//fprintf(stderr,"\nparse_doc_8859_1\n");

	// Extract text from (x)html
	char *text_buf = (char*)malloc(len+1);
	xml.getText(text_buf, 
		    len, 
		    0,
		    99999999,
		    false,
		    true,
		    false,
		    doFilterSpaces,
		    false);
	Words words;

	// just tokenize words
	words.set(false, text_buf, TITEREC_CURRENT_VERSION, doHash);
	free(text_buf);
}

Esempio n. 23

0

Mostra file

File: test.cpp Progetto: raspofabs/wordfinder

bool CheckWord( const char *word ) {
#if 1
    return wordSet.count( std::string( word ) ) == 1;
#else
    const int count = words.size();
    for( int i = 0; i < count; ++i ) {
        const char *s = words[i].c_str();
        if( strcmp( s, word ) == 0 ) {
            return true;
        }
    }
    return false;
#endif
}

Esempio n. 24

0

Mostra file

File: ThreadedConnectionTest.cpp Progetto: Kalmalyzer/FrostbiteRcon

	virtual void onServerResponse(const Words& words)
	{
		if (words.size() == 1)
		{
			if (words[0] == "OK")
				printf("Logged in successfully\n");
			else if (words[0] == "InvalidPassword")
				printf("Invalid password\n");
			else
				printf("Invalid response to login query\n");
		}
		else
			printf("Invalid response to login query\n");
	}

Esempio n. 25

0

Mostra file

File: qiksolve.cpp Progetto: mitchblank/qiksolve

void Dictionary::recursiveInteractive(Words const& active_words) const
{
	if (active_words.empty()) {
		puts("No choices remain.\n");
	} else if (active_words.size() == 1) {
		fputs("Only choice remaining is: ", stdout);
		describeWord(stdout, active_words.front());
		puts("\n");
	} else {
		printf("%u choices remain", static_cast<unsigned>(active_words.size()));
		if (active_words.size() <= 5) {
			// If we're near the end, print a list
			fputs(": ", stdout);
			auto it = active_words.cbegin();
			describeWord(stdout, *it);
			do {
				fputs(", ", stdout);
				describeWord(stdout, *it);
			} while (++it != active_words.cend());
			putchar('\n');
		} else {
			puts(".");
		}
		putchar('\n');

		string cmd;
		for (;;) {
			printInteractiveSuggestions(active_words);
			fputs("Give your move in form \"<word> <result>\" like \"TAXI 2\"\n\n"
			      "> ", stdout);
			fflush(stdout);
			if (!read_uppercase_word(stdin, &cmd))
				break;	// EOF
			Word tried;
			unsigned matched;
			if (parse_move(cmd, wordLength(), &tried, &matched)) {
				DividedWordList dl(wordLength());
				dl.build(*this, active_words, tried);
				recursiveInteractive(dl[matched].choices());
				break;
			}
			puts("Invalid move.\n");
		}
	}
}

Esempio n. 26

0

Mostra file

File: RTABMapDBAdapter.cpp Progetto: SoftwareDefinedBuildings/CellMate

bool RTABMapDBAdapter::readData(const std::vector<std::string> &dbPaths,
                                Words &words, Labels &labels) {
  // Read data from databases
  std::map<int, std::map<int, std::unique_ptr<rtabmap::Signature>>>
      allSignatures;
  std::list<std::unique_ptr<Label>> allLabels;
  int dbId = 0;
  for (const auto &dbPath : dbPaths) {
    auto dbSignatures = readSignatures(dbPath);
    allSignatures.emplace(dbId, std::move(dbSignatures));

    auto dbLabels = readLabels(dbPath, dbId, allSignatures);
    std::move(dbLabels.begin(), dbLabels.end(), std::back_inserter(allLabels));

    dbId++;
  }

  std::cout << "Building Index for Words" << std::endl;
  std::list<std::unique_ptr<Word>> allWords = createWords(allSignatures);
  std::cout << "Total Number of words: " << allWords.size() << std::endl;
  long count = 0;
  for (const auto &word : allWords) {
    for (const auto &desc : word->getDescriptorsByDb()) {
      count += desc.second.rows;
    }
  }
  std::cout << "Total Number of descriptors: " << count << std::endl;
  allWords = clusterPointsInWords(allWords);
  count = 0;
  for (const auto &word : allWords) {
    for (const auto &desc : word->getDescriptorsByDb()) {
      count += desc.second.rows;
    }
  }
  std::cout << "Total Number of points: " << count << std::endl;
  words.putWords(std::move(allWords));

  std::cout << "Building Index for Labels" << std::endl;
  labels.putLabels(std::move(allLabels));

  return true;
}

Esempio n. 27

0

Mostra file

File: PageGet.cpp Progetto: firatkarakusoglu/open-source-search-engine

// returns false if blocked, true otherwise
bool processLoop ( void *state ) {
	// get it
	State2 *st = (State2 *)state;
	// get the tcp socket from the state
	TcpSocket *s = st->m_socket;
	// get it
	XmlDoc *xd = &st->m_xd;

	if ( ! xd->m_loaded ) {
		// setting just the docid. niceness is 0.
		//xd->set3 ( st->m_docId , st->m_coll , 0 );
		// callback
		xd->setCallback ( state , processLoop );
		// . and tell it to load from the old title rec
		// . this sets xd->m_oldTitleRec/m_oldTitleRecSize
		// . this sets xd->ptr_* and all other member vars from
		//   the old title rec if found in titledb.
		if ( ! xd->loadFromOldTitleRec ( ) ) return false;
	}

	if ( g_errno ) return sendErrorReply ( st , g_errno );
	// now force it to load old title rec
	//char **tr = xd->getTitleRec();
	SafeBuf *tr = xd->getTitleRecBuf();
	// blocked? return false if so. it will call processLoop() when it rets
	if ( tr == (void *)-1 ) return false;
	// we did not block. check for error? this will free "st" too.
	if ( ! tr ) return sendErrorReply ( st , g_errno );
	// if title rec was empty, that is a problem
	if ( xd->m_titleRecBuf.length() == 0 ) 
		return sendErrorReply ( st , ENOTFOUND);

	// set callback
	char *na = xd->getIsNoArchive();
	// wait if blocked
	if ( na == (void *)-1 ) return false;
	// error?
	if ( ! na ) return sendErrorReply ( st , g_errno );
	// forbidden? allow turkeys through though...
	if ( ! st->m_isAdmin && *na )
		return sendErrorReply ( st , ENOCACHE );

	SafeBuf *sb = &st->m_sb;


	// &page=4 will print rainbow sections
	if ( ! st->m_printed && st->m_r.getLong("page",0) ) {
		// do not repeat this call
		st->m_printed = true;
		// this will call us again since we called
		// xd->setCallback() above to us
		if ( ! xd->printDocForProCog ( sb , &st->m_r ) )
			return false;
	}

	char *contentType = "text/html";
	char format = st->m_format;
	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// if we printed a special page (like rainbow sections) then return now
	if ( st->m_printed ) {
		bool status = g_httpServer.sendDynamicPage (s,
							    //buf,bufLen,
							    sb->getBufStart(),
							    sb->getLength(),
							    -1,false,
							    //"text/html",
							    contentType,
							    -1, NULL, "utf8" );
		// nuke state2
		mdelete ( st , sizeof(State2) , "PageGet1" );
		delete (st);
		return status;
	}

	/*
	  // this was calling XmlDoc and setting sections, etc. to
	  // get the SpiderReply junk... no no no
	// is it banned or filtered? this ignores the TagRec in the titleRec
	// and uses msg8a to get it fresh instead
	char *vi = xd->getIsFiltered();//Visible( );
	// wait if blocked
	if ( vi == (void *)-1 ) return false;
	// error?
	if ( ! vi ) return sendErrorReply ( st , g_errno );
	// banned?
	if ( ! st->m_isAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED);
	*/

	// get the utf8 content
	char **utf8 = xd->getUtf8Content();
	//long   len  = xd->size_utf8Content - 1;
	// wait if blocked???
	if ( utf8 == (void *)-1 ) return false;
	// strange
	if ( xd->size_utf8Content<=0) {
		log("pageget: utf8 content <= 0");
		return sendErrorReply(st,EBADENGINEER );
	}
	// alloc error?
	if ( ! utf8 ) return sendErrorReply ( st , g_errno );

	// get this host
	Host *h = g_hostdb.getHost ( g_hostdb.m_hostId );
	if ( ! h ) {
		log("pageget: hostid %li is bad",g_hostdb.m_hostId);
		return sendErrorReply(st,EBADENGINEER );
	}


	char *content    = xd->ptr_utf8Content;
	long  contentLen = xd->size_utf8Content - 1;

	// shortcut
	char strip = st->m_strip;

	// alloc buffer now
	//char *buf = NULL;
	//long  bufMaxSize = 0;
	//bufMaxSize = len + ( 32 * 1024 ) ;
	//bufMaxSize = contentLen + ( 32 * 1024 ) ;
	//buf        = (char *)mmalloc ( bufMaxSize , "PageGet2" );
	//char *p          = buf;
	//char *bufEnd     = buf + bufMaxSize;
	//if ( ! buf ) {
	//	return sendErrorReply ( st , g_errno );
	//}

	// for undoing the header
	//char *start1 = p;
	long startLen1 = sb->length();

	// we are always utfu
	if ( strip != 2 )
		sb->safePrintf( "<meta http-equiv=\"Content-Type\" "
			     "content=\"text/html;charset=utf8\">\n");

	// base href
	//Url *base = &xd->m_firstUrl;
	//if ( xd->ptr_redirUrl.m_url[0] )
	//	base = &xd->m_redirUrl;
	char *base = xd->ptr_firstUrl;
	if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl;
	//Url *redir = *xd->getRedirUrl();
	if ( strip != 2 ) {
		sb->safePrintf ( "<BASE HREF=\"%s\">" , base );
		//p += gbstrlen ( p );
	}

	// default colors in case css files missing
	if ( strip != 2 ) {
		sb->safePrintf( "\n<style type=\"text/css\">\n"
			  "body{background-color:white;color:black;}\n"
			  "</style>\n");
		//p += gbstrlen ( p );
	}

	//char format = st->m_format;
	if ( format == FORMAT_XML ) sb->reset();
	if ( format == FORMAT_JSON ) sb->reset();

	// for undoing the stuff below
	long startLen2 = sb->length();//p;

	// query should be NULL terminated
	char *q    = st->m_q;
	long  qlen = st->m_qlen;

	char styleTitle[128] =  "font-size:14px;font-weight:600;"
				"color:#000000;";
	char styleText[128]  =  "font-size:14px;font-weight:400;"
				"color:#000000;";
	char styleLink[128] =  "font-size:14px;font-weight:400;"
				"color:#0000ff;";
	char styleTell[128] =  "font-size:14px;font-weight:600;"
				"color:#cc0000;";

	// get the url of the title rec
	Url *f = xd->getFirstUrl();

	bool printDisclaimer = st->m_printDisclaimer;

	if ( xd->m_contentType == CT_JSON )
		printDisclaimer = false;

	if ( format == FORMAT_XML ) printDisclaimer = false;
	if ( format == FORMAT_JSON ) printDisclaimer = false;

	char tbuf[100];
	tbuf[0] = 0;
	time_t lastSpiderDate = xd->m_spideredTime;

	if ( printDisclaimer ||
	     format == FORMAT_XML ||
	     format == FORMAT_JSON ) {
		struct tm *timeStruct = gmtime ( &lastSpiderDate );
		strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
	}

	// We should always be displaying this disclaimer.
	// - May eventually want to display this at a different location
	//   on the page, or on the click 'n' scroll browser page itself
	//   when this page is not being viewed solo.
	// CNS: if ( ! st->m_clickNScroll ) {
	if ( printDisclaimer ) {

		sb->safePrintf(//sprintf ( p , 
			  //"<BASE HREF=\"%s\">"
			  //"<table border=1 width=100%%>"
			  //"<tr><td>"
			  "<table border=\"1\" bgcolor=\"#"
			  BGCOLOR
			  "\" cellpadding=\"10\" "
			  //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\""
			  "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">"
			  "<tr"
			  //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\""
			  "><td>"
			  //"<font face=times,sans-serif color=black size=-1>"
			  "<span style=\"%s\">"
			  "This is Gigablast's cached page of </span>"
			  "<a href=\"%s\" style=\"%s\">%s</a>"
			  "" , styleTitle, f->getUrl(), styleLink,
			  f->getUrl() );
		//p += gbstrlen ( p );
		// then the rest
		//sprintf(p , 
		sb->safePrintf(
			"<span style=\"%s\">. "
			"Gigablast is not responsible for the content of "
			"this page.</span>", styleTitle );
		//p += gbstrlen ( p );

		sb->safePrintf ( "<br/><span style=\"%s\">"
			  "Cached: </span>"
			  "<span style=\"%s\">",
			  styleTitle, styleText );
		//p += gbstrlen ( p );

		// then the spider date in GMT
		// time_t lastSpiderDate = xd->m_spideredTime;
		// struct tm *timeStruct = gmtime ( &lastSpiderDate );
		// char tbuf[100];
		// strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
		//p += gbstrlen ( p );
		sb->safeStrcpy(tbuf);

		// Moved over from PageResults.cpp
		sb->safePrintf( "</span> - <a href=\""
			      "/get?"
			      "q=%s&amp;c=%s&amp;rtq=%li&amp;"
			      "d=%lli&amp;strip=1\""
			      " style=\"%s\">"
			      "[stripped]</a>", 
			      q , st->m_coll , 
			      (long)st->m_rtq,
			      st->m_docId, styleLink ); 

		// a link to alexa
		if ( f->getUrlLen() > 5 ) {
			sb->safePrintf( " - <a href=\"http:"
					 "//web.archive.org/web/*/%s\""
					 " style=\"%s\">"
					 "[older copies]</a>" ,
					 f->getUrl(), styleLink );
		}

		if (st->m_noArchive){
			sb->safePrintf( " - <span style=\"%s\"><b>"
				     "[NOARCHIVE]</b></span>",
				     styleTell );
		}
		if (st->m_isBanned){
			sb->safePrintf(" - <span style=\"%s\"><b>"
				     "[BANNED]</b></span>",
				     styleTell );
		}

		// only print this if we got a query
		if ( qlen > 0 ) {
			sb->safePrintf("<br/><br/><span style=\"%s\"> "
				   "These search terms have been "
				   "highlighted:  ",
				   styleText );
			//p += gbstrlen ( p );
		}
		
	}

	// how much space left in p?
	//long avail = bufEnd - p;
	// . make the url that we're outputting for (like in PageResults.cpp)
	// . "thisUrl" is the baseUrl for click & scroll
	char thisUrl[MAX_URL_LEN];
	char *thisUrlEnd = thisUrl + MAX_URL_LEN;
	char *x = thisUrl;
	// . use the external ip of our gateway
	// . construct the NAT mapped port
	// . you should have used iptables to map port to the correct
	//   internal ip:port
	//unsigned long  ip   =g_conf.m_mainExternalIp  ; // h->m_externalIp;
	//unsigned short port=g_conf.m_mainExternalPort;//h->m_externalHttpPort
	// local check
	//if ( st->m_isLocal ) {
	unsigned long  ip   = h->m_ip;
	unsigned short port = h->m_httpPort;
	//}
	//sprintf ( x , "http://%s:%li/get?q=" , iptoa ( ip ) , port );
	// . we no longer put the port in here
	// . but still need http:// since we use <base href=>
	if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip));
	else            sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port);
	x += gbstrlen ( x );
	// the query url encoded
	long elen = urlEncode ( x , thisUrlEnd - x , q , qlen );
	x += elen;
	// separate cgi vars with a &
	//sprintf ( x, "&seq=%li&rtq=%lid=%lli",
	//	  (long)st->m_seq,(long)st->m_rtq,st->m_msg22.getDocId());
	sprintf ( x, "&d=%lli",st->m_docId );
	x += gbstrlen(x);		
	// set our query for highlighting
	Query qq;
	qq.set2 ( q, st->m_langId , true );

	// print the query terms into our highlight buffer
	Highlight hi;
	// make words so we can set the scores to ignore fielded terms
	Words qw;
	qw.set ( q            ,  // content being highlighted, utf8
		 qlen         ,  // content being highlighted, utf8
		 TITLEREC_CURRENT_VERSION,
		 true         ,  // computeIds
		 false        ); // hasHtmlEntities?
	// . assign scores of 0 to query words that should be ignored
	// . TRICKY: loop over words in qq.m_qwords, but they should be 1-1
	//   with words in qw.
	// . sanity check
	//if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;}
	// declare up here
	Matches m;
	// do the loop
	//Scores ss;
	//ss.set ( &qw , NULL );
	//for ( long i = 0 ; i < qq.m_numWords ; i++ )
	//	if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0;
	// now set m.m_matches[] to those words in qw that match a query word
	// or phrase in qq.
	m.setQuery ( &qq );
	//m.addMatches ( &qw , &ss , true );
	m.addMatches ( &qw );
	long hilen = 0;

	// CNS: if ( ! st->m_clickNScroll ) {
	// and highlight the matches
	if ( printDisclaimer ) {
		hilen = hi.set ( //p       ,
				 //avail   ,
				sb ,
				 &qw     , // words to highlight
				 &m      , // matches relative to qw
				 false   , // doSteming
				 false   , // st->m_clickAndScroll , 
				 (char *)thisUrl );// base url for ClcknScrll
		//p += hilen;
		// now an hr
		//memcpy ( p , "</span></table></table>\n" , 24 );   p += 24;
		sb->safeStrcpy("</span></table></table>\n");
	}


	bool includeHeader = st->m_includeHeader;

	// do not show header for json object display
	if ( xd->m_contentType == CT_JSON )
		includeHeader = false;

	if ( format == FORMAT_XML ) includeHeader = false;
	if ( format == FORMAT_JSON ) includeHeader = false;

	//mfree(uq, uqCapacity, "PageGet");
	// undo the header writes if we should
	if ( ! includeHeader ) {
		// including base href is off by default when not including
		// the header, so the caller must explicitly turn it back on
		if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2;
		else                         sb->m_length=startLen1;//p=start1;
	}

	//sb->safeStrcpy(tbuf);



	if ( format == FORMAT_XML ) {
		sb->safePrintf("<response>\n");
		sb->safePrintf("<statusCode>0</statusCode>\n");
		sb->safePrintf("<statusMsg>Success</statusMsg>\n");
		sb->safePrintf("<url><![CDATA[");
		sb->cdataEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("]]></url>\n");
		sb->safePrintf("<docId>%llu</docId>\n",xd->m_docId);
		sb->safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n",
			      lastSpiderDate);
		sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf);
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("{\"response\":{\n");
		sb->safePrintf("\t\"statusCode\":0,\n");
		sb->safePrintf("\t\"statusMsg\":\"Success\",\n");
		sb->safePrintf("\t\"url\":\"");
		sb->jsonEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("\",\n");
		sb->safePrintf("\t\"docId\":%llu,\n",xd->m_docId);
		sb->safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate);
		sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf);
	}

	// identify start of <title> tag we wrote out
	char *sbstart = sb->getBufStart();
	char *sbend   = sb->getBufEnd();
	char *titleStart = NULL;
	char *titleEnd   = NULL;
	for ( char *t = sbstart ; t < sbend ; t++ ) {
		// title tag?
		if ( t[0]!='<' ) continue;
		if ( to_lower_a(t[1])!='t' ) continue;
		if ( to_lower_a(t[2])!='i' ) continue;
		if ( to_lower_a(t[3])!='t' ) continue;
		if ( to_lower_a(t[4])!='l' ) continue;
		if ( to_lower_a(t[5])!='e' ) continue;
		// point to it
		char *x = t + 5;
		// max - to keep things fast
		char *max = x + 500;
		for ( ; *x && *x != '>' && x < max ; x++ );
		x++;
		// find end
		char *e = x;
		for ( ; *e && e < max ; e++ ) {
			if ( e[0]=='<' &&
			     to_lower_a(e[1])=='/' &&
			     to_lower_a(e[2])=='t' &&
			     to_lower_a(e[3])=='i' &&
			     to_lower_a(e[4])=='t' &&
			     to_lower_a(e[5])=='l' &&
			     to_lower_a(e[6])=='e' )
				break;
		}
		if ( e < max ) {
			titleStart = x;
			titleEnd   = e;
		}
		break;
	}

	// . print title at top!
	// . consider moving
	if ( titleStart ) {

		char *ebuf = st->m_r.getString("eb");
		if ( ! ebuf ) ebuf = "";

		//p += sprintf ( p , 
		sb->safePrintf(
			       "<table border=1 "
			       "cellpadding=10 "
			       "cellspacing=0 "
			       "width=100%% "
			       "color=#ffffff>" );

		long printLinks = st->m_r.getLong("links",0);

		if ( ! printDisclaimer && printLinks )
			sb->safePrintf(//p += sprintf ( p , 
				       // first put cached and live link
				       "<tr>"
				       "<td bgcolor=lightyellow>"
				       // print cached link
				       //"<center>"
				       "&nbsp; "
				       "<b>"
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=\""
				       "/get?"
				       "c=%s&d=%lli&qh=0&cnsp=1&eb=%s\">"
				       "cached link</a>"
				       " &nbsp; "
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=%s>live link</a>"
				       "</b>"
				       //"</center>"
				       "</td>"
				       "</tr>\n"
				       ,st->m_coll
				       ,st->m_docId 
				       ,ebuf
				       ,thisUrl // st->ptr_ubuf
				       );

		if ( printLinks ) {
			sb->safePrintf(//p += sprintf ( p ,
				       "<tr><td bgcolor=pink>"
				       "<span style=\"font-size:18px;"
				       "font-weight:600;"
				       "color:#000000;\">"
				       "&nbsp; "
				       "<b>PAGE TITLE:</b> "
				       );
			long tlen = titleEnd - titleStart;
			sb->safeMemcpy ( titleStart , tlen );
			sb->safePrintf ( "</span></td></tr>" );
		}

		sb->safePrintf( "</table><br>\n" );

	}

	// is the content preformatted?
	bool pre = false;
	char ctype = (char)xd->m_contentType;
	if ( ctype == CT_TEXT ) pre = true ; // text/plain
	if ( ctype == CT_DOC  ) pre = true ; // filtered msword
	if ( ctype == CT_PS   ) pre = true ; // filtered postscript

	if ( format == FORMAT_XML ) pre = false;
	if ( format == FORMAT_JSON ) pre = false;

	// if it is content-type text, add a <pre>
	if ( pre ) {//p + 5 < bufEnd && pre ) {
		sb->safePrintf("<pre>");
		//p += 5;
	}

	if ( st->m_strip == 1 )
		contentLen = stripHtml( content, contentLen, 
					(long)xd->m_version, st->m_strip );
	// it returns -1 and sets g_errno on error, line OOM
	if ( contentLen == -1 ) {
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		return sendErrorReply ( st , g_errno );
	}

	Xml xml;
	Words ww;

	// if no highlighting, skip it
	bool queryHighlighting = st->m_queryHighlighting;
	if ( st->m_strip == 2 ) queryHighlighting = false;

	// do not do term highlighting if json
	if ( xd->m_contentType == CT_JSON )
		queryHighlighting = false;

	SafeBuf tmp;
	SafeBuf *xb = sb;
	if ( format == FORMAT_XML ) xb = &tmp;
	if ( format == FORMAT_JSON ) xb = &tmp;
	

	if ( ! queryHighlighting ) {
		xb->safeMemcpy ( content , contentLen );
		//p += contentLen ;
	}
	else {
		// get the content as xhtml (should be NULL terminated)
		//Words *ww = xd->getWords();
		if ( ! xml.set ( content , contentLen , false ,
				 0 , false , TITLEREC_CURRENT_VERSION ,
				 false , 0 , CT_HTML ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}			
		if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// sanity check
		//if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
		// how much space left in p?
		//avail = bufEnd - p;

		Matches m;
		m.setQuery ( &qq );
		m.addMatches ( &ww );
		hilen = hi.set ( xb , // p , avail , 
				 &ww , &m ,
				 false /*doStemming?*/ ,  
				 st->m_clickAndScroll , 
				 thisUrl /*base url for click & scroll*/);
		//p += hilen;
		log(LOG_DEBUG, "query: Done highlighting cached page content");
	}


	if ( format == FORMAT_XML ) {
		sb->safePrintf("\t<content><![CDATA[");
		sb->cdataEncode ( xb->getBufStart() );
		sb->safePrintf("]]></content>\n");
		sb->safePrintf("</response>\n");
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("\t\"content\":\"\n");
		sb->jsonEncode ( xb->getBufStart() );
		sb->safePrintf("\"\n}\n}\n");
	}


	// if it is content-type text, add a </pre>
	if ( pre ) { // p + 6 < bufEnd && pre ) {
		sb->safeMemcpy ( "</pre>" , 6 );
		//p += 6;
	}

	// calculate bufLen
	//long bufLen = p - buf;

	long ct = xd->m_contentType;

	// now filter the entire buffer to escape out the xml tags
	// so it is displayed nice
	SafeBuf newbuf;

	if ( ct == CT_XML ) {
		// encode the xml tags into &lt;tagname&gt; sequences
		if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() ,
						 sb->getLength(),
						 0)){// niceness=0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// free out buffer that we alloc'd before returning since this
		// should have copied it into another buffer
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		// reassign
		//buf    = newbuf.getBufStart();
		//bufLen = newbuf.length();
		sb->stealBuf ( &newbuf );
	}

	// now encapsulate it in html head/tail and send it off
	// sendErr:
	contentType = "text/html";
	if ( strip == 2 ) contentType = "text/xml";
	// xml is usually buggy and this throws browser off
	//if ( ctype == CT_XML ) contentType = "text/xml";

	if ( xd->m_contentType == CT_JSON )
		contentType = "application/json";

	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// safebuf, sb, is a member of "st" so this should copy the buffer
	// when it constructs the http reply, and we gotta call delete(st)
	// AFTER this so sb is still valid.
	bool status = g_httpServer.sendDynamicPage (s,
						    //buf,bufLen,
						    sb->getBufStart(),
						    sb->getLength(),
						    -1,false,
						    contentType,
						     -1, NULL, "utf8" );

	// nuke state2
	mdelete ( st , sizeof(State2) , "PageGet1" );
	delete (st);


	// free out buffer that we alloc'd before returning since this
	// should have copied it into another buffer

	//if      ( ct == CT_XML ) newbuf.purge();
	//else if ( buf          ) mfree ( buf , bufMaxSize , "PageGet2" );
	
	// and convey the status
	return status;
}

Esempio n. 28

0

Mostra file

File: Synonyms.cpp Progetto: BillWangCS/open-source-search-engine

// . so now this adds a list of Synonyms to the m_pools[] and returns a ptr
//   to the first one.
// . then the parent caller can store that ptr in the m_wordToSyn[] array
//   which we pre-alloc upon calling the set() function based on the # of
//   words we got
// . returns # of synonyms stored into "tmpBuf"
long Synonyms::getSynonyms ( Words *words , 
			     long wordNum , 
			     uint8_t langId ,
			     char *tmpBuf ,
			     long niceness ) {

	// punct words have no synoyms
	if ( ! words->m_wordIds[wordNum] ) return 0;

	// store these
	m_words     = words;
	m_docLangId = langId;
	m_niceness = niceness;

	// sanity check
	if ( wordNum > m_words->m_numWords ) { char *xx=NULL;*xx=0; }

	// init the dedup table to dedup wordIds
	HashTableX dt;
	char dbuf[512];
	dt.set(8,0,12,dbuf,512,false,m_niceness,"altwrds");


	long maxSyns = (long)MAX_SYNS;

	char *bufPtr = tmpBuf;

	// point into buffer
	m_aids = (long long *)bufPtr;
	bufPtr += maxSyns * 8;

	// then the word ids
	m_wids0 = (long long *)bufPtr;
	bufPtr += maxSyns * 8;

	// second word ids, for multi alnum word synonyms, i.e. "New Jersey"
	m_wids1 = (long long *)bufPtr;
	bufPtr += maxSyns * 8;

	m_termPtrs = (char **)bufPtr;
	bufPtr += maxSyns * 4;

	m_termLens = (long *)bufPtr;
	bufPtr += maxSyns * 4;

	m_numAlnumWords = (long *)bufPtr;
	bufPtr += maxSyns * 4;

	m_numAlnumWordsInBase = (long *)bufPtr;
	bufPtr += maxSyns * 4;


	// source
	m_src = bufPtr;
	bufPtr += maxSyns;

	// cursors
	m_aidsPtr  = m_aids;
	m_wids0Ptr = m_wids0;
	m_wids1Ptr = m_wids1;
	m_srcPtr   = m_src;
	m_termPtrsPtr = m_termPtrs;
	m_termLensPtr = m_termLens;
	m_numAlnumWordsPtr = m_numAlnumWords;
	m_numAlnumWordsInBasePtr = m_numAlnumWordsInBase;

	
	char *w    = m_words->m_words   [wordNum];
	long  wlen = m_words->m_wordLens[wordNum];

	//
	// NOW hit wiktionary
	// Trust this less then our s_exceptions above, but more than
	// our morph computations below
	//

	char sourceId = SOURCE_WIKTIONARY;
	char *ss = NULL;
	long long bwid;
	char wikiLangId = m_docLangId;
	bool hadSpace ;
	long klen ;
	long baseNumAlnumWords;

 tryOtherLang:

	/*
	// if word only exists in one language, assume that language for word
	// even if m_docLangId is langUnknown (0)
	if ( ! ss &&
	     ! m_docLangId &&
	     ! wikiLangId ) {
		// get raw word id
		bwid = m_words->m_wordIds[wordNum];
		// each lang has its own bit
		long long bits = g_speller.getLangBits64 ( &bwid );
		// skip if not unique
		char count = getNumBitsOn64 ( bits ) ;
		// if we only got one lang we could be, assume that
		if ( count == 1 )
			// get it. bit #0 is english, so add 1
			wikiLangId = getBitPosLL((uint8_t *)&bits) + 1;
		// try setting based on script. greek. russian. etc.
		// if the word was not in the wiktionary.
		// this will be langUnknown if not definitive.
		else
			wikiLangId = getCharacterLanguage(w);
	}
	*/

	// try looking up bigram so "new jersey" gets "nj" as synonym
	if ( wikiLangId && 
	     wordNum+2< m_words->m_numWords &&
	     m_words->m_wordIds[wordNum+2]) {
		// get phrase id bigram then
		long conti = 0;
		bwid = hash64Lower_utf8_cont(w,wlen,0,&conti);
		// then the next word
		char *wp2 = m_words->m_words[wordNum+2];
		long  wlen2 = m_words->m_wordLens[wordNum+2];
		bwid = hash64Lower_utf8_cont(wp2,wlen2,bwid,&conti);
		baseNumAlnumWords = 2;
		ss = g_wiktionary.getSynSet( bwid, wikiLangId );
	}

	// need a language for wiktionary to work with
	if ( wikiLangId && ! ss ) {
		// get raw word id
		bwid = m_words->m_wordIds[wordNum];
		baseNumAlnumWords = 1;
		//if ( bwid == 1424622907102375150LL)
		//	log("a");
		ss = g_wiktionary.getSynSet( bwid, wikiLangId );
		// if that failed try removing 's from word if there
		if ( ! ss && 
		     wlen >= 3 &&
		     w[wlen-2]=='\'' && 
		     w[wlen-1]=='s' ) {
			long long cwid = hash64Lower_utf8(w,wlen-2);
			ss = g_wiktionary.getSynSet( cwid, wikiLangId );
		}
	}

	// even though a document may be in german it often has some
	// english words "pdf download" "copyright" etc. so if the word
	// has no synset in german, try it in english
	if ( //numPresets == 0 &&
	     ! ss &&
	     m_docLangId != langEnglish &&
	     wikiLangId  != langEnglish &&
	     m_docLangId &&
	     g_speller.getSynsInEnglish(w,wlen,m_docLangId,langEnglish) ) {
		// try english
		wikiLangId = langEnglish;
		sourceId   = SOURCE_WIKTIONARY_EN;
		goto tryOtherLang;
	}

	// if it was in wiktionary, just use that synset
	if ( ss ) {
		// prepare th
		HashTableX dedup;
		HashTableX *dd = NULL;
		char dbuf[512];
		long count = 0;
	addSynSet:
		// do we have another set following this
		char *next = g_wiktionary.getNextSynSet(bwid,m_docLangId,ss);
		// if so, init the dedup table then
		if ( next && ! dd ) {
			dd = &dedup;
			dd->set ( 8,0,8,dbuf,512,false,m_niceness,"sddbuf");
		}
		// skip over the pipe i guess
		char *pipe = ss + 2;
		// zh_ch?
		if ( *pipe == '_' ) pipe += 3;
		// sanity
		if ( *pipe != '|' ) { char *xx=NULL;*xx=0; }
		// point to word list
		char *p = pipe + 1;
		// hash up the list of words, they are in utf8 and
		char *e = p + 1;
		// save count in case we need to undo
		//long saved = m_numAlts[wordNum];
	hashLoop:


		// skip synonyms that are anagrams because its to ambiguous
		// the are mappings like
		// "PC" -> "PC,Personal Computer" 
		// "PC" -> "PC,Probable Cause" ... (lots more!)
		//bool isAnagram = true;
		for ( ; *e !='\n' && *e != ',' ; e++ ) ;
		//	if ( ! is_upper_a(*e) ) isAnagram = false;

		// get it
		long long h = hash64Lower_utf8_nospaces ( p , e - p );

		// skip if same as base word
		if ( h == bwid ) goto getNextSyn;

		// should we check for dups?
		if ( dd ) {
			// skip dups
			if ( dd->isInTable(&h) ) goto getNextSyn;
			// dedup. return false with g_errno set on error
			if ( ! dd->addKey(&h) ) return m_aidsPtr - m_aids;
		}
		// store it
		*m_aidsPtr++ = h;

		// store source
		*m_srcPtr++ = sourceId;

		hadSpace = false;
		klen = e - p;
		for ( long k = 0 ; k < klen ; k++ )
			if ( is_wspace_a(p[k]) ) hadSpace = true;

		*m_termPtrsPtr++ = p;
		*m_termLensPtr++ = e-p;

		// only for multi-word synonyms like "New Jersey"...
		*m_wids0Ptr = 0LL;
		*m_wids1Ptr = 0LL;
		*m_numAlnumWordsPtr = 1;

		// and for multi alnum word synonyms
		if ( hadSpace ) {
			Words sw;
			sw.setx ( p , e - p , m_niceness );
			*(long long *)m_wids0Ptr = sw.m_wordIds[0];
			*(long long *)m_wids1Ptr = sw.m_wordIds[2];
			*(long  *)m_numAlnumWordsPtr = sw.getNumAlnumWords();
		}

		m_wids0Ptr++;
		m_wids1Ptr++;
		m_numAlnumWordsPtr++;

		// how many words did we have to hash to find a synset?
		// i.e. "new jersey" would be 2, to get "nj"
		*m_numAlnumWordsInBasePtr++ = baseNumAlnumWords;

		// do not breach
		if ( ++count >= maxSyns ) goto done;
	getNextSyn:
		// loop for more
		if ( *e == ',' ) { e++; p = e; goto hashLoop; }
		// add in the next syn set, deduped
		if ( next ) { ss = next; goto addSynSet; }
		// wrap it up
	done:
		// all done
		return m_aidsPtr - m_aids;
	}


	// strip marks from THIS word, return -1 w/ g_errno set on error
	if ( ! addStripped ( w , wlen,&dt ) ) return m_aidsPtr - m_aids;

	// returns false with g_errno set
	if ( ! addAmpPhrase ( wordNum, &dt ) ) return m_aidsPtr - m_aids;

	// if we end in apostrophe, strip and add
	if ( wlen>= 3 &&
	     w[wlen-1] == 's' && 
	     w[wlen-2]=='\'' &&
	     ! addWithoutApostrophe ( wordNum, &dt ) )
		return m_aidsPtr - m_aids;

	return m_aidsPtr - m_aids;
}

Esempio n. 29

0

Mostra file

File: Contributors.cpp Progetto: bianjinsong/osg

void readContributors(NameMap& names, const string& file)
{
    osgDB::ifstream fin(file.c_str());

    Words words;
    while(fin)
    {
        string keyword;
        fin >> keyword;
        words.push_back(keyword);
    }

    string blank_string;

    for(unsigned int i = 0; i < words.size(); ++i)
    {
        if (submissionsSequence(words, i))
        {
            if (i + 2 < words.size() && validName(words[i + 1]))
            {
                NamePair name = createName(words[i + 1], words[i + 2]);
                nameCorrection(name);
                if (!name.first.empty()) ++names[name];
                i += 2;
            }
            else if (i + 1 < words.size() && validName(words[i + 1]))
            {
                NamePair name = createName(words[i + 1], blank_string);
                nameCorrection(name);
                if (!name.first.empty()) ++names[name];
                i += 1;
            }
        }
        else
        {
            if (words[i] == "robert")
            {
                ++names[NameRobertOsfield];
            }
            else if (words[i] == "don")
            {
                ++names[NameDonBurns];
            }
        }
    }

    // reassign fisrt name entries to their full names entries
    if (names.size() > 1)
    {
        for (NameMap::iterator itr = names.begin(); itr != names.end(); )
        {
            if (itr->first.second.empty())
            {
                NameMap::iterator next_itr = itr;
                ++next_itr;

                if (next_itr != names.end() && itr->first.first == next_itr->first.first)
                {
                    next_itr->second += itr->second;
                    names.erase(itr);
                    itr = next_itr;
                }
                else
                {
                    ++itr;
                }
            }
            else
            {
                ++itr;
            }
        }
    }

    // remove the double entries from Robert's contributions
    if (names.size() > 1)
    {
        for (NameMap::iterator itr = names.begin(); itr != names.end(); ++itr)
        {
            if (itr->first != NameRobertOsfield && itr->first != NameDonBurns)
            {
                names[NameRobertOsfield] -= itr->second;
            }
        }
    }
}

Esempio n. 30

0

Mostra file

File: Synonyms.cpp Progetto: BillWangCS/open-source-search-engine

// langId is language of the query
long long getSynBaseHash64 ( char *qstr , uint8_t langId ) {
	Words ww;
	ww.set3 ( qstr );
	long nw = ww.getNumWords();
	long long *wids = ww.getWordIds();
	//char **wptrs = ww.getWords();
	//long *wlens = ww.getWordLens();
	long long baseHash64 = 0LL;
	Synonyms syn;
	// assume english if unknown to fix 'pandora's tower'
	// vs 'pandoras tower' where both words are in both
	// english and german so langid is unknown
	if ( langId == langUnknown ) langId = langEnglish;
	// . store re-written query into here then hash that string
	// . this way we can get rid of spaces
	//char rebuf[1024];
	//char *p = rebuf;
	//if ( strstr(qstr,"cheatcodes") )
	//	log("hey");
	// for deduping
	HashTableX dups;
	if ( ! dups.set ( 8,0,1024,NULL,0,false,0,"qhddup") ) return false;
	// scan the words
	for ( long i = 0 ; i < nw ; i++ ) {
		// skip if not alnum
		if ( ! wids[i] ) continue;
		// get its synonyms into tmpBuf
		char tmpBuf[TMPSYNBUFSIZE];
		// . assume niceness of 0 for now
		// . make sure to get all synsets!! ('love' has two synsets)
		long naids = syn.getSynonyms (&ww,i,langId,tmpBuf,0);
		// term freq algo
		//long pop = g_speller.getPhrasePopularity(NULL,
		//					 wids[i],
		//					 true,
		//					 langId);
		// is it a queryStopWord like "the" or "and"?
		bool isQueryStop = ::isQueryStopWord(NULL,0,wids[i]);
		// a more restrictive list
		bool isStop = ::isStopWord(NULL,0,wids[i]);
		if ( ::isCommonQueryWordInEnglish(wids[i]) ) isStop = true;
		// find the smallest one
		unsigned long long min = wids[i];
		//char *minWordPtr = wptrs[i];
		//long  minWordLen = wlens[i];
		// declare up here since we have a goto below
		long j;
		// add to table too
		if ( dups.isInTable ( &min ) ) goto gotdup;
		// add to it
		if ( ! dups.addKey ( &min ) ) return false;
		// now scan the synonyms, they do not include "min" in them
		for ( j = 0 ; j < naids ; j++ ) {
			// get it
			unsigned long long aid64;
			aid64 = (unsigned long long)syn.m_aids[j];
			// if any syn already hashed then skip it and count
			// as a repeated term. we have to do it this way
			// rather than just getting the minimum synonym 
			// word id, because 'love' has two synsets and
			// 'like', a synonym of 'love' only has one synset
			// and they end up having different minimum synonym
			// word ids!!!
			if ( dups.isInTable ( &aid64 ) ) break;
			// add it. this could fail!
			if ( ! dups.addKey ( &aid64 ) ) return false;
			// set it?
			if ( aid64 >= min ) continue;
			// got a new min
			min = aid64;
			//minWordPtr = syn.m_termPtrs[j];
			//minWordLen = syn.m_termLens[j];
			// get largest term freq of all synonyms
			//long pop2 = g_speller.getPhrasePopularity(NULL,aid64,
			//					  true,langId);
			//if ( pop2 > pop ) pop = pop2;
		}
		// early break out means a hit in dups table
		if ( j < naids ) {
		gotdup:
			// do not count as repeat if query stop word
			// because they often repeat
			if ( isQueryStop ) continue;
			// count # of repeated word forms
			//nrwf++;
			continue;
		}
		// hash that now
		// do not include stop words in synbasehash so
		// 'search the web' != 'search web'
		if ( ! isStop ) {
			// no! make it order independent so 'search the web'
			// equals 'web the search' and 'engine search'
			// equals 'search engine'
			//baseHash64 <<= 1LL;
			baseHash64 ^= min;
		}
		// count it, but only if not a query stop word like "and"
		// or "the" or "a". # of unique word forms.
		//if ( ! isQueryStop ) nuwf++;
		// get term freq 
		//if ( pop > maxPop ) maxPop = pop;
		// control word?
		//if ( wids[i] == cw1 ) ncwf++;
	}
	return baseHash64;
}