float KernelKmeansClusterer::distanceToCluster(int sampleId,
		const boost::unordered_set<int>& clusterIdSet) {
	if (clusterIdSet.empty()) {
		return FLT_MAX;
	}
	// the sum of weight of the cluster
	float weightSum = 0.0f;
	// the sum of kernels from the sample to all samples in the cluster, multiplied by the weight
	float kernelSum = 0.0f;
	// the sum of kernels from each pair of samples in the cluster, multiplied by weights
	float kernelPairSum = 0.0f;
	for (boost::unordered_set<int>::const_iterator iter = clusterIdSet.begin();
			iter != clusterIdSet.end(); ++iter) {
		float weight = mWeightArray[*iter];
		weightSum += weight;
		float kernelResult = (*mpKernel)(sampleId, *iter);
		kernelSum += weight * kernelResult;
		for (boost::unordered_set<int>::const_iterator _iter =
				clusterIdSet.begin(); _iter != clusterIdSet.end(); ++_iter) {
			float _kernelResult = (*mpKernel)(*iter, *_iter);
			kernelPairSum += weight * mWeightArray[*_iter] * _kernelResult;
		}
	}
	float kernelSelf = (*mpKernel)(sampleId, sampleId);
	if (weightSum == 0.0f) {
		return FLT_MAX;
	}
	return kernelSelf - 2 * kernelSum / weightSum
			+ kernelPairSum / (weightSum * weightSum);
}
Exemple #2
0
static bool collectShortcuts(const std::string & str, StringVector & vs) 
{
	static boost::unordered_set<std::string> commonFilters;
	if(commonFilters.empty()) {
#if RULE_KEY_HASH_LENGTH==7 // 7
		commonFilters.insert("http://");
		commonFilters.insert("ttp://w");
		commonFilters.insert("tp://ww");
		commonFilters.insert("p://www");
		commonFilters.insert("://www.");
#elif RULE_KEY_HASH_LENGTH==8 // 8
		commonFilters.insert("http://w");
		commonFilters.insert("ttp://ww");
		commonFilters.insert("tp://www");
		commonFilters.insert("p://www.");
#elif RULE_KEY_HASH_LENGTH==9 // 9
		commonFilters.insert("http://ww");
		commonFilters.insert("ttp://www");
		commonFilters.insert("tp://www.");
#endif
	}

	int i = 0;
	bool isFindShoutcut = false;
	while (i < abpmin(str.length() - RULE_KEY_HASH_LENGTH,80)) 
	{
		unsigned int j = i;
		for (; j < str.length(); j++) {
			if ((str[j] == '*' || str[j] == '^'))
			{
				break;
			}
		}
		for (unsigned int k = i; j - k >= RULE_KEY_HASH_LENGTH; k++) 
		{
			std::string key = str.substr(k, RULE_KEY_HASH_LENGTH);
			if(commonFilters.find(key)!=commonFilters.end())
				continue;
			isFindShoutcut = true;
			vs.push_back(key); //append(key);
		}
		i = j + 1;
	}
	return isFindShoutcut;
}