float KernelKmeansClusterer::distanceToCluster(int sampleId, const boost::unordered_set<int>& clusterIdSet) { if (clusterIdSet.empty()) { return FLT_MAX; } // the sum of weight of the cluster float weightSum = 0.0f; // the sum of kernels from the sample to all samples in the cluster, multiplied by the weight float kernelSum = 0.0f; // the sum of kernels from each pair of samples in the cluster, multiplied by weights float kernelPairSum = 0.0f; for (boost::unordered_set<int>::const_iterator iter = clusterIdSet.begin(); iter != clusterIdSet.end(); ++iter) { float weight = mWeightArray[*iter]; weightSum += weight; float kernelResult = (*mpKernel)(sampleId, *iter); kernelSum += weight * kernelResult; for (boost::unordered_set<int>::const_iterator _iter = clusterIdSet.begin(); _iter != clusterIdSet.end(); ++_iter) { float _kernelResult = (*mpKernel)(*iter, *_iter); kernelPairSum += weight * mWeightArray[*_iter] * _kernelResult; } } float kernelSelf = (*mpKernel)(sampleId, sampleId); if (weightSum == 0.0f) { return FLT_MAX; } return kernelSelf - 2 * kernelSum / weightSum + kernelPairSum / (weightSum * weightSum); }
static bool collectShortcuts(const std::string & str, StringVector & vs) { static boost::unordered_set<std::string> commonFilters; if(commonFilters.empty()) { #if RULE_KEY_HASH_LENGTH==7 // 7 commonFilters.insert("http://"); commonFilters.insert("ttp://w"); commonFilters.insert("tp://ww"); commonFilters.insert("p://www"); commonFilters.insert("://www."); #elif RULE_KEY_HASH_LENGTH==8 // 8 commonFilters.insert("http://w"); commonFilters.insert("ttp://ww"); commonFilters.insert("tp://www"); commonFilters.insert("p://www."); #elif RULE_KEY_HASH_LENGTH==9 // 9 commonFilters.insert("http://ww"); commonFilters.insert("ttp://www"); commonFilters.insert("tp://www."); #endif } int i = 0; bool isFindShoutcut = false; while (i < abpmin(str.length() - RULE_KEY_HASH_LENGTH,80)) { unsigned int j = i; for (; j < str.length(); j++) { if ((str[j] == '*' || str[j] == '^')) { break; } } for (unsigned int k = i; j - k >= RULE_KEY_HASH_LENGTH; k++) { std::string key = str.substr(k, RULE_KEY_HASH_LENGTH); if(commonFilters.find(key)!=commonFilters.end()) continue; isFindShoutcut = true; vs.push_back(key); //append(key); } i = j + 1; } return isFindShoutcut; }