// REVIEW: the poolSize can be pulled from the numeric array RDKit::INT_VECT MaxMinPicks(MaxMinPicker *picker, python::object distMat, int poolSize, int pickSize, python::object firstPicks, int seed) { if (pickSize >= poolSize) { throw ValueErrorException("pickSize must be less than poolSize"); } if (!PyArray_Check(distMat.ptr())) { throw ValueErrorException("distance mat argument must be a numpy matrix"); } PyArrayObject *copy; copy = (PyArrayObject *)PyArray_ContiguousFromObject(distMat.ptr(), PyArray_DOUBLE, 1, 1); double *dMat = (double *)copy->data; RDKit::INT_VECT firstPickVect; for (unsigned int i = 0; i < python::extract<unsigned int>(firstPicks.attr("__len__")()); ++i) { firstPickVect.push_back(python::extract<int>(firstPicks[i])); } RDKit::INT_VECT res = picker->pick(dMat, poolSize, pickSize, firstPickVect, seed); Py_DECREF(copy); return res; }
RDKit::INT_VECT HierarchicalClusterPicker::pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const { PRECONDITION(distMat,"bad distance matrix"); RDKit::VECT_INT_VECT clusters = this->cluster(distMat, poolSize, pickSize); CHECK_INVARIANT(clusters.size() == pickSize, ""); // the last step: find a representative element from each of the // remaining clusters RDKit::INT_VECT picks; for (unsigned int i = 0; i < pickSize; i++) { int pick; double minSumD2 = RDKit::MAX_DOUBLE; for (RDKit::INT_VECT_CI cxi1 = clusters[i].begin(); cxi1 != clusters[i].end(); ++cxi1 ) { int curPick = (*cxi1); double d2sum = 0.0; for (RDKit::INT_VECT_CI cxi2 = clusters[i].begin(); cxi2 != clusters[i].end(); ++cxi2) { if (cxi1 == cxi2) { continue; } double d = getDistFromLTM(distMat, curPick, (*cxi2)); d2sum += (d*d); } if (d2sum < minSumD2) { pick = curPick; minSumD2 = d2sum; } } picks.push_back(pick); } return picks; }
void SetMaskBits(InfoBitRanker *ranker, python::object maskBits) { RDKit::INT_VECT cList; PySequenceHolder<int> bList(maskBits); cList.reserve(bList.size()); for (unsigned int i = 0; i < bList.size(); i++) { cList.push_back(bList[i]); } ranker->setMaskBits(cList); }
void setBitList(BitCorrMatGenerator *cmGen, python::object bitList) { PySequenceHolder<int> blist(bitList); unsigned int nb = blist.size(); RDKit::INT_VECT res; res.reserve(nb); for (unsigned int i = 0; i < nb; i++) { res.push_back(blist[i]); } cmGen->setBitIdList(res); }
RDKit::INT_VECT LazyMaxMinPicks(MaxMinPicker *picker, python::object distFunc, int poolSize, int pickSize, python::object firstPicks, int seed, bool useCache) { RDKit::INT_VECT firstPickVect; for (unsigned int i = 0; i < python::extract<unsigned int>(firstPicks.attr("__len__")()); ++i) { firstPickVect.push_back(python::extract<int>(firstPicks[i])); } RDKit::INT_VECT res; pyobjFunctor functor(distFunc, useCache); res = picker->lazyPick(functor, poolSize, pickSize, firstPickVect, seed); return res; }
RDKit::INT_VECT LazyVectorMaxMinPicks(MaxMinPicker *picker, python::object objs, int poolSize, int pickSize, python::object firstPicks, int seed, DistanceMethod method ) { pyBVFunctor functor(objs,method); RDKit::INT_VECT firstPickVect; for(unsigned int i=0;i<python::extract<unsigned int>(firstPicks.attr("__len__")());++i){ firstPickVect.push_back(python::extract<int>(firstPicks[i])); } RDKit::INT_VECT res=picker->lazyPick(functor, poolSize, pickSize,firstPickVect,seed); return res; }
RDKit::INT_VECT LazyVectorMaxMinPicks(MaxMinPicker *picker, python::object objs, int poolSize, int pickSize, python::object firstPicks, int seed, bool useCache) { std::vector<const ExplicitBitVect *> bvs(poolSize); for (int i = 0; i < poolSize; ++i) { bvs[i] = python::extract<const ExplicitBitVect *>(objs[i]); } pyBVFunctor<ExplicitBitVect> functor(bvs, TANIMOTO, useCache); RDKit::INT_VECT firstPickVect; for (unsigned int i = 0; i < python::extract<unsigned int>(firstPicks.attr("__len__")()); ++i) { firstPickVect.push_back(python::extract<int>(firstPicks[i])); } RDKit::INT_VECT res = picker->lazyPick(functor, poolSize, pickSize, firstPickVect, seed); return res; }
RDKit::VECT_INT_VECT HierarchicalClusterPicker::cluster(const double *distMat, unsigned int poolSize, unsigned int pickSize) const { PRECONDITION(distMat, "Invalid Distance Matrix"); PRECONDITION((poolSize >= pickSize), "pickSize cannot be larger than the poolSize"); // Do the clustering long int method = (long int)d_method; long int len = poolSize*(poolSize-1); long int *ia = (long int *)calloc(poolSize, sizeof(long int)); long int *ib = (long int *)calloc(poolSize, sizeof(long int)); real *crit = (real *)calloc(poolSize,sizeof(real)); CHECK_INVARIANT(ia,"failed to allocate memory"); CHECK_INVARIANT(ib,"failed to allocate memory"); CHECK_INVARIANT(crit,"failed to allocate memory"); long int poolSize2=static_cast<long int>(poolSize); distdriver_(&poolSize2, // number of items in the pool &len, // number of entries in the distance matrix (real *)distMat, // distance matrix &method, // the clustering method (ward, slink etc.) ia, // int vector with clustering history ib, // one more clustering history matrix crit // I believe this is a vector the difference in heights of two clusters ); // we have the clusters now merge then until the number of clusters is same // as the number of picks we need // before we do that a bit of explanation on the vectors "ia" and "ib" // - We with each item in the pool as an individual cluster // - then we use the vectors ia and ib to merge them. // ia and ib provides the ids of the clusters that need to be merged // it is assumed that when a cluster ia[j] is merged with ib[j] // ia[j] is replaced by the new cluster in the cluster list // RDKit::VECT_INT_VECT clusters; for (unsigned int i = 0; i < poolSize; i++) { RDKit::INT_VECT cls; cls.push_back(i); clusters.push_back(cls); } // do the merging, each round of of this loop eleminates one cluster RDKit::INT_VECT removed; for (unsigned int i = 0; i < (poolSize - pickSize); i++) { int cx1 = ia[i] - 1; int cx2 = ib[i] - 1; // add the items from cluster cx2 to cx1 // REVIEW: merge function??? for (RDKit::INT_VECT_CI cx2i = clusters[cx2].begin(); cx2i != clusters[cx2].end(); cx2i++) { clusters[cx1].push_back(*cx2i); } // mark the second cluster as removed removed.push_back(cx2); } free(ia); free(ib); free(crit); // sort removed so that looping will be easier later std::sort(removed.begin(), removed.end()); //some error checking here, uniqueify removed and the vector should not changed // REVIEW can we put this inside a #ifdef DEBUG? RDKit::INT_VECT_CI nEnd = std::unique(removed.begin(), removed.end()); CHECK_INVARIANT(nEnd == removed.end(), "Somehow there are duplicates in the list of removed clusters"); RDKit::VECT_INT_VECT res; unsigned int j = 0; for (unsigned int i = 0; i < poolSize; i++) { if (static_cast<int>(i) == removed[j]) { j++; continue; } res.push_back(clusters[i]); } return res; }