// virtual GMatrix* GAgglomerativeTransducer::transduceInner(const GMatrix& features1, const GMatrix& labels1, const GMatrix& features2) { // Init the metric if(!m_pMetric) setMetric(new GRowDistance(), true); m_pMetric->init(&features1.relation(), false); // Make a dataset with all featuers GMatrix featuresAll(features1.relation().clone()); featuresAll.reserve(features1.rows() + features2.rows()); GReleaseDataHolder hFeaturesAll(&featuresAll); for(size_t i = 0; i < features1.rows(); i++) featuresAll.takeRow((double*)features1[i]); for(size_t i = 0; i < features2.rows(); i++) featuresAll.takeRow((double*)features2[i]); // Find enough neighbors to form a connected graph GNeighborGraph* pNF = NULL; size_t neighbors = 6; while(true) { GKdTree* pKdTree = new GKdTree(&featuresAll, neighbors, m_pMetric, false); pNF = new GNeighborGraph(pKdTree, true); pNF->fillCache(); if(pNF->isConnected()) break; if(neighbors + 1 >= featuresAll.rows()) { delete(pNF); throw Ex("internal problem--a graph with so many neighbors must be connected"); } neighbors = std::min((neighbors * 3) / 2, featuresAll.rows() - 1); } // Sort all the neighbors by their distances size_t count = featuresAll.rows() * neighbors; vector< std::pair<double,size_t> > distNeighs; distNeighs.resize(count); double* pDistances = pNF->squaredDistanceTable(); size_t* pRows = pNF->cache(); size_t index = 0; vector< std::pair<double,size_t> >::iterator it = distNeighs.begin(); for(size_t i = 0; i < count; i++) { if(*pRows < featuresAll.rows()) { it->first = *pDistances; it->second = i; it++; } else index--; pRows++; pDistances++; } std::sort(distNeighs.begin(), it); // Transduce GMatrix* pOut = new GMatrix(labels1.relation().clone()); Holder<GMatrix> hOut(pOut); pOut->newRows(features2.rows()); pOut->setAll(-1); size_t* pSiblings = new size_t[featuresAll.rows()]; // a cyclical linked list of each row in the cluster ArrayHolder<size_t> hSiblings(pSiblings); for(size_t lab = 0; lab < labels1.cols(); lab++) { // Assign each row to its own cluster GIndexVec::makeIndexVec(pSiblings, featuresAll.rows()); // init such that each row is in a cluster of 1 size_t missingLabels = features2.rows(); // Merge until we have the desired number of clusters pRows = pNF->cache(); for(vector< std::pair<double,size_t> >::iterator dn = distNeighs.begin(); dn != it; dn++) { // Get the next two closest points size_t a = dn->second / neighbors; size_t b = pRows[dn->second]; GAssert(a != b && a < featuresAll.rows() && b < featuresAll.rows()); int labelA = (a < features1.rows() ? (int)labels1[a][lab] : (int)pOut->row(a - features1.rows())[lab]); int labelB = (b < features1.rows() ? (int)labels1[b][lab] : (int)pOut->row(b - features1.rows())[lab]); // Merge the clusters if(labelA >= 0 && labelB >= 0) continue; // Both points are already labeled, so there is no point in merging their clusters if(labelA < 0 && labelB >= 0) // Make sure that if one of them has a valid label, it is point a { std::swap(a, b); std::swap(labelA, labelB); } if(labelA >= 0) { for(size_t i = pSiblings[b]; true; i = pSiblings[i]) // Label every row in cluster b { GAssert(i >= features1.rows()); GAssert(pOut->row(i - features1.rows())[lab] == (double)-1); pOut->row(i - features1.rows())[lab] = labelA; missingLabels--; if(i == b) break; } if(missingLabels <= 0) break; } std::swap(pSiblings[a], pSiblings[b]); // This line joins the cyclical linked lists into one big cycle } } return hOut.release(); }
// virtual GMatrix* GGraphCutTransducer::transduceInner(const GMatrix& features1, const GMatrix& labels1, const GMatrix& features2) { // Use k-NN to compute a distance metric with good scale factors for prediction GKNN knn; knn.setNeighborCount(m_neighborCount); //knn.setOptimizeScaleFactors(true); knn.train(features1, labels1); GRowDistanceScaled* pMetric = knn.metric(); // Merge the features into one dataset and build a kd-tree GMatrix both(features1.relation().clone()); GReleaseDataHolder hBoth(&both); both.reserve(features1.rows() + features2.rows()); for(size_t i = 0; i < features1.rows(); i++) both.takeRow((double*)features1[i]); for(size_t i = 0; i < features2.rows(); i++) both.takeRow((double*)features2[i]); GRowDistanceScaled metric2; GKdTree neighborFinder(&both, m_neighborCount, &metric2, false); GVec::copy(metric2.scaleFactors(), pMetric->scaleFactors(), features1.cols()); // Transduce GMatrix* pOut = new GMatrix(labels1.relation().clone()); Holder<GMatrix> hOut(pOut); pOut->newRows(features2.rows()); pOut->setAll(0); for(size_t lab = 0; lab < labels1.cols(); lab++) { // Use max-flow/min-cut graph-cut to separate out each label value int valueCount = (int)labels1.relation().valueCount(lab); for(int val = 1; val < valueCount; val++) { // Add neighborhood edges GGraphCut gc(features1.rows() + features2.rows() + 2); for(size_t i = 0; i < both.rows(); i++) { neighborFinder.neighbors(m_pNeighbors, m_pDistances, i); for(size_t j = 0; j < m_neighborCount; j++) { if(m_pNeighbors[j] >= both.rows()) continue; gc.addEdge(2 + i, 2 + m_pNeighbors[j], (float)(1.0 / std::max(sqrt(m_pDistances[j]), 1e-9))); // connect neighbors } } // Add source and sink edges for(size_t i = 0; i < features1.rows(); i++) { if((int)labels1[i][0] == val) gc.addEdge(0, 2 + i, 1e12f); // connect to source else gc.addEdge(1, 2 + i, 1e12f); // connect to sink } // Cut gc.cut(0, 1); // Label the unlabeled rows for(size_t i = 0; i < features2.rows(); i++) { if(gc.isSource(2 + features1.rows() + i)) pOut->row(i)[lab] = (double)val; } } } return hOut.release(); }