static PyObject * py_minibatch(PyObject *Py_UNUSED(self), PyObject *args) { PyArrayObject *data; PyArrayObject *centroids; int n_samples, max_iter; double reassignment_ratio, bic_ratio_termination; if (!PyArg_ParseTuple(args, "OOiidd", &data, ¢roids, &n_samples, &max_iter, &bic_ratio_termination, &reassignment_ratio)) { return NULL; } require_contiguous_ndarray(data, 2, NPY_DOUBLE, "double"); require_contiguous_ndarray(centroids, 2, NPY_DOUBLE, "double"); require_dimension_match(data, 1, centroids, 1, "features"); const npy_intp N = PyArray_DIM(data, 0); const npy_intp D = PyArray_DIM(data, 1); const npy_intp k = PyArray_DIM(centroids, 0); require_positive_as_int(N, "samples"); require_positive_as_int(D, "features"); require_positive_as_int(k, "centroids"); require_positive(max_iter, "allowed iterations"); require_positive(n_samples, "requested samples"); if (n_samples > (int)N) { PyErr_SetString(PyExc_RuntimeError, "more samples requested than data."); return NULL; } minibatch( PyArray_DATA(data), PyArray_DATA(centroids), n_samples, max_iter, bic_ratio_termination, reassignment_ratio, (int)k, (int)N, (int)D ); Py_XINCREF(centroids); return (PyObject *)centroids; }
// Reading minibatch. Minibatch SequencePacker::ReadMinibatch() { assert(m_streamBufferSizes.size() == m_streamBuffers.size()); const auto sequences = m_transformer->GetNextSequences(m_minibatchSize); Minibatch minibatch(sequences.m_endOfEpoch); if (sequences.m_data.empty()) { return minibatch; } // For each stream packing the minibatch. minibatch.m_data.reserve(sequences.m_data.size()); for (size_t streamIndex = 0; streamIndex < sequences.m_data.size(); ++streamIndex) { minibatch.m_data.push_back(PackStreamMinibatch(sequences.m_data[streamIndex], streamIndex)); } return minibatch; }
Minibatch SequencePacker::ReadMinibatch() { auto sequences = GetNextSequences(); const auto& batch = sequences.m_data; Minibatch minibatch(sequences.m_endOfEpoch); if (batch.empty()) { return minibatch; } auto& currentBuffer = m_streamBuffers[m_currentBufferIndex]; assert(m_outputStreamDescriptions.size() == batch.size()); for (int streamIndex = 0; streamIndex < batch.size(); ++streamIndex) { const auto& streamBatch = batch[streamIndex]; if (m_checkSampleShape[streamIndex]) { CheckSampleShape(streamBatch, m_outputStreamDescriptions[streamIndex]); } const auto& type = m_outputStreamDescriptions[streamIndex]->m_storageType; auto pMBLayout = (type == StorageType::dense) ? PackDenseStream(streamBatch, streamIndex) : PackSparseStream(streamBatch, streamIndex); auto& buffer = currentBuffer[streamIndex]; auto streamMinibatch = std::make_shared<StreamMinibatch>(); streamMinibatch->m_data = buffer.m_data.get(); streamMinibatch->m_layout = pMBLayout; minibatch.m_data.push_back(streamMinibatch); } m_currentBufferIndex = (m_currentBufferIndex + 1) % m_numberOfBuffers; return minibatch; }
int xmeans(double *data, double *centroids, int n_samples, int max_iter, int k_min, int k_max, int N, int D) { // assert(k < n_samples < N) // assert(data.shape == (N, D) // assert(centoids.shape == (k, D) _LOG("Initializing\n"); int *sample_indicies = (int*) malloc(n_samples * sizeof(int)); int *centroid_counts = (int*) malloc(k_max * sizeof(int)); int *cluster_cache = (int*) malloc(n_samples * sizeof(int)); int *test_sample_indicies = (int*) malloc(n_samples * sizeof(int)); double *test_vector = (double*) malloc(D * sizeof(double)); double *test_centroids = (double*) malloc(2 * D * sizeof(double)); double *centroid_distances = (double*) malloc(k_max * sizeof(double)); double distance; int new_k = -1; int k = k_min; for (int i=0; i<2*D; i++) { test_centroids[i] = 0.0; } _LOG("Starting xmeans\n"); while (k < k_max && new_k != 0) { _LOG("Iteration k=%d\n", k); _LOG("\tRunning MiniBatch over full set\n"); minibatch(data, centroids, n_samples, max_iter, k, N, D); _LOG("\tGetting centroid distances\n"); // TODO: optimize this distance calculation for(int c1=0; c1<k; c1++) { centroid_distances[c1] = -1; for(int c2=0; c2<k; c2++) { if (c1 != c2) { distance = distance_metric(centroids + c1*D, centroids + c2*D, D); if (centroid_distances[c1] == -1 || distance < centroid_distances[c1]) { centroid_distances[c1] = distance; } } } } new_k = 0; for(int c=0; c<k; c++) { _LOG("\tRunning 2means on cluster c=%d\n", c); if (k + new_k >= k_max) { _LOG("\tNot continuing with splitting clusters\n"); break; } for(int j=0; j<D; j++) { test_vector[j] = rand() / (double)RAND_MAX; } int dist = centroid_distances[c] / 4.0; for(int j=0; j<D; j++) { test_centroids[ j] = dist * test_vector[j]; test_centroids[D + j] = -1 * dist * test_vector[j]; } int n = generate_random_indicies_in_cluster(data, centroids, test_sample_indicies, c, n_samples, 1000, k, N, D); for (int i=0; i<n; i++) { centroid_counts[i] = 0; } minibatch_iteration(data, test_centroids, test_sample_indicies, centroid_counts, cluster_cache, n, 2, N, D); double parent_bic = bayesian_information_criterion(data, centroids, k, N, D); double children_bic = bayesian_information_criterion(data, test_centroids, 2, N, D); _LOG("\t\tParent BIC: %f, Child BIC: %f\n", parent_bic, children_bic); if (children_bic > parent_bic) { _LOG("\t\tUsing children\n"); int empty_k = k+new_k; for(int i=0; i<D; i++) { centroids[c*D + i] = test_centroids[i]; centroids[empty_k*D + i] = test_centroids[D + i]; } new_k += 1; } else { _LOG("\t\tUsing parents\n"); } } k += new_k; } _LOG("Cleaning up\n"); free(sample_indicies); free(centroid_counts); free(cluster_cache); free(test_sample_indicies); free(test_vector); free(test_centroids); free(centroid_distances); return k; }