예제 #1
0
static PyObject *
py_minibatch(PyObject *Py_UNUSED(self), PyObject *args)
{
    PyArrayObject *data;
    PyArrayObject *centroids;
    int n_samples, max_iter;
    double reassignment_ratio, bic_ratio_termination;

    if (!PyArg_ParseTuple(args, "OOiidd",
                          &data, &centroids, &n_samples, &max_iter,
                          &bic_ratio_termination, &reassignment_ratio)) {
        return NULL;
    }

    require_contiguous_ndarray(data,      2, NPY_DOUBLE, "double");
    require_contiguous_ndarray(centroids, 2, NPY_DOUBLE, "double");

    require_dimension_match(data, 1, centroids, 1, "features");

    const npy_intp N = PyArray_DIM(data, 0);
    const npy_intp D = PyArray_DIM(data, 1);
    const npy_intp k = PyArray_DIM(centroids, 0);

    require_positive_as_int(N, "samples");
    require_positive_as_int(D, "features");
    require_positive_as_int(k, "centroids");
    require_positive(max_iter, "allowed iterations");
    require_positive(n_samples, "requested samples");

    if (n_samples > (int)N) {
        PyErr_SetString(PyExc_RuntimeError,
                        "more samples requested than data.");
        return NULL;
    }

    minibatch(
        PyArray_DATA(data),
        PyArray_DATA(centroids),
        n_samples,
        max_iter,
        bic_ratio_termination,
        reassignment_ratio,
        (int)k, (int)N, (int)D
    );

    Py_XINCREF(centroids);
    return (PyObject *)centroids;
}
예제 #2
0
// Reading minibatch.
Minibatch SequencePacker::ReadMinibatch()
{
    assert(m_streamBufferSizes.size() == m_streamBuffers.size());
    const auto sequences = m_transformer->GetNextSequences(m_minibatchSize);

    Minibatch minibatch(sequences.m_endOfEpoch);
    if (sequences.m_data.empty())
    {
        return minibatch;
    }

    // For each stream packing the minibatch.
    minibatch.m_data.reserve(sequences.m_data.size());
    for (size_t streamIndex = 0; streamIndex < sequences.m_data.size(); ++streamIndex)
    {
        minibatch.m_data.push_back(PackStreamMinibatch(sequences.m_data[streamIndex], streamIndex));
    }

    return minibatch;
}
예제 #3
0
Minibatch SequencePacker::ReadMinibatch()
{
    auto sequences = GetNextSequences();
    const auto& batch = sequences.m_data;

    Minibatch minibatch(sequences.m_endOfEpoch);
    if (batch.empty())
    {
        return minibatch;
    }

    auto& currentBuffer = m_streamBuffers[m_currentBufferIndex];

    assert(m_outputStreamDescriptions.size() == batch.size());

    for (int streamIndex = 0; streamIndex < batch.size(); ++streamIndex)
    {
        const auto& streamBatch = batch[streamIndex];

        if (m_checkSampleShape[streamIndex])
        {
            CheckSampleShape(streamBatch, m_outputStreamDescriptions[streamIndex]);
        }

        const auto& type = m_outputStreamDescriptions[streamIndex]->m_storageType;
        auto pMBLayout = (type == StorageType::dense) ?
            PackDenseStream(streamBatch, streamIndex) : PackSparseStream(streamBatch, streamIndex);

        auto& buffer = currentBuffer[streamIndex];

        auto streamMinibatch = std::make_shared<StreamMinibatch>();
        streamMinibatch->m_data = buffer.m_data.get();
        streamMinibatch->m_layout = pMBLayout;
        minibatch.m_data.push_back(streamMinibatch);
    }

    m_currentBufferIndex = (m_currentBufferIndex + 1) % m_numberOfBuffers;
    return minibatch;
}
예제 #4
0
파일: xmeans.c 프로젝트: zackw/pyxmeans
int xmeans(double *data, double *centroids, int n_samples, int max_iter, int k_min, int k_max, int N, int D)  {
    // assert(k < n_samples < N)
    // assert(data.shape == (N, D)
    // assert(centoids.shape == (k, D)

    _LOG("Initializing\n");
    int *sample_indicies = (int*) malloc(n_samples * sizeof(int));
    int *centroid_counts = (int*) malloc(k_max * sizeof(int));
    int *cluster_cache = (int*) malloc(n_samples * sizeof(int));

    int *test_sample_indicies = (int*) malloc(n_samples * sizeof(int));
    double *test_vector = (double*) malloc(D * sizeof(double));
    double *test_centroids = (double*) malloc(2 * D * sizeof(double));
    double *centroid_distances = (double*) malloc(k_max * sizeof(double));

    double distance;
    int new_k = -1;
    int k = k_min;

    for (int i=0; i<2*D; i++) {
        test_centroids[i] = 0.0;
    }

    _LOG("Starting xmeans\n");
    while (k < k_max && new_k != 0) {
        _LOG("Iteration k=%d\n", k);

        _LOG("\tRunning MiniBatch over full set\n");
        minibatch(data, centroids, n_samples, max_iter, k, N, D);

        _LOG("\tGetting centroid distances\n");
        // TODO: optimize this distance calculation
        for(int c1=0; c1<k; c1++) {
            centroid_distances[c1] = -1;
            for(int c2=0; c2<k; c2++) {
                if (c1 != c2) {
                    distance = distance_metric(centroids + c1*D, centroids + c2*D, D);
                    if (centroid_distances[c1] == -1 || distance < centroid_distances[c1]) {
                        centroid_distances[c1] = distance;
                    }
                }
            }
        }

        new_k = 0;
        for(int c=0; c<k; c++) {
            _LOG("\tRunning 2means on cluster c=%d\n", c);
            if (k + new_k >= k_max) {
                _LOG("\tNot continuing with splitting clusters\n");
                break;
            }
            for(int j=0; j<D; j++) {
                test_vector[j] = rand() / (double)RAND_MAX;
            }
            int dist = centroid_distances[c] / 4.0;
            for(int j=0; j<D; j++) {
                test_centroids[    j] =      dist * test_vector[j];
                test_centroids[D + j] = -1 * dist * test_vector[j];
            }

            int n = generate_random_indicies_in_cluster(data, centroids, test_sample_indicies, c, n_samples, 1000, k, N, D);
            for (int i=0; i<n; i++) {
                centroid_counts[i] = 0;
            }
            minibatch_iteration(data, test_centroids, test_sample_indicies, centroid_counts, cluster_cache, n, 2, N, D);

            double parent_bic = bayesian_information_criterion(data, centroids, k, N, D);
            double children_bic = bayesian_information_criterion(data, test_centroids, 2, N, D);
            _LOG("\t\tParent BIC: %f, Child BIC: %f\n", parent_bic, children_bic);
            if (children_bic > parent_bic) {
                _LOG("\t\tUsing children\n");
                int empty_k = k+new_k;
                for(int i=0; i<D; i++) {
                    centroids[c*D + i] = test_centroids[i];
                    centroids[empty_k*D + i] = test_centroids[D + i];
                }
                new_k += 1;
            } else {
                _LOG("\t\tUsing parents\n");
            }

        }
        k += new_k;
    }


    _LOG("Cleaning up\n");
    free(sample_indicies);
    free(centroid_counts);
    free(cluster_cache);

    free(test_sample_indicies);
    free(test_vector);
    free(test_centroids);
    free(centroid_distances);
    return k;
}