unsigned kd_tree_ln_search (kd_tree_t *tree, const float *pt, unsigned *cnt)
{
    unsigned k, c = 0;
    const float *cm = tree->means;
    float d, min = l2sqr(cm, pt, tree->dim);
    cm += tree->dim;
    for (k = 1; k < tree->K; k++, cm += tree->dim) {
        d = l2sqr(cm, pt, tree->dim);
        if (d < min) {
            c = k;
            min = d;
        }
    }
    *cnt = tree->K;
    return c;
}
static inline void kd_search_leaf_or_node (union kd_leaf_or_node lon,
                            struct kd_search_stat *stat,
                            float d2b) {

    if (kd_is_leaf(lon, stat->K)) {
        float l = l2sqr(stat->pt, stat->means + stat->dim * lon.leaf, stat->dim);
        stat->cnt++;
        if (l < stat->nn_dist) {
            stat->nn = lon.leaf;
            stat->nn_dist = l;
        }
    }
    else { 
        kd_search_node(lon.node, stat, d2b);
    }
}
// ${LSHKIT_HOME}/tools/fitdata.cpp
void fitdata_example()
{
    const std::string data_file("./data/search_algorithm/lshkit/audio.data");
    const unsigned N = 0;  // number of points to use.
    const unsigned P = 50000;  // number of pairs to sample.
    unsigned Q = 1000;  // number of queries to sample.
    unsigned K = 100;  // search for K nearest neighbors.
    const unsigned F = 10;  // divide the sample to F folds.

    // load matrix.
    lshkit::Matrix<float> data(data_file);

    std::vector<unsigned> idx(data.getSize());
    for (unsigned i = 0; i < idx.size(); ++i) idx[i] = i;
    random_shuffle(idx.begin(), idx.end());

    if (N > 0 && N < data.getSize()) idx.resize(N);

    lshkit::metric::l2sqr<float> l2sqr(data.getDim());

    lshkit::DefaultRng rng;
    boost::variate_generator<lshkit::DefaultRng &, lshkit::UniformUnsigned> gen(rng, lshkit::UniformUnsigned(0, idx.size()-1));

    double gM = 0.0;
    double gG = 0.0;
    {
        // sample P pairs of points
        for (unsigned k = 0; k < P; ++k)
        {
            double dist, logdist;
            for (;;)
            {
                unsigned i = gen();
                unsigned j = gen();
                if (i == j) continue;
                dist = l2sqr(data[idx[i]], data[idx[j]]);
                logdist = std::log(dist);
                if (local::is_good_value(logdist)) break;
            }
            gM += dist;
            gG += logdist;
        }
        gM /= P;
        gG /= P;
        gG = std::exp(gG);
    }

    if (Q > idx.size()) Q = idx.size();
    if (K > idx.size() - Q) K = idx.size() - Q;
    // sample query.
    std::vector<unsigned> qry(Q);

    lshkit::SampleQueries(&qry, idx.size(), rng);

    // do the queries.
    std::vector<lshkit::Topk<unsigned> > topks(Q);
    for (unsigned i = 0; i < Q; ++i) topks[i].reset(K);

    /* ... */
    gsl_matrix *X = gsl_matrix_alloc(F * K, 3);
    gsl_vector *yM = gsl_vector_alloc(F * K);
    gsl_vector *yG = gsl_vector_alloc(F * K);
    gsl_vector *pM = gsl_vector_alloc(3);
    gsl_vector *pG = gsl_vector_alloc(3);
    gsl_matrix *cov = gsl_matrix_alloc(3,3);

    std::vector<double> M(K);
    std::vector<double> G(K);

    boost::progress_display progress(F, std::cerr);
    unsigned m = 0;
    for (unsigned l = 0; l < F; l++)
    {
        // Scan
        for (unsigned i = l; i< idx.size(); i += F)
        {
            for (unsigned j = 0; j < Q; j++)
            {
                int id = qry[j];
                if (i != id)
                {
                    float d = l2sqr(data[idx[id]], data[idx[i]]);
                    if (local::is_good_value(std::log(double(d)))) topks[j] << lshkit::Topk<unsigned>::Element(i, d);
                }
            }
        }

        std::fill(M.begin(), M.end(), 0.0);
        std::fill(G.begin(), G.end(), 0.0);

        for (unsigned i = 0; i < Q; i++)
        {
            for (unsigned k = 0; k < K; k++)
            {
                M[k] += topks[i][k].dist;
                G[k] += std::log(topks[i][k].dist);
            }
        }

        for (unsigned k = 0; k < K; k++)
        {
            M[k] = std::log(M[k]/Q);
            G[k] /= Q;
            gsl_matrix_set(X, m, 0, 1.0);
            gsl_matrix_set(X, m, 1, std::log(double(data.getSize() * (l + 1)) / double(F)));
            gsl_matrix_set(X, m, 2, std::log(double(k + 1)));
            gsl_vector_set(yM, m, M[k]);
            gsl_vector_set(yG, m, G[k]);
            ++m;
        }

        ++progress;
    }

    gsl_multifit_linear_workspace *work = gsl_multifit_linear_alloc(F * K, 3);

    double chisq;

    gsl_multifit_linear(X, yM, pM, cov, &chisq, work);
    gsl_multifit_linear(X, yG, pG, cov, &chisq, work);

    std::cout << gM << '\t' << gG << std::endl;
    std::cout << gsl_vector_get(pM, 0) << '\t'
         << gsl_vector_get(pM, 1) << '\t'
         << gsl_vector_get(pM, 2) << std::endl;
    std::cout << gsl_vector_get(pG, 0) << '\t'
         << gsl_vector_get(pG, 1) << '\t'
         << gsl_vector_get(pG, 2) << std::endl;

    gsl_matrix_free(X);
    gsl_matrix_free(cov);
    gsl_vector_free(pM);
    gsl_vector_free(pG);
    gsl_vector_free(yM);
    gsl_vector_free(yG);
}