Example #1
0
void cluster_document::kmedoids(int clusters, std::vector<cluster_document> &document_list) {
  int length = document_list.size();
  std::vector<int> medoid_ids = random_subset(clusters, length);
  recompute_clusters(medoid_ids, document_list);
  int steps = KMEDOIDS_STEPS;
  double cost = compute_cost(medoid_ids, document_list);
  while (steps-- > 0) {
    //std::cerr << steps << std::endl;
    int best_document = -1;
    double best_cost = cost;
    for (auto &document: document_list) {
      int old_medoid = medoid_ids[document.cluster_];
      if (old_medoid == document.id_) {
        continue;
      }
      medoid_ids[document.cluster_] = document.id_;
      double new_cost = compute_cost(medoid_ids, document_list);
      if (new_cost < best_cost) {
        cost = best_cost = new_cost;
        best_document = document.id_;
      }
      medoid_ids[document.cluster_] = old_medoid;
    }
    if (best_document == -1) {
      break;
    }
    auto &document = document_list[best_document];
    medoid_ids[document.cluster_] = document.id_;
    recompute_clusters(medoid_ids, document_list);
  }
}
Example #2
0
void cluster_document::kmeans(int clusters, std::vector<cluster_document> &document_list) {
  int length = document_list.size();
  std::vector<int> centroid_ids = random_subset(clusters, length);
  int steps = KMEANS_STEPS;
  while (steps--) {
    //std::cerr << "Steps left: " << steps << std::endl;
    //reassigning
    for (auto &document: document_list) {
      document.cluster_ = 0;
      for (int cluster = 0; cluster < clusters; ++cluster) {
        int id = centroid_ids[cluster];
        int old_id = centroid_ids[document.cluster_];
        if (document.distance(id) < document.distance(old_id)) {
          document.cluster_ = cluster;
        }
      }
    }
    //recomputing centroids
    std::vector<double> cluster_distance(length);
    for (int i = 0; i < length; ++i) {
      for (int j = i + 1; j < length; ++j) {
        if (document_list[i].cluster_ ==
            document_list[j].cluster_) {
          cluster_distance[i] += document_list[i].distance(j);
          cluster_distance[j] += document_list[j].distance(i);
        }
      }
    }
    centroid_ids.assign(clusters, -1);
    for (auto &document: document_list) {
      if (centroid_ids[document.cluster_] == -1 ||
          cluster_distance[centroid_ids[document.cluster_]] >
          cluster_distance[document.id_]) {
        centroid_ids[document.cluster_] = document.id_;
      }
    }
  }
}
int main()
{
	for (size_t n = 0; n <= 100; ++n)
	{
		for (size_t m = 0; m <= n; ++m)
		{
			std::vector< size_t > values;

			for (size_t j = 0; j < n; ++j)
			{
				values.push_back(j);
			}

			for (int i = 0; i < 100; ++i)
			{
				std::vector< size_t > subset = random_subset(values,m);

				assert(subset.size() == m);

				std::vector< bool > values_seen(n, false);

				/* make sure all values are valid and don't repeat */
				for (const size_t x : subset)
				{
					assert(x < n);
					assert(values_seen[x] == false);
					values_seen[x] = true;
				}
			}
		}

		std::cout << "passed random tests for arrays of size " << n << std::endl;
	}

	return 0;
}
Example #4
0
int main(int argc, char** argv)
{
    int opt_char;
    enum algorithm_t {
        e_null = 0,
        e_permutation = 1,
        e_subset = 2,
    };
    algorithm_t algorithm = e_null;

    /**
     * long options is not portal using getopt, ignored.
     */
    while ((opt_char = getopt(argc, argv, "hs:a:")) != -1) {
        switch (opt_char) {
        case 's': 
            {
            int seed = strtol(optarg, 0, 10); 
            srand(seed);
            break;
            }

        case 'a':
            switch (*optarg) {
            case 'p': 
                algorithm = e_permutation;
                break;
            case 's':
                algorithm = e_subset;
                break;
            default:
                fprintf(stderr, "unsupported algorithm %s\n", optarg);
                exit(1);
            }
            break;

        case 'h': 
            fprintf(stderr, "Usage: %s -s<SEED> integer-list\n", argv[0]);
            exit(1);

        case '?':
            exit(1);
        }
    }

    if (!algorithm) {
        fprintf(stderr, "algorithm(-a) not specified\n");
        exit(1);
    }

    switch(algorithm) {
    case e_permutation: 
        {
            int len = argc-optind;
            int* a = new int[len];
            for (int i = 0; i < len; ++i)
                a[i] = strtol(argv[optind+i], 0, 10);

            random_permutation(len, a);
            for (int i = 0; i < len; ++i)
                fprintf(stdout, "%d ", a[i]);
            fprintf(stdout, "\n");
            delete a;
            break;
        }
    case e_subset: 
        {
            int m = strtol(argv[optind], 0, 10);
            int len = argc-optind-1;
            int* b = new int[m];
            int* a = new int[len];
            for (int i = 0; i < len; ++i)
                a[i] = strtol(argv[optind+1+i], 0, 10);

            random_subset(len, a, m, b);
            std::sort(b, b+m);
            for (int i = 0; i < m; ++i)
                fprintf(stdout, "%d ", b[i]);
            fprintf(stdout, "\n");
            delete a;
            delete b;
            break;
        }
    }
    return 0;
}