This program includes both the sequential and parallel implementations of the k-means algorithm. Added command line flags allow the user to either generate random points to be clustered, or use an points.txt file with a formatted list of points and output a clusters.txt file with the list of cluster centroids.
#include <iostream>
#include <sstream>
#include <vector>
#include <string>
#include <random>
#include <algorithm>
#include <iterator>
#include <chrono>
// struct point
struct point {
double x, y;
double sqr_dist(const point& p) const {
double dx = p.x - x;
double dy = p.y - y;
return dx * dx + dy * dy;
}
};
static std::istream& operator>>(std::istream& in, point& p) {
in >> p.x >> p.y;
return in;
}
static std::ostream& operator<<(std::ostream& out, const point& p) {
out << p.x << " " << p.y;
return out;
}
// struct cluster
struct cluster {
double x, y;
int n;
bool save_as(point& p) const {
double nx = x / n;
double ny = y / n;
if (nx == p.x && ny == p.y) return false;
p.x = nx;
p.y = ny;
return true;
}
void reset() {
x = y = 0.0;
n = 0;
}
cluster& operator+=(const point& p) {
x += p.x;
y += p.y;
n++;
return *this;
}
cluster& operator+=(const cluster& c) {
x += c.x;
y += c.y;
n += c.n;
return *this;
}
};
// k-means functions
static void find_closest(const point& p, const std::vector<point>& centers, std::vector<cluster>& clusters) {
double min_d = std::numeric_limits<double>::infinity();
size_t min_ci;
for (size_t ci = 0; ci < centers.size(); ci++) {
double d = centers[ci].sqr_dist(p);
if (d < min_d) {
min_d = d;
min_ci = ci;
}
}
clusters[min_ci] += p;
}
#if defined(_OPENMP)
static void find_closest(const std::vector<point>& points, const std::vector<point>& centers, std::vector<cluster>& clusters) {
#pragma omp parallel
{
std::vector<cluster> local_clusters {clusters.size()};
#pragma omp for
for (size_t pi = 0; pi < points.size(); pi++) {
find_closest(points[pi], centers, local_clusters);
}
#pragma omp critical
for (size_t ci = 0; ci < clusters.size(); ci++) {
clusters[ci] += local_clusters[ci];
}
}
}
#else // not defined(_OPENMP)
static void find_closest(const std::vector<point>& points, const std::vector<point>& centers, std::vector<cluster>& clusters) {
for (size_t pi = 0; pi < points.size(); pi++) {
find_closest(points[pi], centers, clusters);
}
}
#endif // defined(_OPENMP)
static bool update_centers(std::vector<point>& centers, std::vector<cluster>& clusters) {
bool mod = false;
for (size_t ci = 0; ci < centers.size(); ci++) {
if (clusters[ci].save_as(centers[ci])) mod = true;
clusters[ci].reset();
}
return mod;
}
// initialization functions
static size_t parse_count(const char* s, size_t min, size_t max, const char* msg) {
std::istringstream ss(s);
size_t n;
if (! (ss >> n) || ! ss.eof() || n < min || n > max) {
std::cerr << msg << ": " << s << "\n";
return 0;
}
return n;
}
static bool init_stdin(std::vector<point>& points, std::vector<point>& centers, char** argv) {
// read points from stdin
std::copy(std::istream_iterator<point>(std::cin), std::istream_iterator<point>(), std::back_inserter(points));
size_t cc = parse_count(argv[1], 2, points.size(), "Invalid cluster count");
if (!cc) return false;
centers.resize(cc);
// pick n random points as initial centers
std::shuffle(points.begin(), points.end(), std::default_random_engine{});
std::copy_n(points.begin(), centers.size(), centers.begin());
return true;
}
static bool init_random(std::vector<point>& points, std::vector<point>& centers, char** argv) {
size_t pc = parse_count(argv[2], 2, 1'000'000'000, "Invalid point count");
if (!pc) return false;
size_t cc = parse_count(argv[1], 2, pc, "Invalid cluster count");
if (!cc) return false;
points.resize(pc);
centers.resize(cc);
std::default_random_engine rnd {std::random_device{}()};
std::uniform_real_distribution<double> dist {-10.0, 10.0};
auto rnd_point = [&](){ return point { dist(rnd), dist(rnd) }; };
// pick random points and initial centers
std::generate(points.begin(), points.end(), rnd_point);
std::generate(centers.begin(), centers.end(), rnd_point);
return true;
}
static void usage(char** argv) {
std::cerr << "Usage:\n";
std::cerr << argv[0] << " <cluster count>\n read points from stdin\n";
std::cerr << argv[0] << " <cluster count> <point count>\n generate random points\n";
}
// main function
int main(int argc, char** argv) {
std::vector<point> points;
std::vector<point> centers;
if (argc == 2) {
if (! init_stdin(points, centers, argv)) return EXIT_FAILURE;
}
else if (argc == 3) {
if (! init_random(points, centers, argv)) return EXIT_FAILURE;
}
else {
usage(argv);
return EXIT_FAILURE;
}
std::vector<cluster> clusters {centers.size()};
auto t0 = std::chrono::steady_clock::now();
const int max_iter = 30;
bool mod = true;
for (int i = 0; mod && i < max_iter; i++) {
std::cerr << "Iteration: " << i << "\n";
find_closest(points, centers, clusters);
mod = update_centers(centers, clusters);
}
auto t1 = std::chrono::steady_clock::now();
auto d = std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t0);
std::cerr << "Time: " << d.count() << " seconds\n";
// write centers to stdout
std::copy(centers.begin(), centers.end(), std::ostream_iterator<point>(std::cout, "\n"));
}
CXXFLAGS := -Wall -Wextra -Wpedantic -O3
CXX := g++-10
# CXX := clang++
# CXXFLAGS += -mlinker-version=0 -L/usr/local/Cellar/llvm/11.1.0_1/lib
SEQ := kmeans_sequential
PAR := kmeans_parallel
$(PAR): CXXFLAGS += -fopenmp
all: $(SEQ) $(PAR)
$(SEQ).cpp $(PAR).cpp: kmeans.cpp
ln -s $< $@
init: $(SEQ).cpp $(PAR).cpp
$(SEQ): $(SEQ).cpp
$(PAR): $(PAR).cpp
run_$(SEQ)_file: $(SEQ)
./$(SEQ) 3 < points.txt | tee clusters.txt
run_$(SEQ)_random: $(SEQ)
./$(SEQ) 20 20000000
valgrind_$(SEQ)_file: $(SEQ)
valgrind ./$(SEQ) 3 < points.txt
valgrind_$(SEQ)_random: $(SEQ)
valgrind ./$(SEQ) 20 1000000
run_$(PAR)_file: $(PAR)
OMP_NUM_THREADS=4 ./$(PAR) 3 < points.txt | tee clusters.txt
run_$(PAR)_random: $(PAR)
OMP_NUM_THREADS=4 ./$(PAR) 20 20000000
valgrind_$(PAR)_file: $(PAR)
OMP_NUM_THREADS=4 valgrind --suppressions=$(PAR).supp ./$(PAR) 3 < points.txt
valgrind_$(PAR)_random: $(PAR)
OMP_NUM_THREADS=4 valgrind --suppressions=$(PAR).supp ./$(PAR) 20 1000000
zip:
$(RM) *.zip
zip kmeans.zip kmeans.cpp Makefile points.txt clusters.txt
clean:
$(RM) *.zip $(SEQ) $(PAR) $(SEQ).cpp $(PAR).cpp
9 9
1 1
-1 -1
3 3
10 10
-2 -2
7 8
0.2 0
-1 0
6 10