Clustering
C++20 header-only: DBSCAN, HDBSCAN, k-means.
Loading...
Searching...
No Matches
kmeans.h
Go to the documentation of this file.
1#pragma once
2
3#include <BS_thread_pool.hpp>
4#include <cstddef>
5#include <cstdint>
6#include <optional>
7#include <type_traits>
8
15#include "clustering/ndarray.h"
16
17namespace clustering {
18
38template <class T, class Algo = kmeans::LloydFusedGemm<T>, class Seeder = kmeans::AutoSeeder<T>>
40class KMeans {
41 static_assert(std::is_same_v<T, float>,
42 "KMeans<T> supports only float; add a double specialization to extend.");
43
44public:
53 explicit KMeans(std::size_t k, std::size_t nJobs = 0)
54 : m_k(k), m_nJobs(math::clampedJobCount(nJobs)), m_centroids({0, 0}), m_labels({0}) {
56 // Defer pool construction to @ref run: at small shapes every hot phase gates serial, and
57 // spawning nJobs workers in the ctor burns tens of microseconds of thread-create futex
58 // traffic that the fit never amortizes. @ref run emplaces on first need and reuses.
59 }
60
61 KMeans(const KMeans &) = delete;
62 KMeans &operator=(const KMeans &) = delete;
63 KMeans(KMeans &&) = delete;
64 KMeans &operator=(KMeans &&) = delete;
65 ~KMeans() = default;
66
82 void run(const NDArray<T, 2> &X, std::size_t maxIter = 300, T tol = T{1e-4},
83 std::uint64_t seed = 0) {
84 const std::size_t n = X.dim(0);
85 const std::size_t d = X.dim(1);
86
89
90 ensureOutputShape(n, d);
91
92 if (n == 0 || d == 0) {
93 m_nIter = 0;
94 m_converged = true;
95 m_inertia = 0.0;
96 return;
97 }
98
99 if (!m_pool.has_value() && math::shouldSpawnPool(n * d * m_k, m_nJobs)) {
100 m_pool.emplace(m_nJobs);
101 }
102 const math::Pool pool{m_pool.has_value() ? &*m_pool : nullptr};
103 m_seeder.run(X, m_k, seed, pool, m_centroids);
104 m_lloyd.run(X, m_centroids, m_k, maxIter, tol, pool, m_labels, m_inertia, m_nIter, m_converged);
105 }
106
108 [[nodiscard]] const NDArray<std::int32_t, 1> &labels() const noexcept { return m_labels; }
110 [[nodiscard]] const NDArray<T, 2, Layout::Contig> &centroids() const noexcept {
111 return m_centroids;
112 }
113
114 [[nodiscard]] double inertia() const noexcept { return m_inertia; }
116 [[nodiscard]] std::size_t nIter() const noexcept { return m_nIter; }
118 [[nodiscard]] bool converged() const noexcept { return m_converged; }
119
121 void reset() {
122 m_centroids = NDArray<T, 2, Layout::Contig>({0, 0});
123 m_labels = NDArray<std::int32_t, 1>({0});
124 m_inertia = 0.0;
125 m_nIter = 0;
126 m_converged = false;
127 m_lloyd = Algo{};
128 m_seeder = Seeder{};
129 }
130
131private:
132 void ensureOutputShape(std::size_t n, std::size_t d) {
133 if (m_centroids.dim(0) != m_k || m_centroids.dim(1) != d) {
134 m_centroids = NDArray<T, 2, Layout::Contig>({m_k, d});
135 }
136 if (m_labels.dim(0) != n) {
137 m_labels = NDArray<std::int32_t, 1>({n});
138 }
139 }
140
141 std::size_t m_k;
142 std::size_t m_nJobs;
143 std::optional<BS::light_thread_pool> m_pool;
144 NDArray<T, 2, Layout::Contig> m_centroids;
145 NDArray<std::int32_t, 1> m_labels;
146 double m_inertia = 0.0;
147 std::size_t m_nIter = 0;
148 bool m_converged = false;
149
150 Algo m_lloyd{};
151 Seeder m_seeder{};
152};
153
154} // namespace clustering
#define CLUSTERING_ALWAYS_ASSERT(cond)
Release-active assertion: evaluates cond in every build configuration.
double inertia() const noexcept
Final inertia: Kahan-summed f64 total of per-point squared distance to assignment.
Definition kmeans.h:114
KMeans & operator=(KMeans &&)=delete
void run(const NDArray< T, 2 > &X, std::size_t maxIter=300, T tol=T{1e-4}, std::uint64_t seed=0)
Fit to X.
Definition kmeans.h:82
std::size_t nIter() const noexcept
Iterations executed before tol or maxIter fired.
Definition kmeans.h:116
bool converged() const noexcept
True iff the last run stopped because centroid shift fell at or below tol.
Definition kmeans.h:118
void reset()
Release every scratch buffer. The next run call reallocates against its shape.
Definition kmeans.h:121
KMeans & operator=(const KMeans &)=delete
KMeans(KMeans &&)=delete
const NDArray< T, 2, Layout::Contig > & centroids() const noexcept
k x d fitted centroids.
Definition kmeans.h:110
KMeans(const KMeans &)=delete
KMeans(std::size_t k, std::size_t nJobs=0)
Construct a reusable k-means fitter.
Definition kmeans.h:53
const NDArray< std::int32_t, 1 > & labels() const noexcept
Length-n assignment; each entry is in [0, k).
Definition kmeans.h:108
Represents a multidimensional array (NDArray) of a fixed number of dimensions N and element type T.
Definition ndarray.h:136
size_t dim(std::size_t index) const noexcept
Returns the size of a specific dimension of the NDArray.
Definition ndarray.h:461
Contract for the Lloyd driver that KMeans<T> delegates to.
Definition lloyd.h:23
Contract for the seeder that produces initial centroids for the Lloyd driver.
Definition seeder.h:23
bool shouldSpawnPool(std::size_t totalOps, std::size_t nJobs, std::size_t minOpsPerWorker=std::size_t{1}<< 15) noexcept
Decide whether spawning a pool with nJobs workers is worth it for totalOps of arithmetic work.
Definition thread.h:43