Clustering
C++20 header-only: DBSCAN, HDBSCAN, k-means.
Loading...
Searching...
No Matches
afkmc2_seeder.h
Go to the documentation of this file.
1#pragma once
2
3#include <cstddef>
4#include <cstdint>
5#include <cstring>
6#include <type_traits>
7
9#include "clustering/math/detail/avx2_helpers.h"
10#include "clustering/math/rng.h"
12#include "clustering/ndarray.h"
13
15
16using math::detail::sqEuclideanRowPtr;
17
41template <class T> class AfkMc2Seeder {
42public:
43 static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
44 "AfkMc2Seeder<T> requires T to be float or double");
45
46#ifdef CLUSTERING_KMEANS_AFKMC2_K_FLOOR
54 static constexpr std::size_t kFloor = CLUSTERING_KMEANS_AFKMC2_K_FLOOR;
55#else
57 static constexpr std::size_t kFloor = 100;
58#endif
59
60#ifdef CLUSTERING_KMEANS_AFKMC2_CHAIN_LENGTH
69 static constexpr std::size_t chainLengthDefault = CLUSTERING_KMEANS_AFKMC2_CHAIN_LENGTH;
70#else
72 static constexpr std::size_t chainLengthDefault = 200;
73#endif
74
75 AfkMc2Seeder() : m_q({0}), m_qCum({0}) {}
76
86 void run(const NDArray<T, 2, Layout::Contig> &X, std::size_t k, std::uint64_t seed,
87 math::Pool pool, NDArray<T, 2, Layout::Contig> &outCentroids) {
88 runChain(X, k, chainLengthDefault, seed, pool, outCentroids);
89 }
90
91private:
92 void runChain(const NDArray<T, 2, Layout::Contig> &X, std::size_t k, std::size_t m,
93 std::uint64_t seed, math::Pool pool, NDArray<T, 2, Layout::Contig> &outCentroids) {
94 const std::size_t n = X.dim(0);
95 const std::size_t d = X.dim(1);
96
97 CLUSTERING_ALWAYS_ASSERT(outCentroids.isMutable());
98 CLUSTERING_ALWAYS_ASSERT(outCentroids.dim(0) == k);
99 CLUSTERING_ALWAYS_ASSERT(outCentroids.dim(1) == d);
103
104 (void)pool;
105
106 ensureShape(n);
107
108 math::pcg64 rng;
109 rng.seed(seed);
110
111 const T *xData = X.data();
112 T *centroidsData = outCentroids.data();
113 T *qData = m_q.data();
114
115 // Step 1: first centroid uniformly.
116 const auto first = static_cast<std::size_t>(math::randUniformU64(rng) % n);
117 std::memcpy(centroidsData, xData + (first * d), d * sizeof(T));
118
119 if (k == 1) {
120 return;
121 }
122
123 // Step 2: proposal distribution q(i) = 0.5 * d(x_i, c_1)^2 / sumD2 + 0.5 * 1/n.
124 // Squared distance to the first centroid drives the data-proximal half; the 1/n floor
125 // keeps every point reachable by the chain even when sumD2 is dominated by outliers.
126 const T *firstRow = centroidsData;
127 T sumD2 = T{0};
128 for (std::size_t i = 0; i < n; ++i) {
129 const T d2 = sqEuclideanRowPtr(xData + (i * d), firstRow, d);
130 qData[i] = d2;
131 sumD2 += d2;
132 }
133
134 const T invN = T{1} / static_cast<T>(n);
135 if (sumD2 > T{0}) {
136 const T invSum = T{1} / sumD2;
137 for (std::size_t i = 0; i < n; ++i) {
138 qData[i] = (T{0.5} * qData[i] * invSum) + (T{0.5} * invN);
139 }
140 } else {
141 // Degenerate: every point coincides with c_1. Fall back to uniform so the chain stays
142 // ergodic over the point set.
143 for (std::size_t i = 0; i < n; ++i) {
144 qData[i] = invN;
145 }
146 }
147
148 // Cumulative prefix sum over q for O(log n) index draws via inverse-CDF. With the 0.5/n
149 // floor above, sumQ is strictly positive for any n >= 1.
150 T *qCumData = m_qCum.data();
151 T running = T{0};
152 for (std::size_t i = 0; i < n; ++i) {
153 running += qData[i];
154 qCumData[i] = running;
155 }
156 const T qTotal = qCumData[n - 1];
157
158 auto sampleFromQ = [&]() noexcept -> std::size_t {
159 const T u = math::randUnit<T>(rng) * qTotal;
160 std::size_t lo = 0;
161 std::size_t hi = n;
162 while (lo < hi) {
163 const std::size_t mid = lo + ((hi - lo) / 2);
164 if (qCumData[mid] > u) {
165 hi = mid;
166 } else {
167 lo = mid + 1;
168 }
169 }
170 return lo < n ? lo : n - 1;
171 };
172
173 // Step 3: for each remaining centroid, run a length-m Markov chain. Distances to the
174 // current centroid set are recomputed against all chosen rows -- O(c) per candidate where
175 // c is the count of already-placed centroids.
176 auto distToChosen = [&](std::size_t pointIdx, std::size_t chosenCount) noexcept -> T {
177 const T *row = xData + (pointIdx * d);
178 T best = sqEuclideanRowPtr(row, centroidsData, d);
179 for (std::size_t c = 1; c < chosenCount; ++c) {
180 const T cand = sqEuclideanRowPtr(row, centroidsData + (c * d), d);
181 if (cand < best) {
182 best = cand;
183 }
184 }
185 return best;
186 };
187
188 for (std::size_t c = 1; c < k; ++c) {
189 std::size_t xIdx = sampleFromQ();
190 T xDist = distToChosen(xIdx, c);
191 T xQ = qData[xIdx];
192
193 for (std::size_t step = 0; step < m; ++step) {
194 const std::size_t yIdx = sampleFromQ();
195 const T yDist = distToChosen(yIdx, c);
196 const T yQ = qData[yIdx];
197
198 // Acceptance ratio is (yDist / yQ) / (xDist / xQ); reorder as (yDist * xQ) vs
199 // (xDist * yQ) to skip the division. Draw u every step so the RNG sequence depends
200 // only on (seed, n, k, m) and never on the branch outcomes inside the chain --
201 // essential for bit-identical repeatability across runs. When denom is zero the
202 // current weight vanishes and any proposal is indistinguishable or strictly better;
203 // accept unconditionally.
204 const T numer = yDist * xQ;
205 const T denom = xDist * yQ;
206 const T u = math::randUnit<T>(rng);
207 const bool accept = (denom <= T{0}) || ((u * denom) < numer);
208
209 if (accept) {
210 xIdx = yIdx;
211 xDist = yDist;
212 xQ = yQ;
213 }
214 }
215
216 std::memcpy(centroidsData + (c * d), xData + (xIdx * d), d * sizeof(T));
217 }
218 }
219
220 void ensureShape(std::size_t n) {
221 if (m_q.dim(0) != n) {
222 m_q = NDArray<T, 1>({n});
223 }
224 if (m_qCum.dim(0) != n) {
225 m_qCum = NDArray<T, 1>({n});
226 }
227 }
228
229 NDArray<T, 1> m_q;
230 NDArray<T, 1> m_qCum;
231};
232
233} // namespace clustering::kmeans
#define CLUSTERING_ALWAYS_ASSERT(cond)
Release-active assertion: evaluates cond in every build configuration.
Represents a multidimensional array (NDArray) of a fixed number of dimensions N and element type T.
Definition ndarray.h:136
size_t dim(std::size_t index) const noexcept
Returns the size of a specific dimension of the NDArray.
Definition ndarray.h:461
const T * data() const noexcept
Provides read-only access to the internal data array.
Definition ndarray.h:503
bool isMutable() const noexcept
Reports whether writes through operator(), Accessor, or flatIndex are allowed.
Definition ndarray.h:488
void run(const NDArray< T, 2, Layout::Contig > &X, std::size_t k, std::uint64_t seed, math::Pool pool, NDArray< T, 2, Layout::Contig > &outCentroids)
Seed k centroids from X into outCentroids.
static constexpr std::size_t chainLengthDefault
Default Markov-chain length per centroid pick.
static constexpr std::size_t kFloor
Minimum k below which the AFK-MC2 chain's log-k bound is too loose to win.
T randUnit(Rng &rng) noexcept
Draw a uniform variate in the half-open unit interval [0, 1).
Definition rng.h:152
std::uint64_t randUniformU64(Rng &rng) noexcept
Draw a 64-bit unsigned integer uniformly at random from the full u64 range.
Definition rng.h:139
Thin injection wrapper around a BS::light_thread_pool.
Definition thread.h:63
128-bit state for the PCG-XSL-RR 64-bit output generator (Melissa O'Neill).
Definition rng.h:30
void seed(std::uint64_t seedValue, std::uint64_t stream=0) noexcept
Initialize the generator per PCG's canonical seeding procedure.
Definition rng.h:46