Clustering
C++20 header-only: DBSCAN, HDBSCAN, k-means.
Loading...
Searching...
No Matches
lloyd_fused_gemm.h
Go to the documentation of this file.
1#pragma once
2
3#include <algorithm>
4#include <array>
5#include <cmath>
6#include <cstddef>
7#include <cstdint>
8#include <cstring>
9#include <limits>
10#include <type_traits>
11#include <utility>
12
14#include "clustering/kmeans/detail/convergence.h"
15#include "clustering/kmeans/detail/empty_cluster.h"
19#include "clustering/math/detail/avx2_helpers.h"
20#include "clustering/math/detail/columnwise_reduce_avx2.h"
21#include "clustering/math/detail/gemm_outer_prepacked.h"
22#include "clustering/math/detail/gemm_pack.h"
23#include "clustering/math/detail/matrix_desc.h"
24#include "clustering/math/detail/pairwise_argmin_outer.h"
29#include "clustering/ndarray.h"
30
31namespace clustering::kmeans {
32
33namespace detail {
34
42struct BlockPartition {
43 std::size_t first_index = 0;
44 std::size_t block_size = 0;
45 std::size_t remainder = 0;
46 std::size_t num_blocks = 0;
47
48 BlockPartition(std::size_t first, std::size_t n, std::size_t desired) noexcept
49 : first_index(first) {
50 if (n == 0 || desired == 0) {
51 num_blocks = 0;
52 return;
53 }
54 num_blocks = std::min(desired, n);
55 block_size = n / num_blocks;
56 remainder = n % num_blocks;
57 if (block_size == 0) {
58 block_size = 1;
59 num_blocks = n;
60 }
61 }
62
63 [[nodiscard]] std::size_t blockIndexOf(std::size_t lo) const noexcept {
64 const std::size_t rel = lo - first_index;
65 const std::size_t big = remainder * (block_size + 1);
66 if (rel < big) {
67 return rel / (block_size + 1);
68 }
69 return remainder + ((rel - big) / block_size);
70 }
71};
72
81inline constexpr std::size_t kDirectArgminMaxD = 8;
82
83} // namespace detail
84
98template <class T> class LloydFusedGemm {
99public:
100 static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
101 "LloydFusedGemm<T> requires T to be float or double");
102
104 : m_centroidsOld({0, 0}), m_cSqNorms({0}), m_sums({0, 0}), m_counts({0}), m_minDistSq({0}),
105 m_shiftSq({0}), m_partialSums({0}), m_partialComps({0}), m_partialCounts({0}),
106 m_foldComp({0}), m_packedB({0}), m_packedCSqNorms({0}), m_distsChunk({0, 0}),
107 m_gemmApArena({0}), m_xNormsSq({0}), m_varSum({0}), m_varSumSq({0}), m_u({0}), m_l({0}),
108 m_shiftEuclidean({0}), m_halfDistToNearestOther({0}), m_elkanBounds({0, 0}),
109 m_centerDist({0, 0}) {}
110
111#ifdef CLUSTERING_KMEANS_KAHAN_N_THRESHOLD
120 static constexpr std::size_t kahanNThreshold = CLUSTERING_KMEANS_KAHAN_N_THRESHOLD;
121#else
123 static constexpr std::size_t kahanNThreshold = 100000;
124#endif
125
147 std::size_t k, std::size_t maxIter, T tol, math::Pool pool,
148 NDArray<std::int32_t, 1> &outLabels, double &outInertia, std::size_t &outNIter,
149 bool &outConverged) {
150 const std::size_t n = X.dim(0);
151 const std::size_t d = X.dim(1);
152
155 CLUSTERING_ALWAYS_ASSERT(centroids.dim(0) == k);
156 CLUSTERING_ALWAYS_ASSERT(centroids.dim(1) == d);
157 CLUSTERING_ALWAYS_ASSERT(outLabels.dim(0) == n);
158
159 if (n == 0 || d == 0) {
160 outNIter = 0;
161 outConverged = true;
162 outInertia = 0.0;
163 return;
164 }
165
166 const std::size_t workerCount = pool.workerCount();
167 ensureShape(n, d, k, workerCount);
168
169 // Sklearn-compatible tol semantics: the threshold on sum(||deltac_j||^2) is @c tol * mean_var
170 // where @c mean_var is the mean of per-column variances of @p X. This is scale-invariant,
171 // which is the property callers expect when they pass the same numeric @c tol across
172 // datasets of different magnitudes. The raw-L2-shift convention our earlier prose described
173 // made @c tol=1e-4 hundreds-of-thousands of times tighter than sklearn at the same numeric
174 // value, which inflated the Lloyd iteration count by 3-4x on typical blob data.
175 const T shiftSqThreshold = tol * meanColumnVariance(X);
176 const bool useKahan = n >= kahanNThreshold;
177
178 // X is input-only; its squared-row-norms are reused across every Lloyd iteration's
179 // argmin post-pass. Compute once per run() so the iteration budget doesn't eat an
180 // O(n*d) pass for every assignment.
181 for (std::size_t i = 0; i < n; ++i) {
182 m_xNormsSq(i) = math::detail::sqNormRow<T, Layout::Contig>(X, i);
183 }
184
185 refreshCentroidSqNorms(centroids);
186
187 std::size_t iter = 0;
188 bool converged = false;
189
190 // Hamerly pruning starts once @c d leaves the direct small-D path. Fused-argmin shapes seed
191 // valid per-point bounds after the first dense assignment; chunked shapes seed them inline
192 // during the argmin post-pass. @c k is capped by @c kHamerlyMaxK because the per-row scan
193 // uses a stack-allocated distance buffer; above that, Elkan handles bounded shapes and the
194 // rest fall back to unbounded assignment.
195 const bool hamerlyEligible = (d > detail::kDirectArgminMaxD) && (k <= kHamerlyMaxK) && (k >= 2);
196 // Elkan keeps k lower bounds per sample instead of Hamerly's one, pruning far more distance
197 // work once k exceeds Hamerly's regime. The @c n * k bound matrix grows linearly in both,
198 // so we gate on an @c n * k envelope bound (memory ceiling) and require @c k above the
199 // Hamerly cap so the two paths don't overlap.
200 const bool elkanEligible = (d > math::defaults::pairwiseArgminMaxD) && (k > kHamerlyMaxK) &&
201 (k <= kElkanMaxK) && (n * k <= kElkanNKLimit) && (k >= 2);
202
203 while (iter < maxIter) {
204 if (hamerlyEligible && iter > 0) {
205 runHamerlyAssignment(X, centroids, outLabels, pool);
206 } else if (elkanEligible && iter > 0) {
207 runElkanAssignment(X, centroids, outLabels, pool);
208 } else {
209 // First iteration (or no-prune shape) goes through the dispatcher. The chunked path
210 // seeds Hamerly's @c m_u and @c m_l inline from the argmin post-pass, and Elkan's
211 // @c m_elkanBounds matrix is filled from the same per-sample scan. The fused path only
212 // returns the winning distance, so seed Hamerly's conservative lower bound separately.
213 runAssignment(X, centroids, outLabels, pool);
214 if (hamerlyEligible && iter == 0 && assignmentUsesFusedArgmin(X, centroids)) {
215 seedHamerlyBoundsFromLabels(X, centroids, outLabels, pool);
216 }
217 }
218
219 std::memcpy(m_centroidsOld.data(), centroids.data(),
220 centroids.dim(0) * centroids.dim(1) * sizeof(T));
221
222 if (useKahan) {
223 scatterAndFoldKahan(X, outLabels, k, pool);
224 } else {
225 scatterAndFoldPlain(X, outLabels, k, pool);
226 }
227
228 // Empty-cluster reseed: furthest-point pass bounded by the counts scan. m_minDistSq still
229 // holds the decomposed-formula residual from the assignment above; the noise tail is
230 // bounded by per-point `||c||^2 + ||x||^2` cancellation, smaller than the inter-blob
231 // distance the donor is selected against, so the argmax selection is preserved in
232 // practice on benchmark data. The donor's minDistSq is zeroed so successive empties
233 // cannot reseed to the same point.
234 (void)::clustering::kmeans::detail::reseedEmptyClusters<T>(X, centroids, m_sums, m_counts,
235 m_minDistSq);
236
237 finalizeMeans(centroids);
238 refreshCentroidSqNorms(centroids);
239
240 math::centroidShift<T>(m_centroidsOld, centroids, m_shiftSq, pool);
241 const T totalShift = ::clustering::kmeans::detail::totalShiftSqKahan<T>(m_shiftSq);
242
243 ++iter;
244 if (totalShift <= shiftSqThreshold) {
245 converged = true;
246 break;
247 }
248 }
249
250 // Re-assign labels against the final centroids. At convergence the bounds Hamerly maintains
251 // are already tight for the pre-update centroids; feeding one more bound-aware pass against
252 // the tiny final shift prunes nearly every row and is an order of magnitude cheaper than a
253 // full chunked GEMM assignment. Force the serial fan-out so the per-worker submit/wait pair
254 // doesn't dominate the trivial post-convergence work; the chunked fallback still fans out
255 // when the shape never enabled Hamerly.
256 if (hamerlyEligible && iter > 0) {
257 runHamerlyAssignment(X, centroids, outLabels, math::Pool{});
258 } else if (elkanEligible && iter > 0) {
259 runElkanAssignment(X, centroids, outLabels, math::Pool{});
260 } else {
261 runAssignment(X, centroids, outLabels, pool);
262 }
263 if (!assignmentProducesDirectMinDistSq(X, centroids)) {
264 recomputeMinDistSqDirect(X, centroids, outLabels, pool);
265 }
266
267 // Inertia: Kahan-summed in f64 to pin the 1% gate at large (n, k) envelopes where the
268 // naive single-pass f32 add would drift.
269 double sum = 0.0;
270 double comp = 0.0;
271 for (std::size_t i = 0; i < n; ++i) {
272 const auto addend = static_cast<double>(m_minDistSq(i));
273 const double y = addend - comp;
274 const double t = sum + y;
275 comp = (t - sum) - y;
276 sum = t;
277 }
278
279 outInertia = sum;
280 outNIter = iter;
281 outConverged = converged;
282 }
283
284private:
293 [[nodiscard]] T meanColumnVariance(const NDArray<T, 2, Layout::Contig> &X) {
294 const std::size_t n = X.dim(0);
295 const std::size_t d = X.dim(1);
296 if (n == 0 || d == 0) {
297 return T{0};
298 }
299 const T *xData = X.data();
300
301 // Per-column accumulators kept in scratch so repeat runs at the same shape skip the
302 // allocator. Row-major walk (x traversed in storage order) keeps every column load
303 // inside the same cache line as its neighbors -- the natural column-major alternative
304 // misses L1 once per load at large @p d.
305 if (m_varSum.dim(0) != d) {
306 m_varSum = NDArray<T, 1>({d});
307 m_varSumSq = NDArray<T, 1>({d});
308 }
309 T *colSum = m_varSum.data();
310 T *colSumSq = m_varSumSq.data();
311 for (std::size_t t = 0; t < d; ++t) {
312 colSum[t] = T{0};
313 colSumSq[t] = T{0};
314 }
315 for (std::size_t i = 0; i < n; ++i) {
316 const T *row = xData + (i * d);
317 math::detail::columnwiseAccumSumSq<T>(row, d, colSum, colSumSq);
318 }
319 const auto nInv = static_cast<T>(1) / static_cast<T>(n);
320 T acc = T{0};
321 for (std::size_t t = 0; t < d; ++t) {
322 const T mean = colSum[t] * nInv;
323 acc += (colSumSq[t] * nInv) - (mean * mean);
324 }
325 return acc / static_cast<T>(d);
326 }
327
328 void ensureShape(std::size_t n, std::size_t d, std::size_t k, std::size_t workerCount) {
329 const bool shapeChanged = (n != m_n) || (d != m_d) || (k != m_k);
330 const bool workerChanged = (workerCount != m_workerCount);
331 if (!shapeChanged && !workerChanged) {
332 return;
333 }
334
335 const bool needsChunk = d > math::defaults::pairwiseArgminMaxD;
336 const std::size_t chunkCap = math::pairwiseArgminChunkRows;
337 const std::size_t blocks = workerCount == 0 ? std::size_t{1} : workerCount;
338
339 if (shapeChanged) {
340 m_centroidsOld = NDArray<T, 2, Layout::Contig>({k, d});
341 m_cSqNorms = NDArray<T, 1>({k});
342 m_sums = NDArray<T, 2, Layout::Contig>({k, d});
343 m_counts = NDArray<std::int32_t, 1>({k});
344 m_minDistSq = NDArray<T, 1>({n});
345 m_xNormsSq = NDArray<T, 1>({n});
346 m_shiftSq = NDArray<T, 1>({k});
347 m_foldComp = NDArray<T, 1>({k * d});
348 // Hamerly bound scratch: per-point upper/lower Euclidean bounds and per-cluster sqrt
349 // shifts. Seeded by the first iteration's full scan and maintained by the bounds-aware
350 // reassignment on subsequent iterations.
351 m_u = NDArray<T, 1>({n});
352 m_l = NDArray<T, 1>({n});
353 m_shiftEuclidean = NDArray<T, 1>({k});
354 m_halfDistToNearestOther = NDArray<T, 1>({k});
355 // Elkan's @c n * k bound matrix and the per-pair centroid-distance matrix are allocated
356 // unconditionally; rows past the caller's active-k index stay zero and the allocation
357 // cost amortizes across Lloyd iterations on the same shape.
358 if (n * k <= kElkanNKLimit && k <= kElkanMaxK) {
359 m_elkanBounds = NDArray<T, 2, Layout::Contig>({n, k});
360 m_centerDist = NDArray<T, 2, Layout::Contig>({k, k});
361 } else {
362 m_elkanBounds = NDArray<T, 2, Layout::Contig>({0, 0});
363 m_centerDist = NDArray<T, 2, Layout::Contig>({0, 0});
364 }
365 // Packed-B sizing: the fused fast path at d<=pairwiseArgminMaxD uses the flat
366 // panel-per-centroid layout (ceil(k/Nr)*Nr*d); the chunked fallback uses the tiled
367 // (jcIdx, pcIdx) layout that @c gemmRunPrepacked expects and is what supports d > kKc
368 // and k > kNc without envelope asserts.
369 const std::size_t packedBSize = needsChunk
370 ? math::detail::packedBScratchSizeFloatsTiled<T>(k, d)
371 : math::detail::packedBScratchSizeFloats(k, d);
372 const std::size_t packedNormsSize = math::detail::packedCSqNormsScratchSizeFloats(k);
373 m_packedB = NDArray<T, 1>({packedBSize == 0 ? std::size_t{1} : packedBSize});
374 m_packedCSqNorms = NDArray<T, 1>({packedNormsSize == 0 ? std::size_t{1} : packedNormsSize});
375 // Per-worker distance tile for the chunked path: one chunkCap*k slab per worker so
376 // the chunk fan-out runs without touching a shared tile.
377 const std::size_t distRows = needsChunk ? (blocks * chunkCap) : std::size_t{1};
378 const std::size_t safeK = (k == 0) ? std::size_t{1} : k;
379 const std::size_t distCols = needsChunk ? safeK : std::size_t{1};
380 m_distsChunk = NDArray<T, 2, Layout::Contig>({distRows, distCols});
381 } else if (workerChanged) {
382 // Only the per-worker slabs depend on workerCount; resize them if d triggered needsChunk.
383 if (needsChunk) {
384 const std::size_t distRows = blocks * chunkCap;
385 const std::size_t distCols = (k == 0 ? std::size_t{1} : k);
386 m_distsChunk = NDArray<T, 2, Layout::Contig>({distRows, distCols});
387 }
388 }
389
390 // Per-block scratch sizing for scatter-and-fold. Block count caps at workerCount (see
391 // BlockPartition); we size to the upper bound so both serial and parallel dispatch fit
392 // without reallocation inside the loop.
393 m_partialSums = NDArray<T, 1>({blocks * k * d});
394 m_partialComps = NDArray<T, 1>({blocks * k * d});
395 m_partialCounts = NDArray<std::int32_t, 1>({blocks * k});
396
397 // Gemm A-pack arena sized to @c blocks * kMc * kKc so @c gemmRunPrepacked's per-worker
398 // slice indexing stays in-bounds on every fan-out path.
399 const std::size_t apSize = blocks * math::detail::kMc<T> * math::detail::kKc<T>;
400 m_gemmApArena = NDArray<T, 1>({needsChunk ? apSize : std::size_t{1}});
401
402 m_n = n;
403 m_d = d;
404 m_k = k;
405 m_workerCount = workerCount;
406 }
407
408 void refreshCentroidSqNorms(const NDArray<T, 2, Layout::Contig> &centroids) noexcept {
409 const std::size_t k = centroids.dim(0);
410 const std::size_t d = centroids.dim(1);
411 for (std::size_t c = 0; c < k; ++c) {
412 const T *row = centroids.data() + (c * d);
413 T s = T{0};
414 for (std::size_t t = 0; t < d; ++t) {
415 s += row[t] * row[t];
416 }
417 m_cSqNorms(c) = s;
418 }
419 }
420
421 void finalizeMeans(NDArray<T, 2, Layout::Contig> &centroids) noexcept {
422 const std::size_t k = centroids.dim(0);
423 const std::size_t d = centroids.dim(1);
424 for (std::size_t c = 0; c < k; ++c) {
425 const std::int32_t cnt = m_counts(c);
426 if (cnt <= 0) {
427 continue;
428 }
429 const T inv = T{1} / static_cast<T>(cnt);
430 const T *src = m_sums.data() + (c * d);
431 T *dst = centroids.data() + (c * d);
432 for (std::size_t t = 0; t < d; ++t) {
433 dst[t] = src[t] * inv;
434 }
435 }
436 }
437
445 void runAssignment(const NDArray<T, 2, Layout::Contig> &X,
446 const NDArray<T, 2, Layout::Contig> &centroids,
447 NDArray<std::int32_t, 1> &labels, math::Pool pool) {
448#ifdef CLUSTERING_USE_AVX2
449 if constexpr (std::is_same_v<T, float>) {
450 const std::size_t d = X.dim(1);
451 if (X.template isAligned<32>() && centroids.template isAligned<32>() && d != 0) {
452 if (d <= detail::kDirectArgminMaxD) {
453 math::detail::pairwiseArgminDirectSmallDF32(X, centroids, labels, m_minDistSq, pool);
454 return;
455 }
457 math::detail::pairwiseArgminOuterAvx2F32WithScratch(X, centroids, m_cSqNorms, labels,
458 m_minDistSq, m_packedB.data(),
459 m_packedCSqNorms.data(), pool);
460 return;
461 }
462 }
463 }
464#endif
465 runChunkedMaterializedAssignment(X, centroids, labels, pool);
466 }
467
473 [[nodiscard]] bool
474 assignmentProducesDirectMinDistSq(const NDArray<T, 2, Layout::Contig> &X,
475 const NDArray<T, 2, Layout::Contig> &C) noexcept {
476#ifdef CLUSTERING_USE_AVX2
477 if constexpr (std::is_same_v<T, float>) {
478 const std::size_t d = X.dim(1);
479 return X.template isAligned<32>() && C.template isAligned<32>() && d != 0 &&
481 } else {
482 (void)X;
483 (void)C;
484 return false;
485 }
486#else
487 (void)X;
488 (void)C;
489 return false;
490#endif
491 }
492
493 [[nodiscard]] bool assignmentUsesFusedArgmin(const NDArray<T, 2, Layout::Contig> &X,
494 const NDArray<T, 2, Layout::Contig> &C) noexcept {
495#ifdef CLUSTERING_USE_AVX2
496 if constexpr (std::is_same_v<T, float>) {
497 const std::size_t d = X.dim(1);
498 return X.template isAligned<32>() && C.template isAligned<32>() &&
500 } else {
501 (void)X;
502 (void)C;
503 return false;
504 }
505#else
506 (void)X;
507 (void)C;
508 return false;
509#endif
510 }
511
520 void packCentroidsTiled(const NDArray<T, 2, Layout::Contig> &centroids) noexcept {
521 constexpr std::size_t kNr = math::detail::kKernelNr<T>;
522 constexpr std::size_t kKcVal = math::detail::kKc<T>;
523 constexpr std::size_t kNcVal = math::detail::kNc<T>;
524 const std::size_t k = centroids.dim(0);
525 const std::size_t d = centroids.dim(1);
526 const auto cTransposed = centroids.t();
527 const auto cDesc = ::clustering::detail::describeMatrix(cTransposed);
528 T *bp = m_packedB.data();
529 std::size_t jcBase = 0;
530 for (std::size_t jc = 0; jc < k; jc += kNcVal) {
531 const std::size_t nc = (jc + kNcVal <= k) ? kNcVal : (k - jc);
532 const std::size_t roundedNc = ((nc + kNr - 1) / kNr) * kNr;
533 std::size_t pcOffInJc = 0;
534 for (std::size_t pc = 0; pc < d; pc += kKcVal) {
535 const std::size_t kc = (pc + kKcVal <= d) ? kKcVal : (d - pc);
536 math::detail::packB<T>(cDesc, pc, kc, jc, nc, bp + jcBase + pcOffInJc);
537 pcOffInJc += kc * roundedNc;
538 }
539 jcBase += d * roundedNc;
540 }
541 }
542
543 void runChunkedMaterializedAssignment(const NDArray<T, 2, Layout::Contig> &X,
544 const NDArray<T, 2, Layout::Contig> &centroids,
545 NDArray<std::int32_t, 1> &labels,
546 math::Pool pool) noexcept {
547 const std::size_t n = X.dim(0);
548 const std::size_t k = centroids.dim(0);
549 const std::size_t d = X.dim(1);
550 if (n == 0 || k == 0) {
551 return;
552 }
553
554 packCentroidsTiled(centroids);
555
556 constexpr std::size_t kMcVal = math::detail::kMc<T>;
557 constexpr std::size_t kKcVal = math::detail::kKc<T>;
558 const std::size_t chunkCap = math::pairwiseArgminChunkRows;
559 const std::size_t numChunks = (n + chunkCap - 1) / chunkCap;
560 const T *bp = m_packedB.data();
561 T *apArena = m_gemmApArena.data();
562 T *distsBase = m_distsChunk.data();
563 const T *cNormsBase = m_cSqNorms.data();
564 T *minDistBase = m_minDistSq.data();
565 std::int32_t *labelsBase = labels.data();
566 const T *xBase = X.data();
567 // Hamerly bound seeding happens inline in the argmin post-pass; the cost is a handful of
568 // extra comparisons per row plus two @c sqrt calls, far below a second pass over @p X.
569 T *uBase = m_u.data();
570 T *lBase = m_l.data();
571 // Elkan bound seeding lights up only when the scratch matrix was sized for this shape;
572 // at shapes past @ref kElkanNKLimit or @ref kElkanMaxK the pointer stays null and the
573 // per-row loop skips the per-cluster bound stores.
574 T *elkanBoundsBase = m_elkanBounds.dim(0) == n ? m_elkanBounds.data() : nullptr;
575
576 auto runOneChunk = [&](std::size_t chunkIdx) noexcept {
577 const std::size_t iBase = chunkIdx * chunkCap;
578 const std::size_t chunkRows = (iBase + chunkCap <= n) ? chunkCap : (n - iBase);
579 const std::size_t w = math::Pool::workerIndex();
580 T *distsChunk = distsBase + (w * chunkCap * k);
581 T *apSlice = apArena + (w * kMcVal * kKcVal);
582
583 auto xChunk = NDArray<T, 2, Layout::Contig>::borrow(const_cast<T *>(xBase) + (iBase * d),
584 {chunkRows, d});
585 auto distsView = NDArray<T, 2>::borrow(distsChunk, {chunkRows, k});
586 const auto xDesc = ::clustering::detail::describeMatrix(xChunk);
587 auto distsDesc = ::clustering::detail::describeMatrixMut(distsView);
588 // Serial GEMM inside the chunk; outer fan-out already owns parallelism.
589 math::detail::gemmRunPrepacked<T>(xDesc, bp, d, k, distsDesc, T{-2}, T{0}, apSlice,
590 math::Pool{});
591
592 const T *xNormsChunk = m_xNormsSq.data() + iBase;
593 for (std::size_t i = 0; i < chunkRows; ++i) {
594 const T xn = xNormsChunk[i];
595 const T *row = distsChunk + (i * k);
596 T *elkanRow = elkanBoundsBase != nullptr ? elkanBoundsBase + ((iBase + i) * k) : nullptr;
597 T bestVal = std::numeric_limits<T>::infinity();
598 T secondVal = std::numeric_limits<T>::infinity();
599 std::int32_t bestIdx = 0;
600 for (std::size_t j = 0; j < k; ++j) {
601 T v = row[j] + xn + cNormsBase[j];
602 if (v < T{0}) {
603 v = T{0};
604 }
605 if (elkanRow != nullptr) {
606 elkanRow[j] = std::sqrt(v);
607 }
608 if (v < bestVal) {
609 secondVal = bestVal;
610 bestVal = v;
611 bestIdx = static_cast<std::int32_t>(j);
612 } else if (v < secondVal) {
613 secondVal = v;
614 }
615 }
616 minDistBase[iBase + i] = bestVal;
617 labelsBase[iBase + i] = bestIdx;
618 uBase[iBase + i] = std::sqrt(bestVal);
619 lBase[iBase + i] = std::sqrt(secondVal);
620 }
621 };
622
623 if (pool.shouldParallelize(numChunks, 1, 2) && pool.pool != nullptr) {
624 pool.pool
625 ->submit_blocks(std::size_t{0}, numChunks,
626 [&](std::size_t lo, std::size_t hi) {
627 for (std::size_t c = lo; c < hi; ++c) {
628 runOneChunk(c);
629 }
630 })
631 .wait();
632 } else {
633 for (std::size_t c = 0; c < numChunks; ++c) {
634 runOneChunk(c);
635 }
636 }
637 }
638
639 void scatterAndFoldPlain(const NDArray<T, 2, Layout::Contig> &X,
640 const NDArray<std::int32_t, 1> &labels, std::size_t k, math::Pool pool) {
641 const std::size_t n = X.dim(0);
642 const std::size_t d = X.dim(1);
643
644 T *partialSums = m_partialSums.data();
645 std::int32_t *partialCounts = m_partialCounts.data();
646
647 for (std::size_t c = 0; c < k; ++c) {
648 m_counts(c) = 0;
649 for (std::size_t t = 0; t < d; ++t) {
650 m_sums(c, t) = T{0};
651 }
652 }
653 if (n == 0 || d == 0) {
654 return;
655 }
656
657 const bool willParallelize = pool.shouldParallelizeWork(n * d) &&
658 pool.shouldParallelize(n, 64, 2) && pool.pool != nullptr;
659 const std::size_t desiredBlocks = willParallelize ? pool.workerCount() : std::size_t{1};
660 const detail::BlockPartition part(0, n, desiredBlocks);
661 const std::size_t numBlocks = part.num_blocks == 0 ? std::size_t{1} : part.num_blocks;
662
663 for (std::size_t b = 0; b < numBlocks; ++b) {
664 T *slab = partialSums + (b * k * d);
665 std::int32_t *cslab = partialCounts + (b * k);
666 for (std::size_t e = 0; e < k * d; ++e) {
667 slab[e] = T{0};
668 }
669 for (std::size_t c = 0; c < k; ++c) {
670 cslab[c] = 0;
671 }
672 }
673
674 auto scatterRange = [&](std::size_t lo, std::size_t hi) noexcept {
675 const std::size_t b = part.blockIndexOf(lo);
676 T *slab = partialSums + (b * k * d);
677 std::int32_t *cslab = partialCounts + (b * k);
678 for (std::size_t i = lo; i < hi; ++i) {
679 const std::int32_t lbl = labels(i);
680 if (lbl < 0 || std::cmp_greater_equal(lbl, k)) {
681 continue;
682 }
683 const auto row = static_cast<std::size_t>(lbl);
684 const T *xRow = X.data() + (i * d);
685 T *dst = slab + (row * d);
686 for (std::size_t t = 0; t < d; ++t) {
687 dst[t] += xRow[t];
688 }
689 cslab[row] += 1;
690 }
691 };
692
693 if (willParallelize) {
694 pool.pool
695 ->submit_blocks(
696 std::size_t{0}, n, [&](std::size_t lo, std::size_t hi) { scatterRange(lo, hi); },
697 numBlocks)
698 .wait();
699 } else {
700 scatterRange(0, n);
701 }
702
703 // Ascending-block-index fold. Deterministic at fixed (n, k, d, nJobs); changing this order
704 // changes the last-bit of the per-cluster sum and breaks bit-identity.
705 for (std::size_t b = 0; b < numBlocks; ++b) {
706 const T *slab = partialSums + (b * k * d);
707 const std::int32_t *cslab = partialCounts + (b * k);
708 for (std::size_t c = 0; c < k; ++c) {
709 m_counts(c) += cslab[c];
710 const T *src = slab + (c * d);
711 T *dstRow = &m_sums(c, 0);
712 for (std::size_t t = 0; t < d; ++t) {
713 dstRow[t] += src[t];
714 }
715 }
716 }
717 }
718
719 void scatterAndFoldKahan(const NDArray<T, 2, Layout::Contig> &X,
720 const NDArray<std::int32_t, 1> &labels, std::size_t k, math::Pool pool) {
721 const std::size_t n = X.dim(0);
722 const std::size_t d = X.dim(1);
723
724 T *partialSums = m_partialSums.data();
725 T *partialComps = m_partialComps.data();
726 std::int32_t *partialCounts = m_partialCounts.data();
727 T *foldComp = m_foldComp.data();
728
729 for (std::size_t c = 0; c < k; ++c) {
730 m_counts(c) = 0;
731 for (std::size_t t = 0; t < d; ++t) {
732 m_sums(c, t) = T{0};
733 }
734 }
735 for (std::size_t e = 0; e < k * d; ++e) {
736 foldComp[e] = T{0};
737 }
738 if (n == 0 || d == 0) {
739 return;
740 }
741
742 const bool willParallelize = pool.shouldParallelizeWork(n * d) &&
743 pool.shouldParallelize(n, 64, 2) && pool.pool != nullptr;
744 const std::size_t desiredBlocks = willParallelize ? pool.workerCount() : std::size_t{1};
745 const detail::BlockPartition part(0, n, desiredBlocks);
746 const std::size_t numBlocks = part.num_blocks == 0 ? std::size_t{1} : part.num_blocks;
747
748 for (std::size_t b = 0; b < numBlocks; ++b) {
749 T *slab = partialSums + (b * k * d);
750 T *cslab = partialComps + (b * k * d);
751 std::int32_t *nslab = partialCounts + (b * k);
752 for (std::size_t e = 0; e < k * d; ++e) {
753 slab[e] = T{0};
754 cslab[e] = T{0};
755 }
756 for (std::size_t c = 0; c < k; ++c) {
757 nslab[c] = 0;
758 }
759 }
760
761 auto scatterRange = [&](std::size_t lo, std::size_t hi) noexcept {
762 const std::size_t b = part.blockIndexOf(lo);
763 T *slab = partialSums + (b * k * d);
764 T *cslab = partialComps + (b * k * d);
765 std::int32_t *nslab = partialCounts + (b * k);
766 for (std::size_t i = lo; i < hi; ++i) {
767 const std::int32_t lbl = labels(i);
768 if (lbl < 0 || std::cmp_greater_equal(lbl, k)) {
769 continue;
770 }
771 const auto row = static_cast<std::size_t>(lbl);
772 const T *xRow = X.data() + (i * d);
773 T *sumRow = slab + (row * d);
774 T *compRow = cslab + (row * d);
775 math::detail::kahanAddRow<T>(xRow, d, sumRow, compRow);
776 nslab[row] += 1;
777 }
778 };
779
780 if (willParallelize) {
781 pool.pool
782 ->submit_blocks(
783 std::size_t{0}, n, [&](std::size_t lo, std::size_t hi) { scatterRange(lo, hi); },
784 numBlocks)
785 .wait();
786 } else {
787 scatterRange(0, n);
788 }
789
790 for (std::size_t b = 0; b < numBlocks; ++b) {
791 const T *slab = partialSums + (b * k * d);
792 const T *cslab = partialComps + (b * k * d);
793 const std::int32_t *nslab = partialCounts + (b * k);
794 for (std::size_t c = 0; c < k; ++c) {
795 m_counts(c) += nslab[c];
796 const T *src = slab + (c * d);
797 const T *comp = cslab + (c * d);
798 T *dstRow = &m_sums(c, 0);
799 T *foldRow = foldComp + (c * d);
800 for (std::size_t t = 0; t < d; ++t) {
801 const T addend = src[t] - comp[t];
802 const T y = addend - foldRow[t];
803 const T tVal = dstRow[t] + y;
804 foldRow[t] = (tVal - dstRow[t]) - y;
805 dstRow[t] = tVal;
806 }
807 }
808 }
809 }
810
811 void recomputeMinDistSqDirect(const NDArray<T, 2, Layout::Contig> &X,
812 const NDArray<T, 2, Layout::Contig> &centroids,
813 const NDArray<std::int32_t, 1> &labels, math::Pool pool) noexcept {
814 const std::size_t n = X.dim(0);
815 const std::size_t d = X.dim(1);
816 const std::size_t k = centroids.dim(0);
817 if (n == 0 || d == 0 || k == 0) {
818 return;
819 }
820
821 auto runRowRange = [&](std::size_t lo, std::size_t hi) noexcept {
822 for (std::size_t i = lo; i < hi; ++i) {
823 const std::int32_t lbl = labels(i);
824 if (lbl < 0 || std::cmp_greater_equal(lbl, k)) {
825 m_minDistSq(i) = T{0};
826 continue;
827 }
828 const T *xRow = X.data() + (i * d);
829 const T *cRow = centroids.data() + (static_cast<std::size_t>(lbl) * d);
830 m_minDistSq(i) = math::detail::sqEuclideanRowPtr<T>(xRow, cRow, d);
831 }
832 };
833
834 if (pool.shouldParallelize(n, 64, 2) && pool.pool != nullptr) {
835 pool.pool
836 ->submit_blocks(std::size_t{0}, n,
837 [&](std::size_t lo, std::size_t hi) { runRowRange(lo, hi); })
838 .wait();
839 } else {
840 runRowRange(0, n);
841 }
842 }
843
844 void seedHamerlyBoundsFromLabels(const NDArray<T, 2, Layout::Contig> &X,
845 const NDArray<T, 2, Layout::Contig> &centroids,
846 const NDArray<std::int32_t, 1> &labels,
847 math::Pool pool) noexcept {
848 const std::size_t n = X.dim(0);
849 const std::size_t d = X.dim(1);
850 const std::size_t k = centroids.dim(0);
851 if (n == 0 || d == 0 || k == 0) {
852 return;
853 }
854
855 auto seedRange = [&](std::size_t lo, std::size_t hi) noexcept {
856 for (std::size_t i = lo; i < hi; ++i) {
857 const std::int32_t lbl = labels(i);
858 if (lbl < 0 || std::cmp_greater_equal(lbl, k)) {
859 m_minDistSq(i) = T{0};
860 m_u(i) = std::numeric_limits<T>::infinity();
861 m_l(i) = T{0};
862 continue;
863 }
864 const T *xRow = X.data() + (i * d);
865 const T *cRow = centroids.data() + (static_cast<std::size_t>(lbl) * d);
866 const T tightSq = math::detail::sqEuclideanRowPtr<T>(xRow, cRow, d);
867 m_minDistSq(i) = tightSq;
868 m_u(i) = std::sqrt(tightSq);
869 m_l(i) = T{0};
870 }
871 };
872
873 if (pool.shouldParallelize(n, 64, 2) && pool.pool != nullptr) {
874 pool.pool
875 ->submit_blocks(std::size_t{0}, n,
876 [&](std::size_t lo, std::size_t hi) { seedRange(lo, hi); })
877 .wait();
878 } else {
879 seedRange(0, n);
880 }
881 }
882
889 static constexpr std::size_t kHamerlyMaxK = 64;
890
897 static constexpr std::size_t kElkanMaxK = 4096;
898
906 static constexpr std::size_t kElkanNKLimit = std::size_t{32} << 20;
907
917 void runHamerlyAssignment(const NDArray<T, 2, Layout::Contig> &X,
918 const NDArray<T, 2, Layout::Contig> &centroids,
919 NDArray<std::int32_t, 1> &labels, math::Pool pool) noexcept {
920 const std::size_t n = X.dim(0);
921 const std::size_t d = X.dim(1);
922 const std::size_t k = centroids.dim(0);
923 if (n == 0 || d == 0 || k == 0 || k > kHamerlyMaxK) {
924 return;
925 }
926 const T *xData = X.data();
927 const T *cData = centroids.data();
928 T *uData = m_u.data();
929 T *lData = m_l.data();
930 T *minDistData = m_minDistSq.data();
931 std::int32_t *labelsData = labels.data();
932
933 // Per-cluster Euclidean shift + top-2 of shifts. The second-largest shift is the amount we
934 // subtract from `l(x)` when x's assigned cluster is the one with the largest shift --
935 // otherwise the largest shift is the loose bound donor for every non-assigned cluster.
936 T sMax = T{0};
937 T s2Max = T{0};
938 std::size_t argMax = 0;
939 T *shiftData = m_shiftEuclidean.data();
940 for (std::size_t c = 0; c < k; ++c) {
941 const T s = std::sqrt(m_shiftSq(c));
942 shiftData[c] = s;
943 if (s > sMax) {
944 s2Max = sMax;
945 sMax = s;
946 argMax = c;
947 } else if (s > s2Max) {
948 s2Max = s;
949 }
950 }
951
952 // Per-cluster half-distance to the nearest other centroid. When `u(x)` for a sample
953 // assigned to cluster @c c clears this threshold, triangle inequality pins the sample in
954 // @c c: any other @c c' is at least @c 2 * halfDist[c] away, so `||x - c'||` >= 2 *
955 // halfDist[c] - u(x) >= u(x) >= ||x - c||. Populating it is `O(k^2 * d)`, negligible next
956 // to Hamerly's per-sample work at `k <= 64`.
957 T *halfDistData = m_halfDistToNearestOther.data();
958 for (std::size_t c = 0; c < k; ++c) {
959 T nearestSq = std::numeric_limits<T>::infinity();
960 const T *caRow = cData + (c * d);
961 for (std::size_t cp = 0; cp < k; ++cp) {
962 if (cp == c) {
963 continue;
964 }
965 const T dsq = math::detail::sqEuclideanRowPtr<T>(caRow, cData + (cp * d), d);
966 if (dsq < nearestSq) {
967 nearestSq = dsq;
968 }
969 }
970 halfDistData[c] = T{0.5} * std::sqrt(nearestSq);
971 }
972
973 auto processRange = [&](std::size_t lo, std::size_t hi) noexcept {
974 std::array<T, kHamerlyMaxK> distBuf{};
975 for (std::size_t i = lo; i < hi; ++i) {
976 const std::int32_t a = labelsData[i];
977 if (a < 0 || std::cmp_greater_equal(a, k)) {
978 continue;
979 }
980 const auto au = static_cast<std::size_t>(a);
981 T ui = uData[i] + shiftData[au];
982 T li = lData[i] - ((au == argMax) ? s2Max : sMax);
983
984 if (ui <= li) {
985 uData[i] = ui;
986 lData[i] = li;
987 continue;
988 }
989
990 // Lemma 1 shortcut: if the upper bound clears the half-distance to the nearest other
991 // centroid, the sample's label cannot have changed -- no need to recompute
992 // `||x - c_a||`. @c ui is still the post-shift bound, which stays valid; @c li is
993 // allowed to decay here because the outer per-sample gate will exact-recompute it on
994 // the next iteration that forces a tightening or a full scan.
995 if (ui <= halfDistData[au]) {
996 uData[i] = ui;
997 lData[i] = li;
998 continue;
999 }
1000
1001 const T *xi = xData + (i * d);
1002 const T *caRow = cData + (au * d);
1003 const T tightSq = math::detail::sqEuclideanRowPtr<T>(xi, caRow, d);
1004 ui = std::sqrt(tightSq);
1005
1006 if (ui <= li) {
1007 uData[i] = ui;
1008 lData[i] = li;
1009 minDistData[i] = tightSq;
1010 continue;
1011 }
1012
1013 detail::sqEuclideanRowToBatch<T>(xi, cData, k, d, distBuf.data());
1014 T best = std::numeric_limits<T>::infinity();
1015 T second = std::numeric_limits<T>::infinity();
1016 std::int32_t bestIdx = 0;
1017 for (std::size_t j = 0; j < k; ++j) {
1018 const T v = distBuf[j];
1019 if (v < best) {
1020 second = best;
1021 best = v;
1022 bestIdx = static_cast<std::int32_t>(j);
1023 } else if (v < second) {
1024 second = v;
1025 }
1026 }
1027 labelsData[i] = bestIdx;
1028 minDistData[i] = best;
1029 uData[i] = std::sqrt(best);
1030 lData[i] = std::sqrt(second);
1031 }
1032 };
1033
1034 if (pool.shouldParallelize(n, 64, 2) && pool.pool != nullptr) {
1035 pool.pool
1036 ->submit_blocks(std::size_t{0}, n,
1037 [&](std::size_t lo, std::size_t hi) { processRange(lo, hi); })
1038 .wait();
1039 } else {
1040 processRange(0, n);
1041 }
1042 }
1043
1054 void runElkanAssignment(const NDArray<T, 2, Layout::Contig> &X,
1055 const NDArray<T, 2, Layout::Contig> &centroids,
1056 NDArray<std::int32_t, 1> &labels, math::Pool pool) noexcept {
1057 const std::size_t n = X.dim(0);
1058 const std::size_t d = X.dim(1);
1059 const std::size_t k = centroids.dim(0);
1060 if (n == 0 || d == 0 || k == 0 || m_elkanBounds.dim(0) != n || m_elkanBounds.dim(1) != k) {
1061 return;
1062 }
1063 const T *xData = X.data();
1064 const T *cData = centroids.data();
1065 T *uData = m_u.data();
1066 T *boundsData = m_elkanBounds.data();
1067 T *minDistData = m_minDistSq.data();
1068 std::int32_t *labelsData = labels.data();
1069
1070 // Per-cluster shift (Euclidean) used to update all bounds once at the top of the pass.
1071 T *shiftData = m_shiftEuclidean.data();
1072 for (std::size_t c = 0; c < k; ++c) {
1073 shiftData[c] = std::sqrt(m_shiftSq(c));
1074 }
1075
1076 // Pairwise centroid distances. Symmetric; fill upper triangle and mirror. `O(k^2 * d)`,
1077 // amortized against the @c n * k inner scan below.
1078 T *centerDistData = m_centerDist.data();
1079 T *halfDistData = m_halfDistToNearestOther.data();
1080 for (std::size_t c = 0; c < k; ++c) {
1081 centerDistData[(c * k) + c] = T{0};
1082 T nearest = std::numeric_limits<T>::infinity();
1083 for (std::size_t cp = 0; cp < k; ++cp) {
1084 if (cp == c) {
1085 continue;
1086 }
1087 T dist;
1088 if (cp > c) {
1089 const T dsq = math::detail::sqEuclideanRowPtr<T>(cData + (c * d), cData + (cp * d), d);
1090 dist = std::sqrt(dsq);
1091 centerDistData[(c * k) + cp] = dist;
1092 centerDistData[(cp * k) + c] = dist;
1093 } else {
1094 dist = centerDistData[(c * k) + cp];
1095 }
1096 if (dist < nearest) {
1097 nearest = dist;
1098 }
1099 }
1100 halfDistData[c] = T{0.5} * nearest;
1101 }
1102
1103 auto processRange = [&](std::size_t lo, std::size_t hi) noexcept {
1104 for (std::size_t i = lo; i < hi; ++i) {
1105 std::int32_t a = labelsData[i];
1106 if (a < 0 || std::cmp_greater_equal(a, k)) {
1107 continue;
1108 }
1109 auto au = static_cast<std::size_t>(a);
1110 T u = uData[i] + shiftData[au];
1111 T *lRow = boundsData + (i * k);
1112 // Bound-shift pass for this sample: looser lower bounds against all clusters. Done
1113 // inline so the per-sample walk touches the row exactly once.
1114 for (std::size_t c = 0; c < k; ++c) {
1115 T lnew = lRow[c] - shiftData[c];
1116 if (lnew < T{0}) {
1117 lnew = T{0};
1118 }
1119 lRow[c] = lnew;
1120 }
1121
1122 if (u <= halfDistData[au]) {
1123 uData[i] = u;
1124 continue;
1125 }
1126
1127 bool uTight = false;
1128 const T *xi = xData + (i * d);
1129 for (std::size_t c = 0; c < k; ++c) {
1130 if (c == au) {
1131 continue;
1132 }
1133 const T lc = lRow[c];
1134 const T half = T{0.5} * centerDistData[(au * k) + c];
1135 if (u <= lc || u <= half) {
1136 continue;
1137 }
1138 if (!uTight) {
1139 const T tightSq = math::detail::sqEuclideanRowPtr<T>(xi, cData + (au * d), d);
1140 u = std::sqrt(tightSq);
1141 minDistData[i] = tightSq;
1142 uTight = true;
1143 if (u <= lc || u <= half) {
1144 continue;
1145 }
1146 }
1147 const T dSq = math::detail::sqEuclideanRowPtr<T>(xi, cData + (c * d), d);
1148 const T dEuc = std::sqrt(dSq);
1149 lRow[c] = dEuc;
1150 if (dEuc < u) {
1151 au = c;
1152 a = static_cast<std::int32_t>(c);
1153 u = dEuc;
1154 minDistData[i] = dSq;
1155 }
1156 }
1157 uData[i] = u;
1158 labelsData[i] = a;
1159 }
1160 };
1161
1162 if (pool.shouldParallelize(n, 64, 2) && pool.pool != nullptr) {
1163 pool.pool
1164 ->submit_blocks(std::size_t{0}, n,
1165 [&](std::size_t lo, std::size_t hi) { processRange(lo, hi); })
1166 .wait();
1167 } else {
1168 processRange(0, n);
1169 }
1170 }
1171
1172 NDArray<T, 2, Layout::Contig> m_centroidsOld;
1173 NDArray<T, 1> m_cSqNorms;
1174 NDArray<T, 2, Layout::Contig> m_sums;
1175 NDArray<std::int32_t, 1> m_counts;
1176 NDArray<T, 1> m_minDistSq;
1177 NDArray<T, 1> m_shiftSq;
1178 NDArray<T, 1> m_partialSums;
1179 NDArray<T, 1> m_partialComps;
1180 NDArray<std::int32_t, 1> m_partialCounts;
1181 NDArray<T, 1> m_foldComp;
1182 NDArray<T, 1> m_packedB;
1183 NDArray<T, 1> m_packedCSqNorms;
1184 NDArray<T, 2, Layout::Contig> m_distsChunk;
1185 NDArray<T, 1> m_gemmApArena;
1186 NDArray<T, 1> m_xNormsSq;
1187 NDArray<T, 1> m_varSum;
1188 NDArray<T, 1> m_varSumSq;
1191 NDArray<T, 1> m_u;
1193 NDArray<T, 1> m_l;
1195 NDArray<T, 1> m_shiftEuclidean;
1199 NDArray<T, 1> m_halfDistToNearestOther;
1203 NDArray<T, 2, Layout::Contig> m_elkanBounds;
1206 NDArray<T, 2, Layout::Contig> m_centerDist;
1207
1208 std::size_t m_n = 0;
1209 std::size_t m_d = 0;
1210 std::size_t m_k = 0;
1211 std::size_t m_workerCount = 0;
1212};
1213
1214} // namespace clustering::kmeans
#define CLUSTERING_ALWAYS_ASSERT(cond)
Release-active assertion: evaluates cond in every build configuration.
Represents a multidimensional array (NDArray) of a fixed number of dimensions N and element type T.
Definition ndarray.h:136
size_t dim(std::size_t index) const noexcept
Returns the size of a specific dimension of the NDArray.
Definition ndarray.h:461
static NDArray borrow(T *ptr, std::array< std::size_t, N > shape) noexcept
Borrows a contiguous buffer as an NDArray without taking ownership.
Definition ndarray.h:570
const T * data() const noexcept
Provides read-only access to the internal data array.
Definition ndarray.h:503
void run(const NDArray< T, 2, Layout::Contig > &X, NDArray< T, 2, Layout::Contig > &centroids, std::size_t k, std::size_t maxIter, T tol, math::Pool pool, NDArray< std::int32_t, 1 > &outLabels, double &outInertia, std::size_t &outNIter, bool &outConverged)
Run the Lloyd loop against caller-seeded centroids.
static constexpr std::size_t kahanNThreshold
n threshold at which the centroid accumulator switches to Kahan-compensated summation.
constexpr std::size_t kDirectArgminMaxD
Maximum d for the direct-compute argmin hot path.
void sqEuclideanRowToBatch(const T *x, const T *candData, std::size_t L, std::size_t d, T *out) noexcept
Squared Euclidean distance from one x row to a batch of L candidate rows.
constexpr std::size_t pairwiseArgminMaxD
Maximum feature dimension for which the fused pairwiseArgminSqEuclidean driver is used.
Definition defaults.h:76
T sqNormRow(const NDArray< T, 2, LX > &X, std::size_t i) noexcept
Definition pairwise.h:154
constexpr std::size_t pairwiseArgminChunkRows
Chunk height used by the materialized argmin path when striping over n.
void centroidShift(const NDArray< T, 2, Layout::Contig > &cOld, const NDArray< T, 2, Layout::Contig > &cNew, NDArray< T, 1 > &outShiftSq, Pool pool)
Per-row squared shift between two centroid matrices of identical shape.
Thin injection wrapper around a BS::light_thread_pool.
Definition thread.h:63
static std::size_t workerIndex() noexcept
Stable index of the calling worker thread within the owning pool.
Definition thread.h:82
std::size_t workerCount() const noexcept
Number of worker threads available, or 1 in serial mode.
Definition thread.h:72