Clustering
C++20 header-only: DBSCAN, HDBSCAN, k-means.
Loading...
Searching...
No Matches
auto_seeder.h
Go to the documentation of this file.
1#pragma once
2
3#include <cstddef>
4#include <cstdint>
5#include <variant>
6
10#include "clustering/ndarray.h"
11
12namespace clustering::kmeans {
13
15
30template <class T, AutoSeederMode Mode = AutoSeederMode::kSingleRun> class AutoSeeder {
31public:
32#ifdef CLUSTERING_KMEANS_AFKMC2_N_THRESHOLD
39 static constexpr std::size_t afkmc2NThreshold = CLUSTERING_KMEANS_AFKMC2_N_THRESHOLD;
40#else
42 static constexpr std::size_t afkmc2NThreshold = 500000;
43#endif
44
46 static constexpr std::size_t afkmc2KFloor = AfkMc2Seeder<T>::kFloor;
47#ifdef CLUSTERING_KMEANS_AFKMC2_BEST_OF_N_THRESHOLD
53 static constexpr std::size_t afkmc2BestOfNThreshold =
54 CLUSTERING_KMEANS_AFKMC2_BEST_OF_N_THRESHOLD;
55#else
57 static constexpr std::size_t afkmc2BestOfNThreshold = 5000;
58#endif
59
60#ifdef CLUSTERING_KMEANS_AFKMC2_BEST_OF_D_THRESHOLD
66 static constexpr std::size_t afkmc2BestOfDThreshold =
67 CLUSTERING_KMEANS_AFKMC2_BEST_OF_D_THRESHOLD;
68#else
70 static constexpr std::size_t afkmc2BestOfDThreshold = 8;
71#endif
72
73#ifdef CLUSTERING_KMEANS_AFKMC2_BEST_OF_K_FLOOR
79 static constexpr std::size_t afkmc2BestOfKFloor = CLUSTERING_KMEANS_AFKMC2_BEST_OF_K_FLOOR;
80#else
82 static constexpr std::size_t afkmc2BestOfKFloor = 16;
83#endif
84
85#ifdef CLUSTERING_KMEANS_AFKMC2_BEST_OF_WORK_THRESHOLD
91 static constexpr std::size_t afkmc2BestOfWorkThreshold =
92 CLUSTERING_KMEANS_AFKMC2_BEST_OF_WORK_THRESHOLD;
93#else
95 static constexpr std::size_t afkmc2BestOfWorkThreshold = 80000;
96#endif
97
100
102 void run(const NDArray<T, 2> &X, std::size_t k, std::uint64_t seed, math::Pool pool,
103 NDArray<T, 2> &outCentroids) {
104 const std::size_t n = X.dim(0);
105 const std::size_t d = X.dim(1);
106 ensureShape(n, d, k);
107 std::visit([&](auto &s) { s.run(X, k, seed, pool, outCentroids); }, m_held);
108 }
109
110private:
111 void ensureShape(std::size_t n, std::size_t d, std::size_t k) {
112 if (n == m_lastN && d == m_lastD && k == m_lastK) {
113 return;
114 }
115 if (shouldUseAfkmc2(n, d, k)) {
116 if (!std::holds_alternative<AfkMc2Seeder<T>>(m_held)) {
117 m_held.template emplace<AfkMc2Seeder<T>>();
118 }
119 } else {
120 if (!std::holds_alternative<GreedyKmppSeeder<T>>(m_held)) {
121 m_held.template emplace<GreedyKmppSeeder<T>>();
122 }
123 }
124 m_lastN = n;
125 m_lastD = d;
126 m_lastK = k;
127 }
128
129 [[nodiscard]] static constexpr bool shouldUseAfkmc2(std::size_t n, std::size_t d,
130 std::size_t k) noexcept {
131 const bool largeKEnvelope = (n >= afkmc2NThreshold) && (k >= afkmc2KFloor);
132 if constexpr (Mode == AutoSeederMode::kBestOf) {
133 const bool bestOfEnvelope = (n >= afkmc2BestOfNThreshold) && (d >= afkmc2BestOfDThreshold) &&
134 (k >= afkmc2BestOfKFloor) &&
135 (n >= ((afkmc2BestOfWorkThreshold + d - 1) / d));
136 return largeKEnvelope || bestOfEnvelope;
137 } else {
138 (void)d;
139 return largeKEnvelope;
140 }
141 }
142
143 std::variant<GreedyKmppSeeder<T>, AfkMc2Seeder<T>> m_held{
144 std::in_place_type<GreedyKmppSeeder<T>>};
145 std::size_t m_lastN = 0;
146 std::size_t m_lastD = 0;
147 std::size_t m_lastK = 0;
148};
149
150} // namespace clustering::kmeans
Represents a multidimensional array (NDArray) of a fixed number of dimensions N and element type T.
Definition ndarray.h:136
size_t dim(std::size_t index) const noexcept
Returns the size of a specific dimension of the NDArray.
Definition ndarray.h:461
static constexpr std::size_t chainLengthDefault
Default Markov-chain length per centroid pick.
static constexpr std::size_t kFloor
Minimum k below which the AFK-MC2 chain's log-k bound is too loose to win.
Seeder that picks between greedy k-means++ and AFK-MC2 against workload shape.
Definition auto_seeder.h:30
static constexpr std::size_t afkmc2NThreshold
n threshold above which AFK-MC2 is preferred over greedy k-means++.
Definition auto_seeder.h:42
void run(const NDArray< T, 2 > &X, std::size_t k, std::uint64_t seed, math::Pool pool, NDArray< T, 2 > &outCentroids)
Seed outCentroids with the dispatched seeder; see the class docs for the dispatch rule.
static constexpr std::size_t afkmc2KFloor
Minimum k at which AFK-MC2 is considered; mirrors AfkMc2Seeder::kFloor.
Definition auto_seeder.h:46
static constexpr std::size_t afkmc2BestOfKFloor
Minimum k for best-of restart AFK-MC2 dispatch.
Definition auto_seeder.h:82
static constexpr std::size_t afkmc2BestOfNThreshold
Best-of restart threshold on n above which AFK-MC2 is preferred.
Definition auto_seeder.h:57
static constexpr std::size_t afkmc2BestOfWorkThreshold
Minimum n * d work envelope for best-of restart AFK-MC2 dispatch.
Definition auto_seeder.h:95
static constexpr std::size_t afkmc2BestOfDThreshold
Best-of restart threshold on d above which AFK-MC2 is preferred.
Definition auto_seeder.h:70
static constexpr std::size_t afkmc2ChainLengthDefault
Default Markov-chain length passed through to AFK-MC2.
Definition auto_seeder.h:99
Thin injection wrapper around a BS::light_thread_pool.
Definition thread.h:63