Clustering
C++20 header-only: DBSCAN, HDBSCAN, k-means.
Loading...
Searching...
No Matches
hdbscan.h
Go to the documentation of this file.
1#pragma once
2
3#include <BS_thread_pool.hpp>
4#include <algorithm>
5#include <array>
6#include <cmath>
7#include <cstddef>
8#include <cstdint>
9#include <limits>
10#include <optional>
11#include <span>
12#include <type_traits>
13#include <utility>
14#include <vector>
15
17#include "clustering/hdbscan/detail/condensed_tree.h"
18#include "clustering/hdbscan/detail/eom_extract.h"
19#include "clustering/hdbscan/detail/glosh.h"
20#include "clustering/hdbscan/detail/leaf_extract.h"
21#include "clustering/hdbscan/detail/single_linkage.h"
26#include "clustering/ndarray.h"
27
29
37enum class ClusterSelectionMethod : std::uint8_t {
40};
41
52enum class MinSamplesConvention : std::uint8_t {
59};
60
61} // namespace clustering::hdbscan
62
63namespace clustering {
64
107template <class T, class MstBackend = hdbscan::AutoMstBackend<T>>
108 requires hdbscan::MstBackendStrategy<MstBackend, T>
109class HDBSCAN {
110 static_assert(std::is_same_v<T, float>,
111 "HDBSCAN<T> supports only float; a double specialization is out of scope.");
112
113public:
123 std::span<const std::int32_t> parent;
125 std::span<const std::int32_t> child;
127 std::span<const T> lambda;
129 std::span<const std::int32_t> childSize;
130
132 [[nodiscard]] bool empty() const noexcept { return parent.empty(); }
134 [[nodiscard]] std::size_t size() const noexcept { return parent.size(); }
135 };
136
152 explicit HDBSCAN(
153 std::size_t minClusterSize, std::size_t minSamples = 0,
155 std::size_t nJobs = 0,
157 : m_minClusterSize(minClusterSize), m_minSamples(minSamples), m_method(method),
158 m_nJobs(math::clampedJobCount(nJobs)), m_convention(convention), m_labels({0}),
159 m_outlierScores({0}) {
160 CLUSTERING_ALWAYS_ASSERT(minClusterSize >= 2);
161 // Defer pool construction to @ref run: at small shapes every hot phase gates serial, and
162 // spawning nJobs workers in the ctor burns thread-create overhead that the fit never
163 // amortizes at small shapes. @ref run emplaces on first need and reuses.
164 }
165
166 HDBSCAN(const HDBSCAN &) = delete;
167 HDBSCAN &operator=(const HDBSCAN &) = delete;
168 HDBSCAN(HDBSCAN &&) = delete;
169 HDBSCAN &operator=(HDBSCAN &&) = delete;
170 ~HDBSCAN() = default;
171
183 void run(const NDArray<T, 2> &X) {
184 const std::size_t n = X.dim(0);
185
186 CLUSTERING_ALWAYS_ASSERT(m_minClusterSize >= 2);
187
188 // Translate the user-facing @c minSamples to the non-self neighbour count the MST backends
189 // consume. Under the scikit-learn convention the query point counts as one of the neighbours
190 // and we feed the backend one less; under Campello we pass it through.
191 const std::size_t requestedMinSamples = (m_minSamples == 0) ? m_minClusterSize : m_minSamples;
192 if (m_convention == hdbscan::MinSamplesConvention::kSklearn) {
193 CLUSTERING_ALWAYS_ASSERT(requestedMinSamples >= 2);
194 } else {
195 CLUSTERING_ALWAYS_ASSERT(requestedMinSamples >= 1);
196 }
197 const std::size_t effectiveMinSamples =
198 (m_convention == hdbscan::MinSamplesConvention::kSklearn) ? requestedMinSamples - 1
199 : requestedMinSamples;
200 CLUSTERING_ALWAYS_ASSERT(effectiveMinSamples >= 1);
201 CLUSTERING_ALWAYS_ASSERT(effectiveMinSamples < n);
202 CLUSTERING_ALWAYS_ASSERT(n >= m_minClusterSize);
204 static_cast<std::size_t>(std::numeric_limits<std::int32_t>::max()));
205
206 // Lazy pool spawn: the MST backend is the only phase that can parallelise at all; post-MST is
207 // serial by contract. Size the work gate by the backend's dominant shape (N * N) so small
208 // inputs never pay the thread-create cost.
209 if (!m_pool.has_value() &&
210 math::shouldSpawnPool(n * n, m_nJobs, /*minOpsPerWorker=*/1U << 15)) {
211 m_pool.emplace(m_nJobs);
212 }
213 const math::Pool pool{m_pool.has_value() ? &*m_pool : nullptr};
214
215 // Phase 1: MST via the pinned backend. The backend writes edges and core distances into
216 // `m_mstOutput`.
217 m_backend.run(X, effectiveMinSamples, pool, m_mstOutput);
218
219 // Convert squared distances to linear distances before the post-MST pipeline consumes them.
220 // The backends store squared Euclidean internally (avoids an @c sqrt per pair-distance); the
221 // MST structure is invariant under @c d -> `sqrt(d)` (monotone) but the condensed-tree
222 // stability DP compares absolute lambda values whose outcome is not invariant under that
223 // transform. Linearising here aligns the lambda scale with the reference implementation and
224 // with outlier-score bounds users expect.
225 {
226 const std::size_t nCore = m_mstOutput.coreDistances.dim(0);
227 T *coreData = m_mstOutput.coreDistances.data();
228 for (std::size_t i = 0; i < nCore; ++i) {
229 coreData[i] = std::sqrt(coreData[i]);
230 }
231 for (auto &edge : m_mstOutput.edges) {
232 edge.weight = std::sqrt(edge.weight);
233 }
234 }
235
236 // Phase 2: build the single-linkage dendrogram from the MST edges.
237 hdbscan::detail::SingleLinkageTree<T> slt;
238 hdbscan::detail::buildSingleLinkageTree(m_mstOutput, n, slt);
239
240 // Phase 3: condense the dendrogram under `minClusterSize`.
241 hdbscan::detail::CondensedTree<T> condensed;
242 hdbscan::detail::condenseTree(slt, n, m_minClusterSize, condensed);
243
244 // Phase 4: cluster extraction (EOM or leaf).
245 std::vector<std::int32_t> labels;
247 hdbscan::detail::extractEom(condensed, n, labels);
248 } else {
249 hdbscan::detail::extractLeaf(condensed, n, labels);
250 }
251
252 // Phase 5: GLOSH outlier scores.
253 std::vector<T> scores;
254 hdbscan::detail::computeGlosh(condensed, n, labels, scores);
255
256 // Finalise result accessors. The label array lands in the public NDArray buffer; ditto the
257 // outlier-score array. The condensed tree is retained in its parallel-array form so the
258 // public view can borrow from it without an additional copy.
259 m_labels = NDArray<std::int32_t, 1>(std::array<std::size_t, 1>{n});
260 std::int32_t maxLabel = -1;
261 for (std::size_t i = 0; i < n; ++i) {
262 m_labels(i) = labels[i];
263 maxLabel = std::max(maxLabel, labels[i]);
264 }
265 m_outlierScores = NDArray<T, 1>(std::array<std::size_t, 1>{n});
266 for (std::size_t i = 0; i < n; ++i) {
267 m_outlierScores(i) = scores[i];
268 }
269 m_nClusters = (maxLabel < 0) ? std::size_t{0} : static_cast<std::size_t>(maxLabel) + 1;
270
271 m_ctParent = std::move(condensed.parent);
272 m_ctChild = std::move(condensed.child);
273 m_ctLambda = std::move(condensed.lambdaVal);
274 m_ctChildSize = std::move(condensed.childSize);
275 }
276
279 [[nodiscard]] const NDArray<std::int32_t, 1> &labels() const noexcept { return m_labels; }
280
283 [[nodiscard]] const NDArray<T, 1> &outlierScores() const noexcept { return m_outlierScores; }
284
287 [[nodiscard]] std::size_t nClusters() const noexcept { return m_nClusters; }
288
291 [[nodiscard]] CondensedTreeView condensedTree() const noexcept {
292 return CondensedTreeView{
293 .parent = std::span<const std::int32_t>(m_ctParent.data(), m_ctParent.size()),
294 .child = std::span<const std::int32_t>(m_ctChild.data(), m_ctChild.size()),
295 .lambda = std::span<const T>(m_ctLambda.data(), m_ctLambda.size()),
296 .childSize = std::span<const std::int32_t>(m_ctChildSize.data(), m_ctChildSize.size()),
297 };
298 }
299
301 void reset() {
302 m_labels = NDArray<std::int32_t, 1>({0});
303 m_outlierScores = NDArray<T, 1>({0});
304 m_nClusters = 0;
305 m_ctParent = std::vector<std::int32_t>{};
306 m_ctChild = std::vector<std::int32_t>{};
307 m_ctLambda = std::vector<T>{};
308 m_ctChildSize = std::vector<std::int32_t>{};
309 m_mstOutput = hdbscan::MstOutput<T>{};
310 m_backend = MstBackend{};
311 }
312
313private:
314 std::size_t m_minClusterSize;
315 std::size_t m_minSamples;
317 std::size_t m_nJobs;
322 std::optional<BS::light_thread_pool> m_pool;
324 NDArray<T, 1> m_outlierScores;
325 std::size_t m_nClusters = 0;
326
327 // Condensed-tree parallel arrays, filled by the post-MST pipeline and surfaced through
328 // @ref condensedTree as a read-only view.
329 std::vector<std::int32_t> m_ctParent;
330 std::vector<std::int32_t> m_ctChild;
331 std::vector<T> m_ctLambda;
332 std::vector<std::int32_t> m_ctChildSize;
333
334 MstBackend m_backend{};
335 hdbscan::MstOutput<T> m_mstOutput{};
336};
337
338} // namespace clustering
#define CLUSTERING_ALWAYS_ASSERT(cond)
Release-active assertion: evaluates cond in every build configuration.
CondensedTreeView condensedTree() const noexcept
Borrowed view over the condensed tree from the most recent run, or an empty view if no fit has produc...
Definition hdbscan.h:291
HDBSCAN(std::size_t minClusterSize, std::size_t minSamples=0, hdbscan::ClusterSelectionMethod method=hdbscan::ClusterSelectionMethod::kEom, std::size_t nJobs=0, hdbscan::MinSamplesConvention convention=hdbscan::MinSamplesConvention::kSklearn)
Construct a reusable HDBSCAN fitter.
Definition hdbscan.h:152
HDBSCAN(const HDBSCAN &)=delete
std::size_t nClusters() const noexcept
Total number of clusters discovered by the most recent run, or 0 if no fit has produced a result yet.
Definition hdbscan.h:287
HDBSCAN & operator=(HDBSCAN &&)=delete
HDBSCAN & operator=(const HDBSCAN &)=delete
const NDArray< std::int32_t, 1 > & labels() const noexcept
Length-n assignment; -1 marks noise.
Definition hdbscan.h:279
void run(const NDArray< T, 2 > &X)
Fit to X.
Definition hdbscan.h:183
HDBSCAN(HDBSCAN &&)=delete
const NDArray< T, 1 > & outlierScores() const noexcept
Length-n per-point GLOSH outlier scores in [0, 1].
Definition hdbscan.h:283
void reset()
Release every scratch buffer. The next run call reallocates against its shape.
Definition hdbscan.h:301
Represents a multidimensional array (NDArray) of a fixed number of dimensions N and element type T.
Definition ndarray.h:136
size_t dim(std::size_t index) const noexcept
Returns the size of a specific dimension of the NDArray.
Definition ndarray.h:461
ClusterSelectionMethod
Cluster extraction method on the condensed tree.
Definition hdbscan.h:37
@ kEom
Excess-of-mass selection (the published default).
Definition hdbscan.h:38
@ kLeaf
Leaf-cluster selection; every condensed-tree leaf becomes a cluster.
Definition hdbscan.h:39
MinSamplesConvention
Semantics of the minSamples parameter at core-distance extraction.
Definition hdbscan.h:52
@ kCampello
minSamples counts only non-self neighbours, matching Campello 2015 directly.
Definition hdbscan.h:58
@ kSklearn
minSamples counts the query point itself as one of the k neighbours.
Definition hdbscan.h:56
bool shouldSpawnPool(std::size_t totalOps, std::size_t nJobs, std::size_t minOpsPerWorker=std::size_t{1}<< 15) noexcept
Decide whether spawning a pool with nJobs workers is worth it for totalOps of arithmetic work.
Definition thread.h:43
Read-only view over the condensed-tree result.
Definition hdbscan.h:121
std::size_t size() const noexcept
Number of rows in the condensed tree.
Definition hdbscan.h:134
std::span< const std::int32_t > childSize
Size of the child subtree at its birth lambda.
Definition hdbscan.h:129
std::span< const std::int32_t > child
Child cluster or leaf point id for each row.
Definition hdbscan.h:125
bool empty() const noexcept
True when the view holds no rows (no fit or reset called).
Definition hdbscan.h:132
std::span< const T > lambda
Lambda value (= 1 / distance) at which child detaches from parent.
Definition hdbscan.h:127
std::span< const std::int32_t > parent
Parent cluster id for each row of the condensed tree.
Definition hdbscan.h:123
Frozen output contract of every MST backend.
Definition mst_output.h:41
Thin injection wrapper around a BS::light_thread_pool.
Definition thread.h:63