3#include <BS_thread_pool.hpp>
17#include "clustering/hdbscan/detail/condensed_tree.h"
18#include "clustering/hdbscan/detail/eom_extract.h"
19#include "clustering/hdbscan/detail/glosh.h"
20#include "clustering/hdbscan/detail/leaf_extract.h"
21#include "clustering/hdbscan/detail/single_linkage.h"
107template <
class T,
class MstBackend = hdbscan::AutoMstBackend<T>>
108 requires hdbscan::MstBackendStrategy<MstBackend, T>
110 static_assert(std::is_same_v<T, float>,
111 "HDBSCAN<T> supports only float; a double specialization is out of scope.");
125 std::span<const std::int32_t>
child;
132 [[nodiscard]]
bool empty() const noexcept {
return parent.empty(); }
134 [[nodiscard]] std::size_t
size() const noexcept {
return parent.size(); }
153 std::size_t minClusterSize, std::size_t minSamples = 0,
155 std::size_t nJobs = 0,
157 : m_minClusterSize(minClusterSize), m_minSamples(minSamples), m_method(method),
158 m_nJobs(
math::clampedJobCount(nJobs)), m_convention(convention), m_labels({0}),
159 m_outlierScores({0}) {
184 const std::size_t n = X.
dim(0);
191 const std::size_t requestedMinSamples = (m_minSamples == 0) ? m_minClusterSize : m_minSamples;
197 const std::size_t effectiveMinSamples =
199 : requestedMinSamples;
204 static_cast<std::size_t
>(std::numeric_limits<std::int32_t>::max()));
209 if (!m_pool.has_value() &&
211 m_pool.emplace(m_nJobs);
213 const math::Pool pool{m_pool.has_value() ? &*m_pool :
nullptr};
217 m_backend.run(X, effectiveMinSamples, pool, m_mstOutput);
226 const std::size_t nCore = m_mstOutput.coreDistances.dim(0);
227 T *coreData = m_mstOutput.coreDistances.data();
228 for (std::size_t i = 0; i < nCore; ++i) {
229 coreData[i] = std::sqrt(coreData[i]);
231 for (
auto &edge : m_mstOutput.edges) {
232 edge.weight = std::sqrt(edge.weight);
237 hdbscan::detail::SingleLinkageTree<T> slt;
238 hdbscan::detail::buildSingleLinkageTree(m_mstOutput, n, slt);
241 hdbscan::detail::CondensedTree<T> condensed;
242 hdbscan::detail::condenseTree(slt, n, m_minClusterSize, condensed);
245 std::vector<std::int32_t>
labels;
247 hdbscan::detail::extractEom(condensed, n,
labels);
249 hdbscan::detail::extractLeaf(condensed, n,
labels);
253 std::vector<T> scores;
254 hdbscan::detail::computeGlosh(condensed, n,
labels, scores);
260 std::int32_t maxLabel = -1;
261 for (std::size_t i = 0; i < n; ++i) {
263 maxLabel = std::max(maxLabel,
labels[i]);
265 m_outlierScores =
NDArray<T, 1>(std::array<std::size_t, 1>{n});
266 for (std::size_t i = 0; i < n; ++i) {
267 m_outlierScores(i) = scores[i];
269 m_nClusters = (maxLabel < 0) ? std::size_t{0} :
static_cast<std::size_t
>(maxLabel) + 1;
271 m_ctParent = std::move(condensed.parent);
272 m_ctChild = std::move(condensed.child);
273 m_ctLambda = std::move(condensed.lambdaVal);
274 m_ctChildSize = std::move(condensed.childSize);
287 [[nodiscard]] std::size_t
nClusters() const noexcept {
return m_nClusters; }
293 .parent = std::span<const std::int32_t>(m_ctParent.data(), m_ctParent.size()),
294 .child = std::span<const std::int32_t>(m_ctChild.data(), m_ctChild.size()),
295 .lambda = std::span<const T>(m_ctLambda.data(), m_ctLambda.size()),
296 .childSize = std::span<const std::int32_t>(m_ctChildSize.data(), m_ctChildSize.size()),
305 m_ctParent = std::vector<std::int32_t>{};
306 m_ctChild = std::vector<std::int32_t>{};
307 m_ctLambda = std::vector<T>{};
308 m_ctChildSize = std::vector<std::int32_t>{};
310 m_backend = MstBackend{};
314 std::size_t m_minClusterSize;
315 std::size_t m_minSamples;
322 std::optional<BS::light_thread_pool> m_pool;
325 std::size_t m_nClusters = 0;
329 std::vector<std::int32_t> m_ctParent;
330 std::vector<std::int32_t> m_ctChild;
331 std::vector<T> m_ctLambda;
332 std::vector<std::int32_t> m_ctChildSize;
334 MstBackend m_backend{};
335 hdbscan::MstOutput<T> m_mstOutput{};
#define CLUSTERING_ALWAYS_ASSERT(cond)
Release-active assertion: evaluates cond in every build configuration.
CondensedTreeView condensedTree() const noexcept
Borrowed view over the condensed tree from the most recent run, or an empty view if no fit has produc...
HDBSCAN(std::size_t minClusterSize, std::size_t minSamples=0, hdbscan::ClusterSelectionMethod method=hdbscan::ClusterSelectionMethod::kEom, std::size_t nJobs=0, hdbscan::MinSamplesConvention convention=hdbscan::MinSamplesConvention::kSklearn)
Construct a reusable HDBSCAN fitter.
HDBSCAN(const HDBSCAN &)=delete
std::size_t nClusters() const noexcept
Total number of clusters discovered by the most recent run, or 0 if no fit has produced a result yet.
HDBSCAN & operator=(HDBSCAN &&)=delete
HDBSCAN & operator=(const HDBSCAN &)=delete
const NDArray< std::int32_t, 1 > & labels() const noexcept
Length-n assignment; -1 marks noise.
void run(const NDArray< T, 2 > &X)
Fit to X.
HDBSCAN(HDBSCAN &&)=delete
const NDArray< T, 1 > & outlierScores() const noexcept
Length-n per-point GLOSH outlier scores in [0, 1].
void reset()
Release every scratch buffer. The next run call reallocates against its shape.
Represents a multidimensional array (NDArray) of a fixed number of dimensions N and element type T.
size_t dim(std::size_t index) const noexcept
Returns the size of a specific dimension of the NDArray.
ClusterSelectionMethod
Cluster extraction method on the condensed tree.
@ kEom
Excess-of-mass selection (the published default).
@ kLeaf
Leaf-cluster selection; every condensed-tree leaf becomes a cluster.
MinSamplesConvention
Semantics of the minSamples parameter at core-distance extraction.
@ kCampello
minSamples counts only non-self neighbours, matching Campello 2015 directly.
@ kSklearn
minSamples counts the query point itself as one of the k neighbours.
bool shouldSpawnPool(std::size_t totalOps, std::size_t nJobs, std::size_t minOpsPerWorker=std::size_t{1}<< 15) noexcept
Decide whether spawning a pool with nJobs workers is worth it for totalOps of arithmetic work.
Read-only view over the condensed-tree result.
std::size_t size() const noexcept
Number of rows in the condensed tree.
std::span< const std::int32_t > childSize
Size of the child subtree at its birth lambda.
std::span< const std::int32_t > child
Child cluster or leaf point id for each row.
bool empty() const noexcept
True when the view holds no rows (no fit or reset called).
std::span< const T > lambda
Lambda value (= 1 / distance) at which child detaches from parent.
std::span< const std::int32_t > parent
Parent cluster id for each row of the condensed tree.
Frozen output contract of every MST backend.
Thin injection wrapper around a BS::light_thread_pool.