Clustering
C++20 header-only: DBSCAN, HDBSCAN, k-means.
Loading...
Searching...
No Matches
boruvka_mst_backend.h
Go to the documentation of this file.
1#pragma once
2
3#include <algorithm>
4#include <array>
5#include <cstddef>
6#include <cstdint>
7#include <type_traits>
8#include <utility>
9#include <vector>
10
12#include "clustering/hdbscan/detail/boruvka_traversal.h"
15#include "clustering/math/dsu.h"
17#include "clustering/ndarray.h"
18
19namespace clustering::hdbscan {
20
41template <class T> class BoruvkaMstBackend {
42 static_assert(
43 std::is_same_v<T, float>,
44 "BoruvkaMstBackend<T> supports only float; a double specialization is out of scope.");
45
46public:
47 BoruvkaMstBackend() = default;
48
63 void run(const NDArray<T, 2> &X, std::size_t minSamples, math::Pool pool, MstOutput<T> &out) {
64 const std::size_t n = X.dim(0);
65 CLUSTERING_ALWAYS_ASSERT(minSamples >= 1);
66 CLUSTERING_ALWAYS_ASSERT(minSamples < n);
67
68 out.edges.clear();
69 out.edges.reserve(n - 1);
70 out.coreDistances = NDArray<T, 1>(std::array<std::size_t, 1>{n});
71
72 // Build the tree once; reused for the kNN core-distance query and for every Boruvka
73 // round's per-point nearest-out-of-component traversal. LeafSize=32 halves the internal
74 // node count against the default 16-leaf tree; at d>=4 the batched-distance leaf kernel
75 // amortises the larger leaf's arithmetic well enough that the saved per-node AABB gap and
76 // pivot distance wins net positive on every shape exercised by HDBSCAN's dispatch gates.
78
79 // Phase 1: core distances via kNN at k = minSamples. knnQuery returns neighbours sorted
80 // ascending by squared distance, so the minSamples-th (1-based) neighbour is at column
81 // @c minSamples - 1. Squared distances are the in-memory unit across the pipeline; the
82 // downstream MRD weight and Prim oracle use the same scale.
83 const auto kSigned = static_cast<std::int32_t>(minSamples);
84 auto [knnIdx, knnSqDist] = tree.knnQuery(kSigned, pool);
85 T *coreDistData = out.coreDistances.data();
86 for (std::size_t i = 0; i < n; ++i) {
87 coreDistData[i] = knnSqDist(i, minSamples - 1);
88 }
89
90 // Phase 2: Boruvka rounds. Each round computes one best out-of-component candidate edge
91 // per component via the per-point traversal, then unions those edges in ascending order of
92 // weight. Processing in ascending weight keeps the MRD-MST total identical to the classical
93 // Kruskal/Prim result, modulo equal-weight ties which admit multiple valid MSTs.
95 std::vector<std::int32_t> componentOf(n, std::int32_t{0});
96 std::vector<detail::ComponentBestEdge<T>> bestPerComponent(n);
97 std::vector<std::int32_t> activeRoots;
98 activeRoots.reserve(n);
99
100 bool firstRound = true;
101 while (uf.countComponents() > 1) {
102 for (std::size_t i = 0; i < n; ++i) {
103 componentOf[i] = static_cast<std::int32_t>(uf.find(static_cast<std::uint32_t>(i)));
104 }
105
106 // First-round fast path: every point is its own component; the kNN seed inside
107 // @c nearestOutComponent picks the right answer without a tree walk.
108 detail::nearestOutComponent<T>(tree, std::span<const std::int32_t>(componentOf),
109 out.coreDistances, knnIdx, knnSqDist, pool, bestPerComponent,
110 /*allSingletonComponents=*/firstRound);
111 firstRound = false;
112
113 // Collect unique root candidates that produced a finite best. A component without a
114 // finite best on a >1-component graph would be a correctness failure -- the fan-out
115 // must have reached every component's points. Assert rather than silently skip so the
116 // bug surfaces at the failing round.
117 activeRoots.clear();
118 for (std::size_t i = 0; i < n; ++i) {
119 if (std::cmp_equal(componentOf[i], i)) {
120 activeRoots.push_back(static_cast<std::int32_t>(i));
121 }
122 }
123
124 // Apply in ascending-weight order. Tie-break by (u, v) keeps the output deterministic
125 // across equal-weight candidates that may arrive from different components.
126 std::sort(activeRoots.begin(), activeRoots.end(),
127 [&](std::int32_t a, std::int32_t b) noexcept {
128 const auto &ea = bestPerComponent[static_cast<std::size_t>(a)];
129 const auto &eb = bestPerComponent[static_cast<std::size_t>(b)];
130 if (ea.weight != eb.weight) {
131 return ea.weight < eb.weight;
132 }
133 if (ea.u != eb.u) {
134 return ea.u < eb.u;
135 }
136 return ea.v < eb.v;
137 });
138
139 bool unitedAny = false;
140 for (const std::int32_t c : activeRoots) {
141 const auto &edge = bestPerComponent[static_cast<std::size_t>(c)];
142 CLUSTERING_ALWAYS_ASSERT(edge.u >= 0 && edge.v >= 0);
143 if (uf.unite(static_cast<std::uint32_t>(edge.u), static_cast<std::uint32_t>(edge.v))) {
144 out.edges.push_back(MstEdge<T>{edge.u, edge.v, edge.weight});
145 unitedAny = true;
146 }
147 }
148
149 // Safety gate: a round that fails to unite any component would loop forever. This cannot
150 // happen on a connected graph when every component has a finite best, but assert rather
151 // than infinite-loop if the invariant ever breaks.
152 CLUSTERING_ALWAYS_ASSERT(unitedAny);
153 }
154
155 CLUSTERING_ALWAYS_ASSERT(out.edges.size() == n - 1);
156
157 // Canonical output ordering: ascending by weight. The single-linkage tree construction
158 // downstream consumes edges in this order, matching the Prim backend's implicit Prim-pop
159 // ordering (which is weight-monotone outside of tie groups). Sort explicitly here so the
160 // output contract is uniform across backends.
161 std::sort(out.edges.begin(), out.edges.end(),
162 [](const MstEdge<T> &a, const MstEdge<T> &b) noexcept {
163 if (a.weight != b.weight) {
164 return a.weight < b.weight;
165 }
166 if (a.u != b.u) {
167 return a.u < b.u;
168 }
169 return a.v < b.v;
170 });
171 }
172};
173
174} // namespace clustering::hdbscan
#define CLUSTERING_ALWAYS_ASSERT(cond)
Release-active assertion: evaluates cond in every build configuration.
Implements a KDTree data structure.
Definition kdtree.h:92
Represents a multidimensional array (NDArray) of a fixed number of dimensions N and element type T.
Definition ndarray.h:136
size_t dim(std::size_t index) const noexcept
Returns the size of a specific dimension of the NDArray.
Definition ndarray.h:461
const T * data() const noexcept
Provides read-only access to the internal data array.
Definition ndarray.h:503
Disjoint-set-union with iterative path compression and union-by-rank.
Definition dsu.h:22
Idx find(Idx x) noexcept
Root of the component containing x, with path compression applied.
Definition dsu.h:47
bool unite(Idx a, Idx b) noexcept
Merge the components containing a and b.
Definition dsu.h:73
std::size_t countComponents() const noexcept
Current number of distinct components.
Definition dsu.h:112
void run(const NDArray< T, 2 > &X, std::size_t minSamples, math::Pool pool, MstOutput< T > &out)
Build the MRD-weighted minimum spanning tree of X.
One edge of the minimum spanning tree of mutual-reachability distances.
Definition mst_output.h:22
std::int32_t u
First endpoint of the MST edge (0-based vertex index).
Definition mst_output.h:24
std::int32_t v
Second endpoint of the MST edge (0-based vertex index).
Definition mst_output.h:26
Frozen output contract of every MST backend.
Definition mst_output.h:41
NDArray< T, 1 > coreDistances
Per-point core distance (length N; self-excluded kNN distance at minSamples).
Definition mst_output.h:45
std::vector< MstEdge< T > > edges
The N - 1 MST edges, in insertion order.
Definition mst_output.h:43
Thin injection wrapper around a BS::light_thread_pool.
Definition thread.h:63