Clustering
C++20 header-only: DBSCAN, HDBSCAN, k-means.
Loading...
Searching...
No Matches
dbscan.h
Go to the documentation of this file.
1#pragma once
2
3#include <BS_thread_pool.hpp>
4#include <algorithm>
5#include <cstddef>
6#include <cstdint>
7#include <optional>
8#include <vector>
9
14#include "clustering/ndarray.h"
15
16namespace clustering {
17
36template <class T, class QueryModel = index::AutoRangeIndex<T>>
38class DBSCAN {
39public:
40 static constexpr std::int32_t UNCLASSIFIED = -2;
41 static constexpr std::int32_t NOISY = -1;
42
52 explicit DBSCAN(T eps, std::size_t minPts, std::size_t nJobs = 0)
53 : m_eps(eps), m_minPts(minPts), m_nJobs(math::clampedJobCount(nJobs)), m_labels({0}) {
54 CLUSTERING_ALWAYS_ASSERT(minPts >= 1);
55 // Defer pool construction to @ref run: at small shapes every hot phase gates serial, and
56 // spawning nJobs workers in the ctor burns tens of microseconds of thread-create futex
57 // traffic that the fit never amortizes. @ref run emplaces on first need and reuses.
58 }
59
60 DBSCAN(const DBSCAN &) = delete;
61 DBSCAN &operator=(const DBSCAN &) = delete;
62 DBSCAN(DBSCAN &&) = delete;
63 DBSCAN &operator=(DBSCAN &&) = delete;
64 ~DBSCAN() = default;
65
78 void run(const NDArray<T, 2> &X) {
79 const std::size_t n = X.dim(0);
80 ensureLabelsShape(n);
81 m_clusterId = 0;
82
83 if (n == 0) {
84 return;
85 }
86
87 // One adjacency query per point is the unit of work the range-index backends fan out on;
88 // shouldSpawnPool with minOpsPerWorker=16 matches the KDTree adjacency sweep's own
89 // `shouldParallelize(n, 4, 2)` gate (@c n / 4 >= 2 * workerCount => `n >= 8` * workerCount)
90 // so the pool spawn and the backend fan-out fire at the same shape. @c n * d would
91 // under-estimate DBSCAN work -- the backend does per-query tree walks that are much heavier
92 // than @c d ops -- and caused @c n_jobs=16 at low @c d to fall back to serial here while
93 // every worker-side kernel would have cleared its own gate.
94 if (!m_pool.has_value() && math::shouldSpawnPool(n, m_nJobs, /*minOpsPerWorker=*/16)) {
95 m_pool.emplace(m_nJobs);
96 }
97 const math::Pool pool{m_pool.has_value() ? &*m_pool : nullptr};
98
99 QueryModel queryModel(X);
100 const auto adj = queryModel.query(m_eps, pool);
101
102 std::vector<std::uint8_t> isCore(n, 0);
103 for (std::size_t i = 0; i < n; ++i) {
104 isCore[i] = (adj[i].size() >= m_minPts) ? 1U : 0U;
105 }
106
107 std::int32_t *labelPtr = m_labels.data();
108 std::fill(labelPtr, labelPtr + n, UNCLASSIFIED);
109
110 for (std::size_t i = 0; i < n; ++i) {
111 if (labelPtr[i] == UNCLASSIFIED && isCore[i] != 0) {
112 expandCluster(i, isCore, adj, labelPtr);
113 ++m_clusterId;
114 }
115 }
116
117 std::replace(labelPtr, labelPtr + n, UNCLASSIFIED, NOISY);
118 }
119
121 [[nodiscard]] const NDArray<std::int32_t, 1> &labels() const noexcept { return m_labels; }
122
124 [[nodiscard]] std::size_t nClusters() const noexcept { return m_clusterId; }
125
127 void reset() {
128 m_labels = NDArray<std::int32_t, 1>({0});
129 m_clusterId = 0;
130 }
131
132private:
133 void ensureLabelsShape(std::size_t n) {
134 if (m_labels.dim(0) != n) {
135 m_labels = NDArray<std::int32_t, 1>({n});
136 }
137 }
138
150 void expandCluster(std::size_t seed, const std::vector<std::uint8_t> &isCore,
151 const std::vector<std::vector<std::int32_t>> &adj, std::int32_t *labels) {
152 const auto clusterLabel = static_cast<std::int32_t>(m_clusterId);
153 labels[seed] = clusterLabel;
154 std::vector<std::size_t> frontier;
155 frontier.reserve(adj[seed].size());
156 for (const std::int32_t n : adj[seed]) {
157 frontier.push_back(static_cast<std::size_t>(n));
158 }
159
160 while (!frontier.empty()) {
161 std::vector<std::size_t> next;
162 next.reserve(frontier.size());
163 for (const std::size_t point : frontier) {
164 if (labels[point] != UNCLASSIFIED) {
165 continue;
166 }
167 labels[point] = clusterLabel;
168 if (isCore[point] == 0) {
169 continue;
170 }
171 for (const std::int32_t n : adj[point]) {
172 if (labels[static_cast<std::size_t>(n)] == UNCLASSIFIED) {
173 next.push_back(static_cast<std::size_t>(n));
174 }
175 }
176 }
177 frontier.swap(next);
178 }
179 }
180
181 T m_eps;
182 std::size_t m_minPts;
183 std::size_t m_nJobs;
184 std::size_t m_clusterId = 0;
188 std::optional<BS::light_thread_pool> m_pool;
192 NDArray<std::int32_t, 1> m_labels;
193};
194
195} // namespace clustering
#define CLUSTERING_ALWAYS_ASSERT(cond)
Release-active assertion: evaluates cond in every build configuration.
void run(const NDArray< T, 2 > &X)
Fit to X.
Definition dbscan.h:78
std::size_t nClusters() const noexcept
Total number of clusters discovered by the most recent run.
Definition dbscan.h:124
DBSCAN(T eps, std::size_t minPts, std::size_t nJobs=0)
Construct a reusable DBSCAN fitter.
Definition dbscan.h:52
DBSCAN(const DBSCAN &)=delete
const NDArray< std::int32_t, 1 > & labels() const noexcept
Per-point cluster labels after run; NOISY marks outliers.
Definition dbscan.h:121
DBSCAN & operator=(const DBSCAN &)=delete
static constexpr std::int32_t NOISY
Label assigned to points that no cluster claimed.
Definition dbscan.h:41
DBSCAN(DBSCAN &&)=delete
DBSCAN & operator=(DBSCAN &&)=delete
void reset()
Release every scratch buffer. The next run call reallocates against its shape.
Definition dbscan.h:127
static constexpr std::int32_t UNCLASSIFIED
Sentinel for a point not yet visited.
Definition dbscan.h:40
Represents a multidimensional array (NDArray) of a fixed number of dimensions N and element type T.
Definition ndarray.h:136
size_t dim(std::size_t index) const noexcept
Returns the size of a specific dimension of the NDArray.
Definition ndarray.h:461
Contract for spatial indexes that can surface the radius-neighborhood adjacency over a borrowed point...
Definition range_query.h:24
bool shouldSpawnPool(std::size_t totalOps, std::size_t nJobs, std::size_t minOpsPerWorker=std::size_t{1}<< 15) noexcept
Decide whether spawning a pool with nJobs workers is worth it for totalOps of arithmetic work.
Definition thread.h:43
Thin injection wrapper around a BS::light_thread_pool.
Definition thread.h:63