clustering/prim__mst__backend_8h_source.html

#pragma once


#include <algorithm>

#include <array>

#include <atomic>

#include <cstddef>

#include <cstdint>

#include <future>

#include <limits>

#include <thread>

#include <type_traits>

#include <utility>

#include <vector>


#include "clustering/always_assert.h"

#include "clustering/hdbscan/mst_output.h"

#include "clustering/index/kdtree.h"

#include "clustering/math/detail/avx2_helpers.h"

#include "clustering/math/detail/sq_distances_block.h"

#include "clustering/math/thread.h"

#include "clustering/ndarray.h"


namespace clustering::hdbscan {


inline constexpr std::size_t kPrimMaxN = std::size_t{16384};


inline constexpr std::size_t kPrimMrdMatrixByteBudget = kPrimMaxN * kPrimMaxN * sizeof(float);


inline constexpr std::size_t kPrimDenseCoreMinN = 1024;

inline constexpr std::size_t kPrimDenseCoreMinD = 17;

inline constexpr std::size_t kPrimDenseCoreMaxMinSamples = 64;

inline constexpr std::size_t kPrimPersistentRelaxMinWorkers = 8;

inline constexpr std::size_t kPrimPersistentRelaxMinOpsPerWorker = std::size_t{1} << 14;


template <class T> class PrimMstBackend {

  static_assert(std::is_same_v<T, float>,

                "PrimMstBackend<T> supports only float; a double specialization is out of scope.");


public:

  PrimMstBackend() = default;


  void run(const NDArray<T, 2> &X, std::size_t minSamples, math::Pool pool, MstOutput<T> &out) {

    const std::size_t n = X.dim(0);

    const std::size_t d = X.dim(1);

    CLUSTERING_ALWAYS_ASSERT(minSamples >= 1);

    CLUSTERING_ALWAYS_ASSERT(minSamples < n);


    // Refuse @c n that would push the `O(n^2 * d)` inner work past the dispatcher's intended

    // Prim window. Phrased as `n <= kNsqBudget` / n rather than @c n*n <= kNsqBudget to avoid

    // the intermediate overflowing @c std::size_t at large @c n. Fires before any allocation so

    // out-of-budget callers surface deterministically.

    constexpr std::size_t kNsqBudget = kPrimMrdMatrixByteBudget / sizeof(T);

    CLUSTERING_ALWAYS_ASSERT(n <= kNsqBudget / n);


    out.edges.clear();

    out.edges.reserve(n - 1);

    out.coreDistances = NDArray<T, 1>(std::array<std::size_t, 1>{n});

    T *coreDistData = out.coreDistances.data();

    const T *xData = X.data();

    const bool useDenseCore = shouldUseDenseCore(n, d, minSamples);

    // Row @c i starts at @c xData + i*d, so every row is 32-byte aligned iff the base pointer

    // is aligned and the row stride @c d*sizeof(T) is a multiple of 32. Either condition can

    // fail independently: NumPy buffers only guarantee element alignment, and @c d is caller-

    // driven. When both hold we can use the strict-aligned dot kernel; otherwise the generic

    // kernel's per-operand alignment check is required to stay correct.

    const bool rowsAligned32 =

        X.template isAligned<32>() && (d % (std::size_t{32} / sizeof(T)) == 0);


    std::vector<T> rowNorms;

    if (useDenseCore) {

      rowNorms.resize(n);

      for (std::size_t i = 0; i < n; ++i) {

        const T *row = xData + (i * d);

        rowNorms[i] = rowsAligned32 ? math::detail::dotRowAligned32Ptr(row, row, d)

                                    : math::detail::dotRowPtr(row, row, d);

      }

      computeDenseCoreDistances(X, rowNorms, minSamples, rowsAligned32, pool, coreDistData);

    } else {

      // Shapes that fail @c shouldUseDenseCore take the KDTree kNN path: the dense symmetric

      // scan does not amortise at small @c n, low @c d, or large @c minSamples (where the per-

      // update top-@c k rescan dominates).

      const KDTree<T> tree(X);

      const auto kSigned = static_cast<std::int32_t>(minSamples);

      auto [knnIdx, knnSqDist] = tree.knnQuery(kSigned, pool);

      (void)knnIdx;

      for (std::size_t i = 0; i < n; ++i) {

        coreDistData[i] = knnSqDist(i, minSamples - 1);

      }

    }


    // Phase 2: streaming Prim. Maintain `edgeWeight[v]` = best-known incident MRD weight to

    // the growing tree, `parent[v]` = the in-tree vertex realising that weight, and a visited

    // bitmap. Each iteration picks the smallest-weight unvisited @c target via a linear scan,

    // emits the edge `(parent[target], target, edgeWeight[target])`, then relaxes every other

    // unvisited @c v by recomputing `sqDist(target, v)` and lifting to MRD.

    std::vector<std::uint8_t> visited(n, std::uint8_t{0});

    std::vector<std::int32_t> parent(n, std::int32_t{0});

    std::vector<T> edgeWeight(n, std::numeric_limits<T>::max());


    auto sqDistance = [&](const T *rowT, std::size_t tIdx, std::size_t v) noexcept {

      if (useDenseCore) {

        const T *rowV = xData + (v * d);

        const T dot = rowsAligned32 ? math::detail::dotRowAligned32Ptr(rowT, rowV, d)

                                    : math::detail::dotRowPtr(rowT, rowV, d);

        return math::detail::sqEuclideanFromDot(rowNorms[tIdx], rowNorms[v], dot);

      }

      return math::detail::sqEuclideanRowPtr(rowT, xData + (v * d), d);

    };


    auto relaxRange = [&](std::size_t lo, std::size_t hi, std::int32_t target, std::size_t tIdx,

                          T coreT, const T *rowT) noexcept {

      for (std::size_t v = lo; v < hi; ++v) {

        if (visited[v] != 0U) {

          continue;

        }

        const T sq = sqDistance(rowT, tIdx, v);

        T w = sq;

        if (coreT > w) {

          w = coreT;

        }

        const T coreV = coreDistData[v];

        if (coreV > w) {

          w = coreV;

        }

        if (w < edgeWeight[v]) {

          parent[v] = target;

          edgeWeight[v] = w;

        }

      }

    };


    auto relaxRangeAndFindNext = [&](std::size_t lo, std::size_t hi, std::int32_t target,

                                     std::size_t tIdx, T coreT,

                                     const T *rowT) noexcept -> std::pair<std::int32_t, T> {

      std::int32_t bestV = -1;

      T bestW = std::numeric_limits<T>::max();

      for (std::size_t v = lo; v < hi; ++v) {

        if (visited[v] != 0U) {

          continue;

        }

        const T sq = sqDistance(rowT, tIdx, v);

        T w = sq;

        if (coreT > w) {

          w = coreT;

        }

        const T coreV = coreDistData[v];

        if (coreV > w) {

          w = coreV;

        }

        if (w < edgeWeight[v]) {

          parent[v] = target;

          edgeWeight[v] = w;

        }

        if (edgeWeight[v] < bestW) {

          bestW = edgeWeight[v];

          bestV = static_cast<std::int32_t>(v);

        }

      }

      return {bestV, bestW};

    };


    auto findNext = [&]() noexcept -> std::pair<std::int32_t, T> {

      std::int32_t bestV = -1;

      T bestW = std::numeric_limits<T>::max();

      for (std::size_t v = 0; v < n; ++v) {

        if (visited[v] != 0U) {

          continue;

        }

        if (edgeWeight[v] < bestW) {

          bestW = edgeWeight[v];

          bestV = static_cast<std::int32_t>(v);

        }

      }

      return {bestV, bestW};

    };


    auto persistentRelaxFrom = [&]() -> bool {

      if (!shouldUsePersistentParallelRelax(n, d, useDenseCore, pool)) {

        return false;

      }


      // Pack the per-worker barrier flag into the same 64 B cache line as its local-best

      // reduction slot. Each line is written only by its owning participant (worker or main),

      // and read by main only when @c done matches the current phase counter, so the release

      // store on @c done synchronises the non-atomic @c vertex and @c weight writes that

      // preceded it. Per-line ownership eliminates the cross-core RMW contention a shared

      // @c completed counter would incur on each phase close.

      struct alignas(64) LocalBest {

        std::atomic<std::uint32_t> done{0};

        std::int32_t vertex = -1;

        T weight = std::numeric_limits<T>::max();

      };


      const std::size_t workerTasks = pool.workerCount() - 1;

      const std::size_t participantCount = workerTasks + 1;

      std::vector<LocalBest> localBest(participantCount);


      auto blockBegin = [&](std::size_t id) noexcept { return (n * id) / participantCount; };

      auto blockEnd = [&](std::size_t id) noexcept { return (n * (id + 1)) / participantCount; };

      auto relaxBlock = [&](std::size_t id,

                            std::int32_t target) noexcept -> std::pair<std::int32_t, T> {

        const auto tIdx = static_cast<std::size_t>(target);

        const T coreT = coreDistData[tIdx];

        const T *const rowT = xData + (tIdx * d);

        std::int32_t bestV = -1;

        T bestW = std::numeric_limits<T>::max();

        for (std::size_t v = blockBegin(id); v < blockEnd(id); ++v) {

          if (visited[v] != 0U) {

            continue;

          }

          const T sq = sqDistance(rowT, tIdx, v);

          T w = sq;

          if (coreT > w) {

            w = coreT;

          }

          const T coreV = coreDistData[v];

          if (coreV > w) {

            w = coreV;

          }

          if (w < edgeWeight[v]) {

            parent[v] = target;

            edgeWeight[v] = w;

          }

          if (edgeWeight[v] < bestW) {

            bestW = edgeWeight[v];

            bestV = static_cast<std::int32_t>(v);

          }

        }

        return {bestV, bestW};

      };


      std::atomic<std::uint32_t> phase{0};

      std::atomic<std::uint32_t> ready{0};

      std::atomic<bool> stop{false};

      std::int32_t currentTarget = 0;


      auto workerLoop = [&](std::size_t id) {

        std::uint32_t seen = phase.load(std::memory_order_acquire);

        ready.fetch_add(1, std::memory_order_release);

        for (;;) {

          std::uint32_t next = phase.load(std::memory_order_acquire);

          while (next == seen) {

            spinPause();

            next = phase.load(std::memory_order_acquire);

          }

          seen = next;

          if (stop.load(std::memory_order_acquire)) {

            return;

          }

          auto [bv, bw] = relaxBlock(id, currentTarget);

          localBest[id].vertex = bv;

          localBest[id].weight = bw;

          localBest[id].done.store(seen, std::memory_order_release);

        }

      };


      std::vector<std::future<void>> futures;

      futures.reserve(workerTasks);

      for (std::size_t id = 1; id < participantCount; ++id) {

        futures.emplace_back(pool.pool->submit_task([&, id] { workerLoop(id); }));

      }

      while (ready.load(std::memory_order_acquire) != workerTasks) {

        spinPause();

      }


      auto reduceBest = [&]() noexcept -> std::pair<std::int32_t, T> {

        std::int32_t bestV = -1;

        T bestW = std::numeric_limits<T>::max();

        for (const LocalBest &candidate : localBest) {

          if (candidate.vertex >= 0 && candidate.weight < bestW) {

            bestW = candidate.weight;

            bestV = candidate.vertex;

          }

        }

        return {bestV, bestW};

      };


      auto relaxRound = [&](std::int32_t target) noexcept -> std::pair<std::int32_t, T> {

        currentTarget = target;

        const std::uint32_t newPhase =

            phase.fetch_add(1, std::memory_order_acq_rel) + std::uint32_t{1};

        auto [bv, bw] = relaxBlock(0, target);

        localBest[0].vertex = bv;

        localBest[0].weight = bw;

        // Per-worker flag wait: one cache line per worker, written exactly once per phase by

        // its owner and read exactly once per phase by main. Replaces a shared atomic counter

        // whose cross-core RMW serialised the barrier close.

        for (std::size_t id = 1; id < participantCount; ++id) {

          while (localBest[id].done.load(std::memory_order_acquire) != newPhase) {

            spinPause();

          }

        }

        return reduceBest();

      };


      visited[0] = 1U;

      edgeWeight[0] = T{0};

      auto [nextV, nextW] = relaxRound(static_cast<std::int32_t>(0));


      while (out.edges.size() + 1 < n) {

        CLUSTERING_ALWAYS_ASSERT(nextV >= 0);


        const auto bIdx = static_cast<std::size_t>(nextV);

        visited[bIdx] = 1U;

        out.edges.push_back(MstEdge<T>{parent[bIdx], nextV, nextW});


        if (out.edges.size() + 1 == n) {

          break;

        }

        auto next = relaxRound(nextV);

        nextV = next.first;

        nextW = next.second;

      }


      stop.store(true, std::memory_order_release);

      phase.fetch_add(1, std::memory_order_release);

      for (auto &future : futures) {

        future.get();

      }

      return true;

    };


    if (persistentRelaxFrom()) {

      return;

    }


    auto relaxFrom = [&](std::int32_t target) noexcept -> std::pair<std::int32_t, T> {

      const auto tIdx = static_cast<std::size_t>(target);

      const T coreT = coreDistData[tIdx];

      const T *rowT = xData + (tIdx * d);

      // Per-iter parallel dispatch: the gate uses the per-worker op budget `(n*d / nWorkers)`

      // so very small @c n stays serial and avoids submit_blocks overhead.

      if (pool.pool != nullptr && pool.shouldParallelizeWork(n * d)) {

        pool.pool

            ->submit_blocks(std::size_t{0}, n,

                            [&](std::size_t lo, std::size_t hi) {

                              relaxRange(lo, hi, target, tIdx, coreT, rowT);

                            })

            .wait();

        return findNext();

      }

      return relaxRangeAndFindNext(0, n, target, tIdx, coreT, rowT);

    };


    // Seed: vertex 0 is in the tree with weight 0. The first relax populates @c edgeWeight for

    // every other vertex so the first argmin scan has finite values.

    visited[0] = 1U;

    edgeWeight[0] = T{0};

    auto [nextV, nextW] = relaxFrom(static_cast<std::int32_t>(0));


    while (out.edges.size() + 1 < n) {

      // The graph is complete (every pair has a finite MRD), so on a connected workload the

      // argmin always finds a finite entry. Asserting here flags any contract violation that

      // would otherwise leave the spanning tree short of @c n - 1 edges.

      CLUSTERING_ALWAYS_ASSERT(nextV >= 0);


      const auto bIdx = static_cast<std::size_t>(nextV);

      visited[bIdx] = 1U;

      out.edges.push_back(MstEdge<T>{parent[bIdx], nextV, nextW});


      if (out.edges.size() + 1 == n) {

        break;

      }

      auto next = relaxFrom(nextV);

      nextV = next.first;

      nextW = next.second;

    }

  }


private:

  [[nodiscard]] static constexpr bool shouldUseDenseCore(std::size_t n, std::size_t d,

                                                         std::size_t minSamples) noexcept {

    return n >= kPrimDenseCoreMinN && d >= kPrimDenseCoreMinD &&

           minSamples <= kPrimDenseCoreMaxMinSamples;

  }


  [[nodiscard]] static bool shouldUsePersistentParallelRelax(std::size_t n, std::size_t d,

                                                             bool useDenseCore,

                                                             math::Pool pool) noexcept {

    return useDenseCore && pool.pool != nullptr &&

           pool.workerCount() >= kPrimPersistentRelaxMinWorkers &&

           pool.shouldParallelizeWork(n * d, kPrimPersistentRelaxMinOpsPerWorker);

  }


  static void spinPause() noexcept {

#ifdef CLUSTERING_USE_AVX2

    _mm_pause();

#else

    std::this_thread::yield();

#endif

  }


  static void updateTopK(T *topK, std::vector<std::size_t> &worstSlot, std::size_t minSamples,

                         std::size_t row, T sq) noexcept {

    T *const rowTopK = topK + (row * minSamples);

    std::size_t worst = worstSlot[row];

    if (!(sq < rowTopK[worst])) {

      return;

    }

    rowTopK[worst] = sq;

    worst = 0;

    T worstValue = rowTopK[0];

    for (std::size_t s = 1; s < minSamples; ++s) {

      if (rowTopK[s] > worstValue) {

        worstValue = rowTopK[s];

        worst = s;

      }

    }

    worstSlot[row] = worst;

  }


  static void computeDenseCoreDistances(const NDArray<T, 2> &X, const std::vector<T> &rowNorms,

                                        std::size_t minSamples, bool rowsAligned32, math::Pool pool,

                                        T *coreDistData) {

    const std::size_t n = X.dim(0);

    const std::size_t d = X.dim(1);

    const T *const xData = X.data();

    std::vector<T> topK(n * minSamples, std::numeric_limits<T>::max());

    std::vector<std::size_t> worstSlot(n, 0);


    // With enough workers, row-independent scans win despite computing each pair twice: every row

    // owns its top-k state, so the pool path has no cross-row writes and can reuse the batched

    // four-neighbour distance kernel that amortises AVX2 horizontal sums.

    if (pool.pool != nullptr && pool.workerCount() >= 4 && pool.shouldParallelizeWork(n * n * d)) {

      pool.pool

          ->submit_blocks(std::size_t{0}, n,

                          [&](std::size_t lo, std::size_t hi) {

                            computeDenseCoreDistancesRows(X, minSamples, lo, hi, topK.data(),

                                                          worstSlot);

                          })

          .wait();

      for (std::size_t i = 0; i < n; ++i) {

        coreDistData[i] = topK[(i * minSamples) + worstSlot[i]];

      }

      return;

    }


    for (std::size_t i = 0; i < n; ++i) {

      const T *const rowI = xData + (i * d);

      const T normI = rowNorms[i];

      for (std::size_t j = i + 1; j < n; ++j) {

        const T *const rowJ = xData + (j * d);

        const T dot = rowsAligned32 ? math::detail::dotRowAligned32Ptr(rowI, rowJ, d)

                                    : math::detail::dotRowPtr(rowI, rowJ, d);

        const T sq = math::detail::sqEuclideanFromDot(normI, rowNorms[j], dot);

        updateTopK(topK.data(), worstSlot, minSamples, i, sq);

        updateTopK(topK.data(), worstSlot, minSamples, j, sq);

      }

    }


    for (std::size_t i = 0; i < n; ++i) {

      coreDistData[i] = topK[(i * minSamples) + worstSlot[i]];

    }

  }


  static void computeDenseCoreDistancesRows(const NDArray<T, 2> &X, std::size_t minSamples,

                                            std::size_t lo, std::size_t hi, T *topK,

                                            std::vector<std::size_t> &worstSlot) noexcept {

    constexpr std::size_t kBlockRows = 64;

    const std::size_t n = X.dim(0);

    const std::size_t d = X.dim(1);

    const T *const xData = X.data();

    std::array<T, kBlockRows> distances{};


    for (std::size_t i = lo; i < hi; ++i) {

      const T *const rowI = xData + (i * d);

      for (std::size_t base = 0; base < n; base += kBlockRows) {

        const std::size_t count = std::min(kBlockRows, n - base);

        math::detail::sqDistancesAosBlock(rowI, xData + (base * d), count, d, distances.data());

        for (std::size_t offset = 0; offset < count; ++offset) {

          const std::size_t j = base + offset;

          if (j != i) {

            updateTopK(topK, worstSlot, minSamples, i, distances[offset]);

          }

        }

      }

    }

  }

};


} // namespace clustering::hdbscan

always_assert.h

CLUSTERING_ALWAYS_ASSERT
#define CLUSTERING_ALWAYS_ASSERT(cond)
Release-active assertion: evaluates cond in every build configuration.
Definition always_assert.h:30

clustering::KDTree
Implements a KDTree data structure.
Definition kdtree.h:92

clustering::KDTree::knnQuery
std::pair< NDArray< std::int32_t, 2 >, NDArray< T, 2 > > knnQuery(std::int32_t k, math::Pool pool) const
Returns the k nearest neighbours of every indexed point, self-excluded.
Definition kdtree.h:241

clustering::NDArray
Represents a multidimensional array (NDArray) of a fixed number of dimensions N and element type T.
Definition ndarray.h:136

clustering::NDArray::dim
size_t dim(std::size_t index) const noexcept
Returns the size of a specific dimension of the NDArray.
Definition ndarray.h:461

clustering::NDArray::data
const T * data() const noexcept
Provides read-only access to the internal data array.
Definition ndarray.h:503

clustering::hdbscan::PrimMstBackend::run
void run(const NDArray< T, 2 > &X, std::size_t minSamples, math::Pool pool, MstOutput< T > &out)
Build the MRD-weighted minimum spanning tree of X.
Definition prim_mst_backend.h:104

clustering::hdbscan::PrimMstBackend::PrimMstBackend
PrimMstBackend()=default

kdtree.h

mst_output.h

clustering::hdbscan
Definition hdbscan.h:28

clustering::hdbscan::kPrimDenseCoreMinD
constexpr std::size_t kPrimDenseCoreMinD
Definition prim_mst_backend.h:52

clustering::hdbscan::kPrimMaxN
constexpr std::size_t kPrimMaxN
Compute budget that gates the streaming Prim backend, expressed as the maximum point count it will ac...
Definition prim_mst_backend.h:34

clustering::hdbscan::kPrimDenseCoreMinN
constexpr std::size_t kPrimDenseCoreMinN
Thresholds gating the dense symmetric core-distance pass.
Definition prim_mst_backend.h:51

clustering::hdbscan::kPrimPersistentRelaxMinWorkers
constexpr std::size_t kPrimPersistentRelaxMinWorkers
Definition prim_mst_backend.h:54

clustering::hdbscan::kPrimMrdMatrixByteBudget
constexpr std::size_t kPrimMrdMatrixByteBudget
Equivalent byte-budget phrasing of kPrimMaxN, kept so callers that gate on n*n*sizeof(T) <= kPrimMrdM...
Definition prim_mst_backend.h:39

clustering::hdbscan::kPrimPersistentRelaxMinOpsPerWorker
constexpr std::size_t kPrimPersistentRelaxMinOpsPerWorker
Definition prim_mst_backend.h:55

clustering::hdbscan::kPrimDenseCoreMaxMinSamples
constexpr std::size_t kPrimDenseCoreMaxMinSamples
Definition prim_mst_backend.h:53

ndarray.h

clustering::hdbscan::MstEdge
One edge of the minimum spanning tree of mutual-reachability distances.
Definition mst_output.h:22

clustering::hdbscan::MstOutput
Frozen output contract of every MST backend.
Definition mst_output.h:41

clustering::hdbscan::MstOutput::coreDistances
NDArray< T, 1 > coreDistances
Per-point core distance (length N; self-excluded kNN distance at minSamples).
Definition mst_output.h:45

clustering::hdbscan::MstOutput::edges
std::vector< MstEdge< T > > edges
The N - 1 MST edges, in insertion order.
Definition mst_output.h:43

clustering::math::Pool
Thin injection wrapper around a BS::light_thread_pool.
Definition thread.h:63

clustering::math::Pool::workerCount
std::size_t workerCount() const noexcept
Number of worker threads available, or 1 in serial mode.
Definition thread.h:72

clustering::math::Pool::pool
BS::light_thread_pool * pool
Underlying pool, or nullptr to force serial execution.
Definition thread.h:65

clustering::math::Pool::shouldParallelizeWork
bool shouldParallelizeWork(std::size_t totalOps, std::size_t minOpsPerWorker=std::size_t{1}<< 15) const noexcept
Decide whether totalOps warrants parallel dispatch, based on work volume.
Definition thread.h:118

thread.h