clustering/pairwise_8h_source.html

#pragma once


#include <algorithm>

#include <concepts>

#include <cstddef>

#include <cstdint>

#include <type_traits>


#include "clustering/always_assert.h"

#include "clustering/math/defaults.h"

#include "clustering/math/detail/pairwise_threshold_outer.h"

#include "clustering/math/gemm.h"

#include "clustering/math/thread.h"

#include "clustering/ndarray.h"


#ifdef CLUSTERING_USE_AVX2

#include <immintrin.h>

#endif


// The dispatch metric n*m*d must not wrap. Realistic clustering sizes stay well inside 2^63 on

// any LP64 / LLP64 platform we target; a 32-bit size_t would overflow the metric long before it

// overflows an allocation. Pin the platform expectation so a stray cross-compile flags instead of

// silently under-counting.

static_assert(sizeof(std::size_t) >= 8, "pairwise dispatch assumes a 64-bit std::size_t");


namespace clustering::math {


namespace detail {


enum class PairwisePath : std::uint8_t { Simd, Gemm };


#ifdef CLUSTERING_USE_AVX2


inline float horizontalSumAvx2(__m256 v) noexcept {

  const __m256 permute = _mm256_permute2f128_ps(v, v, 1);

  const __m256 s1 = _mm256_add_ps(v, permute);

  const __m256 s2 = _mm256_hadd_ps(s1, s1);

  const __m256 s3 = _mm256_hadd_ps(s2, s2);

  return _mm_cvtss_f32(_mm256_castps256_ps128(s3));

}


inline double horizontalSumAvx2(__m256d v) noexcept {

  const __m256d permute = _mm256_permute2f128_pd(v, v, 1);

  const __m256d s1 = _mm256_add_pd(v, permute);

  const __m256d s2 = _mm256_hadd_pd(s1, s1);

  return _mm_cvtsd_f64(_mm256_castpd256_pd128(s2));

}


inline float sqEuclideanRowAvx2(const float *xRow, const float *yRow, std::size_t d) noexcept {

  __m256 acc = _mm256_setzero_ps();

  const bool xAligned = (reinterpret_cast<std::uintptr_t>(xRow) % 32) == 0;

  const bool yAligned = (reinterpret_cast<std::uintptr_t>(yRow) % 32) == 0;

  std::size_t k = 0;

  for (; k + 8 <= d; k += 8) {

    const __m256 vx = xAligned ? _mm256_load_ps(xRow + k) : _mm256_loadu_ps(xRow + k);

    const __m256 vy = yAligned ? _mm256_load_ps(yRow + k) : _mm256_loadu_ps(yRow + k);

    const __m256 diff = _mm256_sub_ps(vx, vy);

    acc = _mm256_add_ps(acc, _mm256_mul_ps(diff, diff));

  }

  float tail = 0.0F;

  for (; k < d; ++k) {

    const float diff = xRow[k] - yRow[k];

    tail += diff * diff;

  }

  return horizontalSumAvx2(acc) + tail;

}


inline double sqEuclideanRowAvx2(const double *xRow, const double *yRow, std::size_t d) noexcept {

  __m256d acc = _mm256_setzero_pd();

  const bool xAligned = (reinterpret_cast<std::uintptr_t>(xRow) % 32) == 0;

  const bool yAligned = (reinterpret_cast<std::uintptr_t>(yRow) % 32) == 0;

  std::size_t k = 0;

  for (; k + 4 <= d; k += 4) {

    const __m256d vx = xAligned ? _mm256_load_pd(xRow + k) : _mm256_loadu_pd(xRow + k);

    const __m256d vy = yAligned ? _mm256_load_pd(yRow + k) : _mm256_loadu_pd(yRow + k);

    const __m256d diff = _mm256_sub_pd(vx, vy);

    acc = _mm256_add_pd(acc, _mm256_mul_pd(diff, diff));

  }

  double tail = 0.0;

  for (; k < d; ++k) {

    const double diff = xRow[k] - yRow[k];

    tail += diff * diff;

  }

  return horizontalSumAvx2(acc) + tail;

}


#endif // CLUSTERING_USE_AVX2


template <class T> constexpr std::size_t kAvx2Lanes = std::is_same_v<T, float> ? 8 : 4;


template <class T, Layout LX, Layout LY>


inline T sqEuclideanRow(const NDArray<T, 2, LX> &X, std::size_t i, const NDArray<T, 2, LY> &Y,

                        std::size_t j) noexcept {

  const std::size_t d = X.dim(1);

#ifdef CLUSTERING_USE_AVX2

  if constexpr (LX == Layout::Contig && LY == Layout::Contig) {

    if (d >= kAvx2Lanes<T>) {

      const T *xRow = X.data() + (i * d);

      const T *yRow = Y.data() + (j * d);

      return sqEuclideanRowAvx2(xRow, yRow, d);

    }

  }

#endif

  T sum = T{0};

  for (std::size_t k = 0; k < d; ++k) {

    const T diff = X(i, k) - Y(j, k);

    sum += diff * diff;

  }

  return sum;

}


#ifdef CLUSTERING_USE_AVX2


inline float sqNormRowAvx2(const float *xRow, std::size_t d) noexcept {

  __m256 acc = _mm256_setzero_ps();

  const bool aligned = (reinterpret_cast<std::uintptr_t>(xRow) % 32) == 0;

  std::size_t k = 0;

  for (; k + 8 <= d; k += 8) {

    const __m256 v = aligned ? _mm256_load_ps(xRow + k) : _mm256_loadu_ps(xRow + k);

    acc = _mm256_add_ps(acc, _mm256_mul_ps(v, v));

  }

  float tail = 0.0F;

  for (; k < d; ++k) {

    tail += xRow[k] * xRow[k];

  }

  return horizontalSumAvx2(acc) + tail;

}


inline double sqNormRowAvx2(const double *xRow, std::size_t d) noexcept {

  __m256d acc = _mm256_setzero_pd();

  const bool aligned = (reinterpret_cast<std::uintptr_t>(xRow) % 32) == 0;

  std::size_t k = 0;

  for (; k + 4 <= d; k += 4) {

    const __m256d v = aligned ? _mm256_load_pd(xRow + k) : _mm256_loadu_pd(xRow + k);

    acc = _mm256_add_pd(acc, _mm256_mul_pd(v, v));

  }

  double tail = 0.0;

  for (; k < d; ++k) {

    tail += xRow[k] * xRow[k];

  }

  return horizontalSumAvx2(acc) + tail;

}


#endif // CLUSTERING_USE_AVX2


template <class T, Layout LX>


inline T sqNormRow(const NDArray<T, 2, LX> &X, std::size_t i) noexcept {

  const std::size_t d = X.dim(1);

#ifdef CLUSTERING_USE_AVX2

  if constexpr (LX == Layout::Contig) {

    if (d >= kAvx2Lanes<T>) {

      const T *xRow = X.data() + (i * d);

      return sqNormRowAvx2(xRow, d);

    }

  }

#endif

  T sum = T{0};

  for (std::size_t k = 0; k < d; ++k) {

    const T v = X(i, k);

    sum += v * v;

  }

  return sum;

}


template <class T, Layout LX>


void rowNormsSq(const NDArray<T, 2, LX> &X, NDArray<T, 1> &norms, Pool pool) {

  static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,

                "rowNormsSq<T> requires T to be float or double");


  CLUSTERING_ALWAYS_ASSERT(norms.isMutable());

  CLUSTERING_ALWAYS_ASSERT(norms.dim(0) == X.dim(0));


  const std::size_t n = X.dim(0);

  if (n == 0) {

    return;

  }


  auto runRowRange = [&](std::size_t lo, std::size_t hi) noexcept {

    for (std::size_t i = lo; i < hi; ++i) {

      norms(i) = sqNormRow<T, LX>(X, i);

    }

  };


  if (pool.shouldParallelize(n, 4, 2) && pool.pool != nullptr) {

    pool.pool

        ->submit_blocks(std::size_t{0}, n,

                        [&](std::size_t lo, std::size_t hi) { runRowRange(lo, hi); })

        .wait();

  } else {

    runRowRange(0, n);

  }

}


template <class T, Layout LX, Layout LY>


void pairwiseSqEuclideanGemm(const NDArray<T, 2, LX> &X, const NDArray<T, 2, LY> &Y,

                             NDArray<T, 2> &out, Pool pool) {

  static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,

                "pairwiseSqEuclideanGemm<T> requires T to be float or double");


  CLUSTERING_ALWAYS_ASSERT(out.isMutable());

  CLUSTERING_ALWAYS_ASSERT(X.dim(1) == Y.dim(1));

  CLUSTERING_ALWAYS_ASSERT(out.dim(0) == X.dim(0));

  CLUSTERING_ALWAYS_ASSERT(out.dim(1) == Y.dim(0));


  const std::size_t n = X.dim(0);

  const std::size_t m = Y.dim(0);

  if (n == 0 || m == 0) {

    return;

  }


  NDArray<T, 1> xNorms({n});

  NDArray<T, 1> yNorms({m});

  rowNormsSq(X, xNorms, pool);

  rowNormsSq(Y, yNorms, pool);


  gemm(X, Y.t(), out, pool, T{-2}, T{0});


  auto runBroadcastRange = [&](std::size_t lo, std::size_t hi) noexcept {

    for (std::size_t i = lo; i < hi; ++i) {

      const T xi = xNorms(i);

      for (std::size_t j = 0; j < m; ++j) {

        // Cancellation in ||x||^2 + ||y||^2 - 2 x . y can produce tiny negatives when x ~= y;

        // squared distance is non-negative by definition, so clamp.

        const T v = (out(i, j) + xi) + yNorms(j);

        out(i, j) = std::max(v, T{0});

      }

    }

  };


  const std::size_t totalCells = n * m;

  if (pool.shouldParallelize(totalCells, 64, 2) && pool.pool != nullptr) {

    pool.pool

        ->submit_blocks(std::size_t{0}, n,

                        [&](std::size_t lo, std::size_t hi) { runBroadcastRange(lo, hi); })

        .wait();

  } else {

    runBroadcastRange(0, n);

  }

}


template <class T, Layout LX, Layout LY>


void pairwiseSqEuclideanSimd(const NDArray<T, 2, LX> &X, const NDArray<T, 2, LY> &Y,

                             NDArray<T, 2> &out, Pool pool) {

  static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,

                "pairwiseSqEuclideanSimd<T> requires T to be float or double");


  CLUSTERING_ALWAYS_ASSERT(out.isMutable());

  CLUSTERING_ALWAYS_ASSERT(X.dim(1) == Y.dim(1));

  CLUSTERING_ALWAYS_ASSERT(out.dim(0) == X.dim(0));

  CLUSTERING_ALWAYS_ASSERT(out.dim(1) == Y.dim(0));


  const std::size_t n = X.dim(0);

  const std::size_t m = Y.dim(0);

  if (n == 0 || m == 0) {

    return;

  }


  auto runRowRange = [&](std::size_t lo, std::size_t hi) noexcept {

    for (std::size_t i = lo; i < hi; ++i) {

      for (std::size_t j = 0; j < m; ++j) {

        out(i, j) = sqEuclideanRow<T, LX, LY>(X, i, Y, j);

      }

    }

  };


  if (pool.shouldParallelize(n, 4, 2) && pool.pool != nullptr) {

    pool.pool

        ->submit_blocks(std::size_t{0}, n,

                        [&](std::size_t lo, std::size_t hi) { runRowRange(lo, hi); })

        .wait();

  } else {

    runRowRange(0, n);

  }

}


} // namespace detail


template <class T, Layout LX = Layout::Contig, Layout LY = Layout::Contig>


void pairwiseSqEuclidean(const NDArray<T, 2, LX> &X, const NDArray<T, 2, LY> &Y, NDArray<T, 2> &out,

                         Pool pool) {

  static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,

                "pairwiseSqEuclidean<T> requires T to be float or double");


  CLUSTERING_ALWAYS_ASSERT(out.isMutable());

  CLUSTERING_ALWAYS_ASSERT(X.dim(1) == Y.dim(1));

  CLUSTERING_ALWAYS_ASSERT(out.dim(0) == X.dim(0));

  CLUSTERING_ALWAYS_ASSERT(out.dim(1) == Y.dim(0));


  const std::size_t n = X.dim(0);

  const std::size_t m = Y.dim(0);

  if (n == 0 || m == 0) {

    return;

  }


  const std::size_t work = n * m * X.dim(1);

  if (work >= defaults::pairwiseGemmThreshold) {

    detail::pairwiseSqEuclideanGemm(X, Y, out, pool);

  } else {

    detail::pairwiseSqEuclideanSimd(X, Y, out, pool);

  }

}


namespace detail {


template <class T, Layout LX = Layout::Contig, Layout LY = Layout::Contig>


PairwisePath pairwiseSqEuclideanWithDispatchInfo(const NDArray<T, 2, LX> &X,

                                                 const NDArray<T, 2, LY> &Y, NDArray<T, 2> &out,

                                                 Pool pool) {

  static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,

                "pairwiseSqEuclideanWithDispatchInfo<T> requires T to be float or double");


  CLUSTERING_ALWAYS_ASSERT(out.isMutable());

  CLUSTERING_ALWAYS_ASSERT(X.dim(1) == Y.dim(1));

  CLUSTERING_ALWAYS_ASSERT(out.dim(0) == X.dim(0));

  CLUSTERING_ALWAYS_ASSERT(out.dim(1) == Y.dim(0));


  const std::size_t n = X.dim(0);

  const std::size_t m = Y.dim(0);

  if (n == 0 || m == 0) {

    return PairwisePath::Simd;

  }


  const std::size_t work = n * m * X.dim(1);

  if (work >= defaults::pairwiseGemmThreshold) {

    pairwiseSqEuclideanGemm(X, Y, out, pool);

    return PairwisePath::Gemm;

  }

  pairwiseSqEuclideanSimd(X, Y, out, pool);

  return PairwisePath::Simd;

}


template <class T, Layout LX, Layout LY>


bool canUseFusedThreshold(const NDArray<T, 2, LX> &X, const NDArray<T, 2, LY> &Y) noexcept {

#ifdef CLUSTERING_USE_AVX2

  if constexpr (std::is_same_v<T, float> && LX == Layout::Contig && LY == Layout::Contig) {

    const std::size_t n = X.dim(0);

    const std::size_t m = Y.dim(0);

    const std::size_t d = X.dim(1);

    if (n == 0 || m == 0 || d == 0) {

      return false;

    }

    if (d < 8 || d > kThresholdMaxD) {

      return false;

    }

    if (!X.template isAligned<32>() || !Y.template isAligned<32>()) {

      return false;

    }

    return true;

  } else {

    (void)X;

    (void)Y;

    return false;

  }

#else

  (void)X;

  (void)Y;

  return false;

#endif

}


template <class T, Layout LX, Layout LY, class Emit>

  requires std::invocable<Emit &, std::size_t, std::size_t>


void pairwiseSqEuclideanThresholdedMaterialized(const NDArray<T, 2, LX> &X,

                                                const NDArray<T, 2, LY> &Y, T radiusSq, Pool pool,

                                                Emit &&emit) {

  const std::size_t n = X.dim(0);

  const std::size_t m = Y.dim(0);

  if (n == 0 || m == 0) {

    return;

  }


  auto runRowRange = [&](std::size_t lo, std::size_t hi) {

    for (std::size_t i = lo; i < hi; ++i) {

      for (std::size_t j = 0; j < m; ++j) {

        const T distSq = sqEuclideanRow<T, LX, LY>(X, i, Y, j);

        if (distSq <= radiusSq) {

          emit(i, j);

        }

      }

    }

  };


  // Only fan out across rows: column emit within a row is order-sensitive and consumers rely

  // on the per-row contract. Parallelism at the seed (row) level is safe because each row's

  // emits land in a distinct key space, but the caller owns thread-safety of @p emit.

  if (pool.shouldParallelize(n * m, 64, 2) && pool.pool != nullptr) {

    pool.pool

        ->submit_blocks(std::size_t{0}, n,

                        [&](std::size_t lo, std::size_t hi) { runRowRange(lo, hi); })

        .wait();

  } else {

    runRowRange(0, n);

  }

}


} // namespace detail


template <class T, Layout LX = Layout::Contig, Layout LY = Layout::Contig, class Emit>

  requires std::invocable<Emit &, std::size_t, std::size_t>


void pairwiseSqEuclideanThresholded(const NDArray<T, 2, LX> &X, const NDArray<T, 2, LY> &Y,

                                    T radiusSq, Pool pool, Emit &&emit) {

  static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,

                "pairwiseSqEuclideanThresholded<T> requires T to be float or double");

  CLUSTERING_ALWAYS_ASSERT(X.dim(1) == Y.dim(1));


  const std::size_t n = X.dim(0);

  const std::size_t m = Y.dim(0);

  if (n == 0 || m == 0) {

    return;

  }


#ifdef CLUSTERING_USE_AVX2

  if constexpr (std::is_same_v<T, float> && LX == Layout::Contig && LY == Layout::Contig) {

    if (detail::canUseFusedThreshold(X, Y)) {

      NDArray<T, 1> xNorms({n});

      NDArray<T, 1> yNorms({m});

      detail::rowNormsSq(X, xNorms, pool);

      detail::rowNormsSq(Y, yNorms, pool);

      detail::pairwiseThresholdOuterAvx2F32(X, Y, xNorms, yNorms, radiusSq, pool, emit);

      return;

    }

  }

#endif


  detail::pairwiseSqEuclideanThresholdedMaterialized(X, Y, radiusSq, pool, emit);

}


template <class T, Layout LX = Layout::Contig, class Emit>

  requires std::invocable<Emit &, std::size_t, std::size_t>


void pairwiseSqEuclideanThresholdedSymmetric(const NDArray<T, 2, LX> &X, T radiusSq, Pool pool,

                                             Emit &&emit) {

  static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,

                "pairwiseSqEuclideanThresholdedSymmetric<T> requires T to be float or double");


  const std::size_t n = X.dim(0);

  if (n == 0) {

    return;

  }


#ifdef CLUSTERING_USE_AVX2

  if constexpr (std::is_same_v<T, float> && LX == Layout::Contig) {

    if (detail::canUseFusedThreshold(X, X)) {

      NDArray<T, 1> xNorms({n});

      detail::rowNormsSq(X, xNorms, pool);

      detail::pairwiseThresholdOuterAvx2F32Symmetric(X, xNorms, radiusSq, pool, emit);

      return;

    }

  }

#endif


  // Scalar fallback: walk only j >= i and forward each surviving upper-triangular cell to the

  // caller. Mirrors the @c pairwiseSqEuclideanThresholdedMaterialized contract for the

  // non-symmetric case; the caller's emit is responsible for any adj-side mirror push.

  auto runRowRange = [&](std::size_t lo, std::size_t hi) {

    for (std::size_t i = lo; i < hi; ++i) {

      for (std::size_t j = i; j < n; ++j) {

        const T distSq = detail::sqEuclideanRow<T, LX, LX>(X, i, X, j);

        if (distSq <= radiusSq) {

          emit(i, j);

        }

      }

    }

  };


  if (pool.shouldParallelize(n * n / 2, 64, 2) && pool.pool != nullptr) {

    pool.pool

        ->submit_blocks(std::size_t{0}, n,

                        [&](std::size_t lo, std::size_t hi) { runRowRange(lo, hi); })

        .wait();

  } else {

    runRowRange(0, n);

  }

}


} // namespace clustering::math

always_assert.h

CLUSTERING_ALWAYS_ASSERT
#define CLUSTERING_ALWAYS_ASSERT(cond)
Release-active assertion: evaluates cond in every build configuration.
Definition always_assert.h:30

clustering::NDArray
Represents a multidimensional array (NDArray) of a fixed number of dimensions N and element type T.
Definition ndarray.h:136

clustering::NDArray::dim
size_t dim(std::size_t index) const noexcept
Returns the size of a specific dimension of the NDArray.
Definition ndarray.h:461

clustering::NDArray::t
NDArray< T, 2, Layout::MaybeStrided > t() noexcept
Transposes a rank-2 NDArray into a borrowed view with swapped axes.
Definition ndarray.h:683

clustering::NDArray::isMutable
bool isMutable() const noexcept
Reports whether writes through operator(), Accessor, or flatIndex are allowed.
Definition ndarray.h:488

defaults.h

gemm.h

clustering::detail
Definition ndarray.h:23

clustering::math::defaults::pairwiseGemmThreshold
constexpr std::size_t pairwiseGemmThreshold
Workload threshold at which pairwiseSqEuclidean switches from the per-pair SIMD kernel to the GEMM-id...
Definition defaults.h:52

clustering::math::detail
Definition pairwise.h:28

clustering::math::detail::pairwiseSqEuclideanWithDispatchInfo
PairwisePath pairwiseSqEuclideanWithDispatchInfo(const NDArray< T, 2, LX > &X, const NDArray< T, 2, LY > &Y, NDArray< T, 2 > &out, Pool pool)
Test-only: runs the same dispatch as pairwiseSqEuclidean and reports which kernel fired.
Definition pairwise.h:394

clustering::math::detail::horizontalSumAvx2
float horizontalSumAvx2(__m256 v) noexcept
Definition pairwise.h:41

clustering::math::detail::sqNormRowAvx2
float sqNormRowAvx2(const float *xRow, std::size_t d) noexcept
Definition pairwise.h:121

clustering::math::detail::PairwisePath
PairwisePath
Tag identifying which inner kernel executed for a pairwise distance request.
Definition pairwise.h:37

clustering::math::detail::PairwisePath::Gemm
@ Gemm
Definition pairwise.h:37

clustering::math::detail::PairwisePath::Simd
@ Simd
Definition pairwise.h:37

clustering::math::detail::sqEuclideanRowAvx2
float sqEuclideanRowAvx2(const float *xRow, const float *yRow, std::size_t d) noexcept
Definition pairwise.h:56

clustering::math::detail::sqEuclideanRow
T sqEuclideanRow(const NDArray< T, 2, LX > &X, std::size_t i, const NDArray< T, 2, LY > &Y, std::size_t j) noexcept
Definition pairwise.h:99

clustering::math::detail::kAvx2Lanes
constexpr std::size_t kAvx2Lanes
Definition pairwise.h:96

clustering::math::detail::rowNormsSq
void rowNormsSq(const NDArray< T, 2, LX > &X, NDArray< T, 1 > &norms, Pool pool)
Row-wise sum of squares: norms(i) = sum_k X(i, k)^2.
Definition pairwise.h:187

clustering::math::detail::pairwiseSqEuclideanThresholdedMaterialized
void pairwiseSqEuclideanThresholdedMaterialized(const NDArray< T, 2, LX > &X, const NDArray< T, 2, LY > &Y, T radiusSq, Pool pool, Emit &&emit)
Materialized fallback for the thresholded-emit API: compute each pair's squared distance via sqEuclid...
Definition pairwise.h:472

clustering::math::detail::canUseFusedThreshold
bool canUseFusedThreshold(const NDArray< T, 2, LX > &X, const NDArray< T, 2, LY > &Y) noexcept
Runtime predicate: true when the fused AVX2 threshold path is eligible.
Definition pairwise.h:432

clustering::math::detail::pairwiseSqEuclideanSimd
void pairwiseSqEuclideanSimd(const NDArray< T, 2, LX > &X, const NDArray< T, 2, LY > &Y, NDArray< T, 2 > &out, Pool pool)
Small-path pairwise squared Euclidean via SIMD accumulation per (i, j) pair.
Definition pairwise.h:296

clustering::math::detail::pairwiseSqEuclideanGemm
void pairwiseSqEuclideanGemm(const NDArray< T, 2, LX > &X, const NDArray< T, 2, LY > &Y, NDArray< T, 2 > &out, Pool pool)
Large-path pairwise squared Euclidean via the GEMM identity.
Definition pairwise.h:233

clustering::math::detail::sqNormRow
T sqNormRow(const NDArray< T, 2, LX > &X, std::size_t i) noexcept
Definition pairwise.h:154

clustering::math
Definition aabb.h:12

clustering::math::gemm
void gemm(const NDArray< T, 2, LA > &A, const NDArray< T, 2, LB > &B, NDArray< T, 2 > &C, Pool pool, T alpha=T{1}, T beta=T{0})
One-shot dense matrix-matrix multiply: C := alpha * A * B + beta * C.
Definition gemm.h:31

clustering::math::pairwiseSqEuclideanThresholded
void pairwiseSqEuclideanThresholded(const NDArray< T, 2, LX > &X, const NDArray< T, 2, LY > &Y, T radiusSq, Pool pool, Emit &&emit)
Emit every row pair (i, j) whose squared Euclidean distance is at most radiusSq.
Definition pairwise.h:531

clustering::math::pairwiseSqEuclideanThresholdedSymmetric
void pairwiseSqEuclideanThresholdedSymmetric(const NDArray< T, 2, LX > &X, T radiusSq, Pool pool, Emit &&emit)
Symmetric variant of pairwiseSqEuclideanThresholded for the X == Y case.
Definition pairwise.h:583

clustering::math::pairwiseSqEuclidean
void pairwiseSqEuclidean(const NDArray< T, 2, LX > &X, const NDArray< T, 2, LY > &Y, NDArray< T, 2 > &out, Pool pool)
Pairwise squared Euclidean distances between rows of two matrices.
Definition pairwise.h:350

clustering::math::sum
T sum(const NDArray< T, 1, L > &x) noexcept
Naive single-pass sum of a rank-1 array.
Definition reduce.h:25

clustering::Layout::Contig
@ Contig
Definition ndarray.h:122

ndarray.h

clustering::math::Pool
Thin injection wrapper around a BS::light_thread_pool.
Definition thread.h:63

clustering::math::Pool::pool
BS::light_thread_pool * pool
Underlying pool, or nullptr to force serial execution.
Definition thread.h:65

clustering::math::Pool::shouldParallelize
bool shouldParallelize(std::size_t totalWork, std::size_t minChunk, std::size_t minTasksPerWorker=2) const noexcept
Decide whether totalWork warrants parallel dispatch.
Definition thread.h:98

thread.h