11#include "clustering/math/detail/pairwise_threshold_outer.h"
16#ifdef CLUSTERING_USE_AVX2
24static_assert(
sizeof(std::size_t) >= 8,
"pairwise dispatch assumes a 64-bit std::size_t");
39#ifdef CLUSTERING_USE_AVX2
42 const __m256 permute = _mm256_permute2f128_ps(v, v, 1);
43 const __m256 s1 = _mm256_add_ps(v, permute);
44 const __m256 s2 = _mm256_hadd_ps(s1, s1);
45 const __m256 s3 = _mm256_hadd_ps(s2, s2);
46 return _mm_cvtss_f32(_mm256_castps256_ps128(s3));
50 const __m256d permute = _mm256_permute2f128_pd(v, v, 1);
51 const __m256d s1 = _mm256_add_pd(v, permute);
52 const __m256d s2 = _mm256_hadd_pd(s1, s1);
53 return _mm_cvtsd_f64(_mm256_castpd256_pd128(s2));
57 __m256 acc = _mm256_setzero_ps();
58 const bool xAligned = (
reinterpret_cast<std::uintptr_t
>(xRow) % 32) == 0;
59 const bool yAligned = (
reinterpret_cast<std::uintptr_t
>(yRow) % 32) == 0;
61 for (; k + 8 <= d; k += 8) {
62 const __m256 vx = xAligned ? _mm256_load_ps(xRow + k) : _mm256_loadu_ps(xRow + k);
63 const __m256 vy = yAligned ? _mm256_load_ps(yRow + k) : _mm256_loadu_ps(yRow + k);
64 const __m256 diff = _mm256_sub_ps(vx, vy);
65 acc = _mm256_add_ps(acc, _mm256_mul_ps(diff, diff));
69 const float diff = xRow[k] - yRow[k];
75inline double sqEuclideanRowAvx2(
const double *xRow,
const double *yRow, std::size_t d)
noexcept {
76 __m256d acc = _mm256_setzero_pd();
77 const bool xAligned = (
reinterpret_cast<std::uintptr_t
>(xRow) % 32) == 0;
78 const bool yAligned = (
reinterpret_cast<std::uintptr_t
>(yRow) % 32) == 0;
80 for (; k + 4 <= d; k += 4) {
81 const __m256d vx = xAligned ? _mm256_load_pd(xRow + k) : _mm256_loadu_pd(xRow + k);
82 const __m256d vy = yAligned ? _mm256_load_pd(yRow + k) : _mm256_loadu_pd(yRow + k);
83 const __m256d diff = _mm256_sub_pd(vx, vy);
84 acc = _mm256_add_pd(acc, _mm256_mul_pd(diff, diff));
88 const double diff = xRow[k] - yRow[k];
96template <
class T>
constexpr std::size_t
kAvx2Lanes = std::is_same_v<T, float> ? 8 : 4;
98template <
class T, Layout LX, Layout LY>
100 std::size_t j)
noexcept {
101 const std::size_t d = X.dim(1);
102#ifdef CLUSTERING_USE_AVX2
105 const T *xRow = X.data() + (i * d);
106 const T *yRow = Y.data() + (j * d);
112 for (std::size_t k = 0; k < d; ++k) {
113 const T diff = X(i, k) - Y(j, k);
119#ifdef CLUSTERING_USE_AVX2
122 __m256 acc = _mm256_setzero_ps();
123 const bool aligned = (
reinterpret_cast<std::uintptr_t
>(xRow) % 32) == 0;
125 for (; k + 8 <= d; k += 8) {
126 const __m256 v = aligned ? _mm256_load_ps(xRow + k) : _mm256_loadu_ps(xRow + k);
127 acc = _mm256_add_ps(acc, _mm256_mul_ps(v, v));
131 tail += xRow[k] * xRow[k];
137 __m256d acc = _mm256_setzero_pd();
138 const bool aligned = (
reinterpret_cast<std::uintptr_t
>(xRow) % 32) == 0;
140 for (; k + 4 <= d; k += 4) {
141 const __m256d v = aligned ? _mm256_load_pd(xRow + k) : _mm256_loadu_pd(xRow + k);
142 acc = _mm256_add_pd(acc, _mm256_mul_pd(v, v));
146 tail += xRow[k] * xRow[k];
153template <
class T, Layout LX>
155 const std::size_t d = X.dim(1);
156#ifdef CLUSTERING_USE_AVX2
159 const T *xRow = X.data() + (i * d);
165 for (std::size_t k = 0; k < d; ++k) {
186template <
class T, Layout LX>
188 static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
189 "rowNormsSq<T> requires T to be float or double");
194 const std::size_t n = X.
dim(0);
199 auto runRowRange = [&](std::size_t lo, std::size_t hi)
noexcept {
200 for (std::size_t i = lo; i < hi; ++i) {
207 ->submit_blocks(std::size_t{0}, n,
208 [&](std::size_t lo, std::size_t hi) { runRowRange(lo, hi); })
232template <
class T, Layout LX, Layout LY>
235 static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
236 "pairwiseSqEuclideanGemm<T> requires T to be float or double");
243 const std::size_t n = X.
dim(0);
244 const std::size_t m = Y.
dim(0);
245 if (n == 0 || m == 0) {
254 gemm(X, Y.
t(), out, pool, T{-2}, T{0});
256 auto runBroadcastRange = [&](std::size_t lo, std::size_t hi)
noexcept {
257 for (std::size_t i = lo; i < hi; ++i) {
258 const T xi = xNorms(i);
259 for (std::size_t j = 0; j < m; ++j) {
262 const T v = (out(i, j) + xi) + yNorms(j);
263 out(i, j) = std::max(v, T{0});
268 const std::size_t totalCells = n * m;
271 ->submit_blocks(std::size_t{0}, n,
272 [&](std::size_t lo, std::size_t hi) { runBroadcastRange(lo, hi); })
275 runBroadcastRange(0, n);
295template <
class T, Layout LX, Layout LY>
298 static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
299 "pairwiseSqEuclideanSimd<T> requires T to be float or double");
306 const std::size_t n = X.
dim(0);
307 const std::size_t m = Y.
dim(0);
308 if (n == 0 || m == 0) {
312 auto runRowRange = [&](std::size_t lo, std::size_t hi)
noexcept {
313 for (std::size_t i = lo; i < hi; ++i) {
314 for (std::size_t j = 0; j < m; ++j) {
322 ->submit_blocks(std::size_t{0}, n,
323 [&](std::size_t lo, std::size_t hi) { runRowRange(lo, hi); })
349template <
class T, Layout LX = Layout::Contig, Layout LY = Layout::Contig>
352 static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
353 "pairwiseSqEuclidean<T> requires T to be float or double");
360 const std::size_t n = X.
dim(0);
361 const std::size_t m = Y.
dim(0);
362 if (n == 0 || m == 0) {
366 const std::size_t work = n * m * X.
dim(1);
393template <
class T, Layout LX = Layout::Contig, Layout LY = Layout::Contig>
397 static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
398 "pairwiseSqEuclideanWithDispatchInfo<T> requires T to be float or double");
405 const std::size_t n = X.
dim(0);
406 const std::size_t m = Y.
dim(0);
407 if (n == 0 || m == 0) {
411 const std::size_t work = n * m * X.
dim(1);
431template <
class T, Layout LX, Layout LY>
433#ifdef CLUSTERING_USE_AVX2
435 const std::size_t n = X.dim(0);
436 const std::size_t m = Y.dim(0);
437 const std::size_t d = X.dim(1);
438 if (n == 0 || m == 0 || d == 0) {
441 if (d < 8 || d > kThresholdMaxD) {
444 if (!X.template isAligned<32>() || !Y.template isAligned<32>()) {
470template <
class T, Layout LX, Layout LY,
class Emit>
471 requires std::invocable<Emit &, std::size_t, std::size_t>
475 const std::size_t n = X.
dim(0);
476 const std::size_t m = Y.
dim(0);
477 if (n == 0 || m == 0) {
481 auto runRowRange = [&](std::size_t lo, std::size_t hi) {
482 for (std::size_t i = lo; i < hi; ++i) {
483 for (std::size_t j = 0; j < m; ++j) {
485 if (distSq <= radiusSq) {
497 ->submit_blocks(std::size_t{0}, n,
498 [&](std::size_t lo, std::size_t hi) { runRowRange(lo, hi); })
529template <
class T, Layout LX = Layout::Contig, Layout LY = Layout::Contig,
class Emit>
530 requires std::invocable<Emit &, std::size_t, std::size_t>
532 T radiusSq,
Pool pool, Emit &&emit) {
533 static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
534 "pairwiseSqEuclideanThresholded<T> requires T to be float or double");
537 const std::size_t n = X.
dim(0);
538 const std::size_t m = Y.
dim(0);
539 if (n == 0 || m == 0) {
543#ifdef CLUSTERING_USE_AVX2
550 detail::pairwiseThresholdOuterAvx2F32(X, Y, xNorms, yNorms, radiusSq, pool, emit);
581template <
class T, Layout LX = Layout::Contig,
class Emit>
582 requires std::invocable<Emit &, std::size_t, std::size_t>
585 static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
586 "pairwiseSqEuclideanThresholdedSymmetric<T> requires T to be float or double");
588 const std::size_t n = X.
dim(0);
593#ifdef CLUSTERING_USE_AVX2
598 detail::pairwiseThresholdOuterAvx2F32Symmetric(X, xNorms, radiusSq, pool, emit);
607 auto runRowRange = [&](std::size_t lo, std::size_t hi) {
608 for (std::size_t i = lo; i < hi; ++i) {
609 for (std::size_t j = i; j < n; ++j) {
611 if (distSq <= radiusSq) {
620 ->submit_blocks(std::size_t{0}, n,
621 [&](std::size_t lo, std::size_t hi) { runRowRange(lo, hi); })
#define CLUSTERING_ALWAYS_ASSERT(cond)
Release-active assertion: evaluates cond in every build configuration.
Represents a multidimensional array (NDArray) of a fixed number of dimensions N and element type T.
size_t dim(std::size_t index) const noexcept
Returns the size of a specific dimension of the NDArray.
NDArray< T, 2, Layout::MaybeStrided > t() noexcept
Transposes a rank-2 NDArray into a borrowed view with swapped axes.
bool isMutable() const noexcept
Reports whether writes through operator(), Accessor, or flatIndex are allowed.
constexpr std::size_t pairwiseGemmThreshold
Workload threshold at which pairwiseSqEuclidean switches from the per-pair SIMD kernel to the GEMM-id...
PairwisePath pairwiseSqEuclideanWithDispatchInfo(const NDArray< T, 2, LX > &X, const NDArray< T, 2, LY > &Y, NDArray< T, 2 > &out, Pool pool)
Test-only: runs the same dispatch as pairwiseSqEuclidean and reports which kernel fired.
float horizontalSumAvx2(__m256 v) noexcept
float sqNormRowAvx2(const float *xRow, std::size_t d) noexcept
PairwisePath
Tag identifying which inner kernel executed for a pairwise distance request.
float sqEuclideanRowAvx2(const float *xRow, const float *yRow, std::size_t d) noexcept
T sqEuclideanRow(const NDArray< T, 2, LX > &X, std::size_t i, const NDArray< T, 2, LY > &Y, std::size_t j) noexcept
constexpr std::size_t kAvx2Lanes
void rowNormsSq(const NDArray< T, 2, LX > &X, NDArray< T, 1 > &norms, Pool pool)
Row-wise sum of squares: norms(i) = sum_k X(i, k)^2.
void pairwiseSqEuclideanThresholdedMaterialized(const NDArray< T, 2, LX > &X, const NDArray< T, 2, LY > &Y, T radiusSq, Pool pool, Emit &&emit)
Materialized fallback for the thresholded-emit API: compute each pair's squared distance via sqEuclid...
bool canUseFusedThreshold(const NDArray< T, 2, LX > &X, const NDArray< T, 2, LY > &Y) noexcept
Runtime predicate: true when the fused AVX2 threshold path is eligible.
void pairwiseSqEuclideanSimd(const NDArray< T, 2, LX > &X, const NDArray< T, 2, LY > &Y, NDArray< T, 2 > &out, Pool pool)
Small-path pairwise squared Euclidean via SIMD accumulation per (i, j) pair.
void pairwiseSqEuclideanGemm(const NDArray< T, 2, LX > &X, const NDArray< T, 2, LY > &Y, NDArray< T, 2 > &out, Pool pool)
Large-path pairwise squared Euclidean via the GEMM identity.
T sqNormRow(const NDArray< T, 2, LX > &X, std::size_t i) noexcept
void gemm(const NDArray< T, 2, LA > &A, const NDArray< T, 2, LB > &B, NDArray< T, 2 > &C, Pool pool, T alpha=T{1}, T beta=T{0})
One-shot dense matrix-matrix multiply: C := alpha * A * B + beta * C.
void pairwiseSqEuclideanThresholded(const NDArray< T, 2, LX > &X, const NDArray< T, 2, LY > &Y, T radiusSq, Pool pool, Emit &&emit)
Emit every row pair (i, j) whose squared Euclidean distance is at most radiusSq.
void pairwiseSqEuclideanThresholdedSymmetric(const NDArray< T, 2, LX > &X, T radiusSq, Pool pool, Emit &&emit)
Symmetric variant of pairwiseSqEuclideanThresholded for the X == Y case.
void pairwiseSqEuclidean(const NDArray< T, 2, LX > &X, const NDArray< T, 2, LY > &Y, NDArray< T, 2 > &out, Pool pool)
Pairwise squared Euclidean distances between rows of two matrices.
T sum(const NDArray< T, 1, L > &x) noexcept
Naive single-pass sum of a rank-1 array.
Thin injection wrapper around a BS::light_thread_pool.
BS::light_thread_pool * pool
Underlying pool, or nullptr to force serial execution.
bool shouldParallelize(std::size_t totalWork, std::size_t minChunk, std::size_t minTasksPerWorker=2) const noexcept
Decide whether totalWork warrants parallel dispatch.