Namespaces
namespace	defaults
namespace	distance
namespace	detail

Classes
class	GemmPlan
	Reusable GEMM plan: packs `B` once at construction, amortizes the packing cost across repeated `execute` calls with varying `A`. More...
struct	pcg64
	128-bit state for the PCG-XSL-RR 64-bit output generator (Melissa O'Neill). More...
struct	xoshiro256ss
	256-bit state for Vigna & Blackman's xoshiro256** generator. More...
struct	Pool
	Thin injection wrapper around a `BS::light_thread_pool`. More...

Functions
template<class T>
T	pointAabbGapSq (const T *point, std::span< const T > boxMin, std::span< const T > boxMax) noexcept
	Squared gap distance between a point and an axis-aligned bounding box.
template<class T>
void	centroidShift (const NDArray< T, 2, Layout::Contig > &cOld, const NDArray< T, 2, Layout::Contig > &cNew, NDArray< T, 1 > &outShiftSq, Pool pool)
	Per-row squared shift between two centroid matrices of identical shape.
template<class T, std::size_t N, Layout LA, Layout LB>
bool	arrayEqual (const NDArray< T, N, LA > &a, const NDArray< T, N, LB > &b) noexcept
	Element-wise exact equality between two NDArrays of matching shape.
template<class T, std::size_t N, Layout LA, Layout LB>
bool	allClose (const NDArray< T, N, LA > &a, const NDArray< T, N, LB > &b, T rtol=T(1e-5), T atol=T(1e-8)) noexcept
	Element-wise approximate equality with NumPy-style asymmetric tolerance.
template<class T, Layout LA, Layout LB, class Backend = defaults::Backend>
void	gemm (const NDArray< T, 2, LA > &A, const NDArray< T, 2, LB > &B, NDArray< T, 2 > &C, Pool pool, T alpha=T{1}, T beta=T{0})
	One-shot dense matrix-matrix multiply: `C` := alpha * A * B + beta * C.
template<class T, Layout LX = Layout::Contig, Layout LY = Layout::Contig>
void	pairwiseSqEuclidean (const NDArray< T, 2, LX > &X, const NDArray< T, 2, LY > &Y, NDArray< T, 2 > &out, Pool pool)
	Pairwise squared Euclidean distances between rows of two matrices.
template<class T, Layout LX = Layout::Contig, Layout LY = Layout::Contig, class Emit>
void	pairwiseSqEuclideanThresholded (const NDArray< T, 2, LX > &X, const NDArray< T, 2, LY > &Y, T radiusSq, Pool pool, Emit &&emit)
	Emit every row pair (i, j) whose squared Euclidean distance is at most `radiusSq`.
template<class T, Layout LX = Layout::Contig, class Emit>
void	pairwiseSqEuclideanThresholdedSymmetric (const NDArray< T, 2, LX > &X, T radiusSq, Pool pool, Emit &&emit)
	Symmetric variant of pairwiseSqEuclideanThresholded for the X == Y case.
std::array< std::size_t, 2 >	chunkedMaterializedScratchShape (std::size_t n, std::size_t k) noexcept
	Required shape for the chunked materialized argmin scratch buffer.
template<class T, Layout LX = Layout::Contig, Layout LC = Layout::Contig>
void	pairwiseArgminSqEuclidean (const NDArray< T, 2, LX > &X, const NDArray< T, 2, LC > &C, const NDArray< T, 1 > &cSqNorms, NDArray< std::int32_t, 1 > &labels, NDArray< T, 1 > &outMinDistSq, Pool pool)
	Per-row argmin and minimum squared distance of rows of `X` against rows of `C`.
template<class T, Layout L>
T	sum (const NDArray< T, 1, L > &x) noexcept
	Naive single-pass sum of a rank-1 array.
template<class T, Layout L>
std::size_t	argmin (const NDArray< T, 1, L > &x) noexcept
	Index of the first minimum in a rank-1 array.
template<class T, Layout L>
std::size_t	argmax (const NDArray< T, 1, L > &x) noexcept
	Index of the first maximum in a rank-1 array.
template<class T, Layout L>
void	topk (const NDArray< T, 1, L > &x, std::size_t k, std::span< std::size_t > outIdx) noexcept
	Indices of the top-`k` largest values, written in descending value order.
std::uint64_t	advanceState (pcg64 &rng) noexcept
	Advance a pcg64 one step and return the 64-bit XSL-RR output.
std::uint64_t	advanceState (xoshiro256ss &rng) noexcept
	Advance a xoshiro256ss one step and return the 64-bit output.
template<class Rng>
std::uint32_t	randUniformU32 (Rng &rng) noexcept
	Draw a 32-bit unsigned integer uniformly at random from the full u32 range.
template<class Rng>
std::uint64_t	randUniformU64 (Rng &rng) noexcept
	Draw a 64-bit unsigned integer uniformly at random from the full u64 range.
template<class T, class Rng>
T	randUnit (Rng &rng) noexcept
	Draw a uniform variate in the half-open unit interval [0, 1).
template<class T, Layout L, class Rng>
std::size_t	weightedCategorical (const NDArray< T, 1, L > &weights, Rng &rng) noexcept
	Sample one category index proportionally to non-negative weights.
template<class T, Layout L, class Rng>
void	aExpjReservoir (const NDArray< T, 1, L > &weights, std::size_t k, Rng &rng, std::span< std::size_t > outIdx) noexcept
	Efraimidis-Spirakis weighted reservoir sampling (A-Exp variant, log-key form).
std::size_t	clampedJobCount (std::size_t nJobs) noexcept
	Clamp a caller-supplied `nJobs` to a valid `BS::light_thread_pool` worker count.
bool	shouldSpawnPool (std::size_t totalOps, std::size_t nJobs, std::size_t minOpsPerWorker=std::size_t{1}<< 15) noexcept
	Decide whether spawning a pool with `nJobs` workers is worth it for `totalOps` of arithmetic work.

Variables
constexpr std::size_t	pairwiseArgminChunkRows = 256
	Chunk height used by the materialized argmin path when striping over `n`.

Function Documentation

◆ advanceState() [1/2]

std::uint64_t clustering::math::advanceState ( pcg64 & rng )

inlinenoexcept

Advance a pcg64 one step and return the 64-bit XSL-RR output.

Parameters

rng	In-out generator state; mutated by the call.

Returns: Next 64-bit output word.

Definition at line 63 of file rng.h.

◆ advanceState() [2/2]

std::uint64_t clustering::math::advanceState ( xoshiro256ss & rng )

inlinenoexcept

Advance a xoshiro256ss one step and return the 64-bit output.

Parameters

rng	In-out generator state; mutated by the call.

Returns: Next 64-bit output word.

Definition at line 111 of file rng.h.

◆ aExpjReservoir()

template<class T, Layout L, class Rng>

void clustering::math::aExpjReservoir	(	const NDArray< T, 1, L > &	weights,
		std::size_t	k,
		Rng &	rng,
		std::span< std::size_t >	outIdx )

inlinenoexcept

Efraimidis-Spirakis weighted reservoir sampling (A-Exp variant, log-key form).

For each item i draws u_i uniformly in (0, 1) and computes the key key_i = log(u_i) / w_i; the k items with the largest keys are selected. The log form is algebraically identical to the paper's u_i^(1/w_i) but avoids silent underflow when w_i is small (the naive pow form collapses to 0 once 1/w_i exceeds ~1075, biasing the selection).

Reference: Efraimidis & Spirakis, "Weighted random sampling with a reservoir," IPL 97 (2006), https://arxiv.org/pdf/1012.0256.

Template Parameters

T	Element type of the weight array (`float` or `double`).
L	Layout tag of the weight array; accepts both contiguous and strided.
Rng	Generator type accepted by advanceState.

Parameters

weights	Rank-1 array of strictly positive weights.
k	Number of indices to select; must satisfy k <= weights.dim(0).
rng	In-out generator state.
outIdx	Output buffer of exactly `k` positions; filled with the sampled indices in unspecified order.

Definition at line 233 of file rng.h.

◆ allClose()

template<class T, std::size_t N, Layout LA, Layout LB>

bool clustering::math::allClose	(	const NDArray< T, N, LA > &	a,
		const NDArray< T, N, LB > &	b,
		T	rtol = T(1e-5),
		T	atol = T(1e-8) )

noexcept

Element-wise approximate equality with NumPy-style asymmetric tolerance.

True when |a_i - b_i| <= atol + rtol * |b_i| for every element. b sits on the right-hand side of the tolerance term intentionally, matching numpy.allclose; the relation is not symmetric in the two arguments.

Template Parameters

T	Element type; must be a floating-point type.
N	Rank of both arrays.
LA	Layout tag of `a`.
LB	Layout tag of `b`.

Parameters

a	Left-hand operand.
b	Right-hand operand; `rtol` scales against \|b\|.
rtol	Relative tolerance, defaults to `1e-5`.
atol	Absolute tolerance, defaults to `1e-8`.

Returns: true iff shapes match and every element pair is within tolerance.

Definition at line 78 of file equality.h.

◆ argmax()

template<class T, Layout L>

std::size_t clustering::math::argmax ( const NDArray< T, 1, L > & x )

inlinenoexcept

Index of the first maximum in a rank-1 array.

Mirror of argmin with strict > comparison. Asserts non-empty input.

Template Parameters

T	Element type; `float` or `double`.
L	Layout tag.

Parameters

x	Non-empty rank-1 array.

Returns: Index of the first element attaining the maximum value.

Definition at line 74 of file reduce.h.

◆ argmin()

template<class T, Layout L>

std::size_t clustering::math::argmin ( const NDArray< T, 1, L > & x )

inlinenoexcept

Index of the first minimum in a rank-1 array.

Strict < comparison during the scan: on ties the earliest index wins. Asserts the input is non-empty; an empty argmin has no meaningful answer.

Template Parameters

T	Element type; `float` or `double`.
L	Layout tag.

Parameters

x	Non-empty rank-1 array.

Returns: Index of the first element attaining the minimum value.

Definition at line 47 of file reduce.h.

◆ arrayEqual()

template<class T, std::size_t N, Layout LA, Layout LB>

bool clustering::math::arrayEqual	(	const NDArray< T, N, LA > &	a,
		const NDArray< T, N, LB > &	b )

noexcept

Element-wise exact equality between two NDArrays of matching shape.

Shapes are compared dimension-by-dimension before any elements are read; a shape mismatch reports false without touching element storage. Empty shapes (any axis zero) compare equal vacuously.

Template Parameters

T	Element type of both arrays (must match).
N	Rank of both arrays (must match; enforced by the template signature).
LA	Layout tag of `a`.
LB	Layout tag of `b`.

Parameters

a	Left-hand operand.
b	Right-hand operand.

Returns: true iff a.dim(k) == b.dim(k) for every axis and every pairwise element is bitwise-equal under operator==.

Definition at line 29 of file equality.h.

◆ centroidShift()

template<class T>

void clustering::math::centroidShift	(	const NDArray< T, 2, Layout::Contig > &	cOld,
		const NDArray< T, 2, Layout::Contig > &	cNew,
		NDArray< T, 1 > &	outShiftSq,
		Pool	pool )

Per-row squared shift between two centroid matrices of identical shape.

Writes outShiftSq(c) = sum_t (cNew(c, t) - cOld(c, t))^2 for every cluster c. Composes from detail::sqEuclideanRow, which picks an AVX2 reduction on contiguous 32-byte-aligned inputs and falls back to a scalar loop otherwise. The outer row loop fans out over pool using the same shouldParallelize gate as rowNormsSq.

Template Parameters

T	Element type (`float` or `double`).

Parameters

cOld	Previous centroids (k x d), contiguous.
cNew	Current centroids (k x d), contiguous.
outShiftSq	Rank-1 output of length k; isMutable() must be true.
pool	Parallelism injection.

Definition at line 28 of file centroid_shift.h.

◆ chunkedMaterializedScratchShape()

std::array< std::size_t, 2 > clustering::math::chunkedMaterializedScratchShape	(	std::size_t	n,
		std::size_t	k )

inlinenodiscardnoexcept

Required shape for the chunked materialized argmin scratch buffer.

The caller sizes an owning NDArray<T, 2> with these dimensions and forwards it as distsScratch to pairwiseArgminMaterializedWithScratch. The height collapses to n when n is smaller than the chunk row cap so small inputs do not overallocate.

Parameters

n	Row count of the data matrix.
k	Row count of the centroid matrix.

Definition at line 37 of file pairwise_argmin.h.

◆ clampedJobCount()

std::size_t clustering::math::clampedJobCount ( std::size_t nJobs )

inlinenoexcept

Clamp a caller-supplied nJobs to a valid BS::light_thread_pool worker count.

Algorithm wrappers (KMeans, DBSCAN) accept nJobs via the scikit-learn convention: 0 means "use hardware concurrency." std::thread::hardware_concurrency() can itself return 0 on exotic targets; this helper promotes that to 1 so the pool-worker count is always valid.

Parameters

nJobs Caller-supplied worker count; 0 means "match hardware concurrency."

Returns: Non-zero worker count suitable for constructing a BS::light_thread_pool.

Definition at line 21 of file thread.h.

◆ gemm()

template<class T, Layout LA, Layout LB, class Backend = defaults::Backend>

void clustering::math::gemm	(	const NDArray< T, 2, LA > &	A,
		const NDArray< T, 2, LB > &	B,
		NDArray< T, 2 > &	C,
		Pool	pool,
		T	alpha = T{1},
		T	beta = T{0} )

One-shot dense matrix-matrix multiply: C := alpha * A * B + beta * C.

Template Parameters

T	Element type (`float` or `double`).
LA	Layout tag of A; CTAD-resolved at the call site so gemm(A, B.t(), C, pool) binds without explicit template arguments.
LB	Layout tag of B.
Backend	Backend tag; defaulted to `defaults::Backend`. Swap project-wide via `CLUSTERING_MATH_DEFAULT_BACKEND` or per call site by naming the argument.

Parameters

A	Input matrix (M x K).
B	Input matrix (K x N).
C	Output matrix (M x N). isMutable() must be true; enforced by an always-assert that fires in release too.
pool	Parallelism injection; forwarded to the backend.
alpha	Scalar multiplier on `A*B`.
beta	Scalar multiplier on the prior `C`.

Definition at line 31 of file gemm.h.

◆ pairwiseArgminSqEuclidean()

template<class T, Layout LX = Layout::Contig, Layout LC = Layout::Contig>

void clustering::math::pairwiseArgminSqEuclidean	(	const NDArray< T, 2, LX > &	X,
		const NDArray< T, 2, LC > &	C,
		const NDArray< T, 1 > &	cSqNorms,
		NDArray< std::int32_t, 1 > &	labels,
		NDArray< T, 1 > &	outMinDistSq,
		Pool	pool )

Per-row argmin and minimum squared distance of rows of X against rows of C.

Writes labels(i) = argmin_j ||X(i) - C(j)||^2 and outMinDistSq(i) = min_j ||X(i) - C(j)||^2 for every row of X. cSqNorms must hold ||C(j)||^2 per row of C; callers typically produce it via rowNormsSq.

Dispatches between a fused AVX2 outer driver (d <= defaults::pairwiseArgminMaxD, float, contiguous, 32-byte aligned X and C) and a materialized two-step (pairwiseSqEuclidean + per-row scan) otherwise. Both paths produce labels consistent within float-reassociation tolerance; strict-less-than tie-break mirrors math::argmin.

Template Parameters

T	Element type (`float` or `double`).
LX	Layout tag of `X`; CTAD-resolved.
LC	Layout tag of `C`; CTAD-resolved.

Parameters

X	Data matrix (n x d).
C	Centroid matrix (k x d); must have the same column count as `X`.
cSqNorms	Row-1 array of squared norms, length `k`.
labels	Mutable row-1 output of length `n`, holds the argmin per row.
outMinDistSq	Mutable row-1 output of length `n`, holds the minimum distance per row.
pool	Parallelism injection; forwarded to the selected path.

Definition at line 232 of file pairwise_argmin.h.

◆ pairwiseSqEuclidean()

template<class T, Layout LX = Layout::Contig, Layout LY = Layout::Contig>

void clustering::math::pairwiseSqEuclidean	(	const NDArray< T, 2, LX > &	X,
		const NDArray< T, 2, LY > &	Y,
		NDArray< T, 2 > &	out,
		Pool	pool )

Pairwise squared Euclidean distances between rows of two matrices.

Writes out(i, j) = sum_k (X(i, k) - Y(j, k))^2 for every row pair. out must be mutable-owned and contiguous; shape mismatches trigger a release-active assert. Internally dispatches between a SIMD-per-pair kernel (small workloads) and a GEMM-identity kernel (large workloads) against defaults::pairwiseGemmThreshold on n*m*d.

Template Parameters

T	Element type (`float` or `double`).
LX	Layout tag of `X`; CTAD-resolved so strided views (e.g. Z.t()) bind without explicit template arguments.
LY	Layout tag of `Y`.

Parameters

X	Left operand (n x d).
Y	Right operand (m x d).
out	Output matrix (n x m). isMutable() must be true.
pool	Parallelism injection; forwarded to the selected kernel.

Definition at line 350 of file pairwise.h.

◆ pairwiseSqEuclideanThresholded()

template<class T, Layout LX = Layout::Contig, Layout LY = Layout::Contig, class Emit>

void clustering::math::pairwiseSqEuclideanThresholded	(	const NDArray< T, 2, LX > &	X,
		const NDArray< T, 2, LY > &	Y,
		T	radiusSq,
		Pool	pool,
		Emit &&	emit )

Emit every row pair (i, j) whose squared Euclidean distance is at most radiusSq.

Dispatches between a fused AVX2 tile kernel that streams survivors directly out of the microkernel epilogue (no materialized (n x m) tile) and a materialized fallback (pairwiseSqEuclidean plus a scalar survivor scan). The fused path fires emit in row-major order within each tile; the materialized path fires globally row-major. Both paths invoke emit exactly once per surviving cell.

Template Parameters

T	Element type (`float` or `double`).
LX	Layout tag of `X`.
LY	Layout tag of `Y`.
Emit	`std::invocable<std::size_t, std::size_t>` callable.

Parameters

X	Left operand (n x d).
Y	Right operand (m x d).
radiusSq	Non-negative squared radius; interpreted as `r*r` so callers already squaring the radius avoid a redundant multiplication.
pool	Parallelism injection forwarded to the selected path.
emit	Callback invoked for each surviving (row, col) pair; row indexes into `X`, col indexes into `Y`. The caller owns thread-safety of `emit` if `pool` is non-serial.

Definition at line 531 of file pairwise.h.

◆ pairwiseSqEuclideanThresholdedSymmetric()

template<class T, Layout LX = Layout::Contig, class Emit>

void clustering::math::pairwiseSqEuclideanThresholdedSymmetric	(	const NDArray< T, 2, LX > &	X,
		T	radiusSq,
		Pool	pool,
		Emit &&	emit )

Symmetric variant of pairwiseSqEuclideanThresholded for the X == Y case.

Adjacency over a single point cloud is symmetric: ||x_i - x_j||^2 = ||x_j - x_i||^2, so half the (i, j) pairs the general entry would compute are mirrors of the other half. This entry skips the lower-triangular work entirely and invokes emit exactly once per surviving upper-triangular cell, with row <= col always. The caller is responsible for any mirror push (typically adj[row].push(col) plus adj[col].push(row) when row != col), which keeps the kernel agnostic to the consumer's adjacency layout.

Falls back to a scalar materialised path that walks only j >= i when the AVX2 fast path is ineligible (wrong type, strided view, tiny d, AVX2 off).

Template Parameters

T	Element type (`float` or `double`).
LX	Layout tag of `X`.
Emit	`std::invocable<std::size_t, std::size_t>` callable.

Parameters

X	Point matrix (n x d).
radiusSq	Non-negative squared radius; interpreted as `r*r`.
pool	Parallelism injection forwarded to the selected path.
emit	Callback invoked once per upper-triangular surviving (row, col) with row <= col; the caller mirrors as needed.

Definition at line 583 of file pairwise.h.

◆ pointAabbGapSq()

template<class T>

T clustering::math::pointAabbGapSq	(	const T *	point,
		std::span< const T >	boxMin,
		std::span< const T >	boxMax )

inlinenodiscardnoexcept

Squared gap distance between a point and an axis-aligned bounding box.

Per dimension, either the point lies inside the extent (zero gap) or the gap is the signed distance to the nearer face. The return is the sum of squared gaps, a strict lower bound on the squared Euclidean distance from the point to any point inside the box. Used as the prune key in single-tree KDTree traversals.

When CLUSTERING_USE_AVX2 is defined dispatches to the AVX2 kernel which processes d in ceil(d/8) ymm-wide tiles plus a scalar tail. The vector path is branch-free per dimension; the tail loop matches the scalar reference exactly so dimensions entirely below 8 (e.g. d=2 or d=4) get the same answer as the scalar fallback.

Numerically equivalent to the scalar reference within (ceil(d/8)+1)*ULP-2 on finite inputs: one ULP-2 budget per accumulated 8-wide partial plus one for the horizontal reduction. Strict ULP-2 holds at d<=8 (single tile, two roundings).

Warning: Bit-equivalence with the scalar reference is not guaranteed: the AVX2 horizontal reduction differs from a scalar left-fold under FMA. NaN inputs also diverge: the scalar predicate treats NaN as "inside" (gap = 0), the vector path propagates NaN through the max chain. Callers requiring NaN-safe behaviour must pre-filter.

Template Parameters

T	Scalar element type; `float` only (a `double` specialization is out of scope).

Parameters

point	Length-`d` query coordinates.
boxMin	AABB minimum coordinates; length defines `d`.
boxMax	AABB maximum coordinates; length must equal boxMin.size().

Returns: sum_{j=0..d-1} max(0, max(boxMin[j]-point[j], point[j]-boxMax[j]))^2.

Definition at line 44 of file aabb.h.

◆ randUniformU32()

template<class Rng>

std::uint32_t clustering::math::randUniformU32 ( Rng & rng )

inlinenoexcept

Draw a 32-bit unsigned integer uniformly at random from the full u32 range.

Returns the top 32 bits of the 64-bit output; in generators like PCG64 these are higher-quality than the low bits.

Definition at line 132 of file rng.h.

◆ randUniformU64()

template<class Rng>

std::uint64_t clustering::math::randUniformU64 ( Rng & rng )

inlinenoexcept

Draw a 64-bit unsigned integer uniformly at random from the full u64 range.

Definition at line 139 of file rng.h.

◆ randUnit()

template<class T, class Rng>

T clustering::math::randUnit ( Rng & rng )

inlinenoexcept

Draw a uniform variate in the half-open unit interval [0, 1).

For double, returns (u >> 11) * 2^-53, the canonical bias-free form that selects one of 2^53 representable doubles in [0, 1). For float, returns (u >> 40) * 2^-24 over 2^24 representable floats. Both forms round-trip without the 1.0 endpoint.

Template Parameters

T	Either `float` or `double`.

Definition at line 152 of file rng.h.

◆ shouldSpawnPool()

bool clustering::math::shouldSpawnPool	(	std::size_t	totalOps,
		std::size_t	nJobs,
		std::size_t	minOpsPerWorker = std::size_t{1} << 15 )

inlinenoexcept

Decide whether spawning a pool with nJobs workers is worth it for totalOps of arithmetic work.

Complements Pool::shouldParallelizeWork: that method gates fan-out once a pool is already attached, whereas this free helper decides whether to pay the one-time thread-spawn cost at all. Returning false at small shapes lets the algorithm wrappers skip tens of microseconds of pthread-create traffic a fully-serial fit would never amortize.

Parameters

totalOps	Approximate total arithmetic operation count across all workers.
nJobs	Target worker count (see clampedJobCount for the clamp rule).
minOpsPerWorker	Minimum per-worker op budget that amortizes dispatch overhead.

Returns: true when the pool should be spawned, false when serial-only pays.

Definition at line 43 of file thread.h.

◆ sum()

template<class T, Layout L>

T clustering::math::sum ( const NDArray< T, 1, L > & x )

inlinenoexcept

Naive single-pass sum of a rank-1 array.

Straight accumulation with no compensation. Accurate enough when all magnitudes are close and n is modest.

Template Parameters

T	Element type; `float` or `double`.
L	Layout tag; contiguous and strided inputs are both accepted via x(i).

Parameters

x	Rank-1 array; empty input returns T(0).

Returns: Sum of the elements, or T(0) for an empty input.

Definition at line 25 of file reduce.h.

◆ topk()

template<class T, Layout L>

void clustering::math::topk	(	const NDArray< T, 1, L > &	x,
		std::size_t	k,
		std::span< std::size_t >	outIdx )

inlinenoexcept

Indices of the top-k largest values, written in descending value order.

Stages (value, index) pairs and calls std::partial_sort_copy with a comparator that orders by value descending; equal values tie-break by index ascending so the output is stable and deterministic. The output span is an std::span<std::size_t> rather than an NDArray<std::size_t,1> because NDArray's element type must be float or double.

Template Parameters

T	Element type; `float` or `double`.
L	Layout tag of the input.

Parameters

x	Rank-1 array of values to rank.
k	Number of indices to emit; must satisfy k <= x.dim(0).
outIdx	Output buffer of exactly `k` positions; filled with indices sorted by descending value.

Definition at line 107 of file reduce.h.

◆ weightedCategorical()

template<class T, Layout L, class Rng>

std::size_t clustering::math::weightedCategorical	(	const NDArray< T, 1, L > &	weights,
		Rng &	rng )

inlinenoexcept

Sample one category index proportionally to non-negative weights.

Implements the k-means++ seeding primitive. Draws u in [0, 1), scales to the weight total, and walks the cumulative sum returning the first index whose prefix-sum is strictly greater than u*total. The strict comparison matches numpy.searchsorted(side='right') and avoids the classical <= vs < off-by-one that biases toward index 0 when a weight equals the threshold exactly.

Template Parameters

T	Element type of the weight array (`float` or `double`).
L	Layout tag of the weight array; accepts both contiguous and strided.
Rng	Generator type accepted by advanceState.

Parameters

weights	Non-empty rank-1 array of non-negative weights with at least one positive entry.
rng	In-out generator state.

Returns: Index in [0, weights.dim(0)).

Definition at line 179 of file rng.h.

Variable Documentation

◆ pairwiseArgminChunkRows

std::size_t clustering::math::pairwiseArgminChunkRows = 256

inlineconstexpr

Chunk height used by the materialized argmin path when striping over n.

Matches scikit-learn's CHUNK_SIZE = 256 for euclidean-distance assignment so the per-chunk distance tile (chunkRows * k * sizeof(T)) stays L2-resident on targets with >=256 KiB L2.

Definition at line 24 of file pairwise_argmin.h.

Namespaces

Classes

Functions

Variables

Function Documentation

◆ advanceState() [1/2]

◆ advanceState() [2/2]

◆ aExpjReservoir()

◆ allClose()

◆ argmax()

◆ argmin()

◆ arrayEqual()

◆ centroidShift()

◆ chunkedMaterializedScratchShape()

◆ clampedJobCount()

◆ gemm()

◆ pairwiseArgminSqEuclidean()

◆ pairwiseSqEuclidean()

◆ pairwiseSqEuclideanThresholded()

◆ pairwiseSqEuclideanThresholdedSymmetric()

◆ pointAabbGapSq()

◆ randUniformU32()

◆ randUniformU64()

◆ randUnit()

◆ shouldSpawnPool()

◆ sum()

◆ topk()

◆ weightedCategorical()

Variable Documentation

◆ pairwiseArgminChunkRows