clustering/lloyd__fused__gemm_8h_source.html

#pragma once


#include <algorithm>

#include <array>

#include <cmath>

#include <cstddef>

#include <cstdint>

#include <cstring>

#include <limits>

#include <type_traits>

#include <utility>


#include "clustering/always_assert.h"

#include "clustering/kmeans/detail/convergence.h"

#include "clustering/kmeans/detail/empty_cluster.h"

#include "clustering/kmeans/policy/greedy_kmpp_seeder.h"

#include "clustering/math/centroid_shift.h"

#include "clustering/math/defaults.h"

#include "clustering/math/detail/avx2_helpers.h"

#include "clustering/math/detail/columnwise_reduce_avx2.h"

#include "clustering/math/detail/gemm_outer_prepacked.h"

#include "clustering/math/detail/gemm_pack.h"

#include "clustering/math/detail/matrix_desc.h"

#include "clustering/math/detail/pairwise_argmin_outer.h"

#include "clustering/math/pairwise.h"

#include "clustering/math/pairwise_argmin.h"

#include "clustering/math/reduce.h"

#include "clustering/math/thread.h"

#include "clustering/ndarray.h"


namespace clustering::kmeans {


namespace detail {


struct BlockPartition {

  std::size_t first_index = 0;

  std::size_t block_size = 0;

  std::size_t remainder = 0;

  std::size_t num_blocks = 0;


  BlockPartition(std::size_t first, std::size_t n, std::size_t desired) noexcept

      : first_index(first) {

    if (n == 0 || desired == 0) {

      num_blocks = 0;

      return;

    }

    num_blocks = std::min(desired, n);

    block_size = n / num_blocks;

    remainder = n % num_blocks;

    if (block_size == 0) {

      block_size = 1;

      num_blocks = n;

    }

  }


  [[nodiscard]] std::size_t blockIndexOf(std::size_t lo) const noexcept {

    const std::size_t rel = lo - first_index;

    const std::size_t big = remainder * (block_size + 1);

    if (rel < big) {

      return rel / (block_size + 1);

    }

    return remainder + ((rel - big) / block_size);

  }

};


inline constexpr std::size_t kDirectArgminMaxD = 8;


} // namespace detail


template <class T> class LloydFusedGemm {

public:

  static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,

                "LloydFusedGemm<T> requires T to be float or double");


  LloydFusedGemm()

      : m_centroidsOld({0, 0}), m_cSqNorms({0}), m_sums({0, 0}), m_counts({0}), m_minDistSq({0}),

        m_shiftSq({0}), m_partialSums({0}), m_partialComps({0}), m_partialCounts({0}),

        m_foldComp({0}), m_packedB({0}), m_packedCSqNorms({0}), m_distsChunk({0, 0}),

        m_gemmApArena({0}), m_xNormsSq({0}), m_varSum({0}), m_varSumSq({0}), m_u({0}), m_l({0}),

        m_shiftEuclidean({0}), m_halfDistToNearestOther({0}), m_elkanBounds({0, 0}),

        m_centerDist({0, 0}) {}


#ifdef CLUSTERING_KMEANS_KAHAN_N_THRESHOLD

  static constexpr std::size_t kahanNThreshold = CLUSTERING_KMEANS_KAHAN_N_THRESHOLD;

#else

  static constexpr std::size_t kahanNThreshold = 100000;

#endif


  void run(const NDArray<T, 2, Layout::Contig> &X, NDArray<T, 2, Layout::Contig> &centroids,

           std::size_t k, std::size_t maxIter, T tol, math::Pool pool,

           NDArray<std::int32_t, 1> &outLabels, double &outInertia, std::size_t &outNIter,

           bool &outConverged) {

    const std::size_t n = X.dim(0);

    const std::size_t d = X.dim(1);


    CLUSTERING_ALWAYS_ASSERT(k >= 1);

    CLUSTERING_ALWAYS_ASSERT(n >= k);

    CLUSTERING_ALWAYS_ASSERT(centroids.dim(0) == k);

    CLUSTERING_ALWAYS_ASSERT(centroids.dim(1) == d);

    CLUSTERING_ALWAYS_ASSERT(outLabels.dim(0) == n);


    if (n == 0 || d == 0) {

      outNIter = 0;

      outConverged = true;

      outInertia = 0.0;

      return;

    }


    const std::size_t workerCount = pool.workerCount();

    ensureShape(n, d, k, workerCount);


    // Sklearn-compatible tol semantics: the threshold on sum(||deltac_j||^2) is @c tol * mean_var

    // where @c mean_var is the mean of per-column variances of @p X. This is scale-invariant,

    // which is the property callers expect when they pass the same numeric @c tol across

    // datasets of different magnitudes. The raw-L2-shift convention our earlier prose described

    // made @c tol=1e-4 hundreds-of-thousands of times tighter than sklearn at the same numeric

    // value, which inflated the Lloyd iteration count by 3-4x on typical blob data.

    const T shiftSqThreshold = tol * meanColumnVariance(X);

    const bool useKahan = n >= kahanNThreshold;


    // X is input-only; its squared-row-norms are reused across every Lloyd iteration's

    // argmin post-pass. Compute once per run() so the iteration budget doesn't eat an

    // O(n*d) pass for every assignment.

    for (std::size_t i = 0; i < n; ++i) {

      m_xNormsSq(i) = math::detail::sqNormRow<T, Layout::Contig>(X, i);

    }


    refreshCentroidSqNorms(centroids);


    std::size_t iter = 0;

    bool converged = false;


    // Hamerly pruning starts once @c d leaves the direct small-D path. Fused-argmin shapes seed

    // valid per-point bounds after the first dense assignment; chunked shapes seed them inline

    // during the argmin post-pass. @c k is capped by @c kHamerlyMaxK because the per-row scan

    // uses a stack-allocated distance buffer; above that, Elkan handles bounded shapes and the

    // rest fall back to unbounded assignment.

    const bool hamerlyEligible = (d > detail::kDirectArgminMaxD) && (k <= kHamerlyMaxK) && (k >= 2);

    // Elkan keeps k lower bounds per sample instead of Hamerly's one, pruning far more distance

    // work once k exceeds Hamerly's regime. The @c n * k bound matrix grows linearly in both,

    // so we gate on an @c n * k envelope bound (memory ceiling) and require @c k above the

    // Hamerly cap so the two paths don't overlap.

    const bool elkanEligible = (d > math::defaults::pairwiseArgminMaxD) && (k > kHamerlyMaxK) &&

                               (k <= kElkanMaxK) && (n * k <= kElkanNKLimit) && (k >= 2);


    while (iter < maxIter) {

      if (hamerlyEligible && iter > 0) {

        runHamerlyAssignment(X, centroids, outLabels, pool);

      } else if (elkanEligible && iter > 0) {

        runElkanAssignment(X, centroids, outLabels, pool);

      } else {

        // First iteration (or no-prune shape) goes through the dispatcher. The chunked path

        // seeds Hamerly's @c m_u and @c m_l inline from the argmin post-pass, and Elkan's

        // @c m_elkanBounds matrix is filled from the same per-sample scan. The fused path only

        // returns the winning distance, so seed Hamerly's conservative lower bound separately.

        runAssignment(X, centroids, outLabels, pool);

        if (hamerlyEligible && iter == 0 && assignmentUsesFusedArgmin(X, centroids)) {

          seedHamerlyBoundsFromLabels(X, centroids, outLabels, pool);

        }

      }


      std::memcpy(m_centroidsOld.data(), centroids.data(),

                  centroids.dim(0) * centroids.dim(1) * sizeof(T));


      if (useKahan) {

        scatterAndFoldKahan(X, outLabels, k, pool);

      } else {

        scatterAndFoldPlain(X, outLabels, k, pool);

      }


      // Empty-cluster reseed: furthest-point pass bounded by the counts scan. m_minDistSq still

      // holds the decomposed-formula residual from the assignment above; the noise tail is

      // bounded by per-point `||c||^2 + ||x||^2` cancellation, smaller than the inter-blob

      // distance the donor is selected against, so the argmax selection is preserved in

      // practice on benchmark data. The donor's minDistSq is zeroed so successive empties

      // cannot reseed to the same point.

      (void)::clustering::kmeans::detail::reseedEmptyClusters<T>(X, centroids, m_sums, m_counts,

                                                                 m_minDistSq);


      finalizeMeans(centroids);

      refreshCentroidSqNorms(centroids);


      math::centroidShift<T>(m_centroidsOld, centroids, m_shiftSq, pool);

      const T totalShift = ::clustering::kmeans::detail::totalShiftSqKahan<T>(m_shiftSq);


      ++iter;

      if (totalShift <= shiftSqThreshold) {

        converged = true;

        break;

      }

    }


    // Re-assign labels against the final centroids. At convergence the bounds Hamerly maintains

    // are already tight for the pre-update centroids; feeding one more bound-aware pass against

    // the tiny final shift prunes nearly every row and is an order of magnitude cheaper than a

    // full chunked GEMM assignment. Force the serial fan-out so the per-worker submit/wait pair

    // doesn't dominate the trivial post-convergence work; the chunked fallback still fans out

    // when the shape never enabled Hamerly.

    if (hamerlyEligible && iter > 0) {

      runHamerlyAssignment(X, centroids, outLabels, math::Pool{});

    } else if (elkanEligible && iter > 0) {

      runElkanAssignment(X, centroids, outLabels, math::Pool{});

    } else {

      runAssignment(X, centroids, outLabels, pool);

    }

    if (!assignmentProducesDirectMinDistSq(X, centroids)) {

      recomputeMinDistSqDirect(X, centroids, outLabels, pool);

    }


    // Inertia: Kahan-summed in f64 to pin the 1% gate at large (n, k) envelopes where the

    // naive single-pass f32 add would drift.

    double sum = 0.0;

    double comp = 0.0;

    for (std::size_t i = 0; i < n; ++i) {

      const auto addend = static_cast<double>(m_minDistSq(i));

      const double y = addend - comp;

      const double t = sum + y;

      comp = (t - sum) - y;

      sum = t;

    }


    outInertia = sum;

    outNIter = iter;

    outConverged = converged;

  }


private:

  [[nodiscard]] T meanColumnVariance(const NDArray<T, 2, Layout::Contig> &X) {

    const std::size_t n = X.dim(0);

    const std::size_t d = X.dim(1);

    if (n == 0 || d == 0) {

      return T{0};

    }

    const T *xData = X.data();


    // Per-column accumulators kept in scratch so repeat runs at the same shape skip the

    // allocator. Row-major walk (x traversed in storage order) keeps every column load

    // inside the same cache line as its neighbors -- the natural column-major alternative

    // misses L1 once per load at large @p d.

    if (m_varSum.dim(0) != d) {

      m_varSum = NDArray<T, 1>({d});

      m_varSumSq = NDArray<T, 1>({d});

    }

    T *colSum = m_varSum.data();

    T *colSumSq = m_varSumSq.data();

    for (std::size_t t = 0; t < d; ++t) {

      colSum[t] = T{0};

      colSumSq[t] = T{0};

    }

    for (std::size_t i = 0; i < n; ++i) {

      const T *row = xData + (i * d);

      math::detail::columnwiseAccumSumSq<T>(row, d, colSum, colSumSq);

    }

    const auto nInv = static_cast<T>(1) / static_cast<T>(n);

    T acc = T{0};

    for (std::size_t t = 0; t < d; ++t) {

      const T mean = colSum[t] * nInv;

      acc += (colSumSq[t] * nInv) - (mean * mean);

    }

    return acc / static_cast<T>(d);

  }


  void ensureShape(std::size_t n, std::size_t d, std::size_t k, std::size_t workerCount) {

    const bool shapeChanged = (n != m_n) || (d != m_d) || (k != m_k);

    const bool workerChanged = (workerCount != m_workerCount);

    if (!shapeChanged && !workerChanged) {

      return;

    }


    const bool needsChunk = d > math::defaults::pairwiseArgminMaxD;

    const std::size_t chunkCap = math::pairwiseArgminChunkRows;

    const std::size_t blocks = workerCount == 0 ? std::size_t{1} : workerCount;


    if (shapeChanged) {

      m_centroidsOld = NDArray<T, 2, Layout::Contig>({k, d});

      m_cSqNorms = NDArray<T, 1>({k});

      m_sums = NDArray<T, 2, Layout::Contig>({k, d});

      m_counts = NDArray<std::int32_t, 1>({k});

      m_minDistSq = NDArray<T, 1>({n});

      m_xNormsSq = NDArray<T, 1>({n});

      m_shiftSq = NDArray<T, 1>({k});

      m_foldComp = NDArray<T, 1>({k * d});

      // Hamerly bound scratch: per-point upper/lower Euclidean bounds and per-cluster sqrt

      // shifts. Seeded by the first iteration's full scan and maintained by the bounds-aware

      // reassignment on subsequent iterations.

      m_u = NDArray<T, 1>({n});

      m_l = NDArray<T, 1>({n});

      m_shiftEuclidean = NDArray<T, 1>({k});

      m_halfDistToNearestOther = NDArray<T, 1>({k});

      // Elkan's @c n * k bound matrix and the per-pair centroid-distance matrix are allocated

      // unconditionally; rows past the caller's active-k index stay zero and the allocation

      // cost amortizes across Lloyd iterations on the same shape.

      if (n * k <= kElkanNKLimit && k <= kElkanMaxK) {

        m_elkanBounds = NDArray<T, 2, Layout::Contig>({n, k});

        m_centerDist = NDArray<T, 2, Layout::Contig>({k, k});

      } else {

        m_elkanBounds = NDArray<T, 2, Layout::Contig>({0, 0});

        m_centerDist = NDArray<T, 2, Layout::Contig>({0, 0});

      }

      // Packed-B sizing: the fused fast path at d<=pairwiseArgminMaxD uses the flat

      // panel-per-centroid layout (ceil(k/Nr)*Nr*d); the chunked fallback uses the tiled

      // (jcIdx, pcIdx) layout that @c gemmRunPrepacked expects and is what supports d > kKc

      // and k > kNc without envelope asserts.

      const std::size_t packedBSize = needsChunk

                                          ? math::detail::packedBScratchSizeFloatsTiled<T>(k, d)

                                          : math::detail::packedBScratchSizeFloats(k, d);

      const std::size_t packedNormsSize = math::detail::packedCSqNormsScratchSizeFloats(k);

      m_packedB = NDArray<T, 1>({packedBSize == 0 ? std::size_t{1} : packedBSize});

      m_packedCSqNorms = NDArray<T, 1>({packedNormsSize == 0 ? std::size_t{1} : packedNormsSize});

      // Per-worker distance tile for the chunked path: one chunkCap*k slab per worker so

      // the chunk fan-out runs without touching a shared tile.

      const std::size_t distRows = needsChunk ? (blocks * chunkCap) : std::size_t{1};

      const std::size_t safeK = (k == 0) ? std::size_t{1} : k;

      const std::size_t distCols = needsChunk ? safeK : std::size_t{1};

      m_distsChunk = NDArray<T, 2, Layout::Contig>({distRows, distCols});

    } else if (workerChanged) {

      // Only the per-worker slabs depend on workerCount; resize them if d triggered needsChunk.

      if (needsChunk) {

        const std::size_t distRows = blocks * chunkCap;

        const std::size_t distCols = (k == 0 ? std::size_t{1} : k);

        m_distsChunk = NDArray<T, 2, Layout::Contig>({distRows, distCols});

      }

    }


    // Per-block scratch sizing for scatter-and-fold. Block count caps at workerCount (see

    // BlockPartition); we size to the upper bound so both serial and parallel dispatch fit

    // without reallocation inside the loop.

    m_partialSums = NDArray<T, 1>({blocks * k * d});

    m_partialComps = NDArray<T, 1>({blocks * k * d});

    m_partialCounts = NDArray<std::int32_t, 1>({blocks * k});


    // Gemm A-pack arena sized to @c blocks * kMc * kKc so @c gemmRunPrepacked's per-worker

    // slice indexing stays in-bounds on every fan-out path.

    const std::size_t apSize = blocks * math::detail::kMc<T> * math::detail::kKc<T>;

    m_gemmApArena = NDArray<T, 1>({needsChunk ? apSize : std::size_t{1}});


    m_n = n;

    m_d = d;

    m_k = k;

    m_workerCount = workerCount;

  }


  void refreshCentroidSqNorms(const NDArray<T, 2, Layout::Contig> &centroids) noexcept {

    const std::size_t k = centroids.dim(0);

    const std::size_t d = centroids.dim(1);

    for (std::size_t c = 0; c < k; ++c) {

      const T *row = centroids.data() + (c * d);

      T s = T{0};

      for (std::size_t t = 0; t < d; ++t) {

        s += row[t] * row[t];

      }

      m_cSqNorms(c) = s;

    }

  }


  void finalizeMeans(NDArray<T, 2, Layout::Contig> &centroids) noexcept {

    const std::size_t k = centroids.dim(0);

    const std::size_t d = centroids.dim(1);

    for (std::size_t c = 0; c < k; ++c) {

      const std::int32_t cnt = m_counts(c);

      if (cnt <= 0) {

        continue;

      }

      const T inv = T{1} / static_cast<T>(cnt);

      const T *src = m_sums.data() + (c * d);

      T *dst = centroids.data() + (c * d);

      for (std::size_t t = 0; t < d; ++t) {

        dst[t] = src[t] * inv;

      }

    }

  }


  void runAssignment(const NDArray<T, 2, Layout::Contig> &X,

                     const NDArray<T, 2, Layout::Contig> &centroids,

                     NDArray<std::int32_t, 1> &labels, math::Pool pool) {

#ifdef CLUSTERING_USE_AVX2

    if constexpr (std::is_same_v<T, float>) {

      const std::size_t d = X.dim(1);

      if (X.template isAligned<32>() && centroids.template isAligned<32>() && d != 0) {

        if (d <= detail::kDirectArgminMaxD) {

          math::detail::pairwiseArgminDirectSmallDF32(X, centroids, labels, m_minDistSq, pool);

          return;

        }

        if (d <= math::defaults::pairwiseArgminMaxD) {

          math::detail::pairwiseArgminOuterAvx2F32WithScratch(X, centroids, m_cSqNorms, labels,

                                                              m_minDistSq, m_packedB.data(),

                                                              m_packedCSqNorms.data(), pool);

          return;

        }

      }

    }

#endif

    runChunkedMaterializedAssignment(X, centroids, labels, pool);

  }


  [[nodiscard]] bool

  assignmentProducesDirectMinDistSq(const NDArray<T, 2, Layout::Contig> &X,

                                    const NDArray<T, 2, Layout::Contig> &C) noexcept {

#ifdef CLUSTERING_USE_AVX2

    if constexpr (std::is_same_v<T, float>) {

      const std::size_t d = X.dim(1);

      return X.template isAligned<32>() && C.template isAligned<32>() && d != 0 &&

             d <= detail::kDirectArgminMaxD;

    } else {

      (void)X;

      (void)C;

      return false;

    }

#else

    (void)X;

    (void)C;

    return false;

#endif

  }


  [[nodiscard]] bool assignmentUsesFusedArgmin(const NDArray<T, 2, Layout::Contig> &X,

                                               const NDArray<T, 2, Layout::Contig> &C) noexcept {

#ifdef CLUSTERING_USE_AVX2

    if constexpr (std::is_same_v<T, float>) {

      const std::size_t d = X.dim(1);

      return X.template isAligned<32>() && C.template isAligned<32>() &&

             d > detail::kDirectArgminMaxD && d <= math::defaults::pairwiseArgminMaxD;

    } else {

      (void)X;

      (void)C;

      return false;

    }

#else

    (void)X;

    (void)C;

    return false;

#endif

  }


  void packCentroidsTiled(const NDArray<T, 2, Layout::Contig> &centroids) noexcept {

    constexpr std::size_t kNr = math::detail::kKernelNr<T>;

    constexpr std::size_t kKcVal = math::detail::kKc<T>;

    constexpr std::size_t kNcVal = math::detail::kNc<T>;

    const std::size_t k = centroids.dim(0);

    const std::size_t d = centroids.dim(1);

    const auto cTransposed = centroids.t();

    const auto cDesc = ::clustering::detail::describeMatrix(cTransposed);

    T *bp = m_packedB.data();

    std::size_t jcBase = 0;

    for (std::size_t jc = 0; jc < k; jc += kNcVal) {

      const std::size_t nc = (jc + kNcVal <= k) ? kNcVal : (k - jc);

      const std::size_t roundedNc = ((nc + kNr - 1) / kNr) * kNr;

      std::size_t pcOffInJc = 0;

      for (std::size_t pc = 0; pc < d; pc += kKcVal) {

        const std::size_t kc = (pc + kKcVal <= d) ? kKcVal : (d - pc);

        math::detail::packB<T>(cDesc, pc, kc, jc, nc, bp + jcBase + pcOffInJc);

        pcOffInJc += kc * roundedNc;

      }

      jcBase += d * roundedNc;

    }

  }


  void runChunkedMaterializedAssignment(const NDArray<T, 2, Layout::Contig> &X,

                                        const NDArray<T, 2, Layout::Contig> &centroids,

                                        NDArray<std::int32_t, 1> &labels,

                                        math::Pool pool) noexcept {

    const std::size_t n = X.dim(0);

    const std::size_t k = centroids.dim(0);

    const std::size_t d = X.dim(1);

    if (n == 0 || k == 0) {

      return;

    }


    packCentroidsTiled(centroids);


    constexpr std::size_t kMcVal = math::detail::kMc<T>;

    constexpr std::size_t kKcVal = math::detail::kKc<T>;

    const std::size_t chunkCap = math::pairwiseArgminChunkRows;

    const std::size_t numChunks = (n + chunkCap - 1) / chunkCap;

    const T *bp = m_packedB.data();

    T *apArena = m_gemmApArena.data();

    T *distsBase = m_distsChunk.data();

    const T *cNormsBase = m_cSqNorms.data();

    T *minDistBase = m_minDistSq.data();

    std::int32_t *labelsBase = labels.data();

    const T *xBase = X.data();

    // Hamerly bound seeding happens inline in the argmin post-pass; the cost is a handful of

    // extra comparisons per row plus two @c sqrt calls, far below a second pass over @p X.

    T *uBase = m_u.data();

    T *lBase = m_l.data();

    // Elkan bound seeding lights up only when the scratch matrix was sized for this shape;

    // at shapes past @ref kElkanNKLimit or @ref kElkanMaxK the pointer stays null and the

    // per-row loop skips the per-cluster bound stores.

    T *elkanBoundsBase = m_elkanBounds.dim(0) == n ? m_elkanBounds.data() : nullptr;


    auto runOneChunk = [&](std::size_t chunkIdx) noexcept {

      const std::size_t iBase = chunkIdx * chunkCap;

      const std::size_t chunkRows = (iBase + chunkCap <= n) ? chunkCap : (n - iBase);

      const std::size_t w = math::Pool::workerIndex();

      T *distsChunk = distsBase + (w * chunkCap * k);

      T *apSlice = apArena + (w * kMcVal * kKcVal);


      auto xChunk = NDArray<T, 2, Layout::Contig>::borrow(const_cast<T *>(xBase) + (iBase * d),

                                                          {chunkRows, d});

      auto distsView = NDArray<T, 2>::borrow(distsChunk, {chunkRows, k});

      const auto xDesc = ::clustering::detail::describeMatrix(xChunk);

      auto distsDesc = ::clustering::detail::describeMatrixMut(distsView);

      // Serial GEMM inside the chunk; outer fan-out already owns parallelism.

      math::detail::gemmRunPrepacked<T>(xDesc, bp, d, k, distsDesc, T{-2}, T{0}, apSlice,

                                        math::Pool{});


      const T *xNormsChunk = m_xNormsSq.data() + iBase;

      for (std::size_t i = 0; i < chunkRows; ++i) {

        const T xn = xNormsChunk[i];

        const T *row = distsChunk + (i * k);

        T *elkanRow = elkanBoundsBase != nullptr ? elkanBoundsBase + ((iBase + i) * k) : nullptr;

        T bestVal = std::numeric_limits<T>::infinity();

        T secondVal = std::numeric_limits<T>::infinity();

        std::int32_t bestIdx = 0;

        for (std::size_t j = 0; j < k; ++j) {

          T v = row[j] + xn + cNormsBase[j];

          if (v < T{0}) {

            v = T{0};

          }

          if (elkanRow != nullptr) {

            elkanRow[j] = std::sqrt(v);

          }

          if (v < bestVal) {

            secondVal = bestVal;

            bestVal = v;

            bestIdx = static_cast<std::int32_t>(j);

          } else if (v < secondVal) {

            secondVal = v;

          }

        }

        minDistBase[iBase + i] = bestVal;

        labelsBase[iBase + i] = bestIdx;

        uBase[iBase + i] = std::sqrt(bestVal);

        lBase[iBase + i] = std::sqrt(secondVal);

      }

    };


    if (pool.shouldParallelize(numChunks, 1, 2) && pool.pool != nullptr) {

      pool.pool

          ->submit_blocks(std::size_t{0}, numChunks,

                          [&](std::size_t lo, std::size_t hi) {

                            for (std::size_t c = lo; c < hi; ++c) {

                              runOneChunk(c);

                            }

                          })

          .wait();

    } else {

      for (std::size_t c = 0; c < numChunks; ++c) {

        runOneChunk(c);

      }

    }

  }


  void scatterAndFoldPlain(const NDArray<T, 2, Layout::Contig> &X,

                           const NDArray<std::int32_t, 1> &labels, std::size_t k, math::Pool pool) {

    const std::size_t n = X.dim(0);

    const std::size_t d = X.dim(1);


    T *partialSums = m_partialSums.data();

    std::int32_t *partialCounts = m_partialCounts.data();


    for (std::size_t c = 0; c < k; ++c) {

      m_counts(c) = 0;

      for (std::size_t t = 0; t < d; ++t) {

        m_sums(c, t) = T{0};

      }

    }

    if (n == 0 || d == 0) {

      return;

    }


    const bool willParallelize = pool.shouldParallelizeWork(n * d) &&

                                 pool.shouldParallelize(n, 64, 2) && pool.pool != nullptr;

    const std::size_t desiredBlocks = willParallelize ? pool.workerCount() : std::size_t{1};

    const detail::BlockPartition part(0, n, desiredBlocks);

    const std::size_t numBlocks = part.num_blocks == 0 ? std::size_t{1} : part.num_blocks;


    for (std::size_t b = 0; b < numBlocks; ++b) {

      T *slab = partialSums + (b * k * d);

      std::int32_t *cslab = partialCounts + (b * k);

      for (std::size_t e = 0; e < k * d; ++e) {

        slab[e] = T{0};

      }

      for (std::size_t c = 0; c < k; ++c) {

        cslab[c] = 0;

      }

    }


    auto scatterRange = [&](std::size_t lo, std::size_t hi) noexcept {

      const std::size_t b = part.blockIndexOf(lo);

      T *slab = partialSums + (b * k * d);

      std::int32_t *cslab = partialCounts + (b * k);

      for (std::size_t i = lo; i < hi; ++i) {

        const std::int32_t lbl = labels(i);

        if (lbl < 0 || std::cmp_greater_equal(lbl, k)) {

          continue;

        }

        const auto row = static_cast<std::size_t>(lbl);

        const T *xRow = X.data() + (i * d);

        T *dst = slab + (row * d);

        for (std::size_t t = 0; t < d; ++t) {

          dst[t] += xRow[t];

        }

        cslab[row] += 1;

      }

    };


    if (willParallelize) {

      pool.pool

          ->submit_blocks(

              std::size_t{0}, n, [&](std::size_t lo, std::size_t hi) { scatterRange(lo, hi); },

              numBlocks)

          .wait();

    } else {

      scatterRange(0, n);

    }


    // Ascending-block-index fold. Deterministic at fixed (n, k, d, nJobs); changing this order

    // changes the last-bit of the per-cluster sum and breaks bit-identity.

    for (std::size_t b = 0; b < numBlocks; ++b) {

      const T *slab = partialSums + (b * k * d);

      const std::int32_t *cslab = partialCounts + (b * k);

      for (std::size_t c = 0; c < k; ++c) {

        m_counts(c) += cslab[c];

        const T *src = slab + (c * d);

        T *dstRow = &m_sums(c, 0);

        for (std::size_t t = 0; t < d; ++t) {

          dstRow[t] += src[t];

        }

      }

    }

  }


  void scatterAndFoldKahan(const NDArray<T, 2, Layout::Contig> &X,

                           const NDArray<std::int32_t, 1> &labels, std::size_t k, math::Pool pool) {

    const std::size_t n = X.dim(0);

    const std::size_t d = X.dim(1);


    T *partialSums = m_partialSums.data();

    T *partialComps = m_partialComps.data();

    std::int32_t *partialCounts = m_partialCounts.data();

    T *foldComp = m_foldComp.data();


    for (std::size_t c = 0; c < k; ++c) {

      m_counts(c) = 0;

      for (std::size_t t = 0; t < d; ++t) {

        m_sums(c, t) = T{0};

      }

    }

    for (std::size_t e = 0; e < k * d; ++e) {

      foldComp[e] = T{0};

    }

    if (n == 0 || d == 0) {

      return;

    }


    const bool willParallelize = pool.shouldParallelizeWork(n * d) &&

                                 pool.shouldParallelize(n, 64, 2) && pool.pool != nullptr;

    const std::size_t desiredBlocks = willParallelize ? pool.workerCount() : std::size_t{1};

    const detail::BlockPartition part(0, n, desiredBlocks);

    const std::size_t numBlocks = part.num_blocks == 0 ? std::size_t{1} : part.num_blocks;


    for (std::size_t b = 0; b < numBlocks; ++b) {

      T *slab = partialSums + (b * k * d);

      T *cslab = partialComps + (b * k * d);

      std::int32_t *nslab = partialCounts + (b * k);

      for (std::size_t e = 0; e < k * d; ++e) {

        slab[e] = T{0};

        cslab[e] = T{0};

      }

      for (std::size_t c = 0; c < k; ++c) {

        nslab[c] = 0;

      }

    }


    auto scatterRange = [&](std::size_t lo, std::size_t hi) noexcept {

      const std::size_t b = part.blockIndexOf(lo);

      T *slab = partialSums + (b * k * d);

      T *cslab = partialComps + (b * k * d);

      std::int32_t *nslab = partialCounts + (b * k);

      for (std::size_t i = lo; i < hi; ++i) {

        const std::int32_t lbl = labels(i);

        if (lbl < 0 || std::cmp_greater_equal(lbl, k)) {

          continue;

        }

        const auto row = static_cast<std::size_t>(lbl);

        const T *xRow = X.data() + (i * d);

        T *sumRow = slab + (row * d);

        T *compRow = cslab + (row * d);

        math::detail::kahanAddRow<T>(xRow, d, sumRow, compRow);

        nslab[row] += 1;

      }

    };


    if (willParallelize) {

      pool.pool

          ->submit_blocks(

              std::size_t{0}, n, [&](std::size_t lo, std::size_t hi) { scatterRange(lo, hi); },

              numBlocks)

          .wait();

    } else {

      scatterRange(0, n);

    }


    for (std::size_t b = 0; b < numBlocks; ++b) {

      const T *slab = partialSums + (b * k * d);

      const T *cslab = partialComps + (b * k * d);

      const std::int32_t *nslab = partialCounts + (b * k);

      for (std::size_t c = 0; c < k; ++c) {

        m_counts(c) += nslab[c];

        const T *src = slab + (c * d);

        const T *comp = cslab + (c * d);

        T *dstRow = &m_sums(c, 0);

        T *foldRow = foldComp + (c * d);

        for (std::size_t t = 0; t < d; ++t) {

          const T addend = src[t] - comp[t];

          const T y = addend - foldRow[t];

          const T tVal = dstRow[t] + y;

          foldRow[t] = (tVal - dstRow[t]) - y;

          dstRow[t] = tVal;

        }

      }

    }

  }


  void recomputeMinDistSqDirect(const NDArray<T, 2, Layout::Contig> &X,

                                const NDArray<T, 2, Layout::Contig> &centroids,

                                const NDArray<std::int32_t, 1> &labels, math::Pool pool) noexcept {

    const std::size_t n = X.dim(0);

    const std::size_t d = X.dim(1);

    const std::size_t k = centroids.dim(0);

    if (n == 0 || d == 0 || k == 0) {

      return;

    }


    auto runRowRange = [&](std::size_t lo, std::size_t hi) noexcept {

      for (std::size_t i = lo; i < hi; ++i) {

        const std::int32_t lbl = labels(i);

        if (lbl < 0 || std::cmp_greater_equal(lbl, k)) {

          m_minDistSq(i) = T{0};

          continue;

        }

        const T *xRow = X.data() + (i * d);

        const T *cRow = centroids.data() + (static_cast<std::size_t>(lbl) * d);

        m_minDistSq(i) = math::detail::sqEuclideanRowPtr<T>(xRow, cRow, d);

      }

    };


    if (pool.shouldParallelize(n, 64, 2) && pool.pool != nullptr) {

      pool.pool

          ->submit_blocks(std::size_t{0}, n,

                          [&](std::size_t lo, std::size_t hi) { runRowRange(lo, hi); })

          .wait();

    } else {

      runRowRange(0, n);

    }

  }


  void seedHamerlyBoundsFromLabels(const NDArray<T, 2, Layout::Contig> &X,

                                   const NDArray<T, 2, Layout::Contig> &centroids,

                                   const NDArray<std::int32_t, 1> &labels,

                                   math::Pool pool) noexcept {

    const std::size_t n = X.dim(0);

    const std::size_t d = X.dim(1);

    const std::size_t k = centroids.dim(0);

    if (n == 0 || d == 0 || k == 0) {

      return;

    }


    auto seedRange = [&](std::size_t lo, std::size_t hi) noexcept {

      for (std::size_t i = lo; i < hi; ++i) {

        const std::int32_t lbl = labels(i);

        if (lbl < 0 || std::cmp_greater_equal(lbl, k)) {

          m_minDistSq(i) = T{0};

          m_u(i) = std::numeric_limits<T>::infinity();

          m_l(i) = T{0};

          continue;

        }

        const T *xRow = X.data() + (i * d);

        const T *cRow = centroids.data() + (static_cast<std::size_t>(lbl) * d);

        const T tightSq = math::detail::sqEuclideanRowPtr<T>(xRow, cRow, d);

        m_minDistSq(i) = tightSq;

        m_u(i) = std::sqrt(tightSq);

        m_l(i) = T{0};

      }

    };


    if (pool.shouldParallelize(n, 64, 2) && pool.pool != nullptr) {

      pool.pool

          ->submit_blocks(std::size_t{0}, n,

                          [&](std::size_t lo, std::size_t hi) { seedRange(lo, hi); })

          .wait();

    } else {

      seedRange(0, n);

    }

  }


  static constexpr std::size_t kHamerlyMaxK = 64;


  static constexpr std::size_t kElkanMaxK = 4096;


  static constexpr std::size_t kElkanNKLimit = std::size_t{32} << 20;


  void runHamerlyAssignment(const NDArray<T, 2, Layout::Contig> &X,

                            const NDArray<T, 2, Layout::Contig> &centroids,

                            NDArray<std::int32_t, 1> &labels, math::Pool pool) noexcept {

    const std::size_t n = X.dim(0);

    const std::size_t d = X.dim(1);

    const std::size_t k = centroids.dim(0);

    if (n == 0 || d == 0 || k == 0 || k > kHamerlyMaxK) {

      return;

    }

    const T *xData = X.data();

    const T *cData = centroids.data();

    T *uData = m_u.data();

    T *lData = m_l.data();

    T *minDistData = m_minDistSq.data();

    std::int32_t *labelsData = labels.data();


    // Per-cluster Euclidean shift + top-2 of shifts. The second-largest shift is the amount we

    // subtract from `l(x)` when x's assigned cluster is the one with the largest shift --

    // otherwise the largest shift is the loose bound donor for every non-assigned cluster.

    T sMax = T{0};

    T s2Max = T{0};

    std::size_t argMax = 0;

    T *shiftData = m_shiftEuclidean.data();

    for (std::size_t c = 0; c < k; ++c) {

      const T s = std::sqrt(m_shiftSq(c));

      shiftData[c] = s;

      if (s > sMax) {

        s2Max = sMax;

        sMax = s;

        argMax = c;

      } else if (s > s2Max) {

        s2Max = s;

      }

    }


    // Per-cluster half-distance to the nearest other centroid. When `u(x)` for a sample

    // assigned to cluster @c c clears this threshold, triangle inequality pins the sample in

    // @c c: any other @c c' is at least @c 2 * halfDist[c] away, so `||x - c'||` >= 2 *

    // halfDist[c] - u(x) >= u(x) >= ||x - c||. Populating it is `O(k^2 * d)`, negligible next

    // to Hamerly's per-sample work at `k <= 64`.

    T *halfDistData = m_halfDistToNearestOther.data();

    for (std::size_t c = 0; c < k; ++c) {

      T nearestSq = std::numeric_limits<T>::infinity();

      const T *caRow = cData + (c * d);

      for (std::size_t cp = 0; cp < k; ++cp) {

        if (cp == c) {

          continue;

        }

        const T dsq = math::detail::sqEuclideanRowPtr<T>(caRow, cData + (cp * d), d);

        if (dsq < nearestSq) {

          nearestSq = dsq;

        }

      }

      halfDistData[c] = T{0.5} * std::sqrt(nearestSq);

    }


    auto processRange = [&](std::size_t lo, std::size_t hi) noexcept {

      std::array<T, kHamerlyMaxK> distBuf{};

      for (std::size_t i = lo; i < hi; ++i) {

        const std::int32_t a = labelsData[i];

        if (a < 0 || std::cmp_greater_equal(a, k)) {

          continue;

        }

        const auto au = static_cast<std::size_t>(a);

        T ui = uData[i] + shiftData[au];

        T li = lData[i] - ((au == argMax) ? s2Max : sMax);


        if (ui <= li) {

          uData[i] = ui;

          lData[i] = li;

          continue;

        }


        // Lemma 1 shortcut: if the upper bound clears the half-distance to the nearest other

        // centroid, the sample's label cannot have changed -- no need to recompute

        // `||x - c_a||`. @c ui is still the post-shift bound, which stays valid; @c li is

        // allowed to decay here because the outer per-sample gate will exact-recompute it on

        // the next iteration that forces a tightening or a full scan.

        if (ui <= halfDistData[au]) {

          uData[i] = ui;

          lData[i] = li;

          continue;

        }


        const T *xi = xData + (i * d);

        const T *caRow = cData + (au * d);

        const T tightSq = math::detail::sqEuclideanRowPtr<T>(xi, caRow, d);

        ui = std::sqrt(tightSq);


        if (ui <= li) {

          uData[i] = ui;

          lData[i] = li;

          minDistData[i] = tightSq;

          continue;

        }


        detail::sqEuclideanRowToBatch<T>(xi, cData, k, d, distBuf.data());

        T best = std::numeric_limits<T>::infinity();

        T second = std::numeric_limits<T>::infinity();

        std::int32_t bestIdx = 0;

        for (std::size_t j = 0; j < k; ++j) {

          const T v = distBuf[j];

          if (v < best) {

            second = best;

            best = v;

            bestIdx = static_cast<std::int32_t>(j);

          } else if (v < second) {

            second = v;

          }

        }

        labelsData[i] = bestIdx;

        minDistData[i] = best;

        uData[i] = std::sqrt(best);

        lData[i] = std::sqrt(second);

      }

    };


    if (pool.shouldParallelize(n, 64, 2) && pool.pool != nullptr) {

      pool.pool

          ->submit_blocks(std::size_t{0}, n,

                          [&](std::size_t lo, std::size_t hi) { processRange(lo, hi); })

          .wait();

    } else {

      processRange(0, n);

    }

  }


  void runElkanAssignment(const NDArray<T, 2, Layout::Contig> &X,

                          const NDArray<T, 2, Layout::Contig> &centroids,

                          NDArray<std::int32_t, 1> &labels, math::Pool pool) noexcept {

    const std::size_t n = X.dim(0);

    const std::size_t d = X.dim(1);

    const std::size_t k = centroids.dim(0);

    if (n == 0 || d == 0 || k == 0 || m_elkanBounds.dim(0) != n || m_elkanBounds.dim(1) != k) {

      return;

    }

    const T *xData = X.data();

    const T *cData = centroids.data();

    T *uData = m_u.data();

    T *boundsData = m_elkanBounds.data();

    T *minDistData = m_minDistSq.data();

    std::int32_t *labelsData = labels.data();


    // Per-cluster shift (Euclidean) used to update all bounds once at the top of the pass.

    T *shiftData = m_shiftEuclidean.data();

    for (std::size_t c = 0; c < k; ++c) {

      shiftData[c] = std::sqrt(m_shiftSq(c));

    }


    // Pairwise centroid distances. Symmetric; fill upper triangle and mirror. `O(k^2 * d)`,

    // amortized against the @c n * k inner scan below.

    T *centerDistData = m_centerDist.data();

    T *halfDistData = m_halfDistToNearestOther.data();

    for (std::size_t c = 0; c < k; ++c) {

      centerDistData[(c * k) + c] = T{0};

      T nearest = std::numeric_limits<T>::infinity();

      for (std::size_t cp = 0; cp < k; ++cp) {

        if (cp == c) {

          continue;

        }

        T dist;

        if (cp > c) {

          const T dsq = math::detail::sqEuclideanRowPtr<T>(cData + (c * d), cData + (cp * d), d);

          dist = std::sqrt(dsq);

          centerDistData[(c * k) + cp] = dist;

          centerDistData[(cp * k) + c] = dist;

        } else {

          dist = centerDistData[(c * k) + cp];

        }

        if (dist < nearest) {

          nearest = dist;

        }

      }

      halfDistData[c] = T{0.5} * nearest;

    }


    auto processRange = [&](std::size_t lo, std::size_t hi) noexcept {

      for (std::size_t i = lo; i < hi; ++i) {

        std::int32_t a = labelsData[i];

        if (a < 0 || std::cmp_greater_equal(a, k)) {

          continue;

        }

        auto au = static_cast<std::size_t>(a);

        T u = uData[i] + shiftData[au];

        T *lRow = boundsData + (i * k);

        // Bound-shift pass for this sample: looser lower bounds against all clusters. Done

        // inline so the per-sample walk touches the row exactly once.

        for (std::size_t c = 0; c < k; ++c) {

          T lnew = lRow[c] - shiftData[c];

          if (lnew < T{0}) {

            lnew = T{0};

          }

          lRow[c] = lnew;

        }


        if (u <= halfDistData[au]) {

          uData[i] = u;

          continue;

        }


        bool uTight = false;

        const T *xi = xData + (i * d);

        for (std::size_t c = 0; c < k; ++c) {

          if (c == au) {

            continue;

          }

          const T lc = lRow[c];

          const T half = T{0.5} * centerDistData[(au * k) + c];

          if (u <= lc || u <= half) {

            continue;

          }

          if (!uTight) {

            const T tightSq = math::detail::sqEuclideanRowPtr<T>(xi, cData + (au * d), d);

            u = std::sqrt(tightSq);

            minDistData[i] = tightSq;

            uTight = true;

            if (u <= lc || u <= half) {

              continue;

            }

          }

          const T dSq = math::detail::sqEuclideanRowPtr<T>(xi, cData + (c * d), d);

          const T dEuc = std::sqrt(dSq);

          lRow[c] = dEuc;

          if (dEuc < u) {

            au = c;

            a = static_cast<std::int32_t>(c);

            u = dEuc;

            minDistData[i] = dSq;

          }

        }

        uData[i] = u;

        labelsData[i] = a;

      }

    };


    if (pool.shouldParallelize(n, 64, 2) && pool.pool != nullptr) {

      pool.pool

          ->submit_blocks(std::size_t{0}, n,

                          [&](std::size_t lo, std::size_t hi) { processRange(lo, hi); })

          .wait();

    } else {

      processRange(0, n);

    }

  }


  NDArray<T, 2, Layout::Contig> m_centroidsOld;

  NDArray<T, 1> m_cSqNorms;

  NDArray<T, 2, Layout::Contig> m_sums;

  NDArray<std::int32_t, 1> m_counts;

  NDArray<T, 1> m_minDistSq;

  NDArray<T, 1> m_shiftSq;

  NDArray<T, 1> m_partialSums;

  NDArray<T, 1> m_partialComps;

  NDArray<std::int32_t, 1> m_partialCounts;

  NDArray<T, 1> m_foldComp;

  NDArray<T, 1> m_packedB;

  NDArray<T, 1> m_packedCSqNorms;

  NDArray<T, 2, Layout::Contig> m_distsChunk;

  NDArray<T, 1> m_gemmApArena;

  NDArray<T, 1> m_xNormsSq;

  NDArray<T, 1> m_varSum;

  NDArray<T, 1> m_varSumSq;

  NDArray<T, 1> m_u;

  NDArray<T, 1> m_l;

  NDArray<T, 1> m_shiftEuclidean;

  NDArray<T, 1> m_halfDistToNearestOther;

  NDArray<T, 2, Layout::Contig> m_elkanBounds;

  NDArray<T, 2, Layout::Contig> m_centerDist;


  std::size_t m_n = 0;

  std::size_t m_d = 0;

  std::size_t m_k = 0;

  std::size_t m_workerCount = 0;

};


} // namespace clustering::kmeans

always_assert.h

CLUSTERING_ALWAYS_ASSERT
#define CLUSTERING_ALWAYS_ASSERT(cond)
Release-active assertion: evaluates cond in every build configuration.
Definition always_assert.h:30

centroid_shift.h

clustering::NDArray
Represents a multidimensional array (NDArray) of a fixed number of dimensions N and element type T.
Definition ndarray.h:136

clustering::NDArray::dim
size_t dim(std::size_t index) const noexcept
Returns the size of a specific dimension of the NDArray.
Definition ndarray.h:461

clustering::NDArray::borrow
static NDArray borrow(T *ptr, std::array< std::size_t, N > shape) noexcept
Borrows a contiguous buffer as an NDArray without taking ownership.
Definition ndarray.h:570

clustering::NDArray::data
const T * data() const noexcept
Provides read-only access to the internal data array.
Definition ndarray.h:503

clustering::kmeans::LloydFusedGemm::LloydFusedGemm
LloydFusedGemm()
Definition lloyd_fused_gemm.h:103

clustering::kmeans::LloydFusedGemm::run
void run(const NDArray< T, 2, Layout::Contig > &X, NDArray< T, 2, Layout::Contig > &centroids, std::size_t k, std::size_t maxIter, T tol, math::Pool pool, NDArray< std::int32_t, 1 > &outLabels, double &outInertia, std::size_t &outNIter, bool &outConverged)
Run the Lloyd loop against caller-seeded centroids.
Definition lloyd_fused_gemm.h:146

clustering::kmeans::LloydFusedGemm::kahanNThreshold
static constexpr std::size_t kahanNThreshold
n threshold at which the centroid accumulator switches to Kahan-compensated summation.
Definition lloyd_fused_gemm.h:123

defaults.h

greedy_kmpp_seeder.h

clustering::kmeans::detail
Definition greedy_kmpp_seeder.h:29

clustering::kmeans::detail::kDirectArgminMaxD
constexpr std::size_t kDirectArgminMaxD
Maximum d for the direct-compute argmin hot path.
Definition lloyd_fused_gemm.h:81

clustering::kmeans::detail::sqEuclideanRowToBatch
void sqEuclideanRowToBatch(const T *x, const T *candData, std::size_t L, std::size_t d, T *out) noexcept
Squared Euclidean distance from one x row to a batch of L candidate rows.
Definition greedy_kmpp_seeder.h:284

clustering::kmeans
Definition afkmc2_seeder.h:14

clustering::math::defaults::pairwiseArgminMaxD
constexpr std::size_t pairwiseArgminMaxD
Maximum feature dimension for which the fused pairwiseArgminSqEuclidean driver is used.
Definition defaults.h:76

clustering::math::detail::sqNormRow
T sqNormRow(const NDArray< T, 2, LX > &X, std::size_t i) noexcept
Definition pairwise.h:154

clustering::math::pairwiseArgminChunkRows
constexpr std::size_t pairwiseArgminChunkRows
Chunk height used by the materialized argmin path when striping over n.
Definition pairwise_argmin.h:24

clustering::math::centroidShift
void centroidShift(const NDArray< T, 2, Layout::Contig > &cOld, const NDArray< T, 2, Layout::Contig > &cNew, NDArray< T, 1 > &outShiftSq, Pool pool)
Per-row squared shift between two centroid matrices of identical shape.
Definition centroid_shift.h:28

ndarray.h

pairwise.h

pairwise_argmin.h

reduce.h

clustering::math::Pool
Thin injection wrapper around a BS::light_thread_pool.
Definition thread.h:63

clustering::math::Pool::workerIndex
static std::size_t workerIndex() noexcept
Stable index of the calling worker thread within the owning pool.
Definition thread.h:82

clustering::math::Pool::workerCount
std::size_t workerCount() const noexcept
Number of worker threads available, or 1 in serial mode.
Definition thread.h:72

thread.h