clustering/gemm__plan_8h_source.html

#pragma once


#include <cstddef>

#include <cstdint>

#include <type_traits>

#include <vector>


#include "clustering/math/detail/gemm_outer_prepacked.h"

#include "clustering/math/detail/gemm_pack.h"

#include "clustering/math/detail/matrix_desc.h"

#include "clustering/math/detail/reference_gemm.h"

#include "clustering/math/thread.h"

#include "clustering/ndarray.h"


namespace clustering::math {


template <class T, class Backend = detail::ReferenceGemm> class GemmPlan {

  static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,

                "GemmPlan: T must be float or double");


public:

  template <Layout LB>


  GemmPlan(const NDArray<T, 2, LB> &B, Pool pool)

      : m_kDim(B.dim(0)), m_nDim(B.dim(1)), m_workerCount(pool.workerCount()), m_pool(pool) {

    constexpr std::size_t kNr = detail::kKernelNr<T>;

    constexpr std::size_t kNcVal = detail::kNc<T>;

    constexpr std::size_t kKcVal = detail::kKc<T>;

    constexpr std::size_t kMcVal = detail::kMc<T>;


    m_scratch.assign(m_workerCount * kMcVal * kKcVal, T{0});


    if (m_kDim == 0 || m_nDim == 0) {

      // No Bp storage needed: execute() treats K==0 as the BLAS C<-beta*C identity without

      // reading Bp, and N==0 is a no-op.

      return;

    }


    // Two-pass: first compute total Bp size, then pack. Keeping the sizing pass separate lets

    // us call reserve/resize exactly once; the pack loop walks the same offset arithmetic that

    // gemmRunPrepacked uses, so the two stay structurally locked.

    std::size_t total = 0;

    for (std::size_t jc = 0; jc < m_nDim; jc += kNcVal) {

      const std::size_t nc = (jc + kNcVal <= m_nDim) ? kNcVal : (m_nDim - jc);

      const std::size_t roundedNc = ((nc + kNr - 1) / kNr) * kNr;

      total += m_kDim * roundedNc;

    }

    m_Bp.assign(total, T{0});


    auto Bd = ::clustering::detail::describeMatrix(B);


    std::size_t jcBase = 0;

    for (std::size_t jc = 0; jc < m_nDim; jc += kNcVal) {

      const std::size_t nc = (jc + kNcVal <= m_nDim) ? kNcVal : (m_nDim - jc);

      const std::size_t roundedNc = ((nc + kNr - 1) / kNr) * kNr;


      std::size_t pcOffInJc = 0;

      for (std::size_t pc = 0; pc < m_kDim; pc += kKcVal) {

        const std::size_t kc = (pc + kKcVal <= m_kDim) ? kKcVal : (m_kDim - pc);

        detail::packB<T>(Bd, pc, kc, jc, nc, m_Bp.data() + jcBase + pcOffInJc);

        pcOffInJc += kc * roundedNc;

      }

      jcBase += m_kDim * roundedNc;

    }

  }


  template <Layout LA>


  void execute(const NDArray<T, 2, LA> &A, NDArray<T, 2> &C, T alpha = T{1},

               T beta = T{0}) const noexcept {

    if (A.dim(0) == 0 || m_nDim == 0) {

      return;

    }


    // Pass the full scratch base -- gemmRunPrepacked slices per-worker inside its Mc dispatch via

    // Pool::workerIndex(). On the serial path workerIndex() returns 0, so slice 0 is used.

    auto Ad = ::clustering::detail::describeMatrix(A);

    auto Cd = ::clustering::detail::describeMatrixMut(C);

    detail::gemmRunPrepacked<T>(Ad, m_Bp.data(), m_kDim, m_nDim, Cd, alpha, beta, m_scratch.data(),

                                m_pool);

  }


  [[nodiscard]] std::size_t kDim() const noexcept { return m_kDim; }


  [[nodiscard]] std::size_t nDim() const noexcept { return m_nDim; }


  [[nodiscard]] const T *debugBpData() const noexcept { return m_Bp.data(); }


  [[nodiscard]] std::size_t debugScratchSize() const noexcept { return m_scratch.size(); }


  GemmPlan(const GemmPlan &) = delete;

  GemmPlan &operator=(const GemmPlan &) = delete;

  GemmPlan(GemmPlan &&) noexcept = default;

  GemmPlan &operator=(GemmPlan &&) noexcept = default;

  ~GemmPlan() = default;


private:

  std::size_t m_kDim = 0;

  std::size_t m_nDim = 0;

  std::size_t m_workerCount = 1;

  Pool m_pool{};

  std::vector<T, ::clustering::detail::AlignedAllocator<T, 32>> m_Bp;

  // mutable: execute() is const on the plan's observable shape but the scratch is a per-call

  // mutation surface sliced by worker index.

  mutable std::vector<T, ::clustering::detail::AlignedAllocator<T, 32>> m_scratch;

};


} // namespace clustering::math

clustering::NDArray
Represents a multidimensional array (NDArray) of a fixed number of dimensions N and element type T.
Definition ndarray.h:136

clustering::NDArray::dim
size_t dim(std::size_t index) const noexcept
Returns the size of a specific dimension of the NDArray.
Definition ndarray.h:461

clustering::math::GemmPlan::debugBpData
const T * debugBpData() const noexcept
Debug accessor exposing the packed B pointer so tests can pin alignment.
Definition gemm_plan.h:131

clustering::math::GemmPlan::execute
void execute(const NDArray< T, 2, LA > &A, NDArray< T, 2 > &C, T alpha=T{1}, T beta=T{0}) const noexcept
Execute the plan: compute C := alpha * A * B + beta * C against the pre-packed B captured at construc...
Definition gemm_plan.h:109

clustering::math::GemmPlan::GemmPlan
GemmPlan(GemmPlan &&) noexcept=default
Defaulted move constructor; transfers the packed B panel and scratch.

clustering::math::GemmPlan::nDim
std::size_t nDim() const noexcept
Column count captured at construction (B.cols).
Definition gemm_plan.h:127

clustering::math::GemmPlan::operator=
GemmPlan & operator=(const GemmPlan &)=delete

clustering::math::GemmPlan::debugScratchSize
std::size_t debugScratchSize() const noexcept
Debug accessor exposing the scratch capacity so tests can pin the sizing formula.
Definition gemm_plan.h:135

clustering::math::GemmPlan::GemmPlan
GemmPlan(const NDArray< T, 2, LB > &B, Pool pool)
Construct the plan and fully pre-pack B into m_Bp.
Definition gemm_plan.h:55

clustering::math::GemmPlan::kDim
std::size_t kDim() const noexcept
Inner dimension captured at construction (B.rows).
Definition gemm_plan.h:124

clustering::math::GemmPlan::GemmPlan
GemmPlan(const GemmPlan &)=delete

clustering::math
Definition aabb.h:12

ndarray.h

clustering::math::Pool
Thin injection wrapper around a BS::light_thread_pool.
Definition thread.h:63

thread.h