diff options
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp | 143 |
1 files changed, 97 insertions, 46 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp index eff487719..bdccd0532 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 ARM Limited. + * Copyright (c) 2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -35,6 +35,7 @@ #include <algorithm> #include <cassert> +#include <cmath> // Some macros used to decide how much working space to allocate. // Round allocations up to the next cache line. @@ -62,9 +63,6 @@ class GemmInterleavedPretransposed2d : public GemmCommon<To, Tr> { const unsigned int _nbatches; const unsigned int _nmulti; - const bool _trA; - const bool _trB; - const Activation _act; const int _maxthreads; @@ -173,16 +171,13 @@ class GemmInterleavedPretransposed2d : public GemmCommon<To, Tr> { // Internal execute function. // This supports both the "pretransposed" and "standard" interfaces via the template parameter. - void execute_pretranspose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int mthreadid, int nthreadid) { + void execute_pretranspose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int, int) { /* Make sure we've been set up correctly. */ assert(_B_transposed); assert(_working_space); assert(this->_Aptr); assert(this->_Cptr); - UNUSED(mthreadid); - UNUSED(nthreadid); - #ifdef CYCLE_PROFILING profiler prof; #endif @@ -255,8 +250,7 @@ class GemmInterleavedPretransposed2d : public GemmCommon<To, Tr> { first_m, last_m, current.k0(), - current.kmax(), - _trA); + current.kmax()); } } @@ -308,6 +302,36 @@ class GemmInterleavedPretransposed2d : public GemmCommon<To, Tr> { } } + static unsigned int get_k_block_size(const GemmArgs &args) { + // Work out blocking parameters, or override from provided GemmConfig + if (args._cfg && args._cfg->inner_block_size) { + return args._cfg->inner_block_size; + } + + const unsigned int L1_size = args._ci->get_L1_cache_size(); + unsigned int k_block; + + // k_block: Find out how much of the larger array can be loaded into half the cache. + // This should account for associative caches. + k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); + + // Needs to be (at least a single) multiple of the K unroll level. + k_block /= strategy::k_unroll(); + k_block = std::max(k_block, 1U) * strategy::k_unroll(); + + // Now tune to presented problem size; this is how many blocks we need. + unsigned int numk_blocks = iceildiv(args._Ksize, k_block); + + // So divide the space equally into that many blocks. + k_block = iceildiv(args._Ksize, numk_blocks); + + // And round UP to the K unroll level required. + k_block = iceildiv(k_block, strategy::k_unroll()); + k_block *= strategy::k_unroll(); + + return k_block; + } + public: GemmInterleavedPretransposed2d(GemmInterleavedPretransposed2d &) = delete; GemmInterleavedPretransposed2d & operator= (GemmInterleavedPretransposed2d &) = delete; @@ -320,12 +344,10 @@ public: , _Ksize(args._Ksize) , _nbatches(args._nbatches) , _nmulti(args._nmulti) - , _trA(args._trA) - , _trB(args._trB) , _act(args._act) , _maxthreads(args._maxthreads) - , _nthreads(args._maxthreads) - + , _nthreads(args._maxthreads) + , _k_block(get_k_block_size(args)) // Work out the rounded size of M - needed for some buffers. , _Mround_div ( iceildiv(_Msize, strategy::out_height()) ) , _Mround ( _Mround_div * strategy::out_height() ) @@ -333,36 +355,10 @@ public: , _Nround_div ( iceildiv(_Nsize, strategy::out_width()) ) , _Nround ( _Nround_div * strategy::out_width() ) { - - assert(args._pretransposed_hint); assert(_maxthreads > 0); - const unsigned int L1_size = _ci->get_L1_cache_size(); const unsigned int L2_size = _ci->get_L2_cache_size(); - // Work out blocking parameters, or override from provided GemmConfig - if (args._cfg && args._cfg->inner_block_size) { - _k_block = args._cfg->inner_block_size; - } else { - // k_block: Find out how much of the larger array can be loaded into half the cache. - // This should account for associative caches. - _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); - - // Needs to be (at least a single) multiple of the K unroll level. - _k_block /= strategy::k_unroll(); - _k_block = std::max(_k_block, 1U) * strategy::k_unroll(); - - // Now tune to presented problem size; this is how many blocks we need. - unsigned int num_k_blocks = iceildiv(_Ksize, _k_block); - - // So divide the space equally into that many blocks. - _k_block = iceildiv(_Ksize, num_k_blocks); - - // And round UP to the K unroll level required. - _k_block = iceildiv(_k_block, strategy::k_unroll()); - _k_block *= strategy::k_unroll(); - } - if (args._cfg && args._cfg->outer_block_size) { _x_block = args._cfg->outer_block_size; } else { @@ -389,7 +385,11 @@ public: unsigned m = (_Mround / strategy::out_height()) * _nbatches; unsigned n = _Nround_div; - return { m, n, 1u, 1u, 1u, 1u }; + return { m, n }; + } + + bool supports_dynamic_scheduling() const override { + return true; } // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads. @@ -401,8 +401,6 @@ public: /* This particular GEMM implementation can only be broken up over the M & N * dimensions, we inform the frame work of this limitation via the get_window_size function */ - assert(ndrange_popcount(work_range) <= 2); - const auto m_start = work_range.get_position(0); const auto n_start = work_range.get_position(1); const auto m_size = work_range.get_size(0); @@ -416,7 +414,7 @@ public: execute_pretranspose(m_start, m_end, n_start, n_end, threadid, m_threadid, n_threadid); } - std::size_t get_working_size()const override { + std::size_t get_working_size() const override { /* Because we do not know how schedular will break up * the task, we need to ensure that alloc enough * space to be able to handle the case where every thread @@ -498,7 +496,7 @@ public: k_size *= strategy::k_unroll(); strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb, - current.x0(), current.xmax(), current.k0(), current.kmax(), _trB); + current.x0(), current.xmax(), current.k0(), current.kmax()); buffer += (x_size * k_size); } while (current.advance()); @@ -508,7 +506,60 @@ public: _B_transposed = reinterpret_cast<Toi *>(in_buffer); } - ~GemmInterleavedPretransposed2d() override { } + // Estimate cycles for given problem given provided parameters + static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters ¶ms) { + unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args)); + unsigned int m_blocks = iceildiv(args._Msize, strategy::out_height()) * args._nbatches; + unsigned int n_blocks = iceildiv(args._Nsize, strategy::out_width()); + + uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll()); + uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Ksize, strategy::k_unroll()) * sizeof(Toi); + uint64_t merge_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr); + + // Wide problems incur extra preparation cost, as it is done per thread. + // Duplicate the logic the scheduler will later use to figure out how much that will affect us + float ratio = m_blocks / static_cast<float>(n_blocks); + + unsigned int ideal_height = static_cast<unsigned int>(std::sqrt(args._maxthreads * ratio) + 0.5); + unsigned int height = 1; + + if (ideal_height == 0) { + height = 1; + } else { + for (unsigned int adj=0; adj<ideal_height; adj++) { + const unsigned int round_down = ideal_height - adj; + if (args._maxthreads % round_down == 0) { + height = round_down; + break; + } + + const unsigned int round_up = ideal_height + adj; + if (args._maxthreads % round_up == 0) { + height = round_up; + break; + } + } + } + + // We've computed the height here - we need to multiply the amount of preparation effort by the width (which is total threads / height) + prepare_bytes *= (args._maxthreads / height); + + float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle; + float prepare_cycles = static_cast<float>(prepare_bytes) / params.prepare_bytes_cycle; + float merge_cycles = static_cast<float>(merge_bytes) / params.merge_bytes_cycle; + + float total_cycles = mac_cycles + prepare_cycles + merge_cycles; + + // We can't thread over multis, which might be a problem in some + // threaded cases. Penalize that here. + float parallelism_available = static_cast<float>(iceildiv(args._Msize, strategy::out_height()) * args._nbatches * iceildiv(args._Nsize, strategy::out_width())) * 0.9; + + if (parallelism_available < args._maxthreads) { + total_cycles *= (static_cast<float>(args._maxthreads) / parallelism_available); + } + + return static_cast<uint64_t>(total_cycles); + } }; } // namespace arm_gemm |