701 lines
24 KiB
C++
701 lines
24 KiB
C++
// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// multi_thread_gemm.h: Multi-threaded GEMM entry point.
|
|
// Readers note: To understand this file, it is useful to first
|
|
// read and understand the much simpler single_thread_gemm.h.
|
|
|
|
#ifndef GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
|
|
#define GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
|
|
|
|
#include <pthread.h>
|
|
#include <unistd.h>
|
|
#include <vector>
|
|
|
|
#include "single_thread_gemm.h"
|
|
|
|
namespace gemmlowp {
|
|
|
|
// On X86 and ARM platforms we enable a busy-wait spinlock before waiting on a
|
|
// pthread conditional variable. In order to implement that correctly we need
|
|
// to put some explicit memory load/store barriers.
|
|
|
|
#if defined(GEMMLOWP_ALLOW_INLINE_ASM) && !defined(GEMMLOWP_NO_BUSYWAIT) && \
|
|
(defined(GEMMLOWP_ARM) || defined(GEMMLOWP_X86))
|
|
|
|
#define GEMMLOWP_USE_BUSYWAIT
|
|
|
|
const int kMaxBusyWaitNOPs = 32 * 1000 * 1000;
|
|
|
|
#define GEMMLOWP_NOP "nop\n"
|
|
|
|
#define GEMMLOWP_STRING_CONCAT_4(X) X X X X
|
|
#define GEMMLOWP_NOP4 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP)
|
|
#define GEMMLOWP_NOP16 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP4)
|
|
#define GEMMLOWP_NOP64 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP16)
|
|
|
|
inline int Do256NOPs() {
|
|
asm volatile(GEMMLOWP_NOP64);
|
|
return 64;
|
|
}
|
|
|
|
#undef GEMMLOWP_STRING_CONCAT_4
|
|
#undef GEMMLOWP_NOP256
|
|
#undef GEMMLOWP_NOP64
|
|
#undef GEMMLOWP_NOP16
|
|
#undef GEMMLOWP_NOP4
|
|
#undef GEMMLOWP_NOP
|
|
|
|
inline void WriteBarrier() {
|
|
#ifdef GEMMLOWP_ARM_32
|
|
MemoryBarrier();
|
|
#elif defined(GEMMLOWP_ARM_64)
|
|
asm volatile("dmb ishst" ::: "memory");
|
|
#elif defined(GEMMLOWP_X86)
|
|
asm volatile("sfence" ::: "memory");
|
|
#else
|
|
#error "Unsupported architecture for WriteBarrier."
|
|
#endif
|
|
}
|
|
|
|
inline void ReadBarrier() {
|
|
#ifdef GEMMLOWP_ARM_32
|
|
MemoryBarrier();
|
|
#elif defined(GEMMLOWP_ARM_64)
|
|
asm volatile("dmb ishld" ::: "memory");
|
|
#elif defined(GEMMLOWP_X86)
|
|
asm volatile("lfence" ::: "memory");
|
|
#else
|
|
#error "Unsupported architecture for ReadBarrier."
|
|
#endif
|
|
}
|
|
|
|
#endif
|
|
|
|
// Waits until *var != initial_value.
|
|
//
|
|
// Returns the new value of *var. The guarantee here is that
|
|
// the return value is different from initial_value, and that that
|
|
// new value has been taken by *var at some point during the
|
|
// execution of this function. There is no guarantee that this is
|
|
// still the value of *var when this function returns, since *var is
|
|
// not assumed to be guarded by any lock.
|
|
//
|
|
// First does some busy-waiting for a fixed number of no-op cycles,
|
|
// then falls back to passive waiting for the given condvar, guarded
|
|
// by the given mutex.
|
|
//
|
|
// The idea of doing some initial busy-waiting is to help get
|
|
// better and more consistent multithreading benefits for small GEMM sizes.
|
|
// Busy-waiting help ensuring that if we need to wake up soon after having
|
|
// started waiting, then we can wake up quickly (as opposed to, say,
|
|
// having to wait to be scheduled again by the OS). On the other hand,
|
|
// we must still eventually revert to passive waiting for longer waits
|
|
// (e.g. worker threads having finished a GEMM and waiting until the next GEMM)
|
|
// so as to avoid permanently spinning.
|
|
//
|
|
template <typename T>
|
|
T WaitForVariableChange(volatile T* var, T initial_value, pthread_cond_t* cond,
|
|
pthread_mutex_t* mutex) {
|
|
#ifdef GEMMLOWP_USE_BUSYWAIT
|
|
// If we are on a platform that supports it, spin for some time.
|
|
{
|
|
int nops = 0;
|
|
// First, trivial case where the variable already changed value.
|
|
T new_value = *var;
|
|
if (new_value != initial_value) {
|
|
ReadBarrier();
|
|
return new_value;
|
|
}
|
|
// Then try busy-waiting.
|
|
while (nops < kMaxBusyWaitNOPs) {
|
|
nops += Do256NOPs();
|
|
new_value = *var;
|
|
if (new_value != initial_value) {
|
|
ReadBarrier();
|
|
return new_value;
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
// Finally, do real passive waiting.
|
|
pthread_mutex_lock(mutex);
|
|
T new_value = *var;
|
|
if (new_value == initial_value) {
|
|
pthread_cond_wait(cond, mutex);
|
|
new_value = *var;
|
|
assert(new_value != initial_value);
|
|
}
|
|
pthread_mutex_unlock(mutex);
|
|
return new_value;
|
|
}
|
|
|
|
// A BlockingCounter lets one thread to wait for N events to occur.
|
|
// This is how the master thread waits for all the worker threads
|
|
// to have finished working.
|
|
class BlockingCounter {
|
|
public:
|
|
BlockingCounter()
|
|
: cond_(PTHREAD_COND_INITIALIZER),
|
|
mutex_(PTHREAD_MUTEX_INITIALIZER),
|
|
count_(0),
|
|
initial_count_(0) {}
|
|
|
|
// Sets/resets the counter; initial_count is the number of
|
|
// decrementing events that the Wait() call will be waiting for.
|
|
void Reset(std::size_t initial_count) {
|
|
pthread_mutex_lock(&mutex_);
|
|
assert(count_ == 0);
|
|
initial_count_ = initial_count;
|
|
count_ = initial_count_;
|
|
pthread_mutex_unlock(&mutex_);
|
|
}
|
|
|
|
// Decrements the counter; if the counter hits zero, signals
|
|
// the thread that was waiting for that, and returns true.
|
|
// Otherwise (if the decremented count is still nonzero),
|
|
// returns false.
|
|
bool DecrementCount() {
|
|
pthread_mutex_lock(&mutex_);
|
|
assert(count_ > 0);
|
|
count_--;
|
|
#ifdef GEMMLOWP_USE_BUSYWAIT
|
|
WriteBarrier();
|
|
#endif
|
|
if (count_ == 0) {
|
|
pthread_cond_signal(&cond_);
|
|
}
|
|
bool retval = count_ == 0;
|
|
pthread_mutex_unlock(&mutex_);
|
|
return retval;
|
|
}
|
|
|
|
// Waits for the N other threads (N having been set by Reset())
|
|
// to hit the BlockingCounter.
|
|
void Wait() {
|
|
ScopedProfilingLabel label("BlockingCounter::Wait");
|
|
while (count_) {
|
|
MemoryBarrier();
|
|
const std::size_t count_value = count_;
|
|
if (count_value) {
|
|
WaitForVariableChange(&count_, count_value, &cond_, &mutex_);
|
|
}
|
|
}
|
|
}
|
|
|
|
private:
|
|
pthread_cond_t cond_;
|
|
pthread_mutex_t mutex_;
|
|
std::size_t count_;
|
|
std::size_t initial_count_;
|
|
};
|
|
|
|
// A workload for a worker.
|
|
struct Task {
|
|
Task() : local_allocator(nullptr) {}
|
|
virtual ~Task() {}
|
|
virtual void Run() = 0;
|
|
Allocator* local_allocator;
|
|
};
|
|
|
|
// A worker thread.
|
|
class Worker {
|
|
public:
|
|
enum class State {
|
|
ThreadStartup, // The initial state before the thread main loop runs.
|
|
Ready, // Is not working, has not yet received new work to do.
|
|
HasWork, // Has work to do.
|
|
ExitAsSoonAsPossible // Should exit at earliest convenience.
|
|
};
|
|
|
|
explicit Worker(BlockingCounter* counter_to_decrement_when_ready)
|
|
: task_(nullptr),
|
|
state_cond_(PTHREAD_COND_INITIALIZER),
|
|
state_mutex_(PTHREAD_MUTEX_INITIALIZER),
|
|
state_(State::ThreadStartup),
|
|
counter_to_decrement_when_ready_(counter_to_decrement_when_ready) {
|
|
pthread_create(&thread_, nullptr, ThreadFunc, this);
|
|
}
|
|
|
|
~Worker() {
|
|
ChangeState(State::ExitAsSoonAsPossible);
|
|
pthread_join(thread_, nullptr);
|
|
}
|
|
|
|
// Changes State; may be called from either the worker thread
|
|
// or the master thread; however, not all state transitions are legal,
|
|
// which is guarded by assertions.
|
|
void ChangeState(State new_state) {
|
|
ScopedProfilingLabel label("Worker::ChangeState");
|
|
pthread_mutex_lock(&state_mutex_);
|
|
assert(new_state != state_);
|
|
switch (state_) {
|
|
case State::ThreadStartup:
|
|
assert(new_state == State::Ready);
|
|
break;
|
|
case State::Ready:
|
|
assert(new_state == State::HasWork ||
|
|
new_state == State::ExitAsSoonAsPossible);
|
|
break;
|
|
case State::HasWork:
|
|
assert(new_state == State::Ready ||
|
|
new_state == State::ExitAsSoonAsPossible);
|
|
break;
|
|
default:
|
|
abort();
|
|
}
|
|
state_ = new_state;
|
|
pthread_cond_signal(&state_cond_);
|
|
if (state_ == State::Ready) {
|
|
counter_to_decrement_when_ready_->DecrementCount();
|
|
}
|
|
pthread_mutex_unlock(&state_mutex_);
|
|
}
|
|
|
|
// Thread entry point.
|
|
void ThreadFunc() {
|
|
ScopedProfilingLabel label("Worker::ThreadFunc");
|
|
RegisterCurrentThreadForProfiling();
|
|
|
|
ChangeState(State::Ready);
|
|
|
|
// Thread main loop
|
|
while (true) {
|
|
// Get a state to act on
|
|
// In the 'Ready' state, we have nothing to do but to wait until
|
|
// we switch to another state.
|
|
State state_to_act_upon = WaitForVariableChange(
|
|
&state_, State::Ready, &state_cond_, &state_mutex_);
|
|
|
|
// We now have a state to act on, so act.
|
|
switch (state_to_act_upon) {
|
|
case State::HasWork:
|
|
// Got work to do! So do it, and then revert to 'Ready' state.
|
|
assert(task_);
|
|
task_->Run();
|
|
task_ = nullptr;
|
|
ChangeState(State::Ready);
|
|
break;
|
|
case State::ExitAsSoonAsPossible:
|
|
return;
|
|
default:
|
|
abort();
|
|
}
|
|
}
|
|
}
|
|
|
|
static void* ThreadFunc(void* arg) {
|
|
static_cast<Worker*>(arg)->ThreadFunc();
|
|
return nullptr;
|
|
}
|
|
|
|
// Called by the master thead to give this worker work to do.
|
|
// It is only legal to call this if the worker
|
|
void StartWork(Task* task) {
|
|
assert(!task_);
|
|
task->local_allocator = &local_allocator_;
|
|
task_ = task;
|
|
#ifdef GEMMLOWP_USE_BUSYWAIT
|
|
WriteBarrier();
|
|
#endif
|
|
assert(state_ == State::Ready);
|
|
ChangeState(State::HasWork);
|
|
}
|
|
|
|
private:
|
|
// The underlying thread.
|
|
pthread_t thread_;
|
|
|
|
// The task to be worked on.
|
|
Task* task_;
|
|
|
|
// The condition variable and mutex guarding state changes.
|
|
pthread_cond_t state_cond_;
|
|
pthread_mutex_t state_mutex_;
|
|
|
|
// The state enum tells if we're currently working, waiting for work, etc.
|
|
State state_;
|
|
|
|
// Each thread had a local allocator so they can allocate temporary
|
|
// buffers without blocking each other.
|
|
Allocator local_allocator_;
|
|
|
|
// pointer to the master's thread BlockingCounter object, to notify the
|
|
// master thread of when this worker switches to the 'Ready' state.
|
|
BlockingCounter* const counter_to_decrement_when_ready_;
|
|
};
|
|
|
|
// A very simple pool of workers, that only allows the very
|
|
// specific parallelization pattern that we use here:
|
|
// a fixed number of workers can be given work, and one then
|
|
// waits for all of them to finish.
|
|
//
|
|
// See MultiThreadGemmContextBase for how other WorkersPool implementations can
|
|
// be used. Note that in those implementations, StartWorker can be free to
|
|
// ignore the <index> value; that is, the caller of WorkersPool does not rely on
|
|
// <index> to order tasks with equal <index>.
|
|
class WorkersPool {
|
|
public:
|
|
WorkersPool() {}
|
|
|
|
~WorkersPool() {
|
|
for (auto w : workers_) {
|
|
delete w;
|
|
}
|
|
}
|
|
|
|
void Execute(const std::vector<Task*>& tasks) {
|
|
assert(tasks.size() >= 1);
|
|
// One of the tasks will be run on the current thread.
|
|
int workers_count = tasks.size() - 1;
|
|
CreateWorkers(workers_count);
|
|
assert(workers_count <= workers_.size());
|
|
counter_to_decrement_when_ready_.Reset(workers_count);
|
|
int n = 0;
|
|
std::for_each(tasks.begin(), --tasks.end(), [this, &n](Task *task) {
|
|
workers_[n++]->StartWork(task);
|
|
});
|
|
// Execute the remaining workload immediately on the current thread.
|
|
Task* task = tasks.back();
|
|
task->local_allocator = &main_thread_task_allocator_;
|
|
task->Run();
|
|
// Wait for the workers submitted above to finish.
|
|
counter_to_decrement_when_ready_.Wait();
|
|
// Cleanup tasks (best to do this from the same thread that allocated
|
|
// the memory).
|
|
std::for_each(tasks.begin(), tasks.end(), [](Task *task) {
|
|
delete task;
|
|
});
|
|
}
|
|
|
|
private:
|
|
// Ensures that the pool has at least the given count of workers.
|
|
// If any new worker has to be created, this function waits for it to
|
|
// be ready.
|
|
void CreateWorkers(std::size_t workers_count) {
|
|
if (workers_.size() >= workers_count) {
|
|
return;
|
|
}
|
|
counter_to_decrement_when_ready_.Reset(workers_count - workers_.size());
|
|
while (workers_.size() < workers_count) {
|
|
workers_.push_back(new Worker(&counter_to_decrement_when_ready_));
|
|
}
|
|
counter_to_decrement_when_ready_.Wait();
|
|
}
|
|
|
|
// copy construction disallowed
|
|
WorkersPool(const WorkersPool&) = delete;
|
|
|
|
// The workers in this pool. They are owned by the pool:
|
|
// the pool creates workers and destroys them in its destructor.
|
|
std::vector<Worker*> workers_;
|
|
|
|
// The BlockingCounter used to wait for the workers.
|
|
BlockingCounter counter_to_decrement_when_ready_;
|
|
|
|
// For N-threaded operations, we will use only N-1 worker threads
|
|
// while the last task will be run directly on the main thread.
|
|
// It will then use this main_thread_task_allocator_; having a
|
|
// dedicated allocator for that (separate from the base allocator_)
|
|
// allows to use the same code for all tasks regardless of which
|
|
// thread they run on.
|
|
Allocator main_thread_task_allocator_;
|
|
};
|
|
|
|
// The task we use to implement a multi-threaded Gemm: a block of the
|
|
// RHS has been packed by the master thread; each worker thread
|
|
// then has to pack a block of the LHS and accumulate the Gemm of these
|
|
// packed LHS and RHS blocks.
|
|
template <typename KernelFormat, typename InputScalar, typename OutputScalar,
|
|
typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
|
|
MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
|
|
typename OutputPipelineType, typename GemmContextType>
|
|
struct GemmWithPackedRhsTask : Task {
|
|
typedef PackedSideBlock<typename KernelFormat::Lhs> PackedLhs;
|
|
typedef PackedSideBlock<typename KernelFormat::Rhs> PackedRhs;
|
|
GemmWithPackedRhsTask(GemmContextType* _context,
|
|
const KernelBase& _kernel,
|
|
const MatrixMap<const InputScalar, LhsOrder>& _lhs,
|
|
const PackedRhs& _packed_rhs,
|
|
MatrixMap<OutputScalar, ResultOrder>* _result,
|
|
const MatrixBlockBounds& _result_block,
|
|
const LhsOffset& _lhs_offset,
|
|
const RhsOffset& _rhs_offset,
|
|
const OutputPipelineType& _output_pipeline)
|
|
: context(_context),
|
|
kernel(_kernel),
|
|
lhs(_lhs),
|
|
packed_rhs(_packed_rhs),
|
|
result(*_result),
|
|
result_block(_result_block),
|
|
lhs_offset(_lhs_offset),
|
|
rhs_offset(_rhs_offset),
|
|
output_pipeline(_output_pipeline) {}
|
|
|
|
void Run() override {
|
|
ScopedProfilingLabel label("GemmWithPackedRhsTask");
|
|
|
|
const int rows = result_block.rows;
|
|
const int cols = result_block.cols;
|
|
const int depth = lhs.cols();
|
|
|
|
BlockParams block_params;
|
|
block_params.Init<KernelFormat>(rows, cols, depth, 1,
|
|
context->l1_bytes_to_use(),
|
|
context->l2_bytes_to_use(),
|
|
context->l2_rhs_factor());
|
|
|
|
PackedLhs packed_lhs(Side::Lhs, local_allocator, block_params);
|
|
|
|
PackedResult packed_result(local_allocator, block_params);
|
|
|
|
local_allocator->Commit();
|
|
|
|
for (int c = 0; c < cols; c += block_params.l2_cols) {
|
|
int cs = std::min(block_params.l2_cols, cols - c);
|
|
|
|
for (int r = 0; r < rows; r += block_params.l2_rows) {
|
|
int rs = std::min(block_params.l2_rows, rows - r);
|
|
|
|
PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));
|
|
|
|
Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs,
|
|
depth);
|
|
|
|
auto curr_result_block = MatrixBlockBounds(
|
|
result_block.start_row + r, result_block.start_col + c, rs, cs);
|
|
UnpackResult<KernelFormat>(
|
|
&result, curr_result_block, packed_result, depth,
|
|
packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(),
|
|
lhs_offset.block(curr_result_block.start_row, rs),
|
|
rhs_offset.block(curr_result_block.start_col, cs), output_pipeline);
|
|
}
|
|
}
|
|
|
|
local_allocator->Decommit();
|
|
}
|
|
|
|
const GemmContextType* context;
|
|
const KernelBase& kernel;
|
|
const MatrixMap<const InputScalar, LhsOrder> lhs;
|
|
const PackedRhs packed_rhs;
|
|
MatrixMap<OutputScalar, ResultOrder> result;
|
|
const MatrixBlockBounds result_block;
|
|
const LhsOffset& lhs_offset;
|
|
const RhsOffset& rhs_offset;
|
|
const OutputPipelineType& output_pipeline;
|
|
};
|
|
|
|
// This base class for multi-threading allows subclasses to implement their own
|
|
// workers_pool() method. See MultiThreadGemmContext below for an example;
|
|
// any other implementation of workers_pool() must return an object with the
|
|
// same public methods as WorkersPool.
|
|
class MultiThreadGemmContextBase : public SingleThreadGemmContext {
|
|
public:
|
|
void set_max_num_threads(int n) { max_num_threads_ = n; }
|
|
|
|
int max_num_threads() const { return max_num_threads_; }
|
|
|
|
protected:
|
|
// The maximum number of worker threads to use (including
|
|
// the master thread).
|
|
// The default value 1 means single-threading. That is the default
|
|
// because gemmlowp's primary target is mobile hardware, where thermal
|
|
// constraints usually mean that it may not be realistic to use more
|
|
// than 1 CPU core even if multiple cores are present.
|
|
// The special value 0 means try to detect the number of hardware threads.
|
|
// Note: this assumes that all CPU cores are equivalent. That assumption
|
|
// is defeated on big.LITTLE ARM devices, where we have no API to query
|
|
// the number of big cores (which is typically what we would want to use,
|
|
// leaving aside above-mentioned thermal issues). That is the other reason
|
|
// why the best compromise here is to let max_num_threads_ default to 1,
|
|
// so users who want multi-threading have to make the decision of how many
|
|
// threads to use by themselves.
|
|
int max_num_threads_ = 1;
|
|
};
|
|
|
|
class MultiThreadGemmContext : public MultiThreadGemmContextBase {
|
|
public:
|
|
WorkersPool* workers_pool() { return &workers_pool_; }
|
|
|
|
private:
|
|
// The workers pool used by MultiThreadGemm. Making
|
|
// this part of the context allows it to be persistent,
|
|
// avoiding recreating threads on every Gemm.
|
|
WorkersPool workers_pool_;
|
|
};
|
|
|
|
// Needed by chrome native builds
|
|
#ifndef _SC_NPROCESSORS_CONF
|
|
#define _SC_NPROCESSORS_CONF _SC_NPROCESSORS_ONLN
|
|
#endif
|
|
|
|
// Determines how many threads should be used for a given Gemm
|
|
// operation.
|
|
template <int KernelRows>
|
|
inline int HowManyThreads(int max_num_threads, int rows, int cols, int depth) {
|
|
// Early-exit in the default case where multi-threading is disabled.
|
|
if (max_num_threads == 1) {
|
|
return 1;
|
|
}
|
|
|
|
// Determine the maximum number of threads.
|
|
int max_count = max_num_threads;
|
|
// The special value 0 means try to determine the total number of cores.
|
|
if (max_count == 0) {
|
|
// No user-set maximum number of threads, so we need to
|
|
// do some hardware detection.
|
|
// This is expensive to query so we do it only once.
|
|
// Too bad for dynamicness. Also, we dont use the c++11 standard getter
|
|
// because Google's coding style currently bans #include <thread_>.
|
|
static const int hardware_threads_count =
|
|
static_cast<int>(sysconf(_SC_NPROCESSORS_CONF));
|
|
|
|
max_count = hardware_threads_count;
|
|
}
|
|
|
|
// Basic calculation: take into account max pool size, and
|
|
// how many rows we have to feed our kernel.
|
|
// The motivation for an absolute minimum number of rows per thread,
|
|
// potentially higher than KernelRows, is that very thin thread workload
|
|
// currently defeat assumptions of the AddMod generator, resulting
|
|
// in substantial bias in TestWithRealData on 24 threads.
|
|
// Ideally, the AddMod generator should be aware of global (r,c) coordinates
|
|
// so as to be independent of the number of threads.
|
|
static const int AbsoluteMinRowsPerThread = 16;
|
|
static const int MinRowsPerThread = KernelRows > AbsoluteMinRowsPerThread
|
|
? KernelRows
|
|
: AbsoluteMinRowsPerThread;
|
|
int thread_count = std::min(max_count, CeilQuotient(rows, MinRowsPerThread));
|
|
|
|
// At this point for small products we already have thread_count==1 so
|
|
// we can avoid doing more work; otherwise, we still want to check
|
|
// that the cubic size (rows*cols*depth) is big enough to keep
|
|
// workers_ busy.
|
|
if (thread_count > 1) {
|
|
// Empirically determined value.
|
|
static const std::uint64_t min_cubic_size_per_thread = 64 * 1024;
|
|
|
|
// We can only multiply two out of three sizes without risking overflow
|
|
const std::uint64_t cubic_size =
|
|
std::uint64_t(rows) * std::uint64_t(cols) * std::uint64_t(depth);
|
|
|
|
thread_count =
|
|
std::min(thread_count, int(cubic_size / min_cubic_size_per_thread));
|
|
|
|
if (thread_count < 1) {
|
|
thread_count = 1;
|
|
}
|
|
}
|
|
|
|
assert(thread_count > 0 && thread_count <= max_count);
|
|
return thread_count;
|
|
}
|
|
|
|
// The main multi-threaded Gemm function.
|
|
// To understand it, first read the code of SingleThreadGemm().
|
|
// The parallelization scheme used here is to have this master function
|
|
// pack a block of RHS and then start worker threads to pack a block of LHS
|
|
// each, and accumulate the corresponding products.
|
|
template <typename KernelFormat, typename InputScalar, typename OutputScalar,
|
|
typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
|
|
MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
|
|
typename OutputPipelineType, typename GemmContextType>
|
|
void MultiThreadGemm(GemmContextType* context, const KernelBase& kernel,
|
|
const MatrixMap<const InputScalar, LhsOrder>& lhs,
|
|
const MatrixMap<const InputScalar, RhsOrder>& rhs,
|
|
MatrixMap<OutputScalar, ResultOrder>* result,
|
|
const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
|
|
const OutputPipelineType& output_pipeline) {
|
|
ScopedProfilingLabel label("gemmlowp::MultiThreadGemm");
|
|
|
|
assert(lhs.cols() == rhs.rows());
|
|
|
|
int rows = result->rows();
|
|
int cols = result->cols();
|
|
int depth = lhs.cols();
|
|
|
|
// zero sizes should have been caught earlier and early-returned.
|
|
assert(rows > 0);
|
|
assert(cols > 0);
|
|
assert(depth > 0);
|
|
|
|
// The case of rows<cols should have been caught earlier and transposed.
|
|
assert(rows >= cols);
|
|
|
|
const int thread_count = HowManyThreads<KernelFormat::kRows>(
|
|
context->max_num_threads(), rows, cols, depth);
|
|
if (thread_count == 1) {
|
|
return SingleThreadGemm<KernelFormat, InputScalar, OutputScalar,
|
|
BitDepthParams>(context, kernel, lhs, rhs, result,
|
|
lhs_offset, rhs_offset,
|
|
output_pipeline);
|
|
}
|
|
assert(thread_count > 1);
|
|
|
|
// Simple 1:1 mapping of tasks to physical cores, which is very important
|
|
// to getting good multithreaded performance, specially for not-very-large
|
|
// GEMMs, and especially on Android.
|
|
const int task_count = thread_count;
|
|
|
|
Allocator* allocator = context->allocator();
|
|
auto* workers_pool = context->workers_pool();
|
|
|
|
BlockParams block_params;
|
|
block_params.Init<KernelFormat>(rows, cols, depth, task_count,
|
|
context->l1_bytes_to_use(),
|
|
context->l2_bytes_to_use(),
|
|
context->l2_rhs_factor());
|
|
|
|
PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator,
|
|
block_params);
|
|
allocator->Commit();
|
|
|
|
// We loop over large blocks of the RHS.
|
|
for (int c = 0; c < cols; c += block_params.l2_cols) {
|
|
int cs = std::min(block_params.l2_cols, cols - c);
|
|
|
|
// Pack a large block of the RHS.
|
|
PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
|
|
|
|
// Give work to each worker.
|
|
std::vector<Task*> tasks;
|
|
int next_start_row = 0;
|
|
for (int n = 0; n < task_count; ++n) {
|
|
int start_row = next_start_row;
|
|
next_start_row = std::min(rows, RoundUp<KernelFormat::kRows>(
|
|
rows * (n + 1) / task_count));
|
|
|
|
int block_rows = next_start_row - start_row;
|
|
auto lhs_block = lhs.block(start_row, 0, block_rows, depth);
|
|
typedef GemmWithPackedRhsTask<
|
|
KernelFormat, InputScalar, OutputScalar, BitDepthParams, LhsOrder,
|
|
RhsOrder, ResultOrder, LhsOffset, RhsOffset, OutputPipelineType,
|
|
GemmContextType>
|
|
TaskType;
|
|
tasks.push_back(new TaskType(context, kernel, lhs_block, packed_rhs, result,
|
|
MatrixBlockBounds(start_row, c, block_rows, cs),
|
|
lhs_offset, rhs_offset, output_pipeline));
|
|
}
|
|
// Execute the work on the workers (and partially on this thread).
|
|
workers_pool->Execute(tasks);
|
|
}
|
|
|
|
allocator->Decommit();
|
|
}
|
|
|
|
} // namespace gemmlowp
|
|
|
|
#endif // GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
|