244 lines
6.9 KiB
C++
244 lines
6.9 KiB
C++
// Copyright 2015 Google Inc. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// instrumentation.h: contains the definitions needed to
|
|
// instrument code for profiling:
|
|
// ScopedProfilingLabel, RegisterCurrentThreadForProfiling.
|
|
//
|
|
// profiler.h is only needed to drive the profiler:
|
|
// StartProfiling, FinishProfiling.
|
|
//
|
|
// See the usage example in profiler.h.
|
|
|
|
#ifndef GEMMLOWP_PROFILING_INSTRUMENTATION_H_
|
|
#define GEMMLOWP_PROFILING_INSTRUMENTATION_H_
|
|
|
|
#include <pthread.h>
|
|
#include <cstdio>
|
|
|
|
#ifndef GEMMLOWP_USE_STLPORT
|
|
#include <cstdint>
|
|
#else
|
|
#include <stdint.h>
|
|
namespace std {
|
|
using ::uint8_t;
|
|
using ::uint16_t;
|
|
using ::uint32_t;
|
|
using ::int8_t;
|
|
using ::int16_t;
|
|
using ::int32_t;
|
|
using ::size_t;
|
|
using ::uintptr_t;
|
|
}
|
|
#endif
|
|
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <cstdlib>
|
|
|
|
#ifdef GEMMLOWP_PROFILING
|
|
#include <cstring>
|
|
#include <set>
|
|
#endif
|
|
|
|
// We should always use C++11 thread_local; unfortunately that
|
|
// isn't fully supported on Apple yet.
|
|
#ifdef __APPLE__
|
|
#define GEMMLOWP_THREAD_LOCAL static __thread
|
|
#define GEMMLOWP_USING_OLD_THREAD_LOCAL
|
|
#else
|
|
#define GEMMLOWP_THREAD_LOCAL thread_local
|
|
#endif
|
|
|
|
namespace gemmlowp {
|
|
|
|
inline void ReleaseBuildAssertion(bool condition, const char* msg) {
|
|
if (!condition) {
|
|
fprintf(stderr, "gemmlowp error: %s\n", msg);
|
|
abort();
|
|
}
|
|
}
|
|
|
|
// To be used as template parameter for GlobalLock.
|
|
// GlobalLock<ProfilerLockId> is the profiler global lock:
|
|
// registering threads, starting profiling, finishing profiling, and
|
|
// the profiler itself as it samples threads, all need to lock it.
|
|
struct ProfilerLockId;
|
|
|
|
// A very plain global lock. Templated in LockId so we can have multiple
|
|
// locks, one for each LockId type.
|
|
template <typename LockId>
|
|
class GlobalLock {
|
|
static pthread_mutex_t* Mutex() {
|
|
static pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
|
|
return &m;
|
|
}
|
|
|
|
public:
|
|
static void Lock() { pthread_mutex_lock(Mutex()); }
|
|
static void Unlock() { pthread_mutex_unlock(Mutex()); }
|
|
};
|
|
|
|
// A very simple RAII helper to lock and unlock a GlobalLock
|
|
template <typename LockId>
|
|
struct AutoGlobalLock {
|
|
AutoGlobalLock() { GlobalLock<LockId>::Lock(); }
|
|
~AutoGlobalLock() { GlobalLock<LockId>::Unlock(); }
|
|
};
|
|
|
|
// MemoryBarrier is purely a compile-time thing; it tells two things
|
|
// to the compiler:
|
|
// 1) It prevents reordering code across it
|
|
// (thanks to the 'volatile' after 'asm')
|
|
// 2) It requires the compiler to assume that any value previously
|
|
// read from memory, may have changed. Thus it offers an alternative
|
|
// to using 'volatile' variables.
|
|
inline void MemoryBarrier() { asm volatile("" ::: "memory"); }
|
|
|
|
// Profiling definitions. Two paths: when profiling is enabled,
|
|
// and when profiling is disabled.
|
|
#ifdef GEMMLOWP_PROFILING
|
|
// This code path is when profiling is enabled.
|
|
|
|
// A pseudo-call-stack. Contrary to a real call-stack, this only
|
|
// contains pointers to literal strings that were manually entered
|
|
// in the instrumented code (see ScopedProfilingLabel).
|
|
struct ProfilingStack {
|
|
static const std::size_t kMaxSize = 15;
|
|
typedef const char* LabelsArrayType[kMaxSize];
|
|
LabelsArrayType labels;
|
|
std::size_t size;
|
|
|
|
ProfilingStack() { memset(this, 0, sizeof(ProfilingStack)); }
|
|
|
|
void Push(const char* label) {
|
|
MemoryBarrier();
|
|
ReleaseBuildAssertion(size < kMaxSize, "ProfilingStack overflow");
|
|
labels[size] = label;
|
|
MemoryBarrier();
|
|
size++;
|
|
MemoryBarrier();
|
|
}
|
|
|
|
void Pop() {
|
|
MemoryBarrier();
|
|
ReleaseBuildAssertion(size > 0, "ProfilingStack underflow");
|
|
size--;
|
|
MemoryBarrier();
|
|
}
|
|
|
|
void UpdateTop(const char* new_label) {
|
|
MemoryBarrier();
|
|
assert(size);
|
|
labels[size - 1] = new_label;
|
|
MemoryBarrier();
|
|
}
|
|
|
|
ProfilingStack& operator=(const ProfilingStack& other) {
|
|
memcpy(this, &other, sizeof(ProfilingStack));
|
|
return *this;
|
|
}
|
|
|
|
bool operator==(const ProfilingStack& other) const {
|
|
return !memcmp(this, &other, sizeof(ProfilingStack));
|
|
}
|
|
};
|
|
|
|
static_assert(
|
|
!(sizeof(ProfilingStack) & (sizeof(ProfilingStack) - 1)),
|
|
"ProfilingStack should have power-of-two size to fit in cache lines");
|
|
|
|
struct ThreadInfo;
|
|
|
|
// The global set of threads being profiled.
|
|
inline std::set<ThreadInfo*>& ThreadsUnderProfiling() {
|
|
static std::set<ThreadInfo*> v;
|
|
return v;
|
|
}
|
|
|
|
struct ThreadInfo {
|
|
pthread_key_t key; // used only to get a callback at thread exit.
|
|
ProfilingStack stack;
|
|
|
|
ThreadInfo() {
|
|
pthread_key_create(&key, ThreadExitCallback);
|
|
pthread_setspecific(key, this);
|
|
}
|
|
|
|
static void ThreadExitCallback(void* ptr) {
|
|
AutoGlobalLock<ProfilerLockId> lock;
|
|
ThreadInfo* self = static_cast<ThreadInfo*>(ptr);
|
|
ThreadsUnderProfiling().erase(self);
|
|
pthread_key_delete(self->key);
|
|
}
|
|
};
|
|
|
|
inline ThreadInfo& ThreadLocalThreadInfo() {
|
|
#ifdef GEMMLOWP_USING_OLD_THREAD_LOCAL
|
|
// We're leaking this ThreadInfo structure, because Apple doesn't support
|
|
// non-trivial constructors or destructors for their __thread type modifier.
|
|
GEMMLOWP_THREAD_LOCAL ThreadInfo* i = nullptr;
|
|
if (i == nullptr) {
|
|
i = new ThreadInfo();
|
|
}
|
|
return *i;
|
|
#else
|
|
GEMMLOWP_THREAD_LOCAL ThreadInfo i;
|
|
return i;
|
|
#endif
|
|
}
|
|
|
|
// ScopedProfilingLabel is how one instruments code for profiling
|
|
// with this profiler. Construct local ScopedProfilingLabel variables,
|
|
// passing a literal string describing the local code. Profile
|
|
// samples will then be annotated with this label, while it is in scope
|
|
// (whence the name --- also known as RAII).
|
|
// See the example in profiler.h.
|
|
class ScopedProfilingLabel {
|
|
ProfilingStack* profiling_stack_;
|
|
|
|
public:
|
|
explicit ScopedProfilingLabel(const char* label)
|
|
: profiling_stack_(&ThreadLocalThreadInfo().stack) {
|
|
profiling_stack_->Push(label);
|
|
}
|
|
|
|
~ScopedProfilingLabel() { profiling_stack_->Pop(); }
|
|
|
|
void Update(const char* new_label) { profiling_stack_->UpdateTop(new_label); }
|
|
};
|
|
|
|
// To be called once on each thread to be profiled.
|
|
inline void RegisterCurrentThreadForProfiling() {
|
|
AutoGlobalLock<ProfilerLockId> lock;
|
|
ThreadsUnderProfiling().insert(&ThreadLocalThreadInfo());
|
|
}
|
|
|
|
#else // not GEMMLOWP_PROFILING
|
|
// This code path is when profiling is disabled.
|
|
|
|
// This empty definition of ScopedProfilingLabel ensures that
|
|
// it has zero runtime overhead when profiling is disabled.
|
|
struct ScopedProfilingLabel {
|
|
explicit ScopedProfilingLabel(const char*) {}
|
|
void Update(const char*) {}
|
|
};
|
|
|
|
inline void RegisterCurrentThreadForProfiling() {}
|
|
|
|
#endif
|
|
|
|
} // end namespace gemmlowp
|
|
|
|
#endif // GEMMLOWP_PROFILING_INSTRUMENTATION_H_
|