105 lines
3.6 KiB
C++
105 lines
3.6 KiB
C++
#include <iostream>
|
|
#include <chrono>
|
|
#include <vector>
|
|
#include <algorithm>
|
|
#include <numeric>
|
|
#include <stdlib.h>
|
|
#include <memory>
|
|
#include <cmath>
|
|
#include <string>
|
|
|
|
using namespace std;
|
|
|
|
const size_t size_start = 64;
|
|
const size_t size_end = 16 * (1ull << 20);
|
|
const size_t samples = 2048;
|
|
size_t size_per_test = 64 * (1ull << 20);
|
|
size_t tot_sum = 0;
|
|
|
|
void __attribute__((noinline)) memcpy_noinline(void *dst, void *src, size_t size);
|
|
void __attribute__((noinline)) memset_noinline(void *dst, int value, size_t size);
|
|
uint64_t __attribute__((noinline)) sum(volatile void *src, size_t size);
|
|
|
|
enum BenchType {
|
|
MemcpyBench,
|
|
MemsetBench,
|
|
SumBench,
|
|
};
|
|
|
|
int main(int argc, char *argv[])
|
|
{
|
|
BenchType type;
|
|
if (argc <= 1) {
|
|
cerr << "memcpy_perf [--memcpy|--memset|--sum]" << endl;
|
|
return 0;
|
|
}
|
|
if (string(argv[1]) == string("--memcpy")) {
|
|
type = MemcpyBench;
|
|
} else if (string(argv[1]) == string("--memset")) {
|
|
type = MemsetBench;
|
|
} else if (string(argv[1]) == string("--sum")) {
|
|
type = SumBench;
|
|
} else {
|
|
type = MemcpyBench;
|
|
}
|
|
|
|
unique_ptr<uint8_t[]> src(new uint8_t[size_end]);
|
|
unique_ptr<uint8_t[]> dst(new uint8_t[size_end]);
|
|
memset(src.get(), 1, size_end);
|
|
|
|
double start_pow = log10(size_start);
|
|
double end_pow = log10(size_end);
|
|
double pow_inc = (end_pow - start_pow) / samples;
|
|
|
|
//cout << "src: " << (uintptr_t)src.get() << endl;
|
|
//cout << "dst: " << (uintptr_t)dst.get() << endl;
|
|
|
|
for (double cur_pow = start_pow; cur_pow <= end_pow; cur_pow += pow_inc) {
|
|
chrono::time_point<chrono::high_resolution_clock> copy_start, copy_end;
|
|
|
|
size_t cur_size = (size_t)pow(10.0, cur_pow);
|
|
size_t iter_per_size = size_per_test / cur_size;
|
|
|
|
// run benchmark
|
|
switch (type) {
|
|
case MemsetBench: {
|
|
memcpy_noinline(src.get(), dst.get(), cur_size);
|
|
memset_noinline(dst.get(), 0xdeadbeef, cur_size);
|
|
copy_start = chrono::high_resolution_clock::now();
|
|
for (int i = 0; i < iter_per_size; i++) {
|
|
memset_noinline(dst.get(), 0xdeadbeef, cur_size);
|
|
}
|
|
copy_end = chrono::high_resolution_clock::now();
|
|
break;
|
|
}
|
|
case MemcpyBench: {
|
|
memcpy_noinline(dst.get(), src.get(), cur_size);
|
|
memcpy_noinline(src.get(), dst.get(), cur_size);
|
|
copy_start = chrono::high_resolution_clock::now();
|
|
for (int i = 0; i < iter_per_size; i++) {
|
|
memcpy_noinline(dst.get(), src.get(), cur_size);
|
|
}
|
|
copy_end = chrono::high_resolution_clock::now();
|
|
break;
|
|
}
|
|
case SumBench: {
|
|
uint64_t s = 0;
|
|
s += sum(src.get(), cur_size);
|
|
copy_start = chrono::high_resolution_clock::now();
|
|
for (int i = 0; i < iter_per_size; i++) {
|
|
s += sum(src.get(), cur_size);
|
|
}
|
|
copy_end = chrono::high_resolution_clock::now();
|
|
tot_sum += s;
|
|
break;
|
|
}
|
|
}
|
|
|
|
double ns_per_copy = chrono::duration_cast<chrono::nanoseconds>(copy_end - copy_start).count() / double(iter_per_size);
|
|
double gb_per_sec = ((double)cur_size / (1ull<<30)) / (ns_per_copy / 1.0E9);
|
|
if (type == MemcpyBench)
|
|
gb_per_sec *= 2.0;
|
|
cout << "size: " << cur_size << ", perf: " << gb_per_sec << "GB/s, iter: " << iter_per_size << endl;
|
|
}
|
|
return 0;
|
|
}
|