diff options
Diffstat (limited to 'test/perf_counters_gtest.cc')
-rw-r--r-- | test/perf_counters_gtest.cc | 307 |
1 files changed, 307 insertions, 0 deletions
diff --git a/test/perf_counters_gtest.cc b/test/perf_counters_gtest.cc new file mode 100644 index 0000000..54c7863 --- /dev/null +++ b/test/perf_counters_gtest.cc @@ -0,0 +1,307 @@ +#include <random> +#include <thread> + +#include "../src/perf_counters.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +#ifndef GTEST_SKIP +struct MsgHandler { + void operator=(std::ostream&) {} +}; +#define GTEST_SKIP() return MsgHandler() = std::cout +#endif + +using benchmark::internal::PerfCounters; +using benchmark::internal::PerfCountersMeasurement; +using benchmark::internal::PerfCounterValues; +using ::testing::AllOf; +using ::testing::Gt; +using ::testing::Lt; + +namespace { +const char kGenericPerfEvent1[] = "CYCLES"; +const char kGenericPerfEvent2[] = "INSTRUCTIONS"; + +TEST(PerfCountersTest, Init) { + EXPECT_EQ(PerfCounters::Initialize(), PerfCounters::kSupported); +} + +TEST(PerfCountersTest, OneCounter) { + if (!PerfCounters::kSupported) { + GTEST_SKIP() << "Performance counters not supported.\n"; + } + EXPECT_TRUE(PerfCounters::Initialize()); + EXPECT_EQ(PerfCounters::Create({kGenericPerfEvent1}).num_counters(), 1); +} + +TEST(PerfCountersTest, NegativeTest) { + if (!PerfCounters::kSupported) { + EXPECT_FALSE(PerfCounters::Initialize()); + return; + } + EXPECT_TRUE(PerfCounters::Initialize()); + // Sanity checks + // Create() will always create a valid object, even if passed no or + // wrong arguments as the new behavior is to warn and drop unsupported + // counters + EXPECT_EQ(PerfCounters::Create({}).num_counters(), 0); + EXPECT_EQ(PerfCounters::Create({""}).num_counters(), 0); + EXPECT_EQ(PerfCounters::Create({"not a counter name"}).num_counters(), 0); + { + // Try sneaking in a bad egg to see if it is filtered out. The + // number of counters has to be two, not zero + auto counter = + PerfCounters::Create({kGenericPerfEvent2, "", kGenericPerfEvent1}); + EXPECT_EQ(counter.num_counters(), 2); + EXPECT_EQ(counter.names(), std::vector<std::string>( + {kGenericPerfEvent2, kGenericPerfEvent1})); + } + { + // Try sneaking in an outrageous counter, like a fat finger mistake + auto counter = PerfCounters::Create( + {kGenericPerfEvent2, "not a counter name", kGenericPerfEvent1}); + EXPECT_EQ(counter.num_counters(), 2); + EXPECT_EQ(counter.names(), std::vector<std::string>( + {kGenericPerfEvent2, kGenericPerfEvent1})); + } + { + // Finally try a golden input - it should like both of them + EXPECT_EQ(PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2}) + .num_counters(), + 2); + } + { + // Add a bad apple in the end of the chain to check the edges + auto counter = PerfCounters::Create( + {kGenericPerfEvent1, kGenericPerfEvent2, "bad event name"}); + EXPECT_EQ(counter.num_counters(), 2); + EXPECT_EQ(counter.names(), std::vector<std::string>( + {kGenericPerfEvent1, kGenericPerfEvent2})); + } +} + +TEST(PerfCountersTest, Read1Counter) { + if (!PerfCounters::kSupported) { + GTEST_SKIP() << "Test skipped because libpfm is not supported.\n"; + } + EXPECT_TRUE(PerfCounters::Initialize()); + auto counters = PerfCounters::Create({kGenericPerfEvent1}); + EXPECT_EQ(counters.num_counters(), 1); + PerfCounterValues values1(1); + EXPECT_TRUE(counters.Snapshot(&values1)); + EXPECT_GT(values1[0], 0); + PerfCounterValues values2(1); + EXPECT_TRUE(counters.Snapshot(&values2)); + EXPECT_GT(values2[0], 0); + EXPECT_GT(values2[0], values1[0]); +} + +TEST(PerfCountersTest, Read2Counters) { + if (!PerfCounters::kSupported) { + GTEST_SKIP() << "Test skipped because libpfm is not supported.\n"; + } + EXPECT_TRUE(PerfCounters::Initialize()); + auto counters = + PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2}); + EXPECT_EQ(counters.num_counters(), 2); + PerfCounterValues values1(2); + EXPECT_TRUE(counters.Snapshot(&values1)); + EXPECT_GT(values1[0], 0); + EXPECT_GT(values1[1], 0); + PerfCounterValues values2(2); + EXPECT_TRUE(counters.Snapshot(&values2)); + EXPECT_GT(values2[0], 0); + EXPECT_GT(values2[1], 0); +} + +TEST(PerfCountersTest, ReopenExistingCounters) { + // This test works in recent and old Intel hardware, Pixel 3, and Pixel 6. + // However we cannot make assumptions beyond 2 HW counters due to Pixel 6. + if (!PerfCounters::kSupported) { + GTEST_SKIP() << "Test skipped because libpfm is not supported.\n"; + } + EXPECT_TRUE(PerfCounters::Initialize()); + std::vector<std::string> kMetrics({kGenericPerfEvent1}); + std::vector<PerfCounters> counters(2); + for (auto& counter : counters) { + counter = PerfCounters::Create(kMetrics); + } + PerfCounterValues values(1); + EXPECT_TRUE(counters[0].Snapshot(&values)); + EXPECT_TRUE(counters[1].Snapshot(&values)); +} + +TEST(PerfCountersTest, CreateExistingMeasurements) { + // The test works (i.e. causes read to fail) for the assumptions + // about hardware capabilities (i.e. small number (2) hardware + // counters) at this date, + // the same as previous test ReopenExistingCounters. + if (!PerfCounters::kSupported) { + GTEST_SKIP() << "Test skipped because libpfm is not supported.\n"; + } + EXPECT_TRUE(PerfCounters::Initialize()); + + // This means we will try 10 counters but we can only guarantee + // for sure at this time that only 3 will work. Perhaps in the future + // we could use libpfm to query for the hardware limits on this + // particular platform. + const int kMaxCounters = 10; + const int kMinValidCounters = 2; + + // Let's use a ubiquitous counter that is guaranteed to work + // on all platforms + const std::vector<std::string> kMetrics{"cycles"}; + + // Cannot create a vector of actual objects because the + // copy constructor of PerfCounters is deleted - and so is + // implicitly deleted on PerfCountersMeasurement too + std::vector<std::unique_ptr<PerfCountersMeasurement>> + perf_counter_measurements; + + perf_counter_measurements.reserve(kMaxCounters); + for (int j = 0; j < kMaxCounters; ++j) { + perf_counter_measurements.emplace_back( + new PerfCountersMeasurement(kMetrics)); + } + + std::vector<std::pair<std::string, double>> measurements; + + // Start all counters together to see if they hold + size_t max_counters = kMaxCounters; + for (size_t i = 0; i < kMaxCounters; ++i) { + auto& counter(*perf_counter_measurements[i]); + EXPECT_EQ(counter.num_counters(), 1); + if (!counter.Start()) { + max_counters = i; + break; + }; + } + + ASSERT_GE(max_counters, kMinValidCounters); + + // Start all together + for (size_t i = 0; i < max_counters; ++i) { + auto& counter(*perf_counter_measurements[i]); + EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters)); + } + + // Start/stop individually + for (size_t i = 0; i < max_counters; ++i) { + auto& counter(*perf_counter_measurements[i]); + measurements.clear(); + counter.Start(); + EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters)); + } +} + +// We try to do some meaningful work here but the compiler +// insists in optimizing away our loop so we had to add a +// no-optimize macro. In case it fails, we added some entropy +// to this pool as well. + +BENCHMARK_DONT_OPTIMIZE size_t do_work() { + static std::mt19937 rd{std::random_device{}()}; + static std::uniform_int_distribution<size_t> mrand(0, 10); + const size_t kNumLoops = 1000000; + size_t sum = 0; + for (size_t j = 0; j < kNumLoops; ++j) { + sum += mrand(rd); + } + benchmark::DoNotOptimize(sum); + return sum; +} + +void measure(size_t threadcount, PerfCounterValues* before, + PerfCounterValues* after) { + BM_CHECK_NE(before, nullptr); + BM_CHECK_NE(after, nullptr); + std::vector<std::thread> threads(threadcount); + auto work = [&]() { BM_CHECK(do_work() > 1000); }; + + // We need to first set up the counters, then start the threads, so the + // threads would inherit the counters. But later, we need to first destroy + // the thread pool (so all the work finishes), then measure the counters. So + // the scopes overlap, and we need to explicitly control the scope of the + // threadpool. + auto counters = + PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2}); + for (auto& t : threads) t = std::thread(work); + counters.Snapshot(before); + for (auto& t : threads) t.join(); + counters.Snapshot(after); +} + +TEST(PerfCountersTest, MultiThreaded) { + if (!PerfCounters::kSupported) { + GTEST_SKIP() << "Test skipped because libpfm is not supported."; + } + EXPECT_TRUE(PerfCounters::Initialize()); + PerfCounterValues before(2); + PerfCounterValues after(2); + + // Notice that this test will work even if we taskset it to a single CPU + // In this case the threads will run sequentially + // Start two threads and measure the number of combined cycles and + // instructions + measure(2, &before, &after); + std::vector<double> Elapsed2Threads{ + static_cast<double>(after[0] - before[0]), + static_cast<double>(after[1] - before[1])}; + + // Start four threads and measure the number of combined cycles and + // instructions + measure(4, &before, &after); + std::vector<double> Elapsed4Threads{ + static_cast<double>(after[0] - before[0]), + static_cast<double>(after[1] - before[1])}; + + // The following expectations fail (at least on a beefy workstation with lots + // of cpus) - it seems that in some circumstances the runtime of 4 threads + // can even be better than with 2. + // So instead of expecting 4 threads to be slower, let's just make sure they + // do not differ too much in general (one is not more than 10x than the + // other). + EXPECT_THAT(Elapsed4Threads[0] / Elapsed2Threads[0], AllOf(Gt(0.1), Lt(10))); + EXPECT_THAT(Elapsed4Threads[1] / Elapsed2Threads[1], AllOf(Gt(0.1), Lt(10))); +} + +TEST(PerfCountersTest, HardwareLimits) { + // The test works (i.e. causes read to fail) for the assumptions + // about hardware capabilities (i.e. small number (3-4) hardware + // counters) at this date, + // the same as previous test ReopenExistingCounters. + if (!PerfCounters::kSupported) { + GTEST_SKIP() << "Test skipped because libpfm is not supported.\n"; + } + EXPECT_TRUE(PerfCounters::Initialize()); + + // Taken from `perf list`, but focusses only on those HW events that actually + // were reported when running `sudo perf stat -a sleep 10`, intersected over + // several platforms. All HW events listed in the first command not reported + // in the second seem to not work. This is sad as we don't really get to test + // the grouping here (groups can contain up to 6 members)... + std::vector<std::string> counter_names{ + "cycles", // leader + "instructions", // + "branch-misses", // + }; + + // In the off-chance that some of these values are not supported, + // we filter them out so the test will complete without failure + // albeit it might not actually test the grouping on that platform + std::vector<std::string> valid_names; + for (const std::string& name : counter_names) { + if (PerfCounters::IsCounterSupported(name)) { + valid_names.push_back(name); + } + } + PerfCountersMeasurement counter(valid_names); + + std::vector<std::pair<std::string, double>> measurements; + + counter.Start(); + EXPECT_TRUE(counter.Stop(measurements)); +} + +} // namespace |