Upgrade google-benchmark to v1.8.3 am: f45c56f9b5 am: 04b856c2d7 am: 83836ea18f

Original change: https://android-review.googlesource.com/c/platform/external/google-benchmark/+/2760806 Change-Id: I85fb057a1f043df81a954b68081b59de03e43412 Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
author: Sadaf Ebrahimi <sadafebrahimi@google.com> 2023-09-27 17:54:23 +0000
committer: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> 2023-09-27 17:54:23 +0000
commit: 63fc46edd10580e6c4b1de9055729b9902011879 (patch)
tree: cbcc19fcf98479dce5ad3e7954427bd82961a348
parent: db4553b1a39ef8ef84a097dfa2e795c0a4df60d8 (diff)
parent: 83836ea18fd8db8d60e4d1a1dcf96109e1447611 (diff)
download: google-benchmark-63fc46edd10580e6c4b1de9055729b9902011879.tar.gz
169 files changed, 9567 insertions, 3527 deletions
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 0000000..56938a5
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,7 @@
+---
+Checks:          'clang-analyzer-*,readability-redundant-*,performance-*'
+WarningsAsErrors: 'clang-analyzer-*,readability-redundant-*,performance-*'
+HeaderFilterRegex: '.*'
+AnalyzeTemporaryDtors: false
+FormatStyle:     none
+User:            user
diff --git a/.github/install_bazel.sh b/.github/install_bazel.sh
new file mode 100644
index 0000000..2b1f4e7
--- /dev/null
+++ b/.github/install_bazel.sh
@@ -0,0 +1,13 @@
+if ! bazel version; then
+  arch=$(uname -m)
+  if [ "$arch" == "aarch64" ]; then
+    arch="arm64"
+  fi
+  echo "Installing wget and downloading $arch Bazel binary from GitHub releases."
+  yum install -y wget
+  wget "https://github.com/bazelbuild/bazel/releases/download/6.3.0/bazel-6.3.0-linux-$arch" -O /usr/local/bin/bazel
+  chmod +x /usr/local/bin/bazel
+else
+  # bazel is installed for the correct architecture
+  exit 0
+fi
diff --git a/.github/libcxx-setup.sh b/.github/libcxx-setup.sh
new file mode 100755
index 0000000..8773b9c
--- /dev/null
+++ b/.github/libcxx-setup.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+set -e
+
+# Checkout LLVM sources
+git clone --depth=1 https://github.com/llvm/llvm-project.git llvm-project
+
+## Setup libc++ options
+if [ -z "$BUILD_32_BITS" ]; then
+  export BUILD_32_BITS=OFF && echo disabling 32 bit build
+fi
+
+## Build and install libc++ (Use unstable ABI for better sanitizer coverage)
+mkdir llvm-build && cd llvm-build
+cmake -DCMAKE_C_COMPILER=${CC}                  \
+      -DCMAKE_CXX_COMPILER=${CXX}               \
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo         \
+      -DCMAKE_INSTALL_PREFIX=/usr               \
+      -DLIBCXX_ABI_UNSTABLE=OFF                 \
+      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER}  \
+      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS}     \
+      -DLLVM_ENABLE_RUNTIMES='libcxx;libcxxabi;libunwind' \
+      -G "Unix Makefiles" \
+      ../llvm-project/runtimes/
+make -j cxx cxxabi unwind
+cd ..
diff --git a/.github/workflows/bazel.yml b/.github/workflows/bazel.yml
index d6bbe62..1cdc38c 100644
--- a/.github/workflows/bazel.yml
+++ b/.github/workflows/bazel.yml
@@ -5,29 +5,31 @@ on:
   pull_request: {}
 
 jobs:
-  build-and-test:
-    runs-on: ubuntu-latest
-    
+  build_and_test_default:
+    name: bazel.${{ matrix.os }}.${{ matrix.bzlmod && 'bzlmod' || 'no_bzlmod' }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        bzlmod: [false, true]
     steps:
-    - uses: actions/checkout@v1
+    - uses: actions/checkout@v3
 
     - name: mount bazel cache
-      uses: actions/cache@v1
+      uses: actions/cache@v3
+      env:
+        cache-name: bazel-cache
       with:
-        path: "/home/runner/.cache/bazel"
-        key: bazel
-
-    - name: install bazelisk
-      run: |
-        curl -LO "https://github.com/bazelbuild/bazelisk/releases/download/v1.1.0/bazelisk-linux-amd64"
-        mkdir -p "${GITHUB_WORKSPACE}/bin/"
-        mv bazelisk-linux-amd64 "${GITHUB_WORKSPACE}/bin/bazel"
-        chmod +x "${GITHUB_WORKSPACE}/bin/bazel"
+        path: "~/.cache/bazel"
+        key: ${{ env.cache-name }}-${{ matrix.os }}-${{ github.ref }}
+        restore-keys: |
+          ${{ env.cache-name }}-${{ matrix.os }}-main
 
     - name: build
       run: |
-        "${GITHUB_WORKSPACE}/bin/bazel" build //...
-        
+        bazel build ${{ matrix.bzlmod && '--enable_bzlmod' || '--noenable_bzlmod' }} //:benchmark //:benchmark_main //test/...
+
     - name: test
       run: |
-        "${GITHUB_WORKSPACE}/bin/bazel" test //test/...
+        bazel test ${{ matrix.bzlmod && '--enable_bzlmod' || '--noenable_bzlmod' }} --test_output=all //test/...
diff --git a/.github/workflows/build-and-test-min-cmake.yml b/.github/workflows/build-and-test-min-cmake.yml
new file mode 100644
index 0000000..e3e3217
--- /dev/null
+++ b/.github/workflows/build-and-test-min-cmake.yml
@@ -0,0 +1,46 @@
+name: build-and-test-min-cmake
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  job:
+    name: ${{ matrix.os }}.min-cmake
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: lukka/get-cmake@latest
+        with:
+          cmakeVersion: 3.10.0
+
+      - name: create build environment
+        run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+      - name: setup cmake initial cache
+        run: touch compiler-cache.cmake
+
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.compiler }}
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: >
+          cmake -C ${{ github.workspace }}/compiler-cache.cmake
+          $GITHUB_WORKSPACE
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DCMAKE_CXX_VISIBILITY_PRESET=hidden
+          -DCMAKE_VISIBILITY_INLINES_HIDDEN=ON
+
+      - name: build
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: cmake --build .
diff --git a/.github/workflows/build-and-test-perfcounters.yml b/.github/workflows/build-and-test-perfcounters.yml
new file mode 100644
index 0000000..97e4d8e
--- /dev/null
+++ b/.github/workflows/build-and-test-perfcounters.yml
@@ -0,0 +1,51 @@
+name: build-and-test-perfcounters
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  job:
+    # TODO(dominic): Extend this to include compiler and set through env: CC/CXX.
+    name: ${{ matrix.os }}.${{ matrix.build_type }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-22.04, ubuntu-20.04]
+        build_type: ['Release', 'Debug']
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: install libpfm
+      run: |
+        sudo apt update
+        sudo apt -y install libpfm4-dev
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_LIBPFM=1
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    # Skip testing, for now. It seems perf_event_open does not succeed on the
+    # hosting machine, very likely a permissions issue.
+    # TODO(mtrofin): Enable test.
+    # - name: test
+    #   shell: bash
+    #   working-directory: ${{ runner.workspace }}/_build
+    #   run: ctest -C ${{ matrix.build_type }} --rerun-failed --output-on-failure
+
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index f0f0626..b35200a 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -2,37 +2,113 @@ name: build-and-test
 
 on:
   push:
-    branches: [ master ]
+    branches: [ main ]
   pull_request:
-    branches: [ master ]
+    branches: [ main ]
 
 jobs:
+  # TODO: add 32-bit builds (g++ and clang++) for ubuntu
+  #   (requires g++-multilib and libc6:i386)
+  # TODO: add coverage build (requires lcov)
+  # TODO: add clang + libc++ builds for ubuntu
   job:
-    # TODO(dominic): Extend this to include compiler and set through env: CC/CXX.
-    name: ${{ matrix.os }}.${{ matrix.build_type }}
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.lib }}.${{ matrix.compiler }}
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-latest, ubuntu-16.04, ubuntu-20.04, macos-latest, windows-latest]
+        os: [ubuntu-22.04, ubuntu-20.04, macos-latest]
         build_type: ['Release', 'Debug']
+        compiler: ['g++', 'clang++']
+        lib: ['shared', 'static']
+
     steps:
-    - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
+
+      - uses: lukka/get-cmake@latest
+
+      - name: create build environment
+        run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+      - name: setup cmake initial cache
+        run: touch compiler-cache.cmake
+
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.compiler }}
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: >
+          cmake -C ${{ github.workspace }}/compiler-cache.cmake
+          $GITHUB_WORKSPACE
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DBUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
+          -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+          -DCMAKE_CXX_COMPILER=${{ env.CXX }}
+          -DCMAKE_CXX_VISIBILITY_PRESET=hidden
+          -DCMAKE_VISIBILITY_INLINES_HIDDEN=ON
+
+      - name: build
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: cmake --build . --config ${{ matrix.build_type }}
+
+      - name: test
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: ctest -C ${{ matrix.build_type }} -VV
+
+  msvc:
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.lib }}.${{ matrix.msvc }}
+    runs-on: ${{ matrix.os }}
+    defaults:
+        run:
+            shell: powershell
+    strategy:
+      fail-fast: false
+      matrix:
+        msvc:
+          - VS-16-2019
+          - VS-17-2022
+        arch:
+          - x64
+        build_type:
+          - Debug
+          - Release
+        lib:
+          - shared
+          - static
+        include:
+          - msvc: VS-16-2019
+            os: windows-2019
+            generator: 'Visual Studio 16 2019'
+          - msvc: VS-17-2022
+            os: windows-2022
+            generator: 'Visual Studio 17 2022'
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: lukka/get-cmake@latest
+
+      - name: configure cmake
+        run: >
+          cmake -S . -B _build/
+          -A ${{ matrix.arch }}
+          -G "${{ matrix.generator }}"
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DBUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
+
+      - name: build
+        run: cmake --build _build/ --config ${{ matrix.build_type }}
 
-    - name: create build environment
-      run: cmake -E make_directory ${{ runner.workspace }}/_build
+      - name: setup test environment
+        # Make sure gmock and benchmark DLLs can be found
+        run: >
+            echo "$((Get-Item .).FullName)/_build/bin/${{ matrix.build_type }}" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append;
+            echo "$((Get-Item .).FullName)/_build/src/${{ matrix.build_type }}" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append;
 
-    - name: configure cmake
-      shell: bash
-      working-directory: ${{ runner.workspace }}/_build
-      run: cmake -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+      - name: test
+        run: ctest --test-dir _build/ -C ${{ matrix.build_type }} -VV
 
-    - name: build
-      shell: bash
-      working-directory: ${{ runner.workspace }}/_build
-      run: cmake --build . --config ${{ matrix.build_type }}
 
-    - name: test
-      shell: bash
-      working-directory: ${{ runner.workspace }}/_build
-      run: ctest -C ${{ matrix.build_type }}
diff --git a/.github/workflows/clang-format-lint.yml b/.github/workflows/clang-format-lint.yml
new file mode 100644
index 0000000..77ce1f8
--- /dev/null
+++ b/.github/workflows/clang-format-lint.yml
@@ -0,0 +1,17 @@
+name: clang-format-lint
+on:
+  push: {}
+  pull_request: {}
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - uses: DoozyX/clang-format-lint-action@v0.13
+      with:
+        source: './include/benchmark ./src ./test'
+        extensions: 'h,cc'
+        clangFormatVersion: 12
+        style: Google
diff --git a/.github/workflows/clang-tidy.yml b/.github/workflows/clang-tidy.yml
new file mode 100644
index 0000000..2eaab9c
--- /dev/null
+++ b/.github/workflows/clang-tidy.yml
@@ -0,0 +1,38 @@
+name: clang-tidy
+
+on:
+  push: {}
+  pull_request: {}
+
+jobs:
+  job:
+    name: run-clang-tidy
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: install clang-tidy
+      run: sudo apt update && sudo apt -y install clang-tidy
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
+        -DBENCHMARK_ENABLE_LIBPFM=OFF
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_C_COMPILER=clang
+        -DCMAKE_CXX_COMPILER=clang++
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DGTEST_COMPILE_COMMANDS=OFF
+
+    - name: run
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: run-clang-tidy
diff --git a/.github/workflows/doxygen.yml b/.github/workflows/doxygen.yml
new file mode 100644
index 0000000..da92c46
--- /dev/null
+++ b/.github/workflows/doxygen.yml
@@ -0,0 +1,28 @@
+name: doxygen
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  build-and-deploy:
+    name: Build HTML documentation
+    runs-on: ubuntu-latest
+    steps:
+    - name: Fetching sources
+      uses: actions/checkout@v3
+
+    - name: Installing build dependencies
+      run: |
+        sudo apt update
+        sudo apt install doxygen gcc git
+
+    - name: Creating build directory
+      run: mkdir build
+
+    - name: Building HTML documentation with Doxygen
+      run: |
+        cmake -S . -B build -DBENCHMARK_ENABLE_TESTING:BOOL=OFF -DBENCHMARK_ENABLE_DOXYGEN:BOOL=ON -DBENCHMARK_INSTALL_DOCS:BOOL=ON
+        cmake --build build --target benchmark_doxygen
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
index c869674..c6939b5 100644
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@@ -2,9 +2,9 @@ name: pylint
 
 on:
   push:
-    branches: [ master ]
+    branches: [ main ]
   pull_request:
-    branches: [ master ]
+    branches: [ main ]
 
 jobs:
   pylint:
@@ -12,15 +12,17 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Set up Python 3.8
       uses: actions/setup-python@v1
       with:
         python-version: 3.8
+
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
         pip install pylint pylint-exit conan
+
     - name: Run pylint
       run: |
         pylint `find . -name '*.py'|xargs` || pylint-exit $?
diff --git a/.github/workflows/sanitizer.yml b/.github/workflows/sanitizer.yml
new file mode 100644
index 0000000..86cccf4
--- /dev/null
+++ b/.github/workflows/sanitizer.yml
@@ -0,0 +1,96 @@
+name: sanitizer
+
+on:
+  push: {}
+  pull_request: {}
+
+env:
+  UBSAN_OPTIONS: "print_stacktrace=1"
+
+jobs:
+  job:
+    name: ${{ matrix.sanitizer }}.${{ matrix.build_type }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: ['Debug', 'RelWithDebInfo']
+        sanitizer: ['asan', 'ubsan', 'tsan', 'msan']
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: configure msan env
+      if: matrix.sanitizer == 'msan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=MemoryWithOrigins" >> $GITHUB_ENV
+
+    - name: configure ubsan env
+      if: matrix.sanitizer == 'ubsan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Undefined" >> $GITHUB_ENV
+
+    - name: configure asan env
+      if: matrix.sanitizer == 'asan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=address -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Address" >> $GITHUB_ENV
+
+    - name: configure tsan env
+      if: matrix.sanitizer == 'tsan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Thread" >> $GITHUB_ENV
+
+    - name: fine-tune asan options
+      # in asan we get an error from std::regex. ignore it.
+      if: matrix.sanitizer == 'asan'
+      run: |
+        echo "ASAN_OPTIONS=alloc_dealloc_mismatch=0" >> $GITHUB_ENV
+
+    - name: setup clang
+      uses: egor-tensin/setup-clang@v1
+      with:
+        version: latest
+        platform: x64
+
+    - name: configure clang
+      run: |
+        echo "CC=cc" >> $GITHUB_ENV
+        echo "CXX=c++" >> $GITHUB_ENV
+
+    - name: build libc++ (non-asan)
+      if: matrix.sanitizer != 'asan'
+      run: |
+        "${GITHUB_WORKSPACE}/.github/libcxx-setup.sh"
+        echo "EXTRA_CXX_FLAGS=-stdlib=libc++ -L ${GITHUB_WORKSPACE}/llvm-build/lib -lc++abi -Isystem${GITHUB_WORKSPACE}/llvm-build/include -Isystem${GITHUB_WORKSPACE}/llvm-build/include/c++/v1 -Wl,-rpath,${GITHUB_WORKSPACE}/llvm-build/lib" >> $GITHUB_ENV
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        VERBOSE=1
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
+        -DBENCHMARK_ENABLE_LIBPFM=OFF
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_C_COMPILER=${{ env.CC }}
+        -DCMAKE_CXX_COMPILER=${{ env.CXX }}
+        -DCMAKE_C_FLAGS="${{ env.EXTRA_FLAGS }}"
+        -DCMAKE_CXX_FLAGS="${{ env.EXTRA_FLAGS }} ${{ env.EXTRA_CXX_FLAGS }}"
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    - name: test
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: ctest -C ${{ matrix.build_type }} -VV
diff --git a/.github/workflows/test_bindings.yml b/.github/workflows/test_bindings.yml
index 273d7f9..e01bb7b 100644
--- a/.github/workflows/test_bindings.yml
+++ b/.github/workflows/test_bindings.yml
@@ -2,23 +2,28 @@ name: test-bindings
 
 on:
   push:
-    branches: [master]
+    branches: [main]
   pull_request:
-    branches: [master]
+    branches: [main]
 
 jobs:
   python_bindings:
-    runs-on: ubuntu-latest
+    name: Test GBM Python bindings on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest, macos-latest, windows-2019 ]
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Set up Python
-        uses: actions/setup-python@v1
+        uses: actions/setup-python@v4
         with:
-          python-version: 3.8
-      - name: Install benchmark
+          python-version: 3.11
+      - name: Install GBM Python bindings on ${{ matrix.os}}
         run:
-          python setup.py install
-      - name: Run example bindings
+          python -m pip install wheel .
+      - name: Run bindings example on ${{ matrix.os }}
         run:
           python bindings/python/google_benchmark/example.py
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
new file mode 100644
index 0000000..1f73bff
--- /dev/null
+++ b/.github/workflows/wheels.yml
@@ -0,0 +1,79 @@
+name: Build and upload Python wheels
+
+on:
+  workflow_dispatch:
+  release:
+    types:
+      - published
+
+jobs:
+  build_sdist:
+    name: Build source distribution
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v3
+
+      - name: Install Python 3.11
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+
+      - name: Build and check sdist
+        run: |
+          python setup.py sdist
+      - name: Upload sdist
+        uses: actions/upload-artifact@v3
+        with:
+          name: dist
+          path: dist/*.tar.gz
+
+  build_wheels:
+    name: Build Google Benchmark wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-2019]
+
+    steps:
+      - name: Check out Google Benchmark
+        uses: actions/checkout@v3
+
+      - name: Set up QEMU
+        if: runner.os == 'Linux'
+        uses: docker/setup-qemu-action@v2
+        with:
+          platforms: all
+
+      - name: Build wheels on ${{ matrix.os }} using cibuildwheel
+        uses: pypa/cibuildwheel@v2.14.1
+        env:
+          CIBW_BUILD: 'cp38-* cp39-* cp310-* cp311-*'
+          CIBW_SKIP: "*-musllinux_*"
+          CIBW_TEST_SKIP: "*-macosx_arm64"
+          CIBW_ARCHS_LINUX: x86_64 aarch64
+          CIBW_ARCHS_MACOS: x86_64 arm64
+          CIBW_ARCHS_WINDOWS: AMD64
+          CIBW_BEFORE_ALL_LINUX: bash .github/install_bazel.sh
+          CIBW_TEST_COMMAND: python {project}/bindings/python/google_benchmark/example.py
+
+      - name: Upload Google Benchmark ${{ matrix.os }} wheels
+        uses: actions/upload-artifact@v3
+        with:
+          name: dist
+          path: ./wheelhouse/*.whl
+
+  pypi_upload:
+    name: Publish google-benchmark wheels to PyPI
+    needs: [build_sdist, build_wheels]
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/download-artifact@v3
+      with:
+        name: dist
+        path: dist
+
+    - uses: pypa/gh-action-pypi-publish@v1.6.4
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_PASSWORD }}
diff --git a/.gitignore b/.gitignore
index be55d77..704f56c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,7 @@
 *.swp
 *.pyc
 __pycache__
+.DS_Store
 
 # lcov
 *.lcov
diff --git a/.travis-libcxx-setup.sh b/.travis-libcxx-setup.sh
deleted file mode 100644
index a591743..0000000
--- a/.travis-libcxx-setup.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-
-# Install a newer CMake version
-curl -sSL https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.sh -o install-cmake.sh
-chmod +x install-cmake.sh
-sudo ./install-cmake.sh --prefix=/usr/local --skip-license
-
-# Checkout LLVM sources
-git clone --depth=1 https://github.com/llvm-mirror/llvm.git llvm-source
-git clone --depth=1 https://github.com/llvm-mirror/libcxx.git llvm-source/projects/libcxx
-git clone --depth=1 https://github.com/llvm-mirror/libcxxabi.git llvm-source/projects/libcxxabi
-
-# Setup libc++ options
-if [ -z "$BUILD_32_BITS" ]; then
-  export BUILD_32_BITS=OFF && echo disabling 32 bit build
-fi
-
-# Build and install libc++ (Use unstable ABI for better sanitizer coverage)
-mkdir llvm-build && cd llvm-build
-cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} \
-      -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=/usr \
-      -DLIBCXX_ABI_UNSTABLE=ON \
-      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER} \
-      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS} \
-      ../llvm-source
-make cxx -j2
-sudo make install-cxxabi install-cxx
-cd ../
diff --git a/.travis.yml b/.travis.yml
index 36e343d..8cfed3d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,10 +11,6 @@ matrix:
             - lcov
       env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Coverage
     - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug
-    - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release
-    - compiler: gcc
       addons:
         apt:
           packages:
@@ -44,10 +40,6 @@ matrix:
         - COMPILER=g++-6 C_COMPILER=gcc-6  BUILD_TYPE=Debug
         - ENABLE_SANITIZER=1
         - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold"
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Debug
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Release
     # Clang w/ libc++
     - compiler: clang
       dist: xenial
@@ -150,29 +142,14 @@ matrix:
       osx_image: xcode8.3
       compiler: clang
       env:
-        - COMPILER=clang++ BUILD_TYPE=Debug
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Release
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
         - COMPILER=clang++
         - BUILD_TYPE=Release
         - BUILD_32_BITS=ON
         - EXTRA_FLAGS="-m32"
-    - os: osx
-      osx_image: xcode9.4
-      compiler: gcc
-      env:
-        - COMPILER=g++-7 C_COMPILER=gcc-7  BUILD_TYPE=Debug
 
 before_script:
   - if [ -n "${LIBCXX_BUILD}" ]; then
-      source .travis-libcxx-setup.sh;
+      source .libcxx-setup.sh;
     fi
   - if [ -n "${ENABLE_SANITIZER}" ]; then
       export EXTRA_OPTIONS="-DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF";
diff --git a/AUTHORS b/AUTHORS
index 3068b2e..d08c1fd 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -13,6 +13,7 @@ Alex Steele <steeleal123@gmail.com>
 Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
 Carto
+Cezary Skrzyński <czars1988@gmail.com>
 Christian Wassermann <christian_wassermann@web.de>
 Christopher Seymour <chris.j.seymour@hotmail.com>
 Colin Braley <braley.colin@gmail.com>
@@ -21,14 +22,18 @@ David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
 Deniz Evrenci <denizevrenci@gmail.com>
 Dirac Research 
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Dominik Korman <kormandominik@gmail.com>
+Donald Aingworth <donalds_junk_mail@yahoo.com>
 Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Fabien Pichot <pichot.fabien@gmail.com>
 Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
 Gergő Szitár <szitar.gergo@gmail.com>
 Google Inc.
+Henrique Bucher <hbucher@gmail.com>
 International Business Machines Corporation
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
@@ -39,20 +44,28 @@ Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
 Kishan Kumar <kumar.kishan@outlook.com>
 Lei Xu <eddyxu@gmail.com>
+Marcel Jacobse <mjacobse@uni-bremen.de>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
+Mike Apodaca <gatorfax@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
 MongoDB Inc.
 Nick Hutchinson <nshutchinson@gmail.com>
+Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Ori Livneh <ori.livneh@gmail.com>
 Paul Redmond <paul.redmond@gmail.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Raghu Raja <raghu@enfabrica.net>
+Rainer Orth <ro@cebitec.uni-bielefeld.de>
 Roman Lebedev <lebedev.ri@gmail.com>
 Sayan Bhattacharjee <aero.sayan@gmail.com>
+Shapr3D <google-contributors@shapr3d.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Staffan Tjernstrom <staffantj@gmail.com>
 Steinar H. Gunderson <sgunderson@bigfoot.com>
 Stripe, Inc.
+Tobias Schmidt <tobias.schmidt@in.tum.de>
 Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
 Zbigniew Skowron <zbychs@gmail.com>
-Min-Yih Hsu <yihshyng223@gmail.com>
diff --git a/BUILD.bazel b/BUILD.bazel
index eb35b62..60d31d2 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1,15 +1,37 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
 licenses(["notice"])
 
 config_setting(
+    name = "qnx",
+    constraint_values = ["@platforms//os:qnx"],
+    values = {
+        "cpu": "x64_qnx",
+    },
+    visibility = [":__subpackages__"],
+)
+
+config_setting(
     name = "windows",
+    constraint_values = ["@platforms//os:windows"],
     values = {
         "cpu": "x64_windows",
     },
     visibility = [":__subpackages__"],
 )
 
+config_setting(
+    name = "macos",
+    constraint_values = ["@platforms//os:macos"],
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "perfcounters",
+    define_values = {
+        "pfm": "1",
+    },
+    visibility = [":__subpackages__"],
+)
+
 cc_library(
     name = "benchmark",
     srcs = glob(
@@ -19,19 +41,40 @@ cc_library(
         ],
         exclude = ["src/benchmark_main.cc"],
     ),
-    hdrs = ["include/benchmark/benchmark.h"],
+    hdrs = [
+        "include/benchmark/benchmark.h",
+        "include/benchmark/export.h",
+    ],
     linkopts = select({
         ":windows": ["-DEFAULTLIB:shlwapi.lib"],
         "//conditions:default": ["-pthread"],
     }),
+    copts = select({
+        ":windows": [],
+        "//conditions:default": ["-Werror=old-style-cast"],
+    }),
     strip_include_prefix = "include",
     visibility = ["//visibility:public"],
+    # Only static linking is allowed; no .so will be produced.
+    # Using `defines` (i.e. not `local_defines`) means that no
+    # dependent rules need to bother about defining the macro.
+    linkstatic = True,
+    defines = [
+        "BENCHMARK_STATIC_DEFINE",
+    ] + select({
+        ":perfcounters": ["HAVE_LIBPFM"],
+        "//conditions:default": [],
+    }),
+    deps = select({
+        ":perfcounters": ["@libpfm//:libpfm"],
+        "//conditions:default": [],
+    }),
 )
 
 cc_library(
     name = "benchmark_main",
     srcs = ["src/benchmark_main.cc"],
-    hdrs = ["include/benchmark/benchmark.h"],
+    hdrs = ["include/benchmark/benchmark.h", "include/benchmark/export.h"],
     strip_include_prefix = "include",
     visibility = ["//visibility:public"],
     deps = [":benchmark"],
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1007254..ffd7dee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,30 +1,34 @@
-cmake_minimum_required (VERSION 3.5.1)
-
-foreach(p
-    CMP0048 # OK to clear PROJECT_VERSION on project()
-    CMP0054 # CMake 3.1
-    CMP0056 # export EXE_LINKER_FLAGS to try_run
-    CMP0057 # Support no if() IN_LIST operator
-    CMP0063 # Honor visibility properties for all targets
-    CMP0077 # Allow option() overrides in importing projects
-    )
-  if(POLICY ${p})
-    cmake_policy(SET ${p} NEW)
-  endif()
-endforeach()
+# Require CMake 3.10. If available, use the policies up to CMake 3.22.
+cmake_minimum_required (VERSION 3.10...3.22)
 
-project (benchmark CXX)
+project (benchmark VERSION 1.8.3 LANGUAGES CXX)
 
 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
 option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
 option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
 option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
+option(BENCHMARK_ENABLE_WERROR "Build Release candidates with -Werror." ON)
+option(BENCHMARK_FORCE_WERROR "Build Release candidates with -Werror regardless of compiler issues." OFF)
+
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "PGI")
+  # PGC++ maybe reporting false positives.
+  set(BENCHMARK_ENABLE_WERROR OFF)
+endif()
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "NVHPC")
+  set(BENCHMARK_ENABLE_WERROR OFF)
+endif()
+if(BENCHMARK_FORCE_WERROR)
+  set(BENCHMARK_ENABLE_WERROR ON)
+endif(BENCHMARK_FORCE_WERROR)
+
 if(NOT MSVC)
   option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
 else()
   set(BENCHMARK_BUILD_32_BITS OFF CACHE BOOL "Build a 32 bit version of the library - unsupported when using MSVC)" FORCE)
 endif()
 option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark. (Projects embedding benchmark may want to turn this OFF.)" ON)
+option(BENCHMARK_ENABLE_DOXYGEN "Build documentation with Doxygen." OFF)
+option(BENCHMARK_INSTALL_DOCS "Enable installation of documentation." ON)
 
 # Allow unmet dependencies to be met using CMake's ExternalProject mechanics, which
 # may require downloading the source code.
@@ -33,8 +37,25 @@ option(BENCHMARK_DOWNLOAD_DEPENDENCIES "Allow the downloading and in-tree buildi
 # This option can be used to disable building and running unit tests which depend on gtest
 # in cases where it is not possible to build or find a valid version of gtest.
 option(BENCHMARK_ENABLE_GTEST_TESTS "Enable building the unit tests which depend on gtest" ON)
+option(BENCHMARK_USE_BUNDLED_GTEST "Use bundled GoogleTest. If disabled, the find_package(GTest) will be used." ON)
+
+option(BENCHMARK_ENABLE_LIBPFM "Enable performance counters provided by libpfm" OFF)
+
+# Export only public symbols
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)
+
+if(MSVC)
+    # As of CMake 3.18, CMAKE_SYSTEM_PROCESSOR is not set properly for MSVC and
+    # cross-compilation (e.g. Host=x86_64, target=aarch64) requires using the
+    # undocumented, but working variable.
+    # See https://gitlab.kitware.com/cmake/cmake/-/issues/15170
+    set(CMAKE_SYSTEM_PROCESSOR ${MSVC_CXX_ARCHITECTURE_ID})
+    if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ARM")
+      set(CMAKE_CROSSCOMPILING TRUE)
+    endif()
+endif()
 
-set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 set(ENABLE_ASSEMBLY_TESTS_DEFAULT OFF)
 function(should_enable_assembly_tests)
   if(CMAKE_BUILD_TYPE)
@@ -81,24 +102,43 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 include(GetGitVersion)
 get_git_version(GIT_VERSION)
 
+# If no git version can be determined, use the version
+# from the project() command
+if ("${GIT_VERSION}" STREQUAL "0.0.0")
+  set(VERSION "${benchmark_VERSION}")
+else()
+  set(VERSION "${GIT_VERSION}")
+endif()
 # Tell the user what versions we are using
-string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" VERSION ${GIT_VERSION})
-message(STATUS "Version: ${VERSION}")
+message(STATUS "Google Benchmark version: ${VERSION}")
 
 # The version of the libraries
 set(GENERIC_LIB_VERSION ${VERSION})
 string(SUBSTRING ${VERSION} 0 1 GENERIC_LIB_SOVERSION)
 
 # Import our CMake modules
-include(CheckCXXCompilerFlag)
 include(AddCXXCompilerFlag)
+include(CheckCXXCompilerFlag)
+include(CheckLibraryExists)
 include(CXXFeatureCheck)
 
+check_library_exists(rt shm_open "" HAVE_LIB_RT)
+
 if (BENCHMARK_BUILD_32_BITS)
   add_required_cxx_compiler_flag(-m32)
 endif()
 
 if (MSVC)
+  set(BENCHMARK_CXX_STANDARD 14)
+else()
+  set(BENCHMARK_CXX_STANDARD 11)
+endif()
+
+set(CMAKE_CXX_STANDARD ${BENCHMARK_CXX_STANDARD})
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+if (MSVC)
   # Turn compiler warnings up to 11
   string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
@@ -130,44 +170,43 @@ if (MSVC)
     set(CMAKE_EXE_LINKER_FLAGS_MINSIZEREL "${CMAKE_EXE_LINKER_FLAGS_MINSIZEREL} /LTCG")
   endif()
 else()
-  # Try and enable C++11. Don't use C++14 because it doesn't work in some
-  # configurations.
-  add_cxx_compiler_flag(-std=c++11)
-  if (NOT HAVE_CXX_FLAG_STD_CXX11)
-    add_cxx_compiler_flag(-std=c++0x)
-  endif()
-
   # Turn compiler warnings up to 11
   add_cxx_compiler_flag(-Wall)
   add_cxx_compiler_flag(-Wextra)
   add_cxx_compiler_flag(-Wshadow)
-  add_cxx_compiler_flag(-Werror RELEASE)
-  add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
-  add_cxx_compiler_flag(-Werror MINSIZEREL)
-  # Disabled until googletest (gmock) stops emitting variadic macro warnings
-  #add_cxx_compiler_flag(-pedantic)
-  #add_cxx_compiler_flag(-pedantic-errors)
+  add_cxx_compiler_flag(-Wfloat-equal)
+  add_cxx_compiler_flag(-Wold-style-cast)
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-Werror)
+  endif()
+  if (NOT BENCHMARK_ENABLE_TESTING)
+    # Disable warning when compiling tests as gtest does not use 'override'.
+    add_cxx_compiler_flag(-Wsuggest-override)
+  endif()
+  add_cxx_compiler_flag(-pedantic)
+  add_cxx_compiler_flag(-pedantic-errors)
   add_cxx_compiler_flag(-Wshorten-64-to-32)
   add_cxx_compiler_flag(-fstrict-aliasing)
   # Disable warnings regarding deprecated parts of the library while building
   # and testing those parts of the library.
   add_cxx_compiler_flag(-Wno-deprecated-declarations)
-  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
     # Intel silently ignores '-Wno-deprecated-declarations',
     # warning no. 1786 must be explicitly disabled.
     # See #631 for rationale.
     add_cxx_compiler_flag(-wd1786)
+    add_cxx_compiler_flag(-fno-finite-math-only)
   endif()
   # Disable deprecation warnings for release builds (when -Werror is enabled).
-  add_cxx_compiler_flag(-Wno-deprecated RELEASE)
-  add_cxx_compiler_flag(-Wno-deprecated RELWITHDEBINFO)
-  add_cxx_compiler_flag(-Wno-deprecated MINSIZEREL)
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-Wno-deprecated)
+  endif()
   if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
     add_cxx_compiler_flag(-fno-exceptions)
   endif()
 
   if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
-    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #ICC17u2: Many false positives for Wstrict-aliasing
+    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM") #ICC17u2: Many false positives for Wstrict-aliasing
       add_cxx_compiler_flag(-Wstrict-aliasing)
     endif()
   endif()
@@ -176,12 +215,12 @@ else()
   add_cxx_compiler_flag(-wd654)
   add_cxx_compiler_flag(-Wthread-safety)
   if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
-    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
+    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES "-DINCLUDE_DIRECTORIES=${PROJECT_SOURCE_DIR}/include")
   endif()
 
   # On most UNIX like platforms g++ and clang++ define _GNU_SOURCE as a
   # predefined macro, which turns on all of the wonderful libc extensions.
-  # However g++ doesn't do this in Cygwin so we have to define it ourselfs
+  # However g++ doesn't do this in Cygwin so we have to define it ourselves
   # since we depend on GNU/POSIX/BSD extensions.
   if (CYGWIN)
     add_definitions(-D_GNU_SOURCE=1)
@@ -232,7 +271,8 @@ if (BENCHMARK_USE_LIBCXX)
   if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
     add_cxx_compiler_flag(-stdlib=libc++)
   elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR
-          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel" OR
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "IntelLLVM")
     add_cxx_compiler_flag(-nostdinc++)
     message(WARNING "libc++ header path must be manually specified using CMAKE_CXX_FLAGS")
     # Adding -nodefaultlibs directly to CMAKE_<TYPE>_LINKER_FLAGS will break
@@ -269,6 +309,11 @@ cxx_feature_check(STEADY_CLOCK)
 # Ensure we have pthreads
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
+cxx_feature_check(PTHREAD_AFFINITY)
+
+if (BENCHMARK_ENABLE_LIBPFM)
+  find_package(PFM)
+endif()
 
 # Set up directories
 include_directories(${PROJECT_SOURCE_DIR}/include)
@@ -281,7 +326,15 @@ if (BENCHMARK_ENABLE_TESTING)
   if (BENCHMARK_ENABLE_GTEST_TESTS AND
       NOT (TARGET gtest AND TARGET gtest_main AND
            TARGET gmock AND TARGET gmock_main))
-    include(GoogleTest)
+    if (BENCHMARK_USE_BUNDLED_GTEST)
+      include(GoogleTest)
+    else()
+      find_package(GTest CONFIG REQUIRED)
+      add_library(gtest ALIAS GTest::gtest)
+      add_library(gtest_main ALIAS GTest::gtest_main)
+      add_library(gmock ALIAS GTest::gmock)
+      add_library(gmock_main ALIAS GTest::gmock_main)
+    endif()
   endif()
   add_subdirectory(test)
 endif()
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index b5e1aa4..95bcad0 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -27,7 +27,9 @@ Albert Pretorius <pretoalb@gmail.com>
 Alex Steele <steelal123@gmail.com>
 Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
+Bátor Tallér <bator.taller@shapr3d.com>
 Billy Robert O'Neal III <billy.oneal@gmail.com> <bion@microsoft.com>
+Cezary Skrzyński <czars1988@gmail.com>
 Chris Kennelly <ckennelly@google.com> <ckennelly@ckennelly.com>
 Christian Wassermann <christian_wassermann@web.de>
 Christopher Seymour <chris.j.seymour@hotmail.com>
@@ -38,16 +40,20 @@ David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
 Deniz Evrenci <denizevrenci@gmail.com>
 Dominic Hamon <dma@stripysock.com> <dominic@google.com>
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Dominik Korman <kormandominik@gmail.com>
+Donald Aingworth <donalds_junk_mail@yahoo.com>
 Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Fabien Pichot <pichot.fabien@gmail.com>
 Fanbo Meng <fanbo.meng@ibm.com>
 Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
 Geoffrey Martin-Noble <gcmn@google.com> <gmngeoffrey@gmail.com>
 Gergő Szitár <szitar.gergo@gmail.com>
 Hannes Hauswedell <h2@fsfe.org>
+Henrique Bucher <hbucher@gmail.com>
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
@@ -55,19 +61,25 @@ Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
 John Millikin <jmillikin@stripe.com>
 Jordan Williams <jwillikers@protonmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
-Kai Wolf <kai.wolf@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
+Kai Wolf <kai.wolf@gmail.com>
 Kishan Kumar <kumar.kishan@outlook.com>
 Lei Xu <eddyxu@gmail.com>
+Marcel Jacobse <mjacobse@uni-bremen.de>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
+Mike Apodaca <gatorfax@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
 Nick Hutchinson <nshutchinson@gmail.com>
+Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Ori Livneh <ori.livneh@gmail.com>
 Pascal Leroy <phl@google.com>
 Paul Redmond <paul.redmond@gmail.com>
 Pierre Phaneuf <pphaneuf@google.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Raghu Raja <raghu@enfabrica.net>
+Rainer Orth <ro@cebitec.uni-bielefeld.de>
 Raul Marin <rmrodriguez@cartodb.com>
 Ray Glover <ray.glover@uk.ibm.com>
 Robert Guo <robert.guo@mongodb.com>
@@ -75,9 +87,9 @@ Roman Lebedev <lebedev.ri@gmail.com>
 Sayan Bhattacharjee <aero.sayan@gmail.com>
 Shuo Chen <chenshuo@chenshuo.com>
 Steven Wan <wan.yu@ibm.com>
+Tobias Schmidt <tobias.schmidt@in.tum.de>
 Tobias Ulvgård <tobias.ulvgard@dirac.se>
 Tom Madams <tom.ej.madams@gmail.com> <tmadams@google.com>
 Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
 Zbigniew Skowron <zbychs@gmail.com>
-Min-Yih Hsu <yihshyng223@gmail.com>
diff --git a/METADATA b/METADATA
index 0584c04..5433077 100644
--- a/METADATA
+++ b/METADATA
@@ -1,3 +1,7 @@
+# This project was upgraded with external_updater.
+# Usage: tools/external_updater/updater.sh update google-benchmark
+# For more info, check https://cs.android.com/android/platform/superproject/+/main:tools/external_updater/README.md
+
 name: "google-benchmark"
 description: "A library to support the benchmarking of functions, similar to unit-tests."
 third_party {
@@ -9,11 +13,11 @@ third_party {
     type: GIT
     value: "https://github.com/google/benchmark.git"
   }
-  version: "ea5a5bbff491fd625c6e3458f6edd680b8bd5452"
+  version: "v1.8.3"
   license_type: NOTICE
   last_upgrade_date {
-    year: 2021
-    month: 2
-    day: 12
+    year: 2023
+    month: 9
+    day: 22
   }
 }
diff --git a/MODULE.bazel b/MODULE.bazel
new file mode 100644
index 0000000..37a5f5d
--- /dev/null
+++ b/MODULE.bazel
@@ -0,0 +1,24 @@
+module(name = "google_benchmark", version="1.8.3")
+
+bazel_dep(name = "bazel_skylib", version = "1.4.1")
+bazel_dep(name = "platforms", version = "0.0.6")
+bazel_dep(name = "rules_foreign_cc", version = "0.9.0")
+bazel_dep(name = "rules_cc", version = "0.0.6")
+bazel_dep(name = "rules_python", version = "0.24.0", dev_dependency = True)
+bazel_dep(name = "googletest", version = "1.12.1", repo_name = "com_google_googletest", dev_dependency = True)
+bazel_dep(name = "libpfm", version = "4.11.0")
+
+# Register a toolchain for Python 3.9 to be able to build numpy. Python
+# versions >=3.10 are problematic.
+# A second reason for this is to be able to build Python hermetically instead
+# of relying on the changing default version from rules_python.
+
+python = use_extension("@rules_python//python/extensions:python.bzl", "python", dev_dependency = True)
+python.toolchain(python_version = "3.9")
+
+pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip", dev_dependency = True)
+pip.parse(
+    hub_name="tools_pip_deps",
+    python_version = "3.9",
+    requirements_lock="//tools:requirements.txt")
+use_repo(pip, "tools_pip_deps")
diff --git a/README.md b/README.md
index 6c09b9d..a5e5d39 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,12 @@
 # Benchmark
 
 [![build-and-test](https://github.com/google/benchmark/workflows/build-and-test/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Abuild-and-test)
+[![bazel](https://github.com/google/benchmark/actions/workflows/bazel.yml/badge.svg)](https://github.com/google/benchmark/actions/workflows/bazel.yml)
 [![pylint](https://github.com/google/benchmark/workflows/pylint/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Apylint)
 [![test-bindings](https://github.com/google/benchmark/workflows/test-bindings/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Atest-bindings)
-
-[![Build Status](https://travis-ci.org/google/benchmark.svg?branch=master)](https://travis-ci.org/google/benchmark)
-[![Build status](https://ci.appveyor.com/api/projects/status/u0qsyp7t1tk7cpxs/branch/master?svg=true)](https://ci.appveyor.com/project/google/benchmark/branch/master)
 [![Coverage Status](https://coveralls.io/repos/google/benchmark/badge.svg)](https://coveralls.io/r/google/benchmark)
 
+[![Discord](https://discordapp.com/api/guilds/1125694995928719494/widget.png?style=shield)](https://discord.gg/cz7UX7wKC2)
 
 A library to benchmark code snippets, similar to unit tests. Example:
 
@@ -27,23 +26,28 @@ BENCHMARK(BM_SomeFunction);
 BENCHMARK_MAIN();
 ```
 
+## Getting Started
+
 To get started, see [Requirements](#requirements) and
 [Installation](#installation). See [Usage](#usage) for a full example and the
-[User Guide](#user-guide) for a more comprehensive feature overview.
+[User Guide](docs/user_guide.md) for a more comprehensive feature overview.
 
-It may also help to read the [Google Test documentation](https://github.com/google/googletest/blob/master/googletest/docs/primer.md)
+It may also help to read the [Google Test documentation](https://github.com/google/googletest/blob/main/docs/primer.md)
 as some of the structural aspects of the APIs are similar.
 
-### Resources
+## Resources
 
 [Discussion group](https://groups.google.com/d/forum/benchmark-discuss)
 
-IRC channel: [freenode](https://freenode.net) #googlebenchmark
+IRC channels:
+* [libera](https://libera.chat) #benchmark
 
 [Additional Tooling Documentation](docs/tools.md)
 
 [Assembly Testing Documentation](docs/AssemblyTests.md)
 
+[Building and installing Python bindings](docs/python_bindings.md)
+
 ## Requirements
 
 The library can be used with C++03. However, it requires C++11 to build,
@@ -56,27 +60,25 @@ The following minimum versions are required to build the library:
 * Visual Studio 14 2015
 * Intel 2015 Update 1
 
-See [Platform-Specific Build Instructions](#platform-specific-build-instructions).
+See [Platform-Specific Build Instructions](docs/platform_specific_build_instructions.md).
 
 ## Installation
 
 This describes the installation process using cmake. As pre-requisites, you'll
 need git and cmake installed.
 
-_See [dependencies.md](dependencies.md) for more details regarding supported
+_See [dependencies.md](docs/dependencies.md) for more details regarding supported
 versions of build tools._
 
 ```bash
 # Check out the library.
 $ git clone https://github.com/google/benchmark.git
-# Benchmark requires Google Test as a dependency. Add the source tree as a subdirectory.
-$ git clone https://github.com/google/googletest.git benchmark/googletest
 # Go to the library root directory
 $ cd benchmark
 # Make a build directory to place the build output.
 $ cmake -E make_directory "build"
-# Generate build system files with cmake.
-$ cmake -E chdir "build" cmake -DCMAKE_BUILD_TYPE=Release ../
+# Generate build system files with cmake, and download any dependencies.
+$ cmake -E chdir "build" cmake -DBENCHMARK_DOWNLOAD_DEPENDENCIES=on -DCMAKE_BUILD_TYPE=Release ../
 # or, starting with CMake 3.13, use a simpler form:
 # cmake -DCMAKE_BUILD_TYPE=Release -S . -B "build"
 # Build the library.
@@ -110,10 +112,10 @@ sudo cmake --build "build" --config Release --target install
 Note that Google Benchmark requires Google Test to build and run the tests. This
 dependency can be provided two ways:
 
-* Checkout the Google Test sources into `benchmark/googletest` as above.
+* Checkout the Google Test sources into `benchmark/googletest`.
 * Otherwise, if `-DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON` is specified during
-  configuration, the library will automatically download and build any required
-  dependencies.
+  configuration as above, the library will automatically download and build
+  any required dependencies.
 
 If you do not wish to build and run the tests, add `-DBENCHMARK_ENABLE_GTEST_TESTS=OFF`
 to `CMAKE_ARGS`.
@@ -136,6 +138,12 @@ cache variables, if autodetection fails.
 If you are using clang, you may need to set `LLVMAR_EXECUTABLE`,
 `LLVMNM_EXECUTABLE` and `LLVMRANLIB_EXECUTABLE` cmake cache variables.
 
+To enable sanitizer checks (eg., `asan` and `tsan`), add:
+```
+ -DCMAKE_C_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=address -fsanitize=thread -fno-sanitize-recover=all"
+ -DCMAKE_CXX_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=address -fsanitize=thread -fno-sanitize-recover=all "  
+```
+
 ### Stable and Experimental Library Versions
 
 The main branch contains the latest stable version of the benchmarking library;
@@ -192,7 +200,7 @@ Alternatively, link against the `benchmark_main` library and remove
 `BENCHMARK_MAIN();` above to get the same behavior.
 
 The compiled executable will run all benchmarks by default. Pass the `--help`
-flag for option information or see the guide below.
+flag for option information or see the [User Guide](docs/user_guide.md).
 
 ### Usage with CMake
 
@@ -213,1111 +221,3 @@ Either way, link to the library as follows.
 ```cmake
 target_link_libraries(MyTarget benchmark::benchmark)
 ```
-
-## Platform Specific Build Instructions
-
-### Building with GCC
-
-When the library is built using GCC it is necessary to link with the pthread
-library due to how GCC implements `std::thread`. Failing to link to pthread will
-lead to runtime exceptions (unless you're using libc++), not linker errors. See
-[issue #67](https://github.com/google/benchmark/issues/67) for more details. You
-can link to pthread by adding `-pthread` to your linker command. Note, you can
-also use `-lpthread`, but there are potential issues with ordering of command
-line parameters if you use that.
-
-### Building with Visual Studio 2015 or 2017
-
-The `shlwapi` library (`-lshlwapi`) is required to support a call to `CPUInfo` which reads the registry. Either add `shlwapi.lib` under `[ Configuration Properties > Linker > Input ]`, or use the following:
-
-```
-// Alternatively, can add libraries using linker options.
-#ifdef _WIN32
-#pragma comment ( lib, "Shlwapi.lib" )
-#ifdef _DEBUG
-#pragma comment ( lib, "benchmarkd.lib" )
-#else
-#pragma comment ( lib, "benchmark.lib" )
-#endif
-#endif
-```
-
-Can also use the graphical version of CMake:
-* Open `CMake GUI`.
-* Under `Where to build the binaries`, same path as source plus `build`.
-* Under `CMAKE_INSTALL_PREFIX`, same path as source plus `install`.
-* Click `Configure`, `Generate`, `Open Project`.
-* If build fails, try deleting entire directory and starting again, or unticking options to build less.
-
-### Building with Intel 2015 Update 1 or Intel System Studio Update 4
-
-See instructions for building with Visual Studio. Once built, right click on the solution and change the build to Intel.
-
-### Building on Solaris
-
-If you're running benchmarks on solaris, you'll want the kstat library linked in
-too (`-lkstat`).
-
-## User Guide
-
-### Command Line
-
-[Output Formats](#output-formats)
-
-[Output Files](#output-files)
-
-[Running Benchmarks](#running-benchmarks)
-
-[Running a Subset of Benchmarks](#running-a-subset-of-benchmarks)
-
-[Result Comparison](#result-comparison)
-
-### Library
-
-[Runtime and Reporting Considerations](#runtime-and-reporting-considerations)
-
-[Passing Arguments](#passing-arguments)
-
-[Calculating Asymptotic Complexity](#asymptotic-complexity)
-
-[Templated Benchmarks](#templated-benchmarks)
-
-[Fixtures](#fixtures)
-
-[Custom Counters](#custom-counters)
-
-[Multithreaded Benchmarks](#multithreaded-benchmarks)
-
-[CPU Timers](#cpu-timers)
-
-[Manual Timing](#manual-timing)
-
-[Setting the Time Unit](#setting-the-time-unit)
-
-[Preventing Optimization](#preventing-optimization)
-
-[Reporting Statistics](#reporting-statistics)
-
-[Custom Statistics](#custom-statistics)
-
-[Using RegisterBenchmark](#using-register-benchmark)
-
-[Exiting with an Error](#exiting-with-an-error)
-
-[A Faster KeepRunning Loop](#a-faster-keep-running-loop)
-
-[Disabling CPU Frequency Scaling](#disabling-cpu-frequency-scaling)
-
-
-<a name="output-formats" />
-
-### Output Formats
-
-The library supports multiple output formats. Use the
-`--benchmark_format=<console|json|csv>` flag (or set the
-`BENCHMARK_FORMAT=<console|json|csv>` environment variable) to set
-the format type. `console` is the default format.
-
-The Console format is intended to be a human readable format. By default
-the format generates color output. Context is output on stderr and the
-tabular data on stdout. Example tabular output looks like:
-
-```
-Benchmark                               Time(ns)    CPU(ns) Iterations
-----------------------------------------------------------------------
-BM_SetInsert/1024/1                        28928      29349      23853  133.097kB/s   33.2742k items/s
-BM_SetInsert/1024/8                        32065      32913      21375  949.487kB/s   237.372k items/s
-BM_SetInsert/1024/10                       33157      33648      21431  1.13369MB/s   290.225k items/s
-```
-
-The JSON format outputs human readable json split into two top level attributes.
-The `context` attribute contains information about the run in general, including
-information about the CPU and the date.
-The `benchmarks` attribute contains a list of every benchmark run. Example json
-output looks like:
-
-```json
-{
-  "context": {
-    "date": "2015/03/17-18:40:25",
-    "num_cpus": 40,
-    "mhz_per_cpu": 2801,
-    "cpu_scaling_enabled": false,
-    "build_type": "debug"
-  },
-  "benchmarks": [
-    {
-      "name": "BM_SetInsert/1024/1",
-      "iterations": 94877,
-      "real_time": 29275,
-      "cpu_time": 29836,
-      "bytes_per_second": 134066,
-      "items_per_second": 33516
-    },
-    {
-      "name": "BM_SetInsert/1024/8",
-      "iterations": 21609,
-      "real_time": 32317,
-      "cpu_time": 32429,
-      "bytes_per_second": 986770,
-      "items_per_second": 246693
-    },
-    {
-      "name": "BM_SetInsert/1024/10",
-      "iterations": 21393,
-      "real_time": 32724,
-      "cpu_time": 33355,
-      "bytes_per_second": 1199226,
-      "items_per_second": 299807
-    }
-  ]
-}
-```
-
-The CSV format outputs comma-separated values. The `context` is output on stderr
-and the CSV itself on stdout. Example CSV output looks like:
-
-```
-name,iterations,real_time,cpu_time,bytes_per_second,items_per_second,label
-"BM_SetInsert/1024/1",65465,17890.7,8407.45,475768,118942,
-"BM_SetInsert/1024/8",116606,18810.1,9766.64,3.27646e+06,819115,
-"BM_SetInsert/1024/10",106365,17238.4,8421.53,4.74973e+06,1.18743e+06,
-```
-
-<a name="output-files" />
-
-### Output Files
-
-Write benchmark results to a file with the `--benchmark_out=<filename>` option
-(or set `BENCHMARK_OUT`). Specify the output format with
-`--benchmark_out_format={json|console|csv}` (or set
-`BENCHMARK_OUT_FORMAT={json|console|csv}`). Note that specifying
-`--benchmark_out` does not suppress the console output.
-
-<a name="running-benchmarks" />
-
-### Running Benchmarks
-
-Benchmarks are executed by running the produced binaries. Benchmarks binaries,
-by default, accept options that may be specified either through their command
-line interface or by setting environment variables before execution. For every
-`--option_flag=<value>` CLI switch, a corresponding environment variable
-`OPTION_FLAG=<value>` exist and is used as default if set (CLI switches always
- prevails). A complete list of CLI options is available running benchmarks
- with the `--help` switch.
-
-<a name="running-a-subset-of-benchmarks" />
-
-### Running a Subset of Benchmarks
-
-The `--benchmark_filter=<regex>` option (or `BENCHMARK_FILTER=<regex>`
-environment variable) can be used to only run the benchmarks that match
-the specified `<regex>`. For example:
-
-```bash
-$ ./run_benchmarks.x --benchmark_filter=BM_memcpy/32
-Run on (1 X 2300 MHz CPU )
-2016-06-25 19:34:24
-Benchmark              Time           CPU Iterations
-----------------------------------------------------
-BM_memcpy/32          11 ns         11 ns   79545455
-BM_memcpy/32k       2181 ns       2185 ns     324074
-BM_memcpy/32          12 ns         12 ns   54687500
-BM_memcpy/32k       1834 ns       1837 ns     357143
-```
-
-<a name="result-comparison" />
-
-### Result comparison
-
-It is possible to compare the benchmarking results.
-See [Additional Tooling Documentation](docs/tools.md)
-
-<a name="runtime-and-reporting-considerations" />
-
-### Runtime and Reporting Considerations
-
-When the benchmark binary is executed, each benchmark function is run serially.
-The number of iterations to run is determined dynamically by running the
-benchmark a few times and measuring the time taken and ensuring that the
-ultimate result will be statistically stable. As such, faster benchmark
-functions will be run for more iterations than slower benchmark functions, and
-the number of iterations is thus reported.
-
-In all cases, the number of iterations for which the benchmark is run is
-governed by the amount of time the benchmark takes. Concretely, the number of
-iterations is at least one, not more than 1e9, until CPU time is greater than
-the minimum time, or the wallclock time is 5x minimum time. The minimum time is
-set per benchmark by calling `MinTime` on the registered benchmark object.
-
-Average timings are then reported over the iterations run. If multiple
-repetitions are requested using the `--benchmark_repetitions` command-line
-option, or at registration time, the benchmark function will be run several
-times and statistical results across these repetitions will also be reported.
-
-As well as the per-benchmark entries, a preamble in the report will include
-information about the machine on which the benchmarks are run.
-
-<a name="passing-arguments" />
-
-### Passing Arguments
-
-Sometimes a family of benchmarks can be implemented with just one routine that
-takes an extra argument to specify which one of the family of benchmarks to
-run. For example, the following code defines a family of benchmarks for
-measuring the speed of `memcpy()` calls of different lengths:
-
-```c++
-static void BM_memcpy(benchmark::State& state) {
-  char* src = new char[state.range(0)];
-  char* dst = new char[state.range(0)];
-  memset(src, 'x', state.range(0));
-  for (auto _ : state)
-    memcpy(dst, src, state.range(0));
-  state.SetBytesProcessed(int64_t(state.iterations()) *
-                          int64_t(state.range(0)));
-  delete[] src;
-  delete[] dst;
-}
-BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
-```
-
-The preceding code is quite repetitive, and can be replaced with the following
-short-hand. The following invocation will pick a few appropriate arguments in
-the specified range and will generate a benchmark for each such argument.
-
-```c++
-BENCHMARK(BM_memcpy)->Range(8, 8<<10);
-```
-
-By default the arguments in the range are generated in multiples of eight and
-the command above selects [ 8, 64, 512, 4k, 8k ]. In the following code the
-range multiplier is changed to multiples of two.
-
-```c++
-BENCHMARK(BM_memcpy)->RangeMultiplier(2)->Range(8, 8<<10);
-```
-
-Now arguments generated are [ 8, 16, 32, 64, 128, 256, 512, 1024, 2k, 4k, 8k ].
-
-The preceding code shows a method of defining a sparse range.  The following
-example shows a method of defining a dense range. It is then used to benchmark
-the performance of `std::vector` initialization for uniformly increasing sizes.
-
-```c++
-static void BM_DenseRange(benchmark::State& state) {
-  for(auto _ : state) {
-    std::vector<int> v(state.range(0), state.range(0));
-    benchmark::DoNotOptimize(v.data());
-    benchmark::ClobberMemory();
-  }
-}
-BENCHMARK(BM_DenseRange)->DenseRange(0, 1024, 128);
-```
-
-Now arguments generated are [ 0, 128, 256, 384, 512, 640, 768, 896, 1024 ].
-
-You might have a benchmark that depends on two or more inputs. For example, the
-following code defines a family of benchmarks for measuring the speed of set
-insertion.
-
-```c++
-static void BM_SetInsert(benchmark::State& state) {
-  std::set<int> data;
-  for (auto _ : state) {
-    state.PauseTiming();
-    data = ConstructRandomSet(state.range(0));
-    state.ResumeTiming();
-    for (int j = 0; j < state.range(1); ++j)
-      data.insert(RandomNumber());
-  }
-}
-BENCHMARK(BM_SetInsert)
-    ->Args({1<<10, 128})
-    ->Args({2<<10, 128})
-    ->Args({4<<10, 128})
-    ->Args({8<<10, 128})
-    ->Args({1<<10, 512})
-    ->Args({2<<10, 512})
-    ->Args({4<<10, 512})
-    ->Args({8<<10, 512});
-```
-
-The preceding code is quite repetitive, and can be replaced with the following
-short-hand. The following macro will pick a few appropriate arguments in the
-product of the two specified ranges and will generate a benchmark for each such
-pair.
-
-```c++
-BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {128, 512}});
-```
-
-Some benchmarks may require specific argument values that cannot be expressed
-with `Ranges`. In this case, `ArgsProduct` offers the ability to generate a
-benchmark input for each combination in the product of the supplied vectors.
-
-```c++
-BENCHMARK(BM_SetInsert)
-    ->ArgsProduct({{1<<10, 3<<10, 8<<10}, {20, 40, 60, 80}})
-// would generate the same benchmark arguments as
-BENCHMARK(BM_SetInsert)
-    ->Args({1<<10, 20})
-    ->Args({3<<10, 20})
-    ->Args({8<<10, 20})
-    ->Args({3<<10, 40})
-    ->Args({8<<10, 40})
-    ->Args({1<<10, 40})
-    ->Args({1<<10, 60})
-    ->Args({3<<10, 60})
-    ->Args({8<<10, 60})
-    ->Args({1<<10, 80})
-    ->Args({3<<10, 80})
-    ->Args({8<<10, 80});
-```
-
-For more complex patterns of inputs, passing a custom function to `Apply` allows
-programmatic specification of an arbitrary set of arguments on which to run the
-benchmark. The following example enumerates a dense range on one parameter,
-and a sparse range on the second.
-
-```c++
-static void CustomArguments(benchmark::internal::Benchmark* b) {
-  for (int i = 0; i <= 10; ++i)
-    for (int j = 32; j <= 1024*1024; j *= 8)
-      b->Args({i, j});
-}
-BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
-```
-
-#### Passing Arbitrary Arguments to a Benchmark
-
-In C++11 it is possible to define a benchmark that takes an arbitrary number
-of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
-macro creates a benchmark that invokes `func`  with the `benchmark::State` as
-the first argument followed by the specified `args...`.
-The `test_case_name` is appended to the name of the benchmark and
-should describe the values passed.
-
-```c++
-template <class ...ExtraArgs>
-void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
-  [...]
-}
-// Registers a benchmark named "BM_takes_args/int_string_test" that passes
-// the specified values to `extra_args`.
-BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
-```
-
-Note that elements of `...args` may refer to global variables. Users should
-avoid modifying global state inside of a benchmark.
-
-<a name="asymptotic-complexity" />
-
-### Calculating Asymptotic Complexity (Big O)
-
-Asymptotic complexity might be calculated for a family of benchmarks. The
-following code will calculate the coefficient for the high-order term in the
-running time and the normalized root-mean square error of string comparison.
-
-```c++
-static void BM_StringCompare(benchmark::State& state) {
-  std::string s1(state.range(0), '-');
-  std::string s2(state.range(0), '-');
-  for (auto _ : state) {
-    benchmark::DoNotOptimize(s1.compare(s2));
-  }
-  state.SetComplexityN(state.range(0));
-}
-BENCHMARK(BM_StringCompare)
-    ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity(benchmark::oN);
-```
-
-As shown in the following invocation, asymptotic complexity might also be
-calculated automatically.
-
-```c++
-BENCHMARK(BM_StringCompare)
-    ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity();
-```
-
-The following code will specify asymptotic complexity with a lambda function,
-that might be used to customize high-order term calculation.
-
-```c++
-BENCHMARK(BM_StringCompare)->RangeMultiplier(2)
-    ->Range(1<<10, 1<<18)->Complexity([](benchmark::IterationCount n)->double{return n; });
-```
-
-<a name="templated-benchmarks" />
-
-### Templated Benchmarks
-
-This example produces and consumes messages of size `sizeof(v)` `range_x`
-times. It also outputs throughput in the absence of multiprogramming.
-
-```c++
-template <class Q> void BM_Sequential(benchmark::State& state) {
-  Q q;
-  typename Q::value_type v;
-  for (auto _ : state) {
-    for (int i = state.range(0); i--; )
-      q.push(v);
-    for (int e = state.range(0); e--; )
-      q.Wait(&v);
-  }
-  // actually messages, not bytes:
-  state.SetBytesProcessed(
-      static_cast<int64_t>(state.iterations())*state.range(0));
-}
-BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
-```
-
-Three macros are provided for adding benchmark templates.
-
-```c++
-#ifdef BENCHMARK_HAS_CXX11
-#define BENCHMARK_TEMPLATE(func, ...) // Takes any number of parameters.
-#else // C++ < C++11
-#define BENCHMARK_TEMPLATE(func, arg1)
-#endif
-#define BENCHMARK_TEMPLATE1(func, arg1)
-#define BENCHMARK_TEMPLATE2(func, arg1, arg2)
-```
-
-<a name="fixtures" />
-
-### Fixtures
-
-Fixture tests are created by first defining a type that derives from
-`::benchmark::Fixture` and then creating/registering the tests using the
-following macros:
-
-* `BENCHMARK_F(ClassName, Method)`
-* `BENCHMARK_DEFINE_F(ClassName, Method)`
-* `BENCHMARK_REGISTER_F(ClassName, Method)`
-
-For Example:
-
-```c++
-class MyFixture : public benchmark::Fixture {
-public:
-  void SetUp(const ::benchmark::State& state) {
-  }
-
-  void TearDown(const ::benchmark::State& state) {
-  }
-};
-
-BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
-}
-
-BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
-}
-/* BarTest is NOT registered */
-BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2);
-/* BarTest is now registered */
-```
-
-#### Templated Fixtures
-
-Also you can create templated fixture by using the following macros:
-
-* `BENCHMARK_TEMPLATE_F(ClassName, Method, ...)`
-* `BENCHMARK_TEMPLATE_DEFINE_F(ClassName, Method, ...)`
-
-For example:
-
-```c++
-template<typename T>
-class MyFixture : public benchmark::Fixture {};
-
-BENCHMARK_TEMPLATE_F(MyFixture, IntTest, int)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
-}
-
-BENCHMARK_TEMPLATE_DEFINE_F(MyFixture, DoubleTest, double)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
-}
-
-BENCHMARK_REGISTER_F(MyFixture, DoubleTest)->Threads(2);
-```
-
-<a name="custom-counters" />
-
-### Custom Counters
-
-You can add your own counters with user-defined names. The example below
-will add columns "Foo", "Bar" and "Baz" in its output:
-
-```c++
-static void UserCountersExample1(benchmark::State& state) {
-  double numFoos = 0, numBars = 0, numBazs = 0;
-  for (auto _ : state) {
-    // ... count Foo,Bar,Baz events
-  }
-  state.counters["Foo"] = numFoos;
-  state.counters["Bar"] = numBars;
-  state.counters["Baz"] = numBazs;
-}
-```
-
-The `state.counters` object is a `std::map` with `std::string` keys
-and `Counter` values. The latter is a `double`-like class, via an implicit
-conversion to `double&`. Thus you can use all of the standard arithmetic
-assignment operators (`=,+=,-=,*=,/=`) to change the value of each counter.
-
-In multithreaded benchmarks, each counter is set on the calling thread only.
-When the benchmark finishes, the counters from each thread will be summed;
-the resulting sum is the value which will be shown for the benchmark.
-
-The `Counter` constructor accepts three parameters: the value as a `double`
-; a bit flag which allows you to show counters as rates, and/or as per-thread
-iteration, and/or as per-thread averages, and/or iteration invariants,
-and/or finally inverting the result; and a flag specifying the 'unit' - i.e.
-is 1k a 1000 (default, `benchmark::Counter::OneK::kIs1000`), or 1024
-(`benchmark::Counter::OneK::kIs1024`)?
-
-```c++
-  // sets a simple counter
-  state.counters["Foo"] = numFoos;
-
-  // Set the counter as a rate. It will be presented divided
-  // by the duration of the benchmark.
-  // Meaning: per one second, how many 'foo's are processed?
-  state.counters["FooRate"] = Counter(numFoos, benchmark::Counter::kIsRate);
-
-  // Set the counter as a rate. It will be presented divided
-  // by the duration of the benchmark, and the result inverted.
-  // Meaning: how many seconds it takes to process one 'foo'?
-  state.counters["FooInvRate"] = Counter(numFoos, benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
-
-  // Set the counter as a thread-average quantity. It will
-  // be presented divided by the number of threads.
-  state.counters["FooAvg"] = Counter(numFoos, benchmark::Counter::kAvgThreads);
-
-  // There's also a combined flag:
-  state.counters["FooAvgRate"] = Counter(numFoos,benchmark::Counter::kAvgThreadsRate);
-
-  // This says that we process with the rate of state.range(0) bytes every iteration:
-  state.counters["BytesProcessed"] = Counter(state.range(0), benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024);
-```
-
-When you're compiling in C++11 mode or later you can use `insert()` with
-`std::initializer_list`:
-
-```c++
-  // With C++11, this can be done:
-  state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
-  // ... instead of:
-  state.counters["Foo"] = numFoos;
-  state.counters["Bar"] = numBars;
-  state.counters["Baz"] = numBazs;
-```
-
-#### Counter Reporting
-
-When using the console reporter, by default, user counters are printed at
-the end after the table, the same way as ``bytes_processed`` and
-``items_processed``. This is best for cases in which there are few counters,
-or where there are only a couple of lines per benchmark. Here's an example of
-the default output:
-
-```
-------------------------------------------------------------------------------
-Benchmark                        Time           CPU Iterations UserCounters...
-------------------------------------------------------------------------------
-BM_UserCounter/threads:8      2248 ns      10277 ns      68808 Bar=16 Bat=40 Baz=24 Foo=8
-BM_UserCounter/threads:1      9797 ns       9788 ns      71523 Bar=2 Bat=5 Baz=3 Foo=1024m
-BM_UserCounter/threads:2      4924 ns       9842 ns      71036 Bar=4 Bat=10 Baz=6 Foo=2
-BM_UserCounter/threads:4      2589 ns      10284 ns      68012 Bar=8 Bat=20 Baz=12 Foo=4
-BM_UserCounter/threads:8      2212 ns      10287 ns      68040 Bar=16 Bat=40 Baz=24 Foo=8
-BM_UserCounter/threads:16     1782 ns      10278 ns      68144 Bar=32 Bat=80 Baz=48 Foo=16
-BM_UserCounter/threads:32     1291 ns      10296 ns      68256 Bar=64 Bat=160 Baz=96 Foo=32
-BM_UserCounter/threads:4      2615 ns      10307 ns      68040 Bar=8 Bat=20 Baz=12 Foo=4
-BM_Factorial                    26 ns         26 ns   26608979 40320
-BM_Factorial/real_time          26 ns         26 ns   26587936 40320
-BM_CalculatePiRange/1           16 ns         16 ns   45704255 0
-BM_CalculatePiRange/8           73 ns         73 ns    9520927 3.28374
-BM_CalculatePiRange/64         609 ns        609 ns    1140647 3.15746
-BM_CalculatePiRange/512       4900 ns       4901 ns     142696 3.14355
-```
-
-If this doesn't suit you, you can print each counter as a table column by
-passing the flag `--benchmark_counters_tabular=true` to the benchmark
-application. This is best for cases in which there are a lot of counters, or
-a lot of lines per individual benchmark. Note that this will trigger a
-reprinting of the table header any time the counter set changes between
-individual benchmarks. Here's an example of corresponding output when
-`--benchmark_counters_tabular=true` is passed:
-
-```
----------------------------------------------------------------------------------------
-Benchmark                        Time           CPU Iterations    Bar   Bat   Baz   Foo
----------------------------------------------------------------------------------------
-BM_UserCounter/threads:8      2198 ns       9953 ns      70688     16    40    24     8
-BM_UserCounter/threads:1      9504 ns       9504 ns      73787      2     5     3     1
-BM_UserCounter/threads:2      4775 ns       9550 ns      72606      4    10     6     2
-BM_UserCounter/threads:4      2508 ns       9951 ns      70332      8    20    12     4
-BM_UserCounter/threads:8      2055 ns       9933 ns      70344     16    40    24     8
-BM_UserCounter/threads:16     1610 ns       9946 ns      70720     32    80    48    16
-BM_UserCounter/threads:32     1192 ns       9948 ns      70496     64   160    96    32
-BM_UserCounter/threads:4      2506 ns       9949 ns      70332      8    20    12     4
---------------------------------------------------------------
-Benchmark                        Time           CPU Iterations
---------------------------------------------------------------
-BM_Factorial                    26 ns         26 ns   26392245 40320
-BM_Factorial/real_time          26 ns         26 ns   26494107 40320
-BM_CalculatePiRange/1           15 ns         15 ns   45571597 0
-BM_CalculatePiRange/8           74 ns         74 ns    9450212 3.28374
-BM_CalculatePiRange/64         595 ns        595 ns    1173901 3.15746
-BM_CalculatePiRange/512       4752 ns       4752 ns     147380 3.14355
-BM_CalculatePiRange/4k       37970 ns      37972 ns      18453 3.14184
-BM_CalculatePiRange/32k     303733 ns     303744 ns       2305 3.14162
-BM_CalculatePiRange/256k   2434095 ns    2434186 ns        288 3.1416
-BM_CalculatePiRange/1024k  9721140 ns    9721413 ns         71 3.14159
-BM_CalculatePi/threads:8      2255 ns       9943 ns      70936
-```
-
-Note above the additional header printed when the benchmark changes from
-``BM_UserCounter`` to ``BM_Factorial``. This is because ``BM_Factorial`` does
-not have the same counter set as ``BM_UserCounter``.
-
-<a name="multithreaded-benchmarks"/>
-
-### Multithreaded Benchmarks
-
-In a multithreaded test (benchmark invoked by multiple threads simultaneously),
-it is guaranteed that none of the threads will start until all have reached
-the start of the benchmark loop, and all will have finished before any thread
-exits the benchmark loop. (This behavior is also provided by the `KeepRunning()`
-API) As such, any global setup or teardown can be wrapped in a check against the thread
-index:
-
-```c++
-static void BM_MultiThreaded(benchmark::State& state) {
-  if (state.thread_index == 0) {
-    // Setup code here.
-  }
-  for (auto _ : state) {
-    // Run the test as normal.
-  }
-  if (state.thread_index == 0) {
-    // Teardown code here.
-  }
-}
-BENCHMARK(BM_MultiThreaded)->Threads(2);
-```
-
-If the benchmarked code itself uses threads and you want to compare it to
-single-threaded code, you may want to use real-time ("wallclock") measurements
-for latency comparisons:
-
-```c++
-BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();
-```
-
-Without `UseRealTime`, CPU time is used by default.
-
-<a name="cpu-timers" />
-
-### CPU Timers
-
-By default, the CPU timer only measures the time spent by the main thread.
-If the benchmark itself uses threads internally, this measurement may not
-be what you are looking for. Instead, there is a way to measure the total
-CPU usage of the process, by all the threads.
-
-```c++
-void callee(int i);
-
-static void MyMain(int size) {
-#pragma omp parallel for
-  for(int i = 0; i < size; i++)
-    callee(i);
-}
-
-static void BM_OpenMP(benchmark::State& state) {
-  for (auto _ : state)
-    MyMain(state.range(0));
-}
-
-// Measure the time spent by the main thread, use it to decide for how long to
-// run the benchmark loop. Depending on the internal implementation detail may
-// measure to anywhere from near-zero (the overhead spent before/after work
-// handoff to worker thread[s]) to the whole single-thread time.
-BENCHMARK(BM_OpenMP)->Range(8, 8<<10);
-
-// Measure the user-visible time, the wall clock (literally, the time that
-// has passed on the clock on the wall), use it to decide for how long to
-// run the benchmark loop. This will always be meaningful, an will match the
-// time spent by the main thread in single-threaded case, in general decreasing
-// with the number of internal threads doing the work.
-BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->UseRealTime();
-
-// Measure the total CPU consumption, use it to decide for how long to
-// run the benchmark loop. This will always measure to no less than the
-// time spent by the main thread in single-threaded case.
-BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime();
-
-// A mixture of the last two. Measure the total CPU consumption, but use the
-// wall clock to decide for how long to run the benchmark loop.
-BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime()->UseRealTime();
-```
-
-#### Controlling Timers
-
-Normally, the entire duration of the work loop (`for (auto _ : state) {}`)
-is measured. But sometimes, it is necessary to do some work inside of
-that loop, every iteration, but without counting that time to the benchmark time.
-That is possible, although it is not recommended, since it has high overhead.
-
-```c++
-static void BM_SetInsert_With_Timer_Control(benchmark::State& state) {
-  std::set<int> data;
-  for (auto _ : state) {
-    state.PauseTiming(); // Stop timers. They will not count until they are resumed.
-    data = ConstructRandomSet(state.range(0)); // Do something that should not be measured
-    state.ResumeTiming(); // And resume timers. They are now counting again.
-    // The rest will be measured.
-    for (int j = 0; j < state.range(1); ++j)
-      data.insert(RandomNumber());
-  }
-}
-BENCHMARK(BM_SetInsert_With_Timer_Control)->Ranges({{1<<10, 8<<10}, {128, 512}});
-```
-
-<a name="manual-timing" />
-
-### Manual Timing
-
-For benchmarking something for which neither CPU time nor real-time are
-correct or accurate enough, completely manual timing is supported using
-the `UseManualTime` function.
-
-When `UseManualTime` is used, the benchmarked code must call
-`SetIterationTime` once per iteration of the benchmark loop to
-report the manually measured time.
-
-An example use case for this is benchmarking GPU execution (e.g. OpenCL
-or CUDA kernels, OpenGL or Vulkan or Direct3D draw calls), which cannot
-be accurately measured using CPU time or real-time. Instead, they can be
-measured accurately using a dedicated API, and these measurement results
-can be reported back with `SetIterationTime`.
-
-```c++
-static void BM_ManualTiming(benchmark::State& state) {
-  int microseconds = state.range(0);
-  std::chrono::duration<double, std::micro> sleep_duration {
-    static_cast<double>(microseconds)
-  };
-
-  for (auto _ : state) {
-    auto start = std::chrono::high_resolution_clock::now();
-    // Simulate some useful workload with a sleep
-    std::this_thread::sleep_for(sleep_duration);
-    auto end = std::chrono::high_resolution_clock::now();
-
-    auto elapsed_seconds =
-      std::chrono::duration_cast<std::chrono::duration<double>>(
-        end - start);
-
-    state.SetIterationTime(elapsed_seconds.count());
-  }
-}
-BENCHMARK(BM_ManualTiming)->Range(1, 1<<17)->UseManualTime();
-```
-
-<a name="setting-the-time-unit" />
-
-### Setting the Time Unit
-
-If a benchmark runs a few milliseconds it may be hard to visually compare the
-measured times, since the output data is given in nanoseconds per default. In
-order to manually set the time unit, you can specify it manually:
-
-```c++
-BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
-```
-
-<a name="preventing-optimization" />
-
-### Preventing Optimization
-
-To prevent a value or expression from being optimized away by the compiler
-the `benchmark::DoNotOptimize(...)` and `benchmark::ClobberMemory()`
-functions can be used.
-
-```c++
-static void BM_test(benchmark::State& state) {
-  for (auto _ : state) {
-      int x = 0;
-      for (int i=0; i < 64; ++i) {
-        benchmark::DoNotOptimize(x += i);
-      }
-  }
-}
-```
-
-`DoNotOptimize(<expr>)` forces the  *result* of `<expr>` to be stored in either
-memory or a register. For GNU based compilers it acts as read/write barrier
-for global memory. More specifically it forces the compiler to flush pending
-writes to memory and reload any other values as necessary.
-
-Note that `DoNotOptimize(<expr>)` does not prevent optimizations on `<expr>`
-in any way. `<expr>` may even be removed entirely when the result is already
-known. For example:
-
-```c++
-  /* Example 1: `<expr>` is removed entirely. */
-  int foo(int x) { return x + 42; }
-  while (...) DoNotOptimize(foo(0)); // Optimized to DoNotOptimize(42);
-
-  /*  Example 2: Result of '<expr>' is only reused */
-  int bar(int) __attribute__((const));
-  while (...) DoNotOptimize(bar(0)); // Optimized to:
-  // int __result__ = bar(0);
-  // while (...) DoNotOptimize(__result__);
-```
-
-The second tool for preventing optimizations is `ClobberMemory()`. In essence
-`ClobberMemory()` forces the compiler to perform all pending writes to global
-memory. Memory managed by block scope objects must be "escaped" using
-`DoNotOptimize(...)` before it can be clobbered. In the below example
-`ClobberMemory()` prevents the call to `v.push_back(42)` from being optimized
-away.
-
-```c++
-static void BM_vector_push_back(benchmark::State& state) {
-  for (auto _ : state) {
-    std::vector<int> v;
-    v.reserve(1);
-    benchmark::DoNotOptimize(v.data()); // Allow v.data() to be clobbered.
-    v.push_back(42);
-    benchmark::ClobberMemory(); // Force 42 to be written to memory.
-  }
-}
-```
-
-Note that `ClobberMemory()` is only available for GNU or MSVC based compilers.
-
-<a name="reporting-statistics" />
-
-### Statistics: Reporting the Mean, Median and Standard Deviation of Repeated Benchmarks
-
-By default each benchmark is run once and that single result is reported.
-However benchmarks are often noisy and a single result may not be representative
-of the overall behavior. For this reason it's possible to repeatedly rerun the
-benchmark.
-
-The number of runs of each benchmark is specified globally by the
-`--benchmark_repetitions` flag or on a per benchmark basis by calling
-`Repetitions` on the registered benchmark object. When a benchmark is run more
-than once the mean, median and standard deviation of the runs will be reported.
-
-Additionally the `--benchmark_report_aggregates_only={true|false}`,
-`--benchmark_display_aggregates_only={true|false}` flags or
-`ReportAggregatesOnly(bool)`, `DisplayAggregatesOnly(bool)` functions can be
-used to change how repeated tests are reported. By default the result of each
-repeated run is reported. When `report aggregates only` option is `true`,
-only the aggregates (i.e. mean, median and standard deviation, maybe complexity
-measurements if they were requested) of the runs is reported, to both the
-reporters - standard output (console), and the file.
-However when only the `display aggregates only` option is `true`,
-only the aggregates are displayed in the standard output, while the file
-output still contains everything.
-Calling `ReportAggregatesOnly(bool)` / `DisplayAggregatesOnly(bool)` on a
-registered benchmark object overrides the value of the appropriate flag for that
-benchmark.
-
-<a name="custom-statistics" />
-
-### Custom Statistics
-
-While having mean, median and standard deviation is nice, this may not be
-enough for everyone. For example you may want to know what the largest
-observation is, e.g. because you have some real-time constraints. This is easy.
-The following code will specify a custom statistic to be calculated, defined
-by a lambda function.
-
-```c++
-void BM_spin_empty(benchmark::State& state) {
-  for (auto _ : state) {
-    for (int x = 0; x < state.range(0); ++x) {
-      benchmark::DoNotOptimize(x);
-    }
-  }
-}
-
-BENCHMARK(BM_spin_empty)
-  ->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
-    return *(std::max_element(std::begin(v), std::end(v)));
-  })
-  ->Arg(512);
-```
-
-<a name="using-register-benchmark" />
-
-### Using RegisterBenchmark(name, fn, args...)
-
-The `RegisterBenchmark(name, func, args...)` function provides an alternative
-way to create and register benchmarks.
-`RegisterBenchmark(name, func, args...)` creates, registers, and returns a
-pointer to a new benchmark with the specified `name` that invokes
-`func(st, args...)` where `st` is a `benchmark::State` object.
-
-Unlike the `BENCHMARK` registration macros, which can only be used at the global
-scope, the `RegisterBenchmark` can be called anywhere. This allows for
-benchmark tests to be registered programmatically.
-
-Additionally `RegisterBenchmark` allows any callable object to be registered
-as a benchmark. Including capturing lambdas and function objects.
-
-For Example:
-```c++
-auto BM_test = [](benchmark::State& st, auto Inputs) { /* ... */ };
-
-int main(int argc, char** argv) {
-  for (auto& test_input : { /* ... */ })
-      benchmark::RegisterBenchmark(test_input.name(), BM_test, test_input);
-  benchmark::Initialize(&argc, argv);
-  benchmark::RunSpecifiedBenchmarks();
-}
-```
-
-<a name="exiting-with-an-error" />
-
-### Exiting with an Error
-
-When errors caused by external influences, such as file I/O and network
-communication, occur within a benchmark the
-`State::SkipWithError(const char* msg)` function can be used to skip that run
-of benchmark and report the error. Note that only future iterations of the
-`KeepRunning()` are skipped. For the ranged-for version of the benchmark loop
-Users must explicitly exit the loop, otherwise all iterations will be performed.
-Users may explicitly return to exit the benchmark immediately.
-
-The `SkipWithError(...)` function may be used at any point within the benchmark,
-including before and after the benchmark loop. Moreover, if `SkipWithError(...)`
-has been used, it is not required to reach the benchmark loop and one may return
-from the benchmark function early.
-
-For example:
-
-```c++
-static void BM_test(benchmark::State& state) {
-  auto resource = GetResource();
-  if (!resource.good()) {
-    state.SkipWithError("Resource is not good!");
-    // KeepRunning() loop will not be entered.
-  }
-  while (state.KeepRunning()) {
-    auto data = resource.read_data();
-    if (!resource.good()) {
-      state.SkipWithError("Failed to read data!");
-      break; // Needed to skip the rest of the iteration.
-    }
-    do_stuff(data);
-  }
-}
-
-static void BM_test_ranged_fo(benchmark::State & state) {
-  auto resource = GetResource();
-  if (!resource.good()) {
-    state.SkipWithError("Resource is not good!");
-    return; // Early return is allowed when SkipWithError() has been used.
-  }
-  for (auto _ : state) {
-    auto data = resource.read_data();
-    if (!resource.good()) {
-      state.SkipWithError("Failed to read data!");
-      break; // REQUIRED to prevent all further iterations.
-    }
-    do_stuff(data);
-  }
-}
-```
-<a name="a-faster-keep-running-loop" />
-
-### A Faster KeepRunning Loop
-
-In C++11 mode, a ranged-based for loop should be used in preference to
-the `KeepRunning` loop for running the benchmarks. For example:
-
-```c++
-static void BM_Fast(benchmark::State &state) {
-  for (auto _ : state) {
-    FastOperation();
-  }
-}
-BENCHMARK(BM_Fast);
-```
-
-The reason the ranged-for loop is faster than using `KeepRunning`, is
-because `KeepRunning` requires a memory load and store of the iteration count
-ever iteration, whereas the ranged-for variant is able to keep the iteration count
-in a register.
-
-For example, an empty inner loop of using the ranged-based for method looks like:
-
-```asm
-# Loop Init
-  mov rbx, qword ptr [r14 + 104]
-  call benchmark::State::StartKeepRunning()
-  test rbx, rbx
-  je .LoopEnd
-.LoopHeader: # =>This Inner Loop Header: Depth=1
-  add rbx, -1
-  jne .LoopHeader
-.LoopEnd:
-```
-
-Compared to an empty `KeepRunning` loop, which looks like:
-
-```asm
-.LoopHeader: # in Loop: Header=BB0_3 Depth=1
-  cmp byte ptr [rbx], 1
-  jne .LoopInit
-.LoopBody: # =>This Inner Loop Header: Depth=1
-  mov rax, qword ptr [rbx + 8]
-  lea rcx, [rax + 1]
-  mov qword ptr [rbx + 8], rcx
-  cmp rax, qword ptr [rbx + 104]
-  jb .LoopHeader
-  jmp .LoopEnd
-.LoopInit:
-  mov rdi, rbx
-  call benchmark::State::StartKeepRunning()
-  jmp .LoopBody
-.LoopEnd:
-```
-
-Unless C++03 compatibility is required, the ranged-for variant of writing
-the benchmark loop should be preferred.
-
-<a name="disabling-cpu-frequency-scaling" />
-
-### Disabling CPU Frequency Scaling
-
-If you see this error:
-
-```
-***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
-```
-
-you might want to disable the CPU frequency scaling while running the benchmark:
-
-```bash
-sudo cpupower frequency-set --governor performance
-./mybench
-sudo cpupower frequency-set --governor powersave
-```
diff --git a/WORKSPACE b/WORKSPACE
index 631f3ba..833590f 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,51 +1,22 @@
 workspace(name = "com_github_google_benchmark")
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("//:bazel/benchmark_deps.bzl", "benchmark_deps")
 
-http_archive(
-    name = "rules_cc",
-    strip_prefix = "rules_cc-a508235df92e71d537fcbae0c7c952ea6957a912",
-    urls = ["https://github.com/bazelbuild/rules_cc/archive/a508235df92e71d537fcbae0c7c952ea6957a912.zip"],
-    sha256 = "d7dc12c1d5bc1a87474de8e3d17b7731a4dcebcfb8aa3990fe8ac7734ef12f2f",
-)
+benchmark_deps()
 
-http_archive(
-    name = "com_google_absl",
-    sha256 = "f41868f7a938605c92936230081175d1eae87f6ea2c248f41077c8f88316f111",
-    strip_prefix = "abseil-cpp-20200225.2",
-    urls = ["https://github.com/abseil/abseil-cpp/archive/20200225.2.tar.gz"],
-)
+load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies")
 
-http_archive(
-    name = "com_google_googletest",
-    strip_prefix = "googletest-3f0cf6b62ad1eb50d8736538363d3580dd640c3e",
-    urls = ["https://github.com/google/googletest/archive/3f0cf6b62ad1eb50d8736538363d3580dd640c3e.zip"],
-    sha256 = "8f827dd550db8b4fdf73904690df0be9fccc161017c9038a724bc9a0617a1bc8",
-)
+rules_foreign_cc_dependencies()
 
-http_archive(
-    name = "pybind11",
-    build_file = "@//bindings/python:pybind11.BUILD",
-    sha256 = "1eed57bc6863190e35637290f97a20c81cfe4d9090ac0a24f3bbf08f265eb71d",
-    strip_prefix = "pybind11-2.4.3",
-    urls = ["https://github.com/pybind/pybind11/archive/v2.4.3.tar.gz"],
+load("@rules_python//python:pip.bzl", pip3_install="pip_install")
+
+pip3_install(
+   name = "tools_pip_deps",
+   requirements = "//tools:requirements.txt",
 )
 
 new_local_repository(
     name = "python_headers",
     build_file = "@//bindings/python:python_headers.BUILD",
-    path = "/usr/include/python3.6",  # May be overwritten by setup.py.
-)
-
-http_archive(
-    name = "rules_python",
-    url = "https://github.com/bazelbuild/rules_python/releases/download/0.1.0/rules_python-0.1.0.tar.gz",
-    sha256 = "b6d46438523a3ec0f3cead544190ee13223a52f6a6765a29eae7b7cc24cc83a0",
-)
-
-load("@rules_python//python:pip.bzl", pip3_install="pip_install")
-
-pip3_install(
-   name = "py_deps",
-   requirements = "//:requirements.txt",
+    path = "<PYTHON_INCLUDE_PATH>",  # May be overwritten by setup.py.
 )
diff --git a/WORKSPACE.bzlmod b/WORKSPACE.bzlmod
new file mode 100644
index 0000000..9526376
--- /dev/null
+++ b/WORKSPACE.bzlmod
@@ -0,0 +1,2 @@
+# This file marks the root of the Bazel workspace.
+# See MODULE.bazel for dependencies and setup.
diff --git a/_config.yml b/_config.yml
index 1885487..1fa5ff8 100644
--- a/_config.yml
+++ b/_config.yml
@@ -1 +1,2 @@
-theme: jekyll-theme-midnight
-\ No newline at end of file
+theme: jekyll-theme-midnight
+markdown: GFM
diff --git a/bazel/benchmark_deps.bzl b/bazel/benchmark_deps.bzl
new file mode 100644
index 0000000..667065f
--- /dev/null
+++ b/bazel/benchmark_deps.bzl
@@ -0,0 +1,65 @@
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "new_git_repository")
+
+def benchmark_deps():
+    """Loads dependencies required to build Google Benchmark."""
+
+    if "bazel_skylib" not in native.existing_rules():
+        http_archive(
+            name = "bazel_skylib",
+            sha256 = "f7be3474d42aae265405a592bb7da8e171919d74c16f082a5457840f06054728",
+            urls = [
+                "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.2.1/bazel-skylib-1.2.1.tar.gz",
+                "https://github.com/bazelbuild/bazel-skylib/releases/download/1.2.1/bazel-skylib-1.2.1.tar.gz",
+            ],
+        )
+
+    if "rules_foreign_cc" not in native.existing_rules():
+        http_archive(
+            name = "rules_foreign_cc",
+            sha256 = "bcd0c5f46a49b85b384906daae41d277b3dc0ff27c7c752cc51e43048a58ec83",
+            strip_prefix = "rules_foreign_cc-0.7.1",
+            url = "https://github.com/bazelbuild/rules_foreign_cc/archive/0.7.1.tar.gz",
+        )
+
+    if "rules_python" not in native.existing_rules():
+        http_archive(
+            name = "rules_python",
+            url = "https://github.com/bazelbuild/rules_python/releases/download/0.1.0/rules_python-0.1.0.tar.gz",
+            sha256 = "b6d46438523a3ec0f3cead544190ee13223a52f6a6765a29eae7b7cc24cc83a0",
+        )
+
+    if "com_google_absl" not in native.existing_rules():
+        http_archive(
+            name = "com_google_absl",
+            sha256 = "f41868f7a938605c92936230081175d1eae87f6ea2c248f41077c8f88316f111",
+            strip_prefix = "abseil-cpp-20200225.2",
+            urls = ["https://github.com/abseil/abseil-cpp/archive/20200225.2.tar.gz"],
+        )
+
+    if "com_google_googletest" not in native.existing_rules():
+        new_git_repository(
+            name = "com_google_googletest",
+            remote = "https://github.com/google/googletest.git",
+            tag = "release-1.11.0",
+        )
+
+    if "nanobind" not in native.existing_rules():
+        new_git_repository(
+            name = "nanobind",
+            remote = "https://github.com/wjakob/nanobind.git",
+            tag = "v1.4.0",
+            build_file = "@//bindings/python:nanobind.BUILD",
+            recursive_init_submodules = True,
+        )
+
+    if "libpfm" not in native.existing_rules():
+        # Downloaded from v4.9.0 tag at https://sourceforge.net/p/perfmon2/libpfm4/ref/master/tags/
+        http_archive(
+            name = "libpfm",
+            build_file = str(Label("//tools:libpfm.BUILD.bazel")),
+            sha256 = "5da5f8872bde14b3634c9688d980f68bda28b510268723cc12973eedbab9fecc",
+            type = "tar.gz",
+            strip_prefix = "libpfm-4.11.0",
+            urls = ["https://sourceforge.net/projects/perfmon2/files/libpfm4/libpfm-4.11.0.tar.gz/download"],
+        )
diff --git a/bindings/python/build_defs.bzl b/bindings/python/build_defs.bzl
index 45907aa..009820a 100644
--- a/bindings/python/build_defs.bzl
+++ b/bindings/python/build_defs.bzl
@@ -8,8 +8,8 @@ def py_extension(name, srcs, hdrs = [], copts = [], features = [], deps = []):
         shared_lib_name = name + shared_lib_suffix
         native.cc_binary(
             name = shared_lib_name,
-            linkshared = 1,
-            linkstatic = 1,
+            linkshared = True,
+            linkstatic = True,
             srcs = srcs + hdrs,
             copts = copts,
             features = features,
diff --git a/bindings/python/google_benchmark/BUILD b/bindings/python/google_benchmark/BUILD
index 3c1561f..89ec76e 100644
--- a/bindings/python/google_benchmark/BUILD
+++ b/bindings/python/google_benchmark/BUILD
@@ -6,7 +6,6 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         ":_benchmark",
-        # pip; absl:app
     ],
 )
 
@@ -17,10 +16,13 @@ py_extension(
         "-fexceptions",
         "-fno-strict-aliasing",
     ],
-    features = ["-use_header_modules"],
+    features = [
+        "-use_header_modules",
+        "-parse_headers",
+    ],
     deps = [
         "//:benchmark",
-        "@pybind11",
+        "@nanobind",
         "@python_headers",
     ],
 )
diff --git a/bindings/python/google_benchmark/__init__.py b/bindings/python/google_benchmark/__init__.py
index f31285e..642d78a 100644
--- a/bindings/python/google_benchmark/__init__.py
+++ b/bindings/python/google_benchmark/__init__.py
@@ -26,6 +26,7 @@ Example usage:
   if __name__ == '__main__':
     benchmark.main()
 """
+import atexit
 
 from absl import app
 from google_benchmark import _benchmark
@@ -44,6 +45,7 @@ from google_benchmark._benchmark import (
     oNLogN,
     oAuto,
     oLambda,
+    State,
 )
 
 
@@ -64,9 +66,10 @@ __all__ = [
     "oNLogN",
     "oAuto",
     "oLambda",
+    "State",
 ]
 
-__version__ = "0.2.0"
+__version__ = "1.8.3"
 
 
 class __OptionMaker:
@@ -101,7 +104,7 @@ class __OptionMaker:
                 options = self.make(func_or_options)
                 options.builder_calls.append((builder_name, args, kwargs))
                 # The decorator returns Options so it is not technically a decorator
-                # and needs a final call to @regiser
+                # and needs a final call to @register
                 return options
 
             return __decorator
@@ -110,7 +113,7 @@ class __OptionMaker:
 
 
 # Alias for nicer API.
-# We have to instanciate an object, even if stateless, to be able to use __getattr__
+# We have to instantiate an object, even if stateless, to be able to use __getattr__
 # on option.range
 option = __OptionMaker()
 
@@ -156,3 +159,4 @@ def main(argv=None):
 # Methods for use with custom main function.
 initialize = _benchmark.Initialize
 run_benchmarks = _benchmark.RunSpecifiedBenchmarks
+atexit.register(_benchmark.ClearRegisteredBenchmarks)
diff --git a/bindings/python/google_benchmark/benchmark.cc b/bindings/python/google_benchmark/benchmark.cc
index d80816e..f444769 100644
--- a/bindings/python/google_benchmark/benchmark.cc
+++ b/bindings/python/google_benchmark/benchmark.cc
@@ -1,20 +1,17 @@
 // Benchmark for Python.
 
-#include <map>
-#include <string>
-#include <vector>
-
-#include "pybind11/operators.h"
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
-#include "pybind11/stl_bind.h"
-
 #include "benchmark/benchmark.h"
 
-PYBIND11_MAKE_OPAQUE(benchmark::UserCounters);
+#include "nanobind/nanobind.h"
+#include "nanobind/operators.h"
+#include "nanobind/stl/bind_map.h"
+#include "nanobind/stl/string.h"
+#include "nanobind/stl/vector.h"
+
+NB_MAKE_OPAQUE(benchmark::UserCounters);
 
 namespace {
-namespace py = ::pybind11;
+namespace nb = nanobind;
 
 std::vector<std::string> Initialize(const std::vector<std::string>& argv) {
   // The `argv` pointers here become invalid when this function returns, but
@@ -37,15 +34,16 @@ std::vector<std::string> Initialize(const std::vector<std::string>& argv) {
   return remaining_argv;
 }
 
-benchmark::internal::Benchmark* RegisterBenchmark(const char* name,
-                                                  py::function f) {
+benchmark::internal::Benchmark* RegisterBenchmark(const std::string& name,
+                                                  nb::callable f) {
   return benchmark::RegisterBenchmark(
       name, [f](benchmark::State& state) { f(&state); });
 }
 
-PYBIND11_MODULE(_benchmark, m) {
+NB_MODULE(_benchmark, m) {
+
   using benchmark::TimeUnit;
-  py::enum_<TimeUnit>(m, "TimeUnit")
+  nb::enum_<TimeUnit>(m, "TimeUnit")
       .value("kNanosecond", TimeUnit::kNanosecond)
       .value("kMicrosecond", TimeUnit::kMicrosecond)
       .value("kMillisecond", TimeUnit::kMillisecond)
@@ -53,72 +51,74 @@ PYBIND11_MODULE(_benchmark, m) {
       .export_values();
 
   using benchmark::BigO;
-  py::enum_<BigO>(m, "BigO")
+  nb::enum_<BigO>(m, "BigO")
       .value("oNone", BigO::oNone)
       .value("o1", BigO::o1)
       .value("oN", BigO::oN)
       .value("oNSquared", BigO::oNSquared)
       .value("oNCubed", BigO::oNCubed)
       .value("oLogN", BigO::oLogN)
-      .value("oNLogN", BigO::oLogN)
+      .value("oNLogN", BigO::oNLogN)
       .value("oAuto", BigO::oAuto)
       .value("oLambda", BigO::oLambda)
       .export_values();
 
   using benchmark::internal::Benchmark;
-  py::class_<Benchmark>(m, "Benchmark")
-      // For methods returning a pointer tor the current object, reference
-      // return policy is used to ask pybind not to take ownership oof the
+  nb::class_<Benchmark>(m, "Benchmark")
+      // For methods returning a pointer to the current object, reference
+      // return policy is used to ask nanobind not to take ownership of the
       // returned object and avoid calling delete on it.
       // https://pybind11.readthedocs.io/en/stable/advanced/functions.html#return-value-policies
       //
       // For methods taking a const std::vector<...>&, a copy is created
       // because a it is bound to a Python list.
       // https://pybind11.readthedocs.io/en/stable/advanced/cast/stl.html
-      .def("unit", &Benchmark::Unit, py::return_value_policy::reference)
-      .def("arg", &Benchmark::Arg, py::return_value_policy::reference)
-      .def("args", &Benchmark::Args, py::return_value_policy::reference)
-      .def("range", &Benchmark::Range, py::return_value_policy::reference,
-           py::arg("start"), py::arg("limit"))
+      .def("unit", &Benchmark::Unit, nb::rv_policy::reference)
+      .def("arg", &Benchmark::Arg, nb::rv_policy::reference)
+      .def("args", &Benchmark::Args, nb::rv_policy::reference)
+      .def("range", &Benchmark::Range, nb::rv_policy::reference,
+           nb::arg("start"), nb::arg("limit"))
       .def("dense_range", &Benchmark::DenseRange,
-           py::return_value_policy::reference, py::arg("start"),
-           py::arg("limit"), py::arg("step") = 1)
-      .def("ranges", &Benchmark::Ranges, py::return_value_policy::reference)
+           nb::rv_policy::reference, nb::arg("start"),
+           nb::arg("limit"), nb::arg("step") = 1)
+      .def("ranges", &Benchmark::Ranges, nb::rv_policy::reference)
       .def("args_product", &Benchmark::ArgsProduct,
-           py::return_value_policy::reference)
-      .def("arg_name", &Benchmark::ArgName, py::return_value_policy::reference)
+           nb::rv_policy::reference)
+      .def("arg_name", &Benchmark::ArgName, nb::rv_policy::reference)
       .def("arg_names", &Benchmark::ArgNames,
-           py::return_value_policy::reference)
+           nb::rv_policy::reference)
       .def("range_pair", &Benchmark::RangePair,
-           py::return_value_policy::reference, py::arg("lo1"), py::arg("hi1"),
-           py::arg("lo2"), py::arg("hi2"))
+           nb::rv_policy::reference, nb::arg("lo1"), nb::arg("hi1"),
+           nb::arg("lo2"), nb::arg("hi2"))
       .def("range_multiplier", &Benchmark::RangeMultiplier,
-           py::return_value_policy::reference)
-      .def("min_time", &Benchmark::MinTime, py::return_value_policy::reference)
+           nb::rv_policy::reference)
+      .def("min_time", &Benchmark::MinTime, nb::rv_policy::reference)
+      .def("min_warmup_time", &Benchmark::MinWarmUpTime,
+           nb::rv_policy::reference)
       .def("iterations", &Benchmark::Iterations,
-           py::return_value_policy::reference)
+           nb::rv_policy::reference)
       .def("repetitions", &Benchmark::Repetitions,
-           py::return_value_policy::reference)
+           nb::rv_policy::reference)
       .def("report_aggregates_only", &Benchmark::ReportAggregatesOnly,
-           py::return_value_policy::reference, py::arg("value") = true)
+           nb::rv_policy::reference, nb::arg("value") = true)
       .def("display_aggregates_only", &Benchmark::DisplayAggregatesOnly,
-           py::return_value_policy::reference, py::arg("value") = true)
+           nb::rv_policy::reference, nb::arg("value") = true)
       .def("measure_process_cpu_time", &Benchmark::MeasureProcessCPUTime,
-           py::return_value_policy::reference)
+           nb::rv_policy::reference)
       .def("use_real_time", &Benchmark::UseRealTime,
-           py::return_value_policy::reference)
+           nb::rv_policy::reference)
       .def("use_manual_time", &Benchmark::UseManualTime,
-           py::return_value_policy::reference)
+           nb::rv_policy::reference)
       .def(
           "complexity",
           (Benchmark * (Benchmark::*)(benchmark::BigO)) & Benchmark::Complexity,
-          py::return_value_policy::reference,
-          py::arg("complexity") = benchmark::oAuto);
+          nb::rv_policy::reference,
+          nb::arg("complexity") = benchmark::oAuto);
 
   using benchmark::Counter;
-  py::class_<Counter> py_counter(m, "Counter");
+  nb::class_<Counter> py_counter(m, "Counter");
 
-  py::enum_<Counter::Flags>(py_counter, "Flags")
+  nb::enum_<Counter::Flags>(py_counter, "Flags")
       .value("kDefaults", Counter::Flags::kDefaults)
       .value("kIsRate", Counter::Flags::kIsRate)
       .value("kAvgThreads", Counter::Flags::kAvgThreads)
@@ -130,52 +130,55 @@ PYBIND11_MODULE(_benchmark, m) {
       .value("kAvgIterationsRate", Counter::Flags::kAvgIterationsRate)
       .value("kInvert", Counter::Flags::kInvert)
       .export_values()
-      .def(py::self | py::self);
+      .def(nb::self | nb::self);
 
-  py::enum_<Counter::OneK>(py_counter, "OneK")
+  nb::enum_<Counter::OneK>(py_counter, "OneK")
       .value("kIs1000", Counter::OneK::kIs1000)
       .value("kIs1024", Counter::OneK::kIs1024)
       .export_values();
 
   py_counter
-      .def(py::init<double, Counter::Flags, Counter::OneK>(),
-           py::arg("value") = 0., py::arg("flags") = Counter::kDefaults,
-           py::arg("k") = Counter::kIs1000)
-      .def(py::init([](double value) { return Counter(value); }))
-      .def_readwrite("value", &Counter::value)
-      .def_readwrite("flags", &Counter::flags)
-      .def_readwrite("oneK", &Counter::oneK);
-  py::implicitly_convertible<py::float_, Counter>();
-  py::implicitly_convertible<py::int_, Counter>();
-
-  py::bind_map<benchmark::UserCounters>(m, "UserCounters");
+      .def(nb::init<double, Counter::Flags, Counter::OneK>(),
+           nb::arg("value") = 0., nb::arg("flags") = Counter::kDefaults,
+           nb::arg("k") = Counter::kIs1000)
+      .def("__init__", ([](Counter *c, double value) { new (c) Counter(value); }))
+      .def_rw("value", &Counter::value)
+      .def_rw("flags", &Counter::flags)
+      .def_rw("oneK", &Counter::oneK)
+      .def(nb::init_implicit<double>());
+
+  nb::implicitly_convertible<nb::int_, Counter>();
+
+  nb::bind_map<benchmark::UserCounters>(m, "UserCounters");
 
   using benchmark::State;
-  py::class_<State>(m, "State")
+  nb::class_<State>(m, "State")
       .def("__bool__", &State::KeepRunning)
-      .def_property_readonly("keep_running", &State::KeepRunning)
+      .def_prop_ro("keep_running", &State::KeepRunning)
       .def("pause_timing", &State::PauseTiming)
       .def("resume_timing", &State::ResumeTiming)
       .def("skip_with_error", &State::SkipWithError)
-      .def_property_readonly("error_occured", &State::error_occurred)
+      .def_prop_ro("error_occurred", &State::error_occurred)
       .def("set_iteration_time", &State::SetIterationTime)
-      .def_property("bytes_processed", &State::bytes_processed,
+      .def_prop_rw("bytes_processed", &State::bytes_processed,
                     &State::SetBytesProcessed)
-      .def_property("complexity_n", &State::complexity_length_n,
+      .def_prop_rw("complexity_n", &State::complexity_length_n,
                     &State::SetComplexityN)
-      .def_property("items_processed", &State::items_processed,
-                    &State::SetItemsProcessed)
-      .def("set_label", (void (State::*)(const char*)) & State::SetLabel)
-      .def("range", &State::range, py::arg("pos") = 0)
-      .def_property_readonly("iterations", &State::iterations)
-      .def_readwrite("counters", &State::counters)
-      .def_readonly("thread_index", &State::thread_index)
-      .def_readonly("threads", &State::threads);
+      .def_prop_rw("items_processed", &State::items_processed,
+                   &State::SetItemsProcessed)
+      .def("set_label", &State::SetLabel)
+      .def("range", &State::range, nb::arg("pos") = 0)
+      .def_prop_ro("iterations", &State::iterations)
+      .def_prop_ro("name", &State::name)
+      .def_rw("counters", &State::counters)
+      .def_prop_ro("thread_index", &State::thread_index)
+      .def_prop_ro("threads", &State::threads);
 
   m.def("Initialize", Initialize);
   m.def("RegisterBenchmark", RegisterBenchmark,
-        py::return_value_policy::reference);
+        nb::rv_policy::reference);
   m.def("RunSpecifiedBenchmarks",
         []() { benchmark::RunSpecifiedBenchmarks(); });
+  m.def("ClearRegisteredBenchmarks", benchmark::ClearRegisteredBenchmarks);
 };
 }  // namespace
diff --git a/bindings/python/google_benchmark/example.py b/bindings/python/google_benchmark/example.py
index 9134e8c..d95a043 100644
--- a/bindings/python/google_benchmark/example.py
+++ b/bindings/python/google_benchmark/example.py
@@ -72,7 +72,7 @@ def manual_timing(state):
 
 @benchmark.register
 def custom_counters(state):
-    """Collect cutom metric using benchmark.Counter."""
+    """Collect custom metric using benchmark.Counter."""
     num_foo = 0.0
     while state:
         # Benchmark some code here
@@ -102,7 +102,7 @@ def with_options(state):
 
 @benchmark.register(name="sum_million_microseconds")
 @benchmark.option.unit(benchmark.kMicrosecond)
-def with_options(state):
+def with_options2(state):
     while state:
         sum(range(1_000_000))
 
diff --git a/bindings/python/nanobind.BUILD b/bindings/python/nanobind.BUILD
new file mode 100644
index 0000000..cd9faf9
--- /dev/null
+++ b/bindings/python/nanobind.BUILD
@@ -0,0 +1,17 @@
+cc_library(
+    name = "nanobind",
+    srcs = glob([
+        "src/*.cpp"
+    ]),
+    copts = ["-fexceptions"],
+    includes = ["include", "ext/robin_map/include"],
+    textual_hdrs = glob(
+        [
+            "include/**/*.h",
+            "src/*.h",
+            "ext/robin_map/include/tsl/*.h",
+        ],
+    ),
+    deps = ["@python_headers"],
+    visibility = ["//visibility:public"],
+)
diff --git a/bindings/python/pybind11.BUILD b/bindings/python/pybind11.BUILD
deleted file mode 100644
index bc83350..0000000
--- a/bindings/python/pybind11.BUILD
+++ /dev/null
@@ -1,20 +0,0 @@
-cc_library(
-    name = "pybind11",
-    hdrs = glob(
-        include = [
-            "include/pybind11/*.h",
-            "include/pybind11/detail/*.h",
-        ],
-        exclude = [
-            "include/pybind11/common.h",
-            "include/pybind11/eigen.h",
-        ],
-    ),
-    copts = [
-        "-fexceptions",
-        "-Wno-undefined-inline",
-        "-Wno-pragma-once-outside-header",
-    ],
-    includes = ["include"],
-    visibility = ["//visibility:public"],
-)
diff --git a/bindings/python/requirements.txt b/bindings/python/requirements.txt
deleted file mode 100644
index f5bbe7e..0000000
--- a/bindings/python/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-absl-py>=0.7.1
-
diff --git a/cmake/AddCXXCompilerFlag.cmake b/cmake/AddCXXCompilerFlag.cmake
index d0d2099..858589e 100644
--- a/cmake/AddCXXCompilerFlag.cmake
+++ b/cmake/AddCXXCompilerFlag.cmake
@@ -34,9 +34,11 @@ function(add_cxx_compiler_flag FLAG)
   check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
   set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
   if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
       string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
     endif()
     set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${BENCHMARK_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
   endif()
@@ -49,9 +51,11 @@ function(add_required_cxx_compiler_flag FLAG)
   check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
   set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
   if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
       string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
     endif()
     set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
diff --git a/cmake/CXXFeatureCheck.cmake b/cmake/CXXFeatureCheck.cmake
index 62e6741..e514826 100644
--- a/cmake/CXXFeatureCheck.cmake
+++ b/cmake/CXXFeatureCheck.cmake
@@ -17,6 +17,8 @@ if(__cxx_feature_check)
 endif()
 set(__cxx_feature_check INCLUDED)
 
+option(CXXFEATURECHECK_DEBUG OFF)
+
 function(cxx_feature_check FILE)
   string(TOLOWER ${FILE} FILE)
   string(TOUPPER ${FILE} VAR)
@@ -27,18 +29,22 @@ function(cxx_feature_check FILE)
     return()
   endif()
 
+  set(FEATURE_CHECK_CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
   if (ARGC GREATER 1)
     message(STATUS "Enabling additional flags: ${ARGV1}")
-    list(APPEND BENCHMARK_CXX_LINKER_FLAGS ${ARGV1})
+    list(APPEND FEATURE_CHECK_CMAKE_FLAGS ${ARGV1})
   endif()
 
   if (NOT DEFINED COMPILE_${FEATURE})
-    message(STATUS "Performing Test ${FEATURE}")
     if(CMAKE_CROSSCOMPILING)
+      message(STATUS "Cross-compiling to test ${FEATURE}")
       try_compile(COMPILE_${FEATURE}
               ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+              CXX_STANDARD 11
+              CXX_STANDARD_REQUIRED ON
+              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
+              OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
       if(COMPILE_${FEATURE})
         message(WARNING
               "If you see build failures due to cross compilation, try setting HAVE_${VAR} to 0")
@@ -47,11 +53,14 @@ function(cxx_feature_check FILE)
         set(RUN_${FEATURE} 1 CACHE INTERNAL "")
       endif()
     else()
-      message(STATUS "Performing Test ${FEATURE}")
+      message(STATUS "Compiling and running to test ${FEATURE}")
       try_run(RUN_${FEATURE} COMPILE_${FEATURE}
               ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+              CXX_STANDARD 11
+              CXX_STANDARD_REQUIRED ON
+              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
+              COMPILE_OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
     endif()
   endif()
 
@@ -61,7 +70,11 @@ function(cxx_feature_check FILE)
     add_definitions(-DHAVE_${VAR})
   else()
     if(NOT COMPILE_${FEATURE})
-      message(STATUS "Performing Test ${FEATURE} -- failed to compile")
+      if(CXXFEATURECHECK_DEBUG)
+        message(STATUS "Performing Test ${FEATURE} -- failed to compile: ${COMPILE_OUTPUT_VAR}")
+      else()
+        message(STATUS "Performing Test ${FEATURE} -- failed to compile")
+      endif()
     else()
       message(STATUS "Performing Test ${FEATURE} -- compiled but failed to run")
     endif()
diff --git a/cmake/Config.cmake.in b/cmake/Config.cmake.in
index 6e9256e..2e15f0c 100644
--- a/cmake/Config.cmake.in
+++ b/cmake/Config.cmake.in
@@ -1 +1,7 @@
+@PACKAGE_INIT@
+
+include (CMakeFindDependencyMacro)
+
+find_dependency (Threads)
+
 include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
diff --git a/cmake/GetGitVersion.cmake b/cmake/GetGitVersion.cmake
index 4f10f22..04a1f9b 100644
--- a/cmake/GetGitVersion.cmake
+++ b/cmake/GetGitVersion.cmake
@@ -20,16 +20,20 @@ set(__get_git_version INCLUDED)
 
 function(get_git_version var)
   if(GIT_EXECUTABLE)
-      execute_process(COMMAND ${GIT_EXECUTABLE} describe --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
+      execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
           WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
           RESULT_VARIABLE status
-          OUTPUT_VARIABLE GIT_VERSION
+          OUTPUT_VARIABLE GIT_DESCRIBE_VERSION
           ERROR_QUIET)
-      if(${status})
-          set(GIT_VERSION "v0.0.0")
+      if(status)
+          set(GIT_DESCRIBE_VERSION "v0.0.0")
+      endif()
+      
+      string(STRIP ${GIT_DESCRIBE_VERSION} GIT_DESCRIBE_VERSION)
+      if(GIT_DESCRIBE_VERSION MATCHES v[^-]*-) 
+         string(REGEX REPLACE "v([^-]*)-([0-9]+)-.*" "\\1.\\2"  GIT_VERSION ${GIT_DESCRIBE_VERSION})
       else()
-          string(STRIP ${GIT_VERSION} GIT_VERSION)
-          string(REGEX REPLACE "-[0-9]+-g" "-" GIT_VERSION ${GIT_VERSION})
+         string(REGEX REPLACE "v(.*)" "\\1" GIT_VERSION ${GIT_DESCRIBE_VERSION})
       endif()
 
       # Work out if the repository is dirty
@@ -43,12 +47,12 @@ function(get_git_version var)
           ERROR_QUIET)
       string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
       if (${GIT_DIRTY})
-          set(GIT_VERSION "${GIT_VERSION}-dirty")
+          set(GIT_DESCRIBE_VERSION "${GIT_DESCRIBE_VERSION}-dirty")
       endif()
+      message(STATUS "git version: ${GIT_DESCRIBE_VERSION} normalized to ${GIT_VERSION}")
   else()
-      set(GIT_VERSION "v0.0.0")
+      set(GIT_VERSION "0.0.0")
   endif()
 
-  message(STATUS "git Version: ${GIT_VERSION}")
   set(${var} ${GIT_VERSION} PARENT_SCOPE)
 endfunction()
diff --git a/cmake/GoogleTest.cmake b/cmake/GoogleTest.cmake
index dd611fc..e66e9d1 100644
--- a/cmake/GoogleTest.cmake
+++ b/cmake/GoogleTest.cmake
@@ -35,7 +35,24 @@ add_subdirectory(${GOOGLETEST_SOURCE_DIR}
                  ${GOOGLETEST_BINARY_DIR}
                  EXCLUDE_FROM_ALL)
 
-set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES>)
-set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES>)
-set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES>)
-set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES>)
+# googletest doesn't seem to want to stay build warning clean so let's not hurt ourselves.
+if (MSVC)
+  target_compile_options(gtest PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gtest_main PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gmock PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gmock_main PRIVATE "/wd4244" "/wd4722")
+else()
+  target_compile_options(gtest PRIVATE "-w")
+  target_compile_options(gtest_main PRIVATE "-w")
+  target_compile_options(gmock PRIVATE "-w")
+  target_compile_options(gmock_main PRIVATE "-w")
+endif()
+
+if(NOT DEFINED GTEST_COMPILE_COMMANDS)
+    set(GTEST_COMPILE_COMMANDS ON)
+endif()
+
+set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
diff --git a/cmake/GoogleTest.cmake.in b/cmake/GoogleTest.cmake.in
index fd957ff..ce653ac 100644
--- a/cmake/GoogleTest.cmake.in
+++ b/cmake/GoogleTest.cmake.in
@@ -31,13 +31,14 @@ if(EXISTS "${GOOGLETEST_PATH}"            AND IS_DIRECTORY "${GOOGLETEST_PATH}"
   )
 else()
   if(NOT ALLOW_DOWNLOADING_GOOGLETEST)
-    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_USE_BUNDLED_GTEST, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+    return()
   else()
     message(WARNING "Did not find Google Test sources! Fetching from web...")
     ExternalProject_Add(
       googletest
       GIT_REPOSITORY    https://github.com/google/googletest.git
-      GIT_TAG           master
+      GIT_TAG           "release-1.11.0"
       PREFIX            "${CMAKE_BINARY_DIR}"
       STAMP_DIR         "${CMAKE_BINARY_DIR}/stamp"
       DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
diff --git a/cmake/Modules/FindPFM.cmake b/cmake/Modules/FindPFM.cmake
new file mode 100644
index 0000000..4c1ce93
--- /dev/null
+++ b/cmake/Modules/FindPFM.cmake
@@ -0,0 +1,28 @@
+# If successful, the following variables will be defined:
+# PFM_FOUND.
+# PFM_LIBRARIES
+# PFM_INCLUDE_DIRS
+# the following target will be defined:
+# PFM::libpfm
+
+include(FeatureSummary)
+include(FindPackageHandleStandardArgs)
+
+set_package_properties(PFM PROPERTIES
+                       URL http://perfmon2.sourceforge.net/
+                       DESCRIPTION "A helper library to develop monitoring tools"
+                       PURPOSE "Used to program specific performance monitoring events")
+
+find_library(PFM_LIBRARY NAMES pfm)
+find_path(PFM_INCLUDE_DIR NAMES perfmon/pfmlib.h)
+
+find_package_handle_standard_args(PFM REQUIRED_VARS PFM_LIBRARY PFM_INCLUDE_DIR)
+
+if (PFM_FOUND AND NOT TARGET PFM::libpfm)
+    add_library(PFM::libpfm UNKNOWN IMPORTED)
+    set_target_properties(PFM::libpfm PROPERTIES
+        IMPORTED_LOCATION "${PFM_LIBRARY}"
+        INTERFACE_INCLUDE_DIRECTORIES "${PFM_INCLUDE_DIR}")
+endif()
+
+mark_as_advanced(PFM_LIBRARY PFM_INCLUDE_DIR)
diff --git a/cmake/benchmark.pc.in b/cmake/benchmark.pc.in
index 34beb01..9dae881 100644
--- a/cmake/benchmark.pc.in
+++ b/cmake/benchmark.pc.in
@@ -1,7 +1,7 @@
 prefix=@CMAKE_INSTALL_PREFIX@
 exec_prefix=${prefix}
-libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@
-includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
 
 Name: @PROJECT_NAME@
 Description: Google microbenchmark framework
diff --git a/cmake/pthread_affinity.cpp b/cmake/pthread_affinity.cpp
new file mode 100644
index 0000000..7b143bc
--- /dev/null
+++ b/cmake/pthread_affinity.cpp
@@ -0,0 +1,16 @@
+#include <pthread.h>
+int main() {
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  for (int i = 0; i < CPU_SETSIZE; ++i) {
+    CPU_SET(i, &set);
+    CPU_CLR(i, &set);
+  }
+  pthread_t self = pthread_self();
+  int ret;
+  ret = pthread_getaffinity_np(self, sizeof(set), &set);
+  if (ret != 0) return ret;
+  ret = pthread_setaffinity_np(self, sizeof(set), &set);
+  if (ret != 0) return ret;
+  return 0;
+}
diff --git a/conan/CMakeLists.txt b/conan/CMakeLists.txt
deleted file mode 100644
index 15b92ca..0000000
--- a/conan/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-cmake_minimum_required(VERSION 2.8.11)
-project(cmake_wrapper)
-
-include(conanbuildinfo.cmake)
-conan_basic_setup()
-
-include(${CMAKE_SOURCE_DIR}/CMakeListsOriginal.txt)
diff --git a/conan/test_package/CMakeLists.txt b/conan/test_package/CMakeLists.txt
deleted file mode 100644
index 089a6c7..0000000
--- a/conan/test_package/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-cmake_minimum_required(VERSION 2.8.11)
-project(test_package)
-
-set(CMAKE_VERBOSE_MAKEFILE TRUE)
-
-include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake)
-conan_basic_setup()
-
-add_executable(${PROJECT_NAME} test_package.cpp)
-target_link_libraries(${PROJECT_NAME} ${CONAN_LIBS})
diff --git a/conan/test_package/conanfile.py b/conan/test_package/conanfile.py
deleted file mode 100644
index d63f408..0000000
--- a/conan/test_package/conanfile.py
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from conans import ConanFile, CMake
-import os
-
-
-class TestPackageConan(ConanFile):
-    settings = "os", "compiler", "build_type", "arch"
-    generators = "cmake"
-
-    def build(self):
-        cmake = CMake(self)
-        cmake.configure()
-        cmake.build()
-
-    def test(self):
-        bin_path = os.path.join("bin", "test_package")
-        self.run(bin_path, run_environment=True)
diff --git a/conan/test_package/test_package.cpp b/conan/test_package/test_package.cpp
deleted file mode 100644
index 4fa7ec0..0000000
--- a/conan/test_package/test_package.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "benchmark/benchmark.h"
-
-void BM_StringCreation(benchmark::State& state) {
-    while (state.KeepRunning())
-        std::string empty_string;
-}
-
-BENCHMARK(BM_StringCreation);
-
-void BM_StringCopy(benchmark::State& state) {
-    std::string x = "hello";
-    while (state.KeepRunning())
-        std::string copy(x);
-}
-
-BENCHMARK(BM_StringCopy);
-
-BENCHMARK_MAIN();
diff --git a/conanfile.py b/conanfile.py
deleted file mode 100644
index e31fc52..0000000
--- a/conanfile.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from conans import ConanFile, CMake, tools
-from conans.errors import ConanInvalidConfiguration
-import shutil
-import os
-
-
-class GoogleBenchmarkConan(ConanFile):
-    name = "benchmark"
-    description = "A microbenchmark support library."
-    topics = ("conan", "benchmark", "google", "microbenchmark")
-    url = "https://github.com/google/benchmark"
-    homepage = "https://github.com/google/benchmark"
-    author = "Google Inc."
-    license = "Apache-2.0"
-    exports_sources = ["*"]
-    generators = "cmake"
-
-    settings = "arch", "build_type", "compiler", "os"
-    options = {
-        "shared": [True, False],
-        "fPIC": [True, False],
-        "enable_lto": [True, False],
-        "enable_exceptions": [True, False]
-    }
-    default_options = {"shared": False, "fPIC": True, "enable_lto": False, "enable_exceptions": True}
-
-    _build_subfolder = "."
-
-    def source(self):
-        # Wrap the original CMake file to call conan_basic_setup
-        shutil.move("CMakeLists.txt", "CMakeListsOriginal.txt")
-        shutil.move(os.path.join("conan", "CMakeLists.txt"), "CMakeLists.txt")
-
-    def config_options(self):
-        if self.settings.os == "Windows":
-            if self.settings.compiler == "Visual Studio" and float(self.settings.compiler.version.value) <= 12:
-                raise ConanInvalidConfiguration("{} {} does not support Visual Studio <= 12".format(self.name, self.version))
-            del self.options.fPIC
-
-    def configure(self):
-        if self.settings.os == "Windows" and self.options.shared:
-            raise ConanInvalidConfiguration("Windows shared builds are not supported right now, see issue #639")
-
-    def _configure_cmake(self):
-        cmake = CMake(self)
-
-        cmake.definitions["BENCHMARK_ENABLE_TESTING"] = "OFF"
-        cmake.definitions["BENCHMARK_ENABLE_GTEST_TESTS"] = "OFF"
-        cmake.definitions["BENCHMARK_ENABLE_LTO"] = "ON" if self.options.enable_lto else "OFF"
-        cmake.definitions["BENCHMARK_ENABLE_EXCEPTIONS"] = "ON" if self.options.enable_exceptions else "OFF"
-
-        # See https://github.com/google/benchmark/pull/638 for Windows 32 build explanation
-        if self.settings.os != "Windows":
-            cmake.definitions["BENCHMARK_BUILD_32_BITS"] = "ON" if "64" not in str(self.settings.arch) else "OFF"
-            cmake.definitions["BENCHMARK_USE_LIBCXX"] = "ON" if (str(self.settings.compiler.libcxx) == "libc++") else "OFF"
-        else:
-            cmake.definitions["BENCHMARK_USE_LIBCXX"] = "OFF"
-
-        cmake.configure(build_folder=self._build_subfolder)
-        return cmake
-
-    def build(self):
-        cmake = self._configure_cmake()
-        cmake.build()
-
-    def package(self):
-        cmake = self._configure_cmake()
-        cmake.install()
-
-        self.copy(pattern="LICENSE", dst="licenses")
-
-    def package_info(self):
-        self.cpp_info.libs = tools.collect_libs(self)
-        if self.settings.os == "Linux":
-            self.cpp_info.libs.extend(["pthread", "rt"])
-        elif self.settings.os == "Windows":
-            self.cpp_info.libs.append("shlwapi")
-        elif self.settings.os == "SunOS":
-            self.cpp_info.libs.append("kstat")
diff --git a/dependencies.md b/dependencies.md
deleted file mode 100644
index 6289b4e..0000000
--- a/dependencies.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# Build tool dependency policy
-
-To ensure the broadest compatibility when building the benchmark library, but
-still allow forward progress, we require any build tooling to be available for:
-
-* Debian stable AND
-* The last two Ubuntu LTS releases AND
-
-Currently, this means using build tool versions that are available for Ubuntu
-16.04 (Xenial), Ubuntu 18.04 (Bionic), and Debian stretch.
-
-_Note, [travis](.travis.yml) runs under Ubuntu 14.04 (Trusty) for linux builds._
-
-## cmake
-The current supported version is cmake 3.5.1 as of 2018-06-06.
-
-_Note, this version is also available for Ubuntu 14.04, the previous Ubuntu LTS
-release, as `cmake3`._
diff --git a/docs/AssemblyTests.md b/docs/AssemblyTests.md
index 1fbdc26..89df7ca 100644
--- a/docs/AssemblyTests.md
+++ b/docs/AssemblyTests.md
@@ -111,6 +111,7 @@ between compilers or compiler versions. A common example of this
 is matching stack frame addresses. In this case regular expressions
 can be used to match the differing bits of output. For example:
 
+<!-- {% raw %} -->
 ```c++
 int ExternInt;
 struct Point { int x, y, z; };
@@ -127,6 +128,7 @@ extern "C" void test_store_point() {
     // CHECK: ret
 }
 ```
+<!-- {% endraw %} -->
 
 ## Current Requirements and Limitations
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 1885487..32f9f2e 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -1 +1,3 @@
-theme: jekyll-theme-midnight
-\ No newline at end of file
+theme: jekyll-theme-minimal
+logo: /assets/images/icon_black.png
+show_downloads: true
diff --git a/docs/assets/images/icon.png b/docs/assets/images/icon.png
new file mode 100644
index 0000000..b982604
--- /dev/null
+++ b/docs/assets/images/icon.png
diff --git a/docs/assets/images/icon.xcf b/docs/assets/images/icon.xcf
new file mode 100644
index 0000000..f2f0be4
--- /dev/null
+++ b/docs/assets/images/icon.xcf
diff --git a/docs/assets/images/icon_black.png b/docs/assets/images/icon_black.png
new file mode 100644
index 0000000..656ae79
--- /dev/null
+++ b/docs/assets/images/icon_black.png
diff --git a/docs/assets/images/icon_black.xcf b/docs/assets/images/icon_black.xcf
new file mode 100644
index 0000000..430e7ba
--- /dev/null
+++ b/docs/assets/images/icon_black.xcf
diff --git a/docs/dependencies.md b/docs/dependencies.md
new file mode 100644
index 0000000..07760e1
--- /dev/null
+++ b/docs/dependencies.md
@@ -0,0 +1,13 @@
+# Build tool dependency policy
+
+We follow the [Foundational C++ support policy](https://opensource.google/documentation/policies/cplusplus-support) for our build tools. In
+particular the ["Build Systems" section](https://opensource.google/documentation/policies/cplusplus-support#build-systems).
+
+## CMake
+
+The current supported version is CMake 3.10 as of 2023-08-10. Most modern
+distributions include newer versions, for example:
+
+* Ubuntu 20.04 provides CMake 3.16.3
+* Debian 11.4 provides CMake 3.18.4
+* Ubuntu 22.04 provides CMake 3.22.1
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 0000000..9cada96
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,12 @@
+# Benchmark
+
+* [Assembly Tests](AssemblyTests.md)
+* [Dependencies](dependencies.md)
+* [Perf Counters](perf_counters.md)
+* [Platform Specific Build Instructions](platform_specific_build_instructions.md)
+* [Python Bindings](python_bindings.md)
+* [Random Interleaving](random_interleaving.md)
+* [Reducing Variance](reducing_variance.md)
+* [Releasing](releasing.md)
+* [Tools](tools.md)
+* [User Guide](user_guide.md)
diff --git a/docs/perf_counters.md b/docs/perf_counters.md
new file mode 100644
index 0000000..f342092
--- /dev/null
+++ b/docs/perf_counters.md
@@ -0,0 +1,35 @@
+<a name="perf-counters" />
+
+# User-Requested Performance Counters
+
+When running benchmarks, the user may choose to request collection of
+performance counters. This may be useful in investigation scenarios - narrowing
+down the cause of a regression; or verifying that the underlying cause of a
+performance improvement matches expectations.
+
+This feature is available if:
+
+* The benchmark is run on an architecture featuring a Performance Monitoring
+  Unit (PMU),
+* The benchmark is compiled with support for collecting counters. Currently,
+  this requires [libpfm](http://perfmon2.sourceforge.net/), which is built as a
+  dependency via Bazel.
+
+The feature does not require modifying benchmark code. Counter collection is
+handled at the boundaries where timer collection is also handled. 
+
+To opt-in:
+* If using a Bazel build, add `--define pfm=1` to your build flags
+* If using CMake:
+  * Install `libpfm4-dev`, e.g. `apt-get install libpfm4-dev`.
+  * Enable the CMake flag `BENCHMARK_ENABLE_LIBPFM` in `CMakeLists.txt`.
+
+To use, pass a comma-separated list of counter names through the
+`--benchmark_perf_counters` flag. The names are decoded through libpfm - meaning,
+they are platform specific, but some (e.g. `CYCLES` or `INSTRUCTIONS`) are
+mapped by libpfm to platform-specifics - see libpfm
+[documentation](http://perfmon2.sourceforge.net/docs.html) for more details.
+
+The counter values are reported back through the [User Counters](../README.md#custom-counters)
+mechanism, meaning, they are available in all the formats (e.g. JSON) supported
+by User Counters.
diff --git a/docs/platform_specific_build_instructions.md b/docs/platform_specific_build_instructions.md
new file mode 100644
index 0000000..2d5d6c4
--- /dev/null
+++ b/docs/platform_specific_build_instructions.md
@@ -0,0 +1,48 @@
+# Platform Specific Build Instructions
+
+## Building with GCC
+
+When the library is built using GCC it is necessary to link with the pthread
+library due to how GCC implements `std::thread`. Failing to link to pthread will
+lead to runtime exceptions (unless you're using libc++), not linker errors. See
+[issue #67](https://github.com/google/benchmark/issues/67) for more details. You
+can link to pthread by adding `-pthread` to your linker command. Note, you can
+also use `-lpthread`, but there are potential issues with ordering of command
+line parameters if you use that.
+
+On QNX, the pthread library is part of libc and usually included automatically
+(see
+[`pthread_create()`](https://www.qnx.com/developers/docs/7.1/index.html#com.qnx.doc.neutrino.lib_ref/topic/p/pthread_create.html)).
+There's no separate pthread library to link.
+
+## Building with Visual Studio 2015 or 2017
+
+The `shlwapi` library (`-lshlwapi`) is required to support a call to `CPUInfo` which reads the registry. Either add `shlwapi.lib` under `[ Configuration Properties > Linker > Input ]`, or use the following:
+
+```
+// Alternatively, can add libraries using linker options.
+#ifdef _WIN32
+#pragma comment ( lib, "Shlwapi.lib" )
+#ifdef _DEBUG
+#pragma comment ( lib, "benchmarkd.lib" )
+#else
+#pragma comment ( lib, "benchmark.lib" )
+#endif
+#endif
+```
+
+Can also use the graphical version of CMake:
+* Open `CMake GUI`.
+* Under `Where to build the binaries`, same path as source plus `build`.
+* Under `CMAKE_INSTALL_PREFIX`, same path as source plus `install`.
+* Click `Configure`, `Generate`, `Open Project`.
+* If build fails, try deleting entire directory and starting again, or unticking options to build less.
+
+## Building with Intel 2015 Update 1 or Intel System Studio Update 4
+
+See instructions for building with Visual Studio. Once built, right click on the solution and change the build to Intel.
+
+## Building on Solaris
+
+If you're running benchmarks on solaris, you'll want the kstat library linked in
+too (`-lkstat`).
+\ No newline at end of file
diff --git a/docs/python_bindings.md b/docs/python_bindings.md
new file mode 100644
index 0000000..6a7aab0
--- /dev/null
+++ b/docs/python_bindings.md
@@ -0,0 +1,34 @@
+# Building and installing Python bindings
+
+Python bindings are available as wheels on [PyPI](https://pypi.org/project/google-benchmark/) for importing and 
+using Google Benchmark directly in Python. 
+Currently, pre-built wheels exist for macOS (both ARM64 and Intel x86), Linux x86-64 and 64-bit Windows.
+Supported Python versions are Python 3.7 - 3.10.
+
+To install Google Benchmark's Python bindings, run:
+
+```bash
+python -m pip install --upgrade pip  # for manylinux2014 support
+python -m pip install google-benchmark
+```
+
+In order to keep your system Python interpreter clean, it is advisable to run these commands in a virtual
+environment. See the [official Python documentation](https://docs.python.org/3/library/venv.html) 
+on how to create virtual environments.
+
+To build a wheel directly from source, you can follow these steps:
+```bash
+git clone https://github.com/google/benchmark.git
+cd benchmark
+# create a virtual environment and activate it
+python3 -m venv venv --system-site-packages
+source venv/bin/activate  # .\venv\Scripts\Activate.ps1 on Windows
+
+# upgrade Python's system-wide packages
+python -m pip install --upgrade pip setuptools wheel
+# builds the wheel and stores it in the directory "wheelhouse".
+python -m pip wheel . -w wheelhouse
+```
+
+NB: Building wheels from source requires Bazel. For platform-specific instructions on how to install Bazel,
+refer to the [Bazel installation docs](https://bazel.build/install).
diff --git a/docs/random_interleaving.md b/docs/random_interleaving.md
new file mode 100644
index 0000000..c083036
--- /dev/null
+++ b/docs/random_interleaving.md
@@ -0,0 +1,13 @@
+<a name="interleaving" />
+
+# Random Interleaving
+
+[Random Interleaving](https://github.com/google/benchmark/issues/1051) is a
+technique to lower run-to-run variance. It randomly interleaves repetitions of a
+microbenchmark with repetitions from other microbenchmarks in the same benchmark
+test. Data shows it is able to lower run-to-run variance by
+[40%](https://github.com/google/benchmark/issues/1051) on average.
+
+To use, you mainly need to set `--benchmark_enable_random_interleaving=true`,
+and optionally specify non-zero repetition count `--benchmark_repetitions=9`
+and optionally decrease the per-repetition time `--benchmark_min_time=0.1`.
diff --git a/docs/reducing_variance.md b/docs/reducing_variance.md
new file mode 100644
index 0000000..e566ab9
--- /dev/null
+++ b/docs/reducing_variance.md
@@ -0,0 +1,100 @@
+# Reducing Variance
+
+<a name="disabling-cpu-frequency-scaling" />
+
+## Disabling CPU Frequency Scaling
+
+If you see this error:
+
+```
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+```
+
+you might want to disable the CPU frequency scaling while running the
+benchmark, as well as consider other ways to stabilize the performance of
+your system while benchmarking.
+
+See [Reducing Variance](reducing_variance.md) for more information.
+
+Exactly how to do this depends on the Linux distribution,
+desktop environment, and installed programs.  Specific details are a moving
+target, so we will not attempt to exhaustively document them here.
+
+One simple option is to use the `cpupower` program to change the
+performance governor to "performance".  This tool is maintained along with
+the Linux kernel and provided by your distribution.
+
+It must be run as root, like this:
+
+```bash
+sudo cpupower frequency-set --governor performance
+```
+
+After this you can verify that all CPUs are using the performance governor
+by running this command:
+
+```bash
+cpupower frequency-info -o proc
+```
+
+The benchmarks you subsequently run will have less variance.
+
+<a name="reducing-variance" />
+
+## Reducing Variance in Benchmarks
+
+The Linux CPU frequency governor [discussed
+above](user_guide#disabling-cpu-frequency-scaling) is not the only source
+of noise in benchmarks.  Some, but not all, of the sources of variance
+include:
+
+1. On multi-core machines not all CPUs/CPU cores/CPU threads run the same
+   speed, so running a benchmark one time and then again may give a
+   different result depending on which CPU it ran on.
+2. CPU scaling features that run on the CPU, like Intel's Turbo Boost and
+   AMD Turbo Core and Precision Boost, can temporarily change the CPU
+   frequency even when the using the "performance" governor on Linux.
+3. Context switching between CPUs, or scheduling competition on the CPU the
+   benchmark is running on.
+4. Intel Hyperthreading or AMD SMT causing the same issue as above.
+5. Cache effects caused by code running on other CPUs.
+6. Non-uniform memory architectures (NUMA).
+
+These can cause variance in benchmarks results within a single run
+(`--benchmark_repetitions=N`) or across multiple runs of the benchmark
+program.
+
+Reducing sources of variance is OS and architecture dependent, which is one
+reason some companies maintain machines dedicated to performance testing.
+
+Some of the easier and and effective ways of reducing variance on a typical
+Linux workstation are:
+
+1. Use the performance governor as [discussed
+above](user_guide#disabling-cpu-frequency-scaling).
+1. Disable processor boosting by:
+   ```sh
+   echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost
+   ```
+   See the Linux kernel's
+   [boost.txt](https://www.kernel.org/doc/Documentation/cpu-freq/boost.txt)
+   for more information.
+2. Set the benchmark program's task affinity to a fixed cpu.  For example:
+   ```sh
+   taskset -c 0 ./mybenchmark
+   ```
+3. Disabling Hyperthreading/SMT.  This can be done in the Bios or using the
+   `/sys` file system (see the LLVM project's [Benchmarking
+   tips](https://llvm.org/docs/Benchmarking.html)).
+4. Close other programs that do non-trivial things based on timers, such as
+   your web browser, desktop environment, etc.
+5. Reduce the working set of your benchmark to fit within the L1 cache, but
+   do be aware that this may lead you to optimize for an unrelistic
+   situation.
+
+Further resources on this topic:
+
+1. The LLVM project's [Benchmarking
+   tips](https://llvm.org/docs/Benchmarking.html).
+1. The Arch Wiki [Cpu frequency
+scaling](https://wiki.archlinux.org/title/CPU_frequency_scaling) page.
diff --git a/docs/releasing.md b/docs/releasing.md
index f0cd701..cdf4159 100644
--- a/docs/releasing.md
+++ b/docs/releasing.md
@@ -1,16 +1,41 @@
 # How to release
 
-* Make sure you're on master and synced to HEAD
-* Ensure the project builds and tests run (sanity check only, obviously)
+* Make sure you're on main and synced to HEAD
+* Ensure the project builds and tests run
     * `parallel -j0 exec ::: test/*_test` can help ensure everything at least
       passes
 * Prepare release notes
     * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
       commits between the last annotated tag and HEAD
     * Pick the most interesting.
+* Create one last commit that updates the version saved in `CMakeLists.txt`, `MODULE.bazel`
+  and the `__version__` variable in `bindings/python/google_benchmark/__init__.py`to the
+  release version you're creating. (This version will be used if benchmark is installed
+  from the archive you'll be creating in the next step.)
+
+```
+project (benchmark VERSION 1.8.0 LANGUAGES CXX)
+```
+
+```
+module(name = "com_github_google_benchmark", version="1.8.0")
+```
+
+```python
+# bindings/python/google_benchmark/__init__.py
+
+# ...
+
+__version__ = "1.8.0"  # <-- change this to the release version you are creating
+
+# ...
+```
+
 * Create a release through github's interface
     * Note this will create a lightweight tag.
     * Update this to an annotated tag:
       * `git pull --tags`
       * `git tag -a -f <tag> <tag>`
-      * `git push --force origin`
+      * `git push --force --tags origin`
+* Confirm that the "Build and upload Python wheels" action runs to completion
+    * run it manually if it hasn't run
diff --git a/docs/tools.md b/docs/tools.md
index f2d0c49..411f41d 100644
--- a/docs/tools.md
+++ b/docs/tools.md
@@ -186,6 +186,146 @@ Benchmark                               Time             CPU      Time Old
 This is a mix of the previous two modes, two (potentially different) benchmark binaries are run, and a different filter is applied to each one.
 As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
 
+### Note: Interpreting the output
+
+Performance measurements are an art, and performance comparisons are doubly so.
+Results are often noisy and don't necessarily have large absolute differences to
+them, so just by visual inspection, it is not at all apparent if two
+measurements are actually showing a performance change or not. It is even more
+confusing with multiple benchmark repetitions.
+
+Thankfully, what we can do, is use statistical tests on the results to determine
+whether the performance has statistically-significantly changed. `compare.py`
+uses [Mann–Whitney U
+test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test), with a null
+hypothesis being that there's no difference in performance.
+ 
+**The below output is a summary of a benchmark comparison with statistics
+provided for a multi-threaded process.**
+```
+Benchmark                                               Time        CPU    Time Old      Time New       CPU Old       CPU New
+-----------------------------------------------------------------------------------------------------------------------------
+benchmark/threads:1/process_time/real_time_pvalue     0.0000     0.0000    U Test, Repetitions: 27 vs 27
+benchmark/threads:1/process_time/real_time_mean      -0.1442    -0.1442          90            77            90            77
+benchmark/threads:1/process_time/real_time_median    -0.1444    -0.1444          90            77            90            77
+benchmark/threads:1/process_time/real_time_stddev    +0.3974    +0.3933           0             0             0             0
+benchmark/threads:1/process_time/real_time_cv        +0.6329    +0.6280           0             0             0             0
+OVERALL_GEOMEAN                                      -0.1442    -0.1442           0             0             0             0
+```
+--------------------------------------------
+Here's a breakdown of each row:
+
+**benchmark/threads:1/process_time/real_time_pvalue**: This shows the _p-value_ for
+the statistical test comparing the performance of the process running with one
+thread. A value of 0.0000 suggests a statistically significant difference in
+performance. The comparison was conducted using the U Test (Mann-Whitney
+U Test) with 27 repetitions for each case.
+
+**benchmark/threads:1/process_time/real_time_mean**: This shows the relative
+difference in mean execution time between two different cases. The negative
+value (-0.1442) implies that the new process is faster by about 14.42%. The old
+time was 90 units, while the new time is 77 units.
+
+**benchmark/threads:1/process_time/real_time_median**: Similarly, this shows the
+relative difference in the median execution time. Again, the new process is
+faster by 14.44%.
+
+**benchmark/threads:1/process_time/real_time_stddev**: This is the relative
+difference in the standard deviation of the execution time, which is a measure
+of how much variation or dispersion there is from the mean. A positive value
+(+0.3974) implies there is more variance in the execution time in the new
+process.
+
+**benchmark/threads:1/process_time/real_time_cv**: CV stands for Coefficient of
+Variation. It is the ratio of the standard deviation to the mean. It provides a
+standardized measure of dispersion. An increase (+0.6329) indicates more
+relative variability in the new process.
+
+**OVERALL_GEOMEAN**: Geomean stands for geometric mean, a type of average that is
+less influenced by outliers. The negative value indicates a general improvement
+in the new process. However, given the values are all zero for the old and new
+times, this seems to be a mistake or placeholder in the output.
+
+-----------------------------------------
+
+
+
+Let's first try to see what the different columns represent in the above
+`compare.py` benchmarking output:
+
+  1. **Benchmark:** The name of the function being benchmarked, along with the
+     size of the input (after the slash).
+
+  2. **Time:** The average time per operation, across all iterations.
+
+  3. **CPU:** The average CPU time per operation, across all iterations.
+
+  4. **Iterations:** The number of iterations the benchmark was run to get a
+     stable estimate.
+
+  5. **Time Old and Time New:** These represent the average time it takes for a
+     function to run in two different scenarios or versions. For example, you
+     might be comparing how fast a function runs before and after you make some
+     changes to it.
+
+  6. **CPU Old and CPU New:** These show the average amount of CPU time that the
+     function uses in two different scenarios or versions. This is similar to
+     Time Old and Time New, but focuses on CPU usage instead of overall time.
+
+In the comparison section, the relative differences in both time and CPU time
+are displayed for each input size.
+
+
+A statistically-significant difference is determined by a **p-value**, which is
+a measure of the probability that the observed difference could have occurred
+just by random chance. A smaller p-value indicates stronger evidence against the
+null hypothesis. 
+
+**Therefore:**
+  1. If the p-value is less than the chosen significance level (alpha), we
+     reject the null hypothesis and conclude the benchmarks are significantly
+     different.
+  2. If the p-value is greater than or equal to alpha, we fail to reject the
+     null hypothesis and treat the two benchmarks as similar.
+
+
+
+The result of said the statistical test is additionally communicated through color coding:
+```diff
++ Green:
+```
+  The benchmarks are _**statistically different**_. This could mean the
+  performance has either **significantly improved** or **significantly
+  deteriorated**. You should look at the actual performance numbers to see which
+  is the case.
+```diff
+- Red:
+```
+  The benchmarks are _**statistically similar**_. This means the performance
+  **hasn't significantly changed**.
+
+In statistical terms, **'green'** means we reject the null hypothesis that
+there's no difference in performance, and **'red'** means we fail to reject the
+null hypothesis. This might seem counter-intuitive if you're expecting 'green'
+to mean 'improved performance' and 'red' to mean 'worsened performance'. 
+```bash
+  But remember, in this context:
+
+    'Success' means 'successfully finding a difference'.
+    'Failure' means 'failing to find a difference'.
+```
+
+
+Also, please note that **even if** we determine that there **is** a
+statistically-significant difference between the two measurements, it does not
+_necessarily_ mean that the actual benchmarks that were measured **are**
+different, or vice versa, even if we determine that there is **no**
+statistically-significant difference between the two measurements, it does not
+necessarily mean that the actual benchmarks that were measured **are not**
+different.
+
+
+
 ### U test
 
 If there is a sufficient repetition count of the benchmarks, the tool can do
diff --git a/docs/user_guide.md b/docs/user_guide.md
new file mode 100644
index 0000000..2ceb13e
--- /dev/null
+++ b/docs/user_guide.md
@@ -0,0 +1,1266 @@
+# User Guide
+
+## Command Line
+
+[Output Formats](#output-formats)
+
+[Output Files](#output-files)
+
+[Running Benchmarks](#running-benchmarks)
+
+[Running a Subset of Benchmarks](#running-a-subset-of-benchmarks)
+
+[Result Comparison](#result-comparison)
+
+[Extra Context](#extra-context)
+
+## Library
+
+[Runtime and Reporting Considerations](#runtime-and-reporting-considerations)
+
+[Setup/Teardown](#setupteardown)
+
+[Passing Arguments](#passing-arguments)
+
+[Custom Benchmark Name](#custom-benchmark-name)
+
+[Calculating Asymptotic Complexity](#asymptotic-complexity)
+
+[Templated Benchmarks](#templated-benchmarks)
+
+[Fixtures](#fixtures)
+
+[Custom Counters](#custom-counters)
+
+[Multithreaded Benchmarks](#multithreaded-benchmarks)
+
+[CPU Timers](#cpu-timers)
+
+[Manual Timing](#manual-timing)
+
+[Setting the Time Unit](#setting-the-time-unit)
+
+[Random Interleaving](random_interleaving.md)
+
+[User-Requested Performance Counters](perf_counters.md)
+
+[Preventing Optimization](#preventing-optimization)
+
+[Reporting Statistics](#reporting-statistics)
+
+[Custom Statistics](#custom-statistics)
+
+[Memory Usage](#memory-usage)
+
+[Using RegisterBenchmark](#using-register-benchmark)
+
+[Exiting with an Error](#exiting-with-an-error)
+
+[A Faster `KeepRunning` Loop](#a-faster-keep-running-loop)
+
+## Benchmarking Tips
+
+[Disabling CPU Frequency Scaling](#disabling-cpu-frequency-scaling)
+
+[Reducing Variance in Benchmarks](reducing_variance.md)
+
+<a name="output-formats" />
+
+## Output Formats
+
+The library supports multiple output formats. Use the
+`--benchmark_format=<console|json|csv>` flag (or set the
+`BENCHMARK_FORMAT=<console|json|csv>` environment variable) to set
+the format type. `console` is the default format.
+
+The Console format is intended to be a human readable format. By default
+the format generates color output. Context is output on stderr and the
+tabular data on stdout. Example tabular output looks like:
+
+```
+Benchmark                               Time(ns)    CPU(ns) Iterations
+----------------------------------------------------------------------
+BM_SetInsert/1024/1                        28928      29349      23853  133.097kB/s   33.2742k items/s
+BM_SetInsert/1024/8                        32065      32913      21375  949.487kB/s   237.372k items/s
+BM_SetInsert/1024/10                       33157      33648      21431  1.13369MB/s   290.225k items/s
+```
+
+The JSON format outputs human readable json split into two top level attributes.
+The `context` attribute contains information about the run in general, including
+information about the CPU and the date.
+The `benchmarks` attribute contains a list of every benchmark run. Example json
+output looks like:
+
+```json
+{
+  "context": {
+    "date": "2015/03/17-18:40:25",
+    "num_cpus": 40,
+    "mhz_per_cpu": 2801,
+    "cpu_scaling_enabled": false,
+    "build_type": "debug"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_SetInsert/1024/1",
+      "iterations": 94877,
+      "real_time": 29275,
+      "cpu_time": 29836,
+      "bytes_per_second": 134066,
+      "items_per_second": 33516
+    },
+    {
+      "name": "BM_SetInsert/1024/8",
+      "iterations": 21609,
+      "real_time": 32317,
+      "cpu_time": 32429,
+      "bytes_per_second": 986770,
+      "items_per_second": 246693
+    },
+    {
+      "name": "BM_SetInsert/1024/10",
+      "iterations": 21393,
+      "real_time": 32724,
+      "cpu_time": 33355,
+      "bytes_per_second": 1199226,
+      "items_per_second": 299807
+    }
+  ]
+}
+```
+
+The CSV format outputs comma-separated values. The `context` is output on stderr
+and the CSV itself on stdout. Example CSV output looks like:
+
+```
+name,iterations,real_time,cpu_time,bytes_per_second,items_per_second,label
+"BM_SetInsert/1024/1",65465,17890.7,8407.45,475768,118942,
+"BM_SetInsert/1024/8",116606,18810.1,9766.64,3.27646e+06,819115,
+"BM_SetInsert/1024/10",106365,17238.4,8421.53,4.74973e+06,1.18743e+06,
+```
+
+<a name="output-files" />
+
+## Output Files
+
+Write benchmark results to a file with the `--benchmark_out=<filename>` option
+(or set `BENCHMARK_OUT`). Specify the output format with
+`--benchmark_out_format={json|console|csv}` (or set
+`BENCHMARK_OUT_FORMAT={json|console|csv}`). Note that the 'csv' reporter is
+deprecated and the saved `.csv` file
+[is not parsable](https://github.com/google/benchmark/issues/794) by csv
+parsers.
+
+Specifying `--benchmark_out` does not suppress the console output.
+
+<a name="running-benchmarks" />
+
+## Running Benchmarks
+
+Benchmarks are executed by running the produced binaries. Benchmarks binaries,
+by default, accept options that may be specified either through their command
+line interface or by setting environment variables before execution. For every
+`--option_flag=<value>` CLI switch, a corresponding environment variable
+`OPTION_FLAG=<value>` exist and is used as default if set (CLI switches always
+ prevails). A complete list of CLI options is available running benchmarks
+ with the `--help` switch.
+
+<a name="running-a-subset-of-benchmarks" />
+
+## Running a Subset of Benchmarks
+
+The `--benchmark_filter=<regex>` option (or `BENCHMARK_FILTER=<regex>`
+environment variable) can be used to only run the benchmarks that match
+the specified `<regex>`. For example:
+
+```bash
+$ ./run_benchmarks.x --benchmark_filter=BM_memcpy/32
+Run on (1 X 2300 MHz CPU )
+2016-06-25 19:34:24
+Benchmark              Time           CPU Iterations
+----------------------------------------------------
+BM_memcpy/32          11 ns         11 ns   79545455
+BM_memcpy/32k       2181 ns       2185 ns     324074
+BM_memcpy/32          12 ns         12 ns   54687500
+BM_memcpy/32k       1834 ns       1837 ns     357143
+```
+
+## Disabling Benchmarks
+
+It is possible to temporarily disable benchmarks by renaming the benchmark
+function to have the prefix "DISABLED_". This will cause the benchmark to
+be skipped at runtime.
+
+<a name="result-comparison" />
+
+## Result comparison
+
+It is possible to compare the benchmarking results.
+See [Additional Tooling Documentation](tools.md)
+
+<a name="extra-context" />
+
+## Extra Context
+
+Sometimes it's useful to add extra context to the content printed before the
+results. By default this section includes information about the CPU on which
+the benchmarks are running. If you do want to add more context, you can use
+the `benchmark_context` command line flag:
+
+```bash
+$ ./run_benchmarks --benchmark_context=pwd=`pwd`
+Run on (1 x 2300 MHz CPU)
+pwd: /home/user/benchmark/
+Benchmark              Time           CPU Iterations
+----------------------------------------------------
+BM_memcpy/32          11 ns         11 ns   79545455
+BM_memcpy/32k       2181 ns       2185 ns     324074
+```
+
+You can get the same effect with the API:
+
+```c++
+  benchmark::AddCustomContext("foo", "bar");
+```
+
+Note that attempts to add a second value with the same key will fail with an
+error message.
+
+<a name="runtime-and-reporting-considerations" />
+
+## Runtime and Reporting Considerations
+
+When the benchmark binary is executed, each benchmark function is run serially.
+The number of iterations to run is determined dynamically by running the
+benchmark a few times and measuring the time taken and ensuring that the
+ultimate result will be statistically stable. As such, faster benchmark
+functions will be run for more iterations than slower benchmark functions, and
+the number of iterations is thus reported.
+
+In all cases, the number of iterations for which the benchmark is run is
+governed by the amount of time the benchmark takes. Concretely, the number of
+iterations is at least one, not more than 1e9, until CPU time is greater than
+the minimum time, or the wallclock time is 5x minimum time. The minimum time is
+set per benchmark by calling `MinTime` on the registered benchmark object.
+
+Furthermore warming up a benchmark might be necessary in order to get
+stable results because of e.g caching effects of the code under benchmark.
+Warming up means running the benchmark a given amount of time, before
+results are actually taken into account. The amount of time for which
+the warmup should be run can be set per benchmark by calling
+`MinWarmUpTime` on the registered benchmark object or for all benchmarks
+using the `--benchmark_min_warmup_time` command-line option. Note that
+`MinWarmUpTime` will overwrite the value of `--benchmark_min_warmup_time`
+for the single benchmark. How many iterations the warmup run of each
+benchmark takes is determined the same way as described in the paragraph
+above. Per default the warmup phase is set to 0 seconds and is therefore
+disabled.
+
+Average timings are then reported over the iterations run. If multiple
+repetitions are requested using the `--benchmark_repetitions` command-line
+option, or at registration time, the benchmark function will be run several
+times and statistical results across these repetitions will also be reported.
+
+As well as the per-benchmark entries, a preamble in the report will include
+information about the machine on which the benchmarks are run.
+
+<a name="setup-teardown" />
+
+## Setup/Teardown
+
+Global setup/teardown specific to each benchmark can be done by
+passing a callback to Setup/Teardown:
+
+The setup/teardown callbacks will be invoked once for each benchmark. If the
+benchmark is multi-threaded (will run in k threads), they will be invoked
+exactly once before each run with k threads.
+
+If the benchmark uses different size groups of threads, the above will be true
+for each size group.
+
+Eg.,
+
+```c++
+static void DoSetup(const benchmark::State& state) {
+}
+
+static void DoTeardown(const benchmark::State& state) {
+}
+
+static void BM_func(benchmark::State& state) {...}
+
+BENCHMARK(BM_func)->Arg(1)->Arg(3)->Threads(16)->Threads(32)->Setup(DoSetup)->Teardown(DoTeardown);
+
+```
+
+In this example, `DoSetup` and `DoTearDown` will be invoked 4 times each,
+specifically, once for each of this family:
+ - BM_func_Arg_1_Threads_16, BM_func_Arg_1_Threads_32
+ - BM_func_Arg_3_Threads_16, BM_func_Arg_3_Threads_32
+
+<a name="passing-arguments" />
+
+## Passing Arguments
+
+Sometimes a family of benchmarks can be implemented with just one routine that
+takes an extra argument to specify which one of the family of benchmarks to
+run. For example, the following code defines a family of benchmarks for
+measuring the speed of `memcpy()` calls of different lengths:
+
+```c++
+static void BM_memcpy(benchmark::State& state) {
+  char* src = new char[state.range(0)];
+  char* dst = new char[state.range(0)];
+  memset(src, 'x', state.range(0));
+  for (auto _ : state)
+    memcpy(dst, src, state.range(0));
+  state.SetBytesProcessed(int64_t(state.iterations()) *
+                          int64_t(state.range(0)));
+  delete[] src;
+  delete[] dst;
+}
+BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(8<<10);
+```
+
+The preceding code is quite repetitive, and can be replaced with the following
+short-hand. The following invocation will pick a few appropriate arguments in
+the specified range and will generate a benchmark for each such argument.
+
+```c++
+BENCHMARK(BM_memcpy)->Range(8, 8<<10);
+```
+
+By default the arguments in the range are generated in multiples of eight and
+the command above selects [ 8, 64, 512, 4k, 8k ]. In the following code the
+range multiplier is changed to multiples of two.
+
+```c++
+BENCHMARK(BM_memcpy)->RangeMultiplier(2)->Range(8, 8<<10);
+```
+
+Now arguments generated are [ 8, 16, 32, 64, 128, 256, 512, 1024, 2k, 4k, 8k ].
+
+The preceding code shows a method of defining a sparse range.  The following
+example shows a method of defining a dense range. It is then used to benchmark
+the performance of `std::vector` initialization for uniformly increasing sizes.
+
+```c++
+static void BM_DenseRange(benchmark::State& state) {
+  for(auto _ : state) {
+    std::vector<int> v(state.range(0), state.range(0));
+    auto data = v.data();
+    benchmark::DoNotOptimize(data);
+    benchmark::ClobberMemory();
+  }
+}
+BENCHMARK(BM_DenseRange)->DenseRange(0, 1024, 128);
+```
+
+Now arguments generated are [ 0, 128, 256, 384, 512, 640, 768, 896, 1024 ].
+
+You might have a benchmark that depends on two or more inputs. For example, the
+following code defines a family of benchmarks for measuring the speed of set
+insertion.
+
+```c++
+static void BM_SetInsert(benchmark::State& state) {
+  std::set<int> data;
+  for (auto _ : state) {
+    state.PauseTiming();
+    data = ConstructRandomSet(state.range(0));
+    state.ResumeTiming();
+    for (int j = 0; j < state.range(1); ++j)
+      data.insert(RandomNumber());
+  }
+}
+BENCHMARK(BM_SetInsert)
+    ->Args({1<<10, 128})
+    ->Args({2<<10, 128})
+    ->Args({4<<10, 128})
+    ->Args({8<<10, 128})
+    ->Args({1<<10, 512})
+    ->Args({2<<10, 512})
+    ->Args({4<<10, 512})
+    ->Args({8<<10, 512});
+```
+
+The preceding code is quite repetitive, and can be replaced with the following
+short-hand. The following macro will pick a few appropriate arguments in the
+product of the two specified ranges and will generate a benchmark for each such
+pair.
+
+<!-- {% raw %} -->
+```c++
+BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {128, 512}});
+```
+<!-- {% endraw %} -->
+
+Some benchmarks may require specific argument values that cannot be expressed
+with `Ranges`. In this case, `ArgsProduct` offers the ability to generate a
+benchmark input for each combination in the product of the supplied vectors.
+
+<!-- {% raw %} -->
+```c++
+BENCHMARK(BM_SetInsert)
+    ->ArgsProduct({{1<<10, 3<<10, 8<<10}, {20, 40, 60, 80}})
+// would generate the same benchmark arguments as
+BENCHMARK(BM_SetInsert)
+    ->Args({1<<10, 20})
+    ->Args({3<<10, 20})
+    ->Args({8<<10, 20})
+    ->Args({3<<10, 40})
+    ->Args({8<<10, 40})
+    ->Args({1<<10, 40})
+    ->Args({1<<10, 60})
+    ->Args({3<<10, 60})
+    ->Args({8<<10, 60})
+    ->Args({1<<10, 80})
+    ->Args({3<<10, 80})
+    ->Args({8<<10, 80});
+```
+<!-- {% endraw %} -->
+
+For the most common scenarios, helper methods for creating a list of
+integers for a given sparse or dense range are provided.
+
+```c++
+BENCHMARK(BM_SetInsert)
+    ->ArgsProduct({
+      benchmark::CreateRange(8, 128, /*multi=*/2),
+      benchmark::CreateDenseRange(1, 4, /*step=*/1)
+    })
+// would generate the same benchmark arguments as
+BENCHMARK(BM_SetInsert)
+    ->ArgsProduct({
+      {8, 16, 32, 64, 128},
+      {1, 2, 3, 4}
+    });
+```
+
+For more complex patterns of inputs, passing a custom function to `Apply` allows
+programmatic specification of an arbitrary set of arguments on which to run the
+benchmark. The following example enumerates a dense range on one parameter,
+and a sparse range on the second.
+
+```c++
+static void CustomArguments(benchmark::internal::Benchmark* b) {
+  for (int i = 0; i <= 10; ++i)
+    for (int j = 32; j <= 1024*1024; j *= 8)
+      b->Args({i, j});
+}
+BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
+```
+
+### Passing Arbitrary Arguments to a Benchmark
+
+In C++11 it is possible to define a benchmark that takes an arbitrary number
+of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
+macro creates a benchmark that invokes `func`  with the `benchmark::State` as
+the first argument followed by the specified `args...`.
+The `test_case_name` is appended to the name of the benchmark and
+should describe the values passed.
+
+```c++
+template <class ...Args>
+void BM_takes_args(benchmark::State& state, Args&&... args) {
+  auto args_tuple = std::make_tuple(std::move(args)...);
+  for (auto _ : state) {
+    std::cout << std::get<0>(args_tuple) << ": " << std::get<1>(args_tuple)
+              << '\n';
+    [...]
+  }
+}
+// Registers a benchmark named "BM_takes_args/int_string_test" that passes
+// the specified values to `args`.
+BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
+
+// Registers the same benchmark "BM_takes_args/int_test" that passes
+// the specified values to `args`.
+BENCHMARK_CAPTURE(BM_takes_args, int_test, 42, 43);
+```
+
+Note that elements of `...args` may refer to global variables. Users should
+avoid modifying global state inside of a benchmark.
+
+<a name="asymptotic-complexity" />
+
+## Calculating Asymptotic Complexity (Big O)
+
+Asymptotic complexity might be calculated for a family of benchmarks. The
+following code will calculate the coefficient for the high-order term in the
+running time and the normalized root-mean square error of string comparison.
+
+```c++
+static void BM_StringCompare(benchmark::State& state) {
+  std::string s1(state.range(0), '-');
+  std::string s2(state.range(0), '-');
+  for (auto _ : state) {
+    auto comparison_result = s1.compare(s2);
+    benchmark::DoNotOptimize(comparison_result);
+  }
+  state.SetComplexityN(state.range(0));
+}
+BENCHMARK(BM_StringCompare)
+    ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity(benchmark::oN);
+```
+
+As shown in the following invocation, asymptotic complexity might also be
+calculated automatically.
+
+```c++
+BENCHMARK(BM_StringCompare)
+    ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity();
+```
+
+The following code will specify asymptotic complexity with a lambda function,
+that might be used to customize high-order term calculation.
+
+```c++
+BENCHMARK(BM_StringCompare)->RangeMultiplier(2)
+    ->Range(1<<10, 1<<18)->Complexity([](benchmark::IterationCount n)->double{return n; });
+```
+
+<a name="custom-benchmark-name" />
+
+## Custom Benchmark Name
+
+You can change the benchmark's name as follows:
+
+```c++
+BENCHMARK(BM_memcpy)->Name("memcpy")->RangeMultiplier(2)->Range(8, 8<<10);
+```
+
+The invocation will execute the benchmark as before using `BM_memcpy` but changes
+the prefix in the report to `memcpy`.
+
+<a name="templated-benchmarks" />
+
+## Templated Benchmarks
+
+This example produces and consumes messages of size `sizeof(v)` `range_x`
+times. It also outputs throughput in the absence of multiprogramming.
+
+```c++
+template <class Q> void BM_Sequential(benchmark::State& state) {
+  Q q;
+  typename Q::value_type v;
+  for (auto _ : state) {
+    for (int i = state.range(0); i--; )
+      q.push(v);
+    for (int e = state.range(0); e--; )
+      q.Wait(&v);
+  }
+  // actually messages, not bytes:
+  state.SetBytesProcessed(
+      static_cast<int64_t>(state.iterations())*state.range(0));
+}
+// C++03
+BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
+
+// C++11 or newer, you can use the BENCHMARK macro with template parameters:
+BENCHMARK(BM_Sequential<WaitQueue<int>>)->Range(1<<0, 1<<10);
+
+```
+
+Three macros are provided for adding benchmark templates.
+
+```c++
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK(func<...>) // Takes any number of parameters.
+#else // C++ < C++11
+#define BENCHMARK_TEMPLATE(func, arg1)
+#endif
+#define BENCHMARK_TEMPLATE1(func, arg1)
+#define BENCHMARK_TEMPLATE2(func, arg1, arg2)
+```
+
+<a name="fixtures" />
+
+## Fixtures
+
+Fixture tests are created by first defining a type that derives from
+`::benchmark::Fixture` and then creating/registering the tests using the
+following macros:
+
+* `BENCHMARK_F(ClassName, Method)`
+* `BENCHMARK_DEFINE_F(ClassName, Method)`
+* `BENCHMARK_REGISTER_F(ClassName, Method)`
+
+For Example:
+
+```c++
+class MyFixture : public benchmark::Fixture {
+public:
+  void SetUp(const ::benchmark::State& state) {
+  }
+
+  void TearDown(const ::benchmark::State& state) {
+  }
+};
+
+BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+
+BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+/* BarTest is NOT registered */
+BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2);
+/* BarTest is now registered */
+```
+
+### Templated Fixtures
+
+Also you can create templated fixture by using the following macros:
+
+* `BENCHMARK_TEMPLATE_F(ClassName, Method, ...)`
+* `BENCHMARK_TEMPLATE_DEFINE_F(ClassName, Method, ...)`
+
+For example:
+
+```c++
+template<typename T>
+class MyFixture : public benchmark::Fixture {};
+
+BENCHMARK_TEMPLATE_F(MyFixture, IntTest, int)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+
+BENCHMARK_TEMPLATE_DEFINE_F(MyFixture, DoubleTest, double)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+
+BENCHMARK_REGISTER_F(MyFixture, DoubleTest)->Threads(2);
+```
+
+<a name="custom-counters" />
+
+## Custom Counters
+
+You can add your own counters with user-defined names. The example below
+will add columns "Foo", "Bar" and "Baz" in its output:
+
+```c++
+static void UserCountersExample1(benchmark::State& state) {
+  double numFoos = 0, numBars = 0, numBazs = 0;
+  for (auto _ : state) {
+    // ... count Foo,Bar,Baz events
+  }
+  state.counters["Foo"] = numFoos;
+  state.counters["Bar"] = numBars;
+  state.counters["Baz"] = numBazs;
+}
+```
+
+The `state.counters` object is a `std::map` with `std::string` keys
+and `Counter` values. The latter is a `double`-like class, via an implicit
+conversion to `double&`. Thus you can use all of the standard arithmetic
+assignment operators (`=,+=,-=,*=,/=`) to change the value of each counter.
+
+In multithreaded benchmarks, each counter is set on the calling thread only.
+When the benchmark finishes, the counters from each thread will be summed;
+the resulting sum is the value which will be shown for the benchmark.
+
+The `Counter` constructor accepts three parameters: the value as a `double`
+; a bit flag which allows you to show counters as rates, and/or as per-thread
+iteration, and/or as per-thread averages, and/or iteration invariants,
+and/or finally inverting the result; and a flag specifying the 'unit' - i.e.
+is 1k a 1000 (default, `benchmark::Counter::OneK::kIs1000`), or 1024
+(`benchmark::Counter::OneK::kIs1024`)?
+
+```c++
+  // sets a simple counter
+  state.counters["Foo"] = numFoos;
+
+  // Set the counter as a rate. It will be presented divided
+  // by the duration of the benchmark.
+  // Meaning: per one second, how many 'foo's are processed?
+  state.counters["FooRate"] = Counter(numFoos, benchmark::Counter::kIsRate);
+
+  // Set the counter as a rate. It will be presented divided
+  // by the duration of the benchmark, and the result inverted.
+  // Meaning: how many seconds it takes to process one 'foo'?
+  state.counters["FooInvRate"] = Counter(numFoos, benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
+
+  // Set the counter as a thread-average quantity. It will
+  // be presented divided by the number of threads.
+  state.counters["FooAvg"] = Counter(numFoos, benchmark::Counter::kAvgThreads);
+
+  // There's also a combined flag:
+  state.counters["FooAvgRate"] = Counter(numFoos,benchmark::Counter::kAvgThreadsRate);
+
+  // This says that we process with the rate of state.range(0) bytes every iteration:
+  state.counters["BytesProcessed"] = Counter(state.range(0), benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024);
+```
+
+When you're compiling in C++11 mode or later you can use `insert()` with
+`std::initializer_list`:
+
+<!-- {% raw %} -->
+```c++
+  // With C++11, this can be done:
+  state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
+  // ... instead of:
+  state.counters["Foo"] = numFoos;
+  state.counters["Bar"] = numBars;
+  state.counters["Baz"] = numBazs;
+```
+<!-- {% endraw %} -->
+
+### Counter Reporting
+
+When using the console reporter, by default, user counters are printed at
+the end after the table, the same way as ``bytes_processed`` and
+``items_processed``. This is best for cases in which there are few counters,
+or where there are only a couple of lines per benchmark. Here's an example of
+the default output:
+
+```
+------------------------------------------------------------------------------
+Benchmark                        Time           CPU Iterations UserCounters...
+------------------------------------------------------------------------------
+BM_UserCounter/threads:8      2248 ns      10277 ns      68808 Bar=16 Bat=40 Baz=24 Foo=8
+BM_UserCounter/threads:1      9797 ns       9788 ns      71523 Bar=2 Bat=5 Baz=3 Foo=1024m
+BM_UserCounter/threads:2      4924 ns       9842 ns      71036 Bar=4 Bat=10 Baz=6 Foo=2
+BM_UserCounter/threads:4      2589 ns      10284 ns      68012 Bar=8 Bat=20 Baz=12 Foo=4
+BM_UserCounter/threads:8      2212 ns      10287 ns      68040 Bar=16 Bat=40 Baz=24 Foo=8
+BM_UserCounter/threads:16     1782 ns      10278 ns      68144 Bar=32 Bat=80 Baz=48 Foo=16
+BM_UserCounter/threads:32     1291 ns      10296 ns      68256 Bar=64 Bat=160 Baz=96 Foo=32
+BM_UserCounter/threads:4      2615 ns      10307 ns      68040 Bar=8 Bat=20 Baz=12 Foo=4
+BM_Factorial                    26 ns         26 ns   26608979 40320
+BM_Factorial/real_time          26 ns         26 ns   26587936 40320
+BM_CalculatePiRange/1           16 ns         16 ns   45704255 0
+BM_CalculatePiRange/8           73 ns         73 ns    9520927 3.28374
+BM_CalculatePiRange/64         609 ns        609 ns    1140647 3.15746
+BM_CalculatePiRange/512       4900 ns       4901 ns     142696 3.14355
+```
+
+If this doesn't suit you, you can print each counter as a table column by
+passing the flag `--benchmark_counters_tabular=true` to the benchmark
+application. This is best for cases in which there are a lot of counters, or
+a lot of lines per individual benchmark. Note that this will trigger a
+reprinting of the table header any time the counter set changes between
+individual benchmarks. Here's an example of corresponding output when
+`--benchmark_counters_tabular=true` is passed:
+
+```
+---------------------------------------------------------------------------------------
+Benchmark                        Time           CPU Iterations    Bar   Bat   Baz   Foo
+---------------------------------------------------------------------------------------
+BM_UserCounter/threads:8      2198 ns       9953 ns      70688     16    40    24     8
+BM_UserCounter/threads:1      9504 ns       9504 ns      73787      2     5     3     1
+BM_UserCounter/threads:2      4775 ns       9550 ns      72606      4    10     6     2
+BM_UserCounter/threads:4      2508 ns       9951 ns      70332      8    20    12     4
+BM_UserCounter/threads:8      2055 ns       9933 ns      70344     16    40    24     8
+BM_UserCounter/threads:16     1610 ns       9946 ns      70720     32    80    48    16
+BM_UserCounter/threads:32     1192 ns       9948 ns      70496     64   160    96    32
+BM_UserCounter/threads:4      2506 ns       9949 ns      70332      8    20    12     4
+--------------------------------------------------------------
+Benchmark                        Time           CPU Iterations
+--------------------------------------------------------------
+BM_Factorial                    26 ns         26 ns   26392245 40320
+BM_Factorial/real_time          26 ns         26 ns   26494107 40320
+BM_CalculatePiRange/1           15 ns         15 ns   45571597 0
+BM_CalculatePiRange/8           74 ns         74 ns    9450212 3.28374
+BM_CalculatePiRange/64         595 ns        595 ns    1173901 3.15746
+BM_CalculatePiRange/512       4752 ns       4752 ns     147380 3.14355
+BM_CalculatePiRange/4k       37970 ns      37972 ns      18453 3.14184
+BM_CalculatePiRange/32k     303733 ns     303744 ns       2305 3.14162
+BM_CalculatePiRange/256k   2434095 ns    2434186 ns        288 3.1416
+BM_CalculatePiRange/1024k  9721140 ns    9721413 ns         71 3.14159
+BM_CalculatePi/threads:8      2255 ns       9943 ns      70936
+```
+
+Note above the additional header printed when the benchmark changes from
+``BM_UserCounter`` to ``BM_Factorial``. This is because ``BM_Factorial`` does
+not have the same counter set as ``BM_UserCounter``.
+
+<a name="multithreaded-benchmarks"/>
+
+## Multithreaded Benchmarks
+
+In a multithreaded test (benchmark invoked by multiple threads simultaneously),
+it is guaranteed that none of the threads will start until all have reached
+the start of the benchmark loop, and all will have finished before any thread
+exits the benchmark loop. (This behavior is also provided by the `KeepRunning()`
+API) As such, any global setup or teardown can be wrapped in a check against the thread
+index:
+
+```c++
+static void BM_MultiThreaded(benchmark::State& state) {
+  if (state.thread_index() == 0) {
+    // Setup code here.
+  }
+  for (auto _ : state) {
+    // Run the test as normal.
+  }
+  if (state.thread_index() == 0) {
+    // Teardown code here.
+  }
+}
+BENCHMARK(BM_MultiThreaded)->Threads(2);
+```
+
+To run the benchmark across a range of thread counts, instead of `Threads`, use
+`ThreadRange`. This takes two parameters (`min_threads` and `max_threads`) and
+runs the benchmark once for values in the inclusive range. For example:
+
+```c++
+BENCHMARK(BM_MultiThreaded)->ThreadRange(1, 8);
+```
+
+will run `BM_MultiThreaded` with thread counts 1, 2, 4, and 8.
+
+If the benchmarked code itself uses threads and you want to compare it to
+single-threaded code, you may want to use real-time ("wallclock") measurements
+for latency comparisons:
+
+```c++
+BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();
+```
+
+Without `UseRealTime`, CPU time is used by default.
+
+<a name="cpu-timers" />
+
+## CPU Timers
+
+By default, the CPU timer only measures the time spent by the main thread.
+If the benchmark itself uses threads internally, this measurement may not
+be what you are looking for. Instead, there is a way to measure the total
+CPU usage of the process, by all the threads.
+
+```c++
+void callee(int i);
+
+static void MyMain(int size) {
+#pragma omp parallel for
+  for(int i = 0; i < size; i++)
+    callee(i);
+}
+
+static void BM_OpenMP(benchmark::State& state) {
+  for (auto _ : state)
+    MyMain(state.range(0));
+}
+
+// Measure the time spent by the main thread, use it to decide for how long to
+// run the benchmark loop. Depending on the internal implementation detail may
+// measure to anywhere from near-zero (the overhead spent before/after work
+// handoff to worker thread[s]) to the whole single-thread time.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10);
+
+// Measure the user-visible time, the wall clock (literally, the time that
+// has passed on the clock on the wall), use it to decide for how long to
+// run the benchmark loop. This will always be meaningful, and will match the
+// time spent by the main thread in single-threaded case, in general decreasing
+// with the number of internal threads doing the work.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->UseRealTime();
+
+// Measure the total CPU consumption, use it to decide for how long to
+// run the benchmark loop. This will always measure to no less than the
+// time spent by the main thread in single-threaded case.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime();
+
+// A mixture of the last two. Measure the total CPU consumption, but use the
+// wall clock to decide for how long to run the benchmark loop.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime()->UseRealTime();
+```
+
+### Controlling Timers
+
+Normally, the entire duration of the work loop (`for (auto _ : state) {}`)
+is measured. But sometimes, it is necessary to do some work inside of
+that loop, every iteration, but without counting that time to the benchmark time.
+That is possible, although it is not recommended, since it has high overhead.
+
+<!-- {% raw %} -->
+```c++
+static void BM_SetInsert_With_Timer_Control(benchmark::State& state) {
+  std::set<int> data;
+  for (auto _ : state) {
+    state.PauseTiming(); // Stop timers. They will not count until they are resumed.
+    data = ConstructRandomSet(state.range(0)); // Do something that should not be measured
+    state.ResumeTiming(); // And resume timers. They are now counting again.
+    // The rest will be measured.
+    for (int j = 0; j < state.range(1); ++j)
+      data.insert(RandomNumber());
+  }
+}
+BENCHMARK(BM_SetInsert_With_Timer_Control)->Ranges({{1<<10, 8<<10}, {128, 512}});
+```
+<!-- {% endraw %} -->
+
+<a name="manual-timing" />
+
+## Manual Timing
+
+For benchmarking something for which neither CPU time nor real-time are
+correct or accurate enough, completely manual timing is supported using
+the `UseManualTime` function.
+
+When `UseManualTime` is used, the benchmarked code must call
+`SetIterationTime` once per iteration of the benchmark loop to
+report the manually measured time.
+
+An example use case for this is benchmarking GPU execution (e.g. OpenCL
+or CUDA kernels, OpenGL or Vulkan or Direct3D draw calls), which cannot
+be accurately measured using CPU time or real-time. Instead, they can be
+measured accurately using a dedicated API, and these measurement results
+can be reported back with `SetIterationTime`.
+
+```c++
+static void BM_ManualTiming(benchmark::State& state) {
+  int microseconds = state.range(0);
+  std::chrono::duration<double, std::micro> sleep_duration {
+    static_cast<double>(microseconds)
+  };
+
+  for (auto _ : state) {
+    auto start = std::chrono::high_resolution_clock::now();
+    // Simulate some useful workload with a sleep
+    std::this_thread::sleep_for(sleep_duration);
+    auto end = std::chrono::high_resolution_clock::now();
+
+    auto elapsed_seconds =
+      std::chrono::duration_cast<std::chrono::duration<double>>(
+        end - start);
+
+    state.SetIterationTime(elapsed_seconds.count());
+  }
+}
+BENCHMARK(BM_ManualTiming)->Range(1, 1<<17)->UseManualTime();
+```
+
+<a name="setting-the-time-unit" />
+
+## Setting the Time Unit
+
+If a benchmark runs a few milliseconds it may be hard to visually compare the
+measured times, since the output data is given in nanoseconds per default. In
+order to manually set the time unit, you can specify it manually:
+
+```c++
+BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
+```
+
+Additionally the default time unit can be set globally with the
+`--benchmark_time_unit={ns|us|ms|s}` command line argument. The argument only
+affects benchmarks where the time unit is not set explicitly.
+
+<a name="preventing-optimization" />
+
+## Preventing Optimization
+
+To prevent a value or expression from being optimized away by the compiler
+the `benchmark::DoNotOptimize(...)` and `benchmark::ClobberMemory()`
+functions can be used.
+
+```c++
+static void BM_test(benchmark::State& state) {
+  for (auto _ : state) {
+      int x = 0;
+      for (int i=0; i < 64; ++i) {
+        benchmark::DoNotOptimize(x += i);
+      }
+  }
+}
+```
+
+`DoNotOptimize(<expr>)` forces the  *result* of `<expr>` to be stored in either
+memory or a register. For GNU based compilers it acts as read/write barrier
+for global memory. More specifically it forces the compiler to flush pending
+writes to memory and reload any other values as necessary.
+
+Note that `DoNotOptimize(<expr>)` does not prevent optimizations on `<expr>`
+in any way. `<expr>` may even be removed entirely when the result is already
+known. For example:
+
+```c++
+  /* Example 1: `<expr>` is removed entirely. */
+  int foo(int x) { return x + 42; }
+  while (...) DoNotOptimize(foo(0)); // Optimized to DoNotOptimize(42);
+
+  /*  Example 2: Result of '<expr>' is only reused */
+  int bar(int) __attribute__((const));
+  while (...) DoNotOptimize(bar(0)); // Optimized to:
+  // int __result__ = bar(0);
+  // while (...) DoNotOptimize(__result__);
+```
+
+The second tool for preventing optimizations is `ClobberMemory()`. In essence
+`ClobberMemory()` forces the compiler to perform all pending writes to global
+memory. Memory managed by block scope objects must be "escaped" using
+`DoNotOptimize(...)` before it can be clobbered. In the below example
+`ClobberMemory()` prevents the call to `v.push_back(42)` from being optimized
+away.
+
+```c++
+static void BM_vector_push_back(benchmark::State& state) {
+  for (auto _ : state) {
+    std::vector<int> v;
+    v.reserve(1);
+    auto data = v.data();           // Allow v.data() to be clobbered. Pass as non-const
+    benchmark::DoNotOptimize(data); // lvalue to avoid undesired compiler optimizations
+    v.push_back(42);
+    benchmark::ClobberMemory(); // Force 42 to be written to memory.
+  }
+}
+```
+
+Note that `ClobberMemory()` is only available for GNU or MSVC based compilers.
+
+<a name="reporting-statistics" />
+
+## Statistics: Reporting the Mean, Median and Standard Deviation / Coefficient of variation of Repeated Benchmarks
+
+By default each benchmark is run once and that single result is reported.
+However benchmarks are often noisy and a single result may not be representative
+of the overall behavior. For this reason it's possible to repeatedly rerun the
+benchmark.
+
+The number of runs of each benchmark is specified globally by the
+`--benchmark_repetitions` flag or on a per benchmark basis by calling
+`Repetitions` on the registered benchmark object. When a benchmark is run more
+than once the mean, median, standard deviation and coefficient of variation
+of the runs will be reported.
+
+Additionally the `--benchmark_report_aggregates_only={true|false}`,
+`--benchmark_display_aggregates_only={true|false}` flags or
+`ReportAggregatesOnly(bool)`, `DisplayAggregatesOnly(bool)` functions can be
+used to change how repeated tests are reported. By default the result of each
+repeated run is reported. When `report aggregates only` option is `true`,
+only the aggregates (i.e. mean, median, standard deviation and coefficient
+of variation, maybe complexity measurements if they were requested) of the runs
+is reported, to both the reporters - standard output (console), and the file.
+However when only the `display aggregates only` option is `true`,
+only the aggregates are displayed in the standard output, while the file
+output still contains everything.
+Calling `ReportAggregatesOnly(bool)` / `DisplayAggregatesOnly(bool)` on a
+registered benchmark object overrides the value of the appropriate flag for that
+benchmark.
+
+<a name="custom-statistics" />
+
+## Custom Statistics
+
+While having these aggregates is nice, this may not be enough for everyone.
+For example you may want to know what the largest observation is, e.g. because
+you have some real-time constraints. This is easy. The following code will
+specify a custom statistic to be calculated, defined by a lambda function.
+
+```c++
+void BM_spin_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    for (int x = 0; x < state.range(0); ++x) {
+      benchmark::DoNotOptimize(x);
+    }
+  }
+}
+
+BENCHMARK(BM_spin_empty)
+  ->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
+    return *(std::max_element(std::begin(v), std::end(v)));
+  })
+  ->Arg(512);
+```
+
+While usually the statistics produce values in time units,
+you can also produce percentages:
+
+```c++
+void BM_spin_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    for (int x = 0; x < state.range(0); ++x) {
+      benchmark::DoNotOptimize(x);
+    }
+  }
+}
+
+BENCHMARK(BM_spin_empty)
+  ->ComputeStatistics("ratio", [](const std::vector<double>& v) -> double {
+    return std::begin(v) / std::end(v);
+  }, benchmark::StatisticUnit::kPercentage)
+  ->Arg(512);
+```
+
+<a name="memory-usage" />
+
+## Memory Usage
+
+It's often useful to also track memory usage for benchmarks, alongside CPU
+performance. For this reason, benchmark offers the `RegisterMemoryManager`
+method that allows a custom `MemoryManager` to be injected.
+
+If set, the `MemoryManager::Start` and `MemoryManager::Stop` methods will be
+called at the start and end of benchmark runs to allow user code to fill out
+a report on the number of allocations, bytes used, etc.
+
+This data will then be reported alongside other performance data, currently
+only when using JSON output.
+
+<a name="using-register-benchmark" />
+
+## Using RegisterBenchmark(name, fn, args...)
+
+The `RegisterBenchmark(name, func, args...)` function provides an alternative
+way to create and register benchmarks.
+`RegisterBenchmark(name, func, args...)` creates, registers, and returns a
+pointer to a new benchmark with the specified `name` that invokes
+`func(st, args...)` where `st` is a `benchmark::State` object.
+
+Unlike the `BENCHMARK` registration macros, which can only be used at the global
+scope, the `RegisterBenchmark` can be called anywhere. This allows for
+benchmark tests to be registered programmatically.
+
+Additionally `RegisterBenchmark` allows any callable object to be registered
+as a benchmark. Including capturing lambdas and function objects.
+
+For Example:
+```c++
+auto BM_test = [](benchmark::State& st, auto Inputs) { /* ... */ };
+
+int main(int argc, char** argv) {
+  for (auto& test_input : { /* ... */ })
+      benchmark::RegisterBenchmark(test_input.name(), BM_test, test_input);
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
+}
+```
+
+<a name="exiting-with-an-error" />
+
+## Exiting with an Error
+
+When errors caused by external influences, such as file I/O and network
+communication, occur within a benchmark the
+`State::SkipWithError(const std::string& msg)` function can be used to skip that run
+of benchmark and report the error. Note that only future iterations of the
+`KeepRunning()` are skipped. For the ranged-for version of the benchmark loop
+Users must explicitly exit the loop, otherwise all iterations will be performed.
+Users may explicitly return to exit the benchmark immediately.
+
+The `SkipWithError(...)` function may be used at any point within the benchmark,
+including before and after the benchmark loop. Moreover, if `SkipWithError(...)`
+has been used, it is not required to reach the benchmark loop and one may return
+from the benchmark function early.
+
+For example:
+
+```c++
+static void BM_test(benchmark::State& state) {
+  auto resource = GetResource();
+  if (!resource.good()) {
+    state.SkipWithError("Resource is not good!");
+    // KeepRunning() loop will not be entered.
+  }
+  while (state.KeepRunning()) {
+    auto data = resource.read_data();
+    if (!resource.good()) {
+      state.SkipWithError("Failed to read data!");
+      break; // Needed to skip the rest of the iteration.
+    }
+    do_stuff(data);
+  }
+}
+
+static void BM_test_ranged_fo(benchmark::State & state) {
+  auto resource = GetResource();
+  if (!resource.good()) {
+    state.SkipWithError("Resource is not good!");
+    return; // Early return is allowed when SkipWithError() has been used.
+  }
+  for (auto _ : state) {
+    auto data = resource.read_data();
+    if (!resource.good()) {
+      state.SkipWithError("Failed to read data!");
+      break; // REQUIRED to prevent all further iterations.
+    }
+    do_stuff(data);
+  }
+}
+```
+<a name="a-faster-keep-running-loop" />
+
+## A Faster KeepRunning Loop
+
+In C++11 mode, a ranged-based for loop should be used in preference to
+the `KeepRunning` loop for running the benchmarks. For example:
+
+```c++
+static void BM_Fast(benchmark::State &state) {
+  for (auto _ : state) {
+    FastOperation();
+  }
+}
+BENCHMARK(BM_Fast);
+```
+
+The reason the ranged-for loop is faster than using `KeepRunning`, is
+because `KeepRunning` requires a memory load and store of the iteration count
+ever iteration, whereas the ranged-for variant is able to keep the iteration count
+in a register.
+
+For example, an empty inner loop of using the ranged-based for method looks like:
+
+```asm
+# Loop Init
+  mov rbx, qword ptr [r14 + 104]
+  call benchmark::State::StartKeepRunning()
+  test rbx, rbx
+  je .LoopEnd
+.LoopHeader: # =>This Inner Loop Header: Depth=1
+  add rbx, -1
+  jne .LoopHeader
+.LoopEnd:
+```
+
+Compared to an empty `KeepRunning` loop, which looks like:
+
+```asm
+.LoopHeader: # in Loop: Header=BB0_3 Depth=1
+  cmp byte ptr [rbx], 1
+  jne .LoopInit
+.LoopBody: # =>This Inner Loop Header: Depth=1
+  mov rax, qword ptr [rbx + 8]
+  lea rcx, [rax + 1]
+  mov qword ptr [rbx + 8], rcx
+  cmp rax, qword ptr [rbx + 104]
+  jb .LoopHeader
+  jmp .LoopEnd
+.LoopInit:
+  mov rdi, rbx
+  call benchmark::State::StartKeepRunning()
+  jmp .LoopBody
+.LoopEnd:
+```
+
+Unless C++03 compatibility is required, the ranged-for variant of writing
+the benchmark loop should be preferred.
+
+<a name="disabling-cpu-frequency-scaling" />
+
+## Disabling CPU Frequency Scaling
+
+If you see this error:
+
+```
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may
+be noisy and will incur extra overhead.
+```
+
+you might want to disable the CPU frequency scaling while running the
+benchmark, as well as consider other ways to stabilize the performance of
+your system while benchmarking.
+
+See [Reducing Variance](reducing_variance.md) for more information.
diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h
index f57e3e7..e3857e7 100644
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@@ -34,7 +34,7 @@ static void BM_StringCopy(benchmark::State& state) {
 BENCHMARK(BM_StringCopy);
 
 // Augment the main() program to invoke benchmarks if specified
-// via the --benchmarks command line flag.  E.g.,
+// via the --benchmark_filter command line flag.  E.g.,
 //       my_unittest --benchmark_filter=all
 //       my_unittest --benchmark_filter=BM_StringCreation
 //       my_unittest --benchmark_filter=String
@@ -42,6 +42,7 @@ BENCHMARK(BM_StringCopy);
 int main(int argc, char** argv) {
   benchmark::Initialize(&argc, argv);
   benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
   return 0;
 }
 
@@ -139,13 +140,13 @@ thread exits the loop body. As such, any global setup or teardown you want to
 do can be wrapped in a check against the thread index:
 
 static void BM_MultiThreaded(benchmark::State& state) {
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     // Setup code here.
   }
   for (auto _ : state) {
     // Run the test as normal.
   }
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     // Teardown code here.
   }
 }
@@ -167,19 +168,29 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #define BENCHMARK_HAS_CXX11
 #endif
 
+// This _MSC_VER check should detect VS 2017 v15.3 and newer.
+#if __cplusplus >= 201703L || \
+    (defined(_MSC_VER) && _MSC_VER >= 1911 && _MSVC_LANG >= 201703L)
+#define BENCHMARK_HAS_CXX17
+#endif
+
 #include <stdint.h>
 
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <iosfwd>
+#include <limits>
 #include <map>
 #include <set>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "benchmark/export.h"
+
 #if defined(BENCHMARK_HAS_CXX11)
+#include <atomic>
 #include <initializer_list>
 #include <type_traits>
 #include <utility>
@@ -199,42 +210,63 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
   TypeName& operator=(const TypeName&) = delete
 #endif
 
-#if defined(__GNUC__)
+#ifdef BENCHMARK_HAS_CXX17
+#define BENCHMARK_UNUSED [[maybe_unused]]
+#elif defined(__GNUC__) || defined(__clang__)
 #define BENCHMARK_UNUSED __attribute__((unused))
-#define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
-#define BENCHMARK_NOEXCEPT noexcept
-#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
-#elif defined(_MSC_VER) && !defined(__clang__)
+#else
 #define BENCHMARK_UNUSED
-#define BENCHMARK_ALWAYS_INLINE __forceinline
-#if _MSC_VER >= 1900
-#define BENCHMARK_NOEXCEPT noexcept
-#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#endif
+
+// Used to annotate functions, methods and classes so they
+// are not optimized by the compiler. Useful for tests
+// where you expect loops to stay in place churning cycles
+#if defined(__clang__)
+#define BENCHMARK_DONT_OPTIMIZE __attribute__((optnone))
+#elif defined(__GNUC__) || defined(__GNUG__)
+#define BENCHMARK_DONT_OPTIMIZE __attribute__((optimize(0)))
 #else
-#define BENCHMARK_NOEXCEPT
-#define BENCHMARK_NOEXCEPT_OP(x)
+// MSVC & Intel do not have a no-optimize attribute, only line pragmas
+#define BENCHMARK_DONT_OPTIMIZE
 #endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
+#elif defined(_MSC_VER) && !defined(__clang__)
+#define BENCHMARK_ALWAYS_INLINE __forceinline
 #define __func__ __FUNCTION__
 #else
-#define BENCHMARK_UNUSED
 #define BENCHMARK_ALWAYS_INLINE
-#define BENCHMARK_NOEXCEPT
-#define BENCHMARK_NOEXCEPT_OP(x)
 #endif
 
 #define BENCHMARK_INTERNAL_TOSTRING2(x) #x
 #define BENCHMARK_INTERNAL_TOSTRING(x) BENCHMARK_INTERNAL_TOSTRING2(x)
 
-#if defined(__GNUC__) || defined(__clang__)
+// clang-format off
+#if (defined(__GNUC__) && !defined(__NVCC__) && !defined(__NVCOMPILER)) || defined(__clang__)
 #define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
 #define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
+#define BENCHMARK_DISABLE_DEPRECATED_WARNING \
+  _Pragma("GCC diagnostic push")             \
+  _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#define BENCHMARK_RESTORE_DEPRECATED_WARNING _Pragma("GCC diagnostic pop")
+#elif defined(__NVCOMPILER)
+#define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
+#define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
+#define BENCHMARK_DISABLE_DEPRECATED_WARNING \
+  _Pragma("diagnostic push") \
+  _Pragma("diag_suppress deprecated_entity_with_custom_message")
+#define BENCHMARK_RESTORE_DEPRECATED_WARNING _Pragma("diagnostic pop")
 #else
 #define BENCHMARK_BUILTIN_EXPECT(x, y) x
 #define BENCHMARK_DEPRECATED_MSG(msg)
 #define BENCHMARK_WARNING_MSG(msg)                           \
   __pragma(message(__FILE__ "(" BENCHMARK_INTERNAL_TOSTRING( \
       __LINE__) ") : warning note: " msg))
+#define BENCHMARK_DISABLE_DEPRECATED_WARNING
+#define BENCHMARK_RESTORE_DEPRECATED_WARNING
 #endif
+// clang-format on
 
 #if defined(__GNUC__) && !defined(__clang__)
 #define BENCHMARK_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
@@ -252,21 +284,60 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #define BENCHMARK_UNREACHABLE() ((void)0)
 #endif
 
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_OVERRIDE override
+#else
+#define BENCHMARK_OVERRIDE
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// C4251: <symbol> needs to have dll-interface to be used by clients of class
+#pragma warning(disable : 4251)
+#endif
+
 namespace benchmark {
 class BenchmarkReporter;
-class MemoryManager;
 
-void Initialize(int* argc, char** argv);
+// Default number of minimum benchmark running time in seconds.
+const char kDefaultMinTimeStr[] = "0.5s";
+
+BENCHMARK_EXPORT void PrintDefaultHelp();
+
+BENCHMARK_EXPORT void Initialize(int* argc, char** argv,
+                                 void (*HelperPrinterf)() = PrintDefaultHelp);
+BENCHMARK_EXPORT void Shutdown();
 
 // Report to stdout all arguments in 'argv' as unrecognized except the first.
 // Returns true there is at least on unrecognized argument (i.e. 'argc' > 1).
-bool ReportUnrecognizedArguments(int argc, char** argv);
+BENCHMARK_EXPORT bool ReportUnrecognizedArguments(int argc, char** argv);
+
+// Returns the current value of --benchmark_filter.
+BENCHMARK_EXPORT std::string GetBenchmarkFilter();
+
+// Sets a new value to --benchmark_filter. (This will override this flag's
+// current value).
+// Should be called after `benchmark::Initialize()`, as
+// `benchmark::Initialize()` will override the flag's value.
+BENCHMARK_EXPORT void SetBenchmarkFilter(std::string value);
+
+// Returns the current value of --v (command line value for verbosity).
+BENCHMARK_EXPORT int32_t GetBenchmarkVerbosity();
+
+// Creates a default display reporter. Used by the library when no display
+// reporter is provided, but also made available for external use in case a
+// custom reporter should respect the `--benchmark_format` flag as a fallback
+BENCHMARK_EXPORT BenchmarkReporter* CreateDefaultDisplayReporter();
 
 // Generate a list of benchmarks matching the specified --benchmark_filter flag
 // and if --benchmark_list_tests is specified return after printing the name
 // of each matching benchmark. Otherwise run each matching benchmark and
 // report the results.
 //
+// spec : Specify the benchmarks to run. If users do not specify this arg,
+//        then the value of FLAGS_benchmark_filter
+//        will be used.
+//
 // The second and third overload use the specified 'display_reporter' and
 //  'file_reporter' respectively. 'file_reporter' will write to the file
 //  specified
@@ -274,28 +345,94 @@ bool ReportUnrecognizedArguments(int argc, char** argv);
 //  'file_reporter' is ignored.
 //
 // RETURNS: The number of matching benchmarks.
-size_t RunSpecifiedBenchmarks();
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter);
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
-                              BenchmarkReporter* file_reporter);
+BENCHMARK_EXPORT size_t RunSpecifiedBenchmarks();
+BENCHMARK_EXPORT size_t RunSpecifiedBenchmarks(std::string spec);
+
+BENCHMARK_EXPORT size_t
+RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter);
+BENCHMARK_EXPORT size_t
+RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter, std::string spec);
+
+BENCHMARK_EXPORT size_t RunSpecifiedBenchmarks(
+    BenchmarkReporter* display_reporter, BenchmarkReporter* file_reporter);
+BENCHMARK_EXPORT size_t
+RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                       BenchmarkReporter* file_reporter, std::string spec);
+
+// TimeUnit is passed to a benchmark in order to specify the order of magnitude
+// for the measured time.
+enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond, kSecond };
+
+BENCHMARK_EXPORT TimeUnit GetDefaultTimeUnit();
+
+// Sets the default time unit the benchmarks use
+// Has to be called before the benchmark loop to take effect
+BENCHMARK_EXPORT void SetDefaultTimeUnit(TimeUnit unit);
+
+// If a MemoryManager is registered (via RegisterMemoryManager()),
+// it can be used to collect and report allocation metrics for a run of the
+// benchmark.
+class MemoryManager {
+ public:
+  static const int64_t TombstoneValue;
+
+  struct Result {
+    Result()
+        : num_allocs(0),
+          max_bytes_used(0),
+          total_allocated_bytes(TombstoneValue),
+          net_heap_growth(TombstoneValue) {}
+
+    // The number of allocations made in total between Start and Stop.
+    int64_t num_allocs;
+
+    // The peak memory use between Start and Stop.
+    int64_t max_bytes_used;
+
+    // The total memory allocated, in bytes, between Start and Stop.
+    // Init'ed to TombstoneValue if metric not available.
+    int64_t total_allocated_bytes;
+
+    // The net changes in memory, in bytes, between Start and Stop.
+    // ie., total_allocated_bytes - total_deallocated_bytes.
+    // Init'ed to TombstoneValue if metric not available.
+    int64_t net_heap_growth;
+  };
+
+  virtual ~MemoryManager() {}
+
+  // Implement this to start recording allocation information.
+  virtual void Start() = 0;
+
+  // Implement this to stop recording and fill out the given Result structure.
+  virtual void Stop(Result& result) = 0;
+};
 
 // Register a MemoryManager instance that will be used to collect and report
 // allocation measurements for benchmark runs.
+BENCHMARK_EXPORT
 void RegisterMemoryManager(MemoryManager* memory_manager);
 
+// Add a key-value pair to output as part of the context stanza in the report.
+BENCHMARK_EXPORT
+void AddCustomContext(const std::string& key, const std::string& value);
+
 namespace internal {
 class Benchmark;
 class BenchmarkImp;
 class BenchmarkFamilies;
 
+BENCHMARK_EXPORT std::map<std::string, std::string>*& GetGlobalContext();
+
+BENCHMARK_EXPORT
 void UseCharPointer(char const volatile*);
 
 // Take ownership of the pointer and register the benchmark. Return the
 // registered benchmark.
-Benchmark* RegisterBenchmarkInternal(Benchmark*);
+BENCHMARK_EXPORT Benchmark* RegisterBenchmarkInternal(Benchmark*);
 
 // Ensure that the standard streams are properly initialized in every TU.
-int InitializeStreams();
+BENCHMARK_EXPORT int InitializeStreams();
 BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams();
 
 }  // namespace internal
@@ -305,12 +442,24 @@ BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams();
 #define BENCHMARK_HAS_NO_INLINE_ASSEMBLY
 #endif
 
+// Force the compiler to flush pending writes to global memory. Acts as an
+// effective read/write barrier
+#ifdef BENCHMARK_HAS_CXX11
+inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
+  std::atomic_signal_fence(std::memory_order_acq_rel);
+}
+#endif
+
 // The DoNotOptimize(...) function can be used to prevent a value or
 // expression from being optimized away by the compiler. This function is
 // intended to add little to no overhead.
 // See: https://youtu.be/nXaxk27zwlk?t=2441
 #ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY
+#if !defined(__GNUC__) || defined(__llvm__) || defined(__INTEL_COMPILER)
 template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
   asm volatile("" : : "r,m"(value) : "memory");
 }
@@ -324,25 +473,125 @@ inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) {
 #endif
 }
 
-// Force the compiler to flush pending writes to global memory. Acts as an
-// effective read/write barrier
+#ifdef BENCHMARK_HAS_CXX11
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp&& value) {
+#if defined(__clang__)
+  asm volatile("" : "+r,m"(value) : : "memory");
+#else
+  asm volatile("" : "+m,r"(value) : : "memory");
+#endif
+}
+#endif
+#elif defined(BENCHMARK_HAS_CXX11) && (__GNUC__ >= 5)
+// Workaround for a bug with full argument copy overhead with GCC.
+// See: #1340 and https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105519
+template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<std::is_trivially_copyable<Tp>::value &&
+                            (sizeof(Tp) <= sizeof(Tp*))>::type
+    DoNotOptimize(Tp const& value) {
+  asm volatile("" : : "r,m"(value) : "memory");
+}
+
+template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<!std::is_trivially_copyable<Tp>::value ||
+                            (sizeof(Tp) > sizeof(Tp*))>::type
+    DoNotOptimize(Tp const& value) {
+  asm volatile("" : : "m"(value) : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<std::is_trivially_copyable<Tp>::value &&
+                            (sizeof(Tp) <= sizeof(Tp*))>::type
+    DoNotOptimize(Tp& value) {
+  asm volatile("" : "+m,r"(value) : : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<!std::is_trivially_copyable<Tp>::value ||
+                            (sizeof(Tp) > sizeof(Tp*))>::type
+    DoNotOptimize(Tp& value) {
+  asm volatile("" : "+m"(value) : : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<std::is_trivially_copyable<Tp>::value &&
+                            (sizeof(Tp) <= sizeof(Tp*))>::type
+    DoNotOptimize(Tp&& value) {
+  asm volatile("" : "+m,r"(value) : : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<!std::is_trivially_copyable<Tp>::value ||
+                            (sizeof(Tp) > sizeof(Tp*))>::type
+    DoNotOptimize(Tp&& value) {
+  asm volatile("" : "+m"(value) : : "memory");
+}
+
+#else
+// Fallback for GCC < 5. Can add some overhead because the compiler is forced
+// to use memory operations instead of operations with registers.
+// TODO: Remove if GCC < 5 will be unsupported.
+template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+  asm volatile("" : : "m"(value) : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) {
+  asm volatile("" : "+m"(value) : : "memory");
+}
+
+#ifdef BENCHMARK_HAS_CXX11
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp&& value) {
+  asm volatile("" : "+m"(value) : : "memory");
+}
+#endif
+#endif
+
+#ifndef BENCHMARK_HAS_CXX11
 inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
   asm volatile("" : : : "memory");
 }
+#endif
 #elif defined(_MSC_VER)
 template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
   internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
   _ReadWriteBarrier();
 }
 
+#ifndef BENCHMARK_HAS_CXX11
 inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { _ReadWriteBarrier(); }
+#endif
 #else
 template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
   internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
 }
-// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers
+// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers, before C++11.
 #endif
 
 // This class is used for user-defined counters.
@@ -352,27 +601,27 @@ class Counter {
     kDefaults = 0,
     // Mark the counter as a rate. It will be presented divided
     // by the duration of the benchmark.
-    kIsRate = 1U << 0U,
+    kIsRate = 1 << 0,
     // Mark the counter as a thread-average quantity. It will be
     // presented divided by the number of threads.
-    kAvgThreads = 1U << 1U,
+    kAvgThreads = 1 << 1,
     // Mark the counter as a thread-average rate. See above.
     kAvgThreadsRate = kIsRate | kAvgThreads,
     // Mark the counter as a constant value, valid/same for *every* iteration.
     // When reporting, it will be *multiplied* by the iteration count.
-    kIsIterationInvariant = 1U << 2U,
+    kIsIterationInvariant = 1 << 2,
     // Mark the counter as a constant rate.
     // When reporting, it will be *multiplied* by the iteration count
     // and then divided by the duration of the benchmark.
     kIsIterationInvariantRate = kIsRate | kIsIterationInvariant,
     // Mark the counter as a iteration-average quantity.
     // It will be presented divided by the number of iterations.
-    kAvgIterations = 1U << 3U,
+    kAvgIterations = 1 << 3,
     // Mark the counter as a iteration-average rate. See above.
     kAvgIterationsRate = kIsRate | kAvgIterations,
 
     // In the end, invert the result. This is always done last!
-    kInvert = 1U << 31U
+    kInvert = 1 << 31
   };
 
   enum OneK {
@@ -390,7 +639,7 @@ class Counter {
   Counter(double v = 0., Flags f = kDefaults, OneK k = kIs1000)
       : value(v), flags(f), oneK(k) {}
 
-  BENCHMARK_ALWAYS_INLINE operator double const&() const { return value; }
+  BENCHMARK_ALWAYS_INLINE operator double const &() const { return value; }
   BENCHMARK_ALWAYS_INLINE operator double&() { return value; }
 };
 
@@ -405,17 +654,15 @@ Counter::Flags inline operator|(const Counter::Flags& LHS,
 // This is the container for the user-defined counters.
 typedef std::map<std::string, Counter> UserCounters;
 
-// TimeUnit is passed to a benchmark in order to specify the order of magnitude
-// for the measured time.
-enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond, kSecond };
-
 // BigO is passed to a benchmark in order to specify the asymptotic
 // computational
 // complexity for the benchmark. In case oAuto is selected, complexity will be
 // calculated automatically to the best fit.
 enum BigO { oNone, o1, oN, oNSquared, oNCubed, oLogN, oNLogN, oAuto, oLambda };
 
-typedef uint64_t IterationCount;
+typedef int64_t IterationCount;
+
+enum StatisticUnit { kTime, kPercentage };
 
 // BigOFunc is passed to a benchmark in order to specify the asymptotic
 // computational complexity for the benchmark.
@@ -429,14 +676,17 @@ namespace internal {
 struct Statistics {
   std::string name_;
   StatisticsFunc* compute_;
+  StatisticUnit unit_;
 
-  Statistics(const std::string& name, StatisticsFunc* compute)
-      : name_(name), compute_(compute) {}
+  Statistics(const std::string& name, StatisticsFunc* compute,
+             StatisticUnit unit = kTime)
+      : name_(name), compute_(compute), unit_(unit) {}
 };
 
-struct BenchmarkInstance;
+class BenchmarkInstance;
 class ThreadTimer;
 class ThreadManager;
+class PerfCountersMeasurement;
 
 enum AggregationReportMode
 #if defined(BENCHMARK_HAS_CXX11)
@@ -458,11 +708,21 @@ enum AggregationReportMode
       ARM_FileReportAggregatesOnly | ARM_DisplayReportAggregatesOnly
 };
 
+enum Skipped
+#if defined(BENCHMARK_HAS_CXX11)
+    : unsigned
+#endif
+{
+  NotSkipped = 0,
+  SkippedWithMessage,
+  SkippedWithError
+};
+
 }  // namespace internal
 
 // State is passed to a running Benchmark and contains state for the
 // benchmark to use.
-class State {
+class BENCHMARK_EXPORT State {
  public:
   struct StateIterator;
   friend struct StateIterator;
@@ -494,8 +754,8 @@ class State {
   //   }
   bool KeepRunningBatch(IterationCount n);
 
-  // REQUIRES: timer is running and 'SkipWithError(...)' has not been called
-  //           by the current thread.
+  // REQUIRES: timer is running and 'SkipWithMessage(...)' or
+  //   'SkipWithError(...)' has not been called by the current thread.
   // Stop the benchmark timer.  If not called, the timer will be
   // automatically stopped after the last iteration of the benchmark loop.
   //
@@ -510,8 +770,8 @@ class State {
   // within each benchmark iteration, if possible.
   void PauseTiming();
 
-  // REQUIRES: timer is not running and 'SkipWithError(...)' has not been called
-  //           by the current thread.
+  // REQUIRES: timer is not running and 'SkipWithMessage(...)' or
+  //   'SkipWithError(...)' has not been called by the current thread.
   // Start the benchmark timer.  The timer is NOT running on entrance to the
   // benchmark function. It begins running after control flow enters the
   // benchmark loop.
@@ -521,8 +781,30 @@ class State {
   // within each benchmark iteration, if possible.
   void ResumeTiming();
 
-  // REQUIRES: 'SkipWithError(...)' has not been called previously by the
-  //            current thread.
+  // REQUIRES: 'SkipWithMessage(...)' or 'SkipWithError(...)' has not been
+  //            called previously by the current thread.
+  // Report the benchmark as resulting in being skipped with the specified
+  // 'msg'.
+  // After this call the user may explicitly 'return' from the benchmark.
+  //
+  // If the ranged-for style of benchmark loop is used, the user must explicitly
+  // break from the loop, otherwise all future iterations will be run.
+  // If the 'KeepRunning()' loop is used the current thread will automatically
+  // exit the loop at the end of the current iteration.
+  //
+  // For threaded benchmarks only the current thread stops executing and future
+  // calls to `KeepRunning()` will block until all threads have completed
+  // the `KeepRunning()` loop. If multiple threads report being skipped only the
+  // first skip message is used.
+  //
+  // NOTE: Calling 'SkipWithMessage(...)' does not cause the benchmark to exit
+  // the current scope immediately. If the function is called from within
+  // the 'KeepRunning()' loop the current iteration will finish. It is the users
+  // responsibility to exit the scope as needed.
+  void SkipWithMessage(const std::string& msg);
+
+  // REQUIRES: 'SkipWithMessage(...)' or 'SkipWithError(...)' has not been
+  //            called previously by the current thread.
   // Report the benchmark as resulting in an error with the specified 'msg'.
   // After this call the user may explicitly 'return' from the benchmark.
   //
@@ -540,10 +822,13 @@ class State {
   // the current scope immediately. If the function is called from within
   // the 'KeepRunning()' loop the current iteration will finish. It is the users
   // responsibility to exit the scope as needed.
-  void SkipWithError(const char* msg);
+  void SkipWithError(const std::string& msg);
+
+  // Returns true if 'SkipWithMessage(...)' or 'SkipWithError(...)' was called.
+  bool skipped() const { return internal::NotSkipped != skipped_; }
 
   // Returns true if an error has been reported with 'SkipWithError(...)'.
-  bool error_occurred() const { return error_occurred_; }
+  bool error_occurred() const { return internal::SkippedWithError == skipped_; }
 
   // REQUIRES: called exactly once per iteration of the benchmarking loop.
   // Set the manually measured time for this benchmark iteration, which
@@ -614,11 +899,7 @@ class State {
   //  BM_Compress   50         50   14115038  compress:27.3%
   //
   // REQUIRES: a benchmark has exited its benchmarking loop.
-  void SetLabel(const char* label);
-
-  void BENCHMARK_ALWAYS_INLINE SetLabel(const std::string& str) {
-    this->SetLabel(str.c_str());
-  }
+  void SetLabel(const std::string& label);
 
   // Range arguments for this run. CHECKs if the argument has been set.
   BENCHMARK_ALWAYS_INLINE
@@ -633,6 +914,14 @@ class State {
   BENCHMARK_DEPRECATED_MSG("use 'range(1)' instead")
   int64_t range_y() const { return range(1); }
 
+  // Number of threads concurrently executing the benchmark.
+  BENCHMARK_ALWAYS_INLINE
+  int threads() const { return threads_; }
+
+  // Index of the executing thread. Values from [0, threads).
+  BENCHMARK_ALWAYS_INLINE
+  int thread_index() const { return thread_index_; }
+
   BENCHMARK_ALWAYS_INLINE
   IterationCount iterations() const {
     if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
@@ -641,8 +930,11 @@ class State {
     return max_iterations - total_iterations_ + batch_leftover_;
   }
 
- private
-     :  // items we expect on the first cache line (ie 64 bytes of the struct)
+  BENCHMARK_ALWAYS_INLINE
+  std::string name() const { return name_; }
+
+ private:
+  // items we expect on the first cache line (ie 64 bytes of the struct)
   // When total_iterations_ is 0, KeepRunning() and friends will return false.
   // May be larger than max_iterations.
   IterationCount total_iterations_;
@@ -658,9 +950,9 @@ class State {
  private:
   bool started_;
   bool finished_;
-  bool error_occurred_;
+  internal::Skipped skipped_;
 
- private:  // items we don't need on the first cache line
+  // items we don't need on the first cache line
   std::vector<int64_t> range_;
 
   int64_t complexity_n_;
@@ -668,25 +960,28 @@ class State {
  public:
   // Container for user-defined counters.
   UserCounters counters;
-  // Index of the executing thread. Values from [0, threads).
-  const int thread_index;
-  // Number of threads concurrently executing the benchmark.
-  const int threads;
 
  private:
-  State(IterationCount max_iters, const std::vector<int64_t>& ranges,
-        int thread_i, int n_threads, internal::ThreadTimer* timer,
-        internal::ThreadManager* manager);
+  State(std::string name, IterationCount max_iters,
+        const std::vector<int64_t>& ranges, int thread_i, int n_threads,
+        internal::ThreadTimer* timer, internal::ThreadManager* manager,
+        internal::PerfCountersMeasurement* perf_counters_measurement);
 
   void StartKeepRunning();
   // Implementation of KeepRunning() and KeepRunningBatch().
   // is_batch must be true unless n is 1.
   bool KeepRunningInternal(IterationCount n, bool is_batch);
   void FinishKeepRunning();
-  internal::ThreadTimer* timer_;
-  internal::ThreadManager* manager_;
 
-  friend struct internal::BenchmarkInstance;
+  const std::string name_;
+  const int thread_index_;
+  const int threads_;
+
+  internal::ThreadTimer* const timer_;
+  internal::ThreadManager* const manager_;
+  internal::PerfCountersMeasurement* const perf_counters_measurement_;
+
+  friend class internal::BenchmarkInstance;
 };
 
 inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() {
@@ -710,7 +1005,7 @@ inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningInternal(IterationCount n,
   }
   if (!started_) {
     StartKeepRunning();
-    if (!error_occurred_ && total_iterations_ >= n) {
+    if (!skipped() && total_iterations_ >= n) {
       total_iterations_ -= n;
       return true;
     }
@@ -740,7 +1035,7 @@ struct State::StateIterator {
 
   BENCHMARK_ALWAYS_INLINE
   explicit StateIterator(State* st)
-      : cached_(st->error_occurred_ ? 0 : st->max_iterations), parent_(st) {}
+      : cached_(st->skipped() ? 0 : st->max_iterations), parent_(st) {}
 
  public:
   BENCHMARK_ALWAYS_INLINE
@@ -783,13 +1078,16 @@ typedef void(Function)(State&);
 // be called on this object to change the properties of the benchmark.
 // Each method returns "this" so that multiple method calls can
 // chained into one expression.
-class Benchmark {
+class BENCHMARK_EXPORT Benchmark {
  public:
   virtual ~Benchmark();
 
   // Note: the following methods all return "this" so that multiple
   // method calls can be chained together in one expression.
 
+  // Specify the name of the benchmark
+  Benchmark* Name(const std::string& name);
+
   // Run this benchmark once with "x" as the extra argument passed
   // to the function.
   // REQUIRES: The function passed to the constructor must accept an arg1.
@@ -850,6 +1148,23 @@ class Benchmark {
     return Ranges(ranges);
   }
 
+  // Have "setup" and/or "teardown" invoked once for every benchmark run.
+  // If the benchmark is multi-threaded (will run in k threads concurrently),
+  // the setup callback will be be invoked exactly once (not k times) before
+  // each run with k threads. Time allowing (e.g. for a short benchmark), there
+  // may be multiple such runs per benchmark, each run with its own
+  // "setup"/"teardown".
+  //
+  // If the benchmark uses different size groups of threads (e.g. via
+  // ThreadRange), the above will be true for each size group.
+  //
+  // The callback will be passed a State object, which includes the number
+  // of threads, thread-index, benchmark arguments, etc.
+  //
+  // The callback must not be NULL or self-deleting.
+  Benchmark* Setup(void (*setup)(const benchmark::State&));
+  Benchmark* Teardown(void (*teardown)(const benchmark::State&));
+
   // Pass this benchmark object to *func, which can customize
   // the benchmark by calling various methods like Arg, Args,
   // Threads, etc.
@@ -864,12 +1179,19 @@ class Benchmark {
   // REQUIRES: `t > 0` and `Iterations` has not been called on this benchmark.
   Benchmark* MinTime(double t);
 
+  // Set the minimum amount of time to run the benchmark before taking runtimes
+  // of this benchmark into account. This
+  // option overrides the `benchmark_min_warmup_time` flag.
+  // REQUIRES: `t >= 0` and `Iterations` has not been called on this benchmark.
+  Benchmark* MinWarmUpTime(double t);
+
   // Specify the amount of iterations that should be run by this benchmark.
+  // This option overrides the `benchmark_min_time` flag.
   // REQUIRES: 'n > 0' and `MinTime` has not been called on this benchmark.
   //
   // NOTE: This function should only be used when *exact* iteration control is
   //   needed and never to control or limit how long a benchmark runs, where
-  // `--benchmark_min_time=N` or `MinTime(...)` should be used instead.
+  // `--benchmark_min_time=<N>s` or `MinTime(...)` should be used instead.
   Benchmark* Iterations(IterationCount n);
 
   // Specify the amount of times to repeat this benchmark. This option overrides
@@ -889,7 +1211,7 @@ class Benchmark {
   // By default, the CPU time is measured only for the main thread, which may
   // be unrepresentative if the benchmark uses threads internally. If called,
   // the total CPU time spent by all the threads will be measured instead.
-  // By default, the only the main thread CPU time will be measured.
+  // By default, only the main thread CPU time will be measured.
   Benchmark* MeasureProcessCPUTime();
 
   // If a particular benchmark should use the Wall clock instead of the CPU time
@@ -918,7 +1240,9 @@ class Benchmark {
   Benchmark* Complexity(BigOFunc* complexity);
 
   // Add this statistics to be computed over all the values of benchmark run
-  Benchmark* ComputeStatistics(std::string name, StatisticsFunc* statistics);
+  Benchmark* ComputeStatistics(const std::string& name,
+                               StatisticsFunc* statistics,
+                               StatisticUnit unit = kTime);
 
   // Support for running multiple copies of the same benchmark concurrently
   // in multiple threads.  This may be useful when measuring the scaling
@@ -952,23 +1276,32 @@ class Benchmark {
 
   virtual void Run(State& state) = 0;
 
+  TimeUnit GetTimeUnit() const;
+
  protected:
-  explicit Benchmark(const char* name);
-  Benchmark(Benchmark const&);
-  void SetName(const char* name);
+  explicit Benchmark(const std::string& name);
+  void SetName(const std::string& name);
 
+ public:
+  const char* GetName() const;
   int ArgsCnt() const;
+  const char* GetArgName(int arg) const;
 
  private:
   friend class BenchmarkFamilies;
+  friend class BenchmarkInstance;
 
   std::string name_;
   AggregationReportMode aggregation_report_mode_;
   std::vector<std::string> arg_names_;       // Args for all benchmark runs
   std::vector<std::vector<int64_t> > args_;  // Args for all benchmark runs
+
   TimeUnit time_unit_;
+  bool use_default_time_unit_;
+
   int range_multiplier_;
   double min_time_;
+  double min_warmup_time_;
   IterationCount iterations_;
   int repetitions_;
   bool measure_process_cpu_time_;
@@ -979,7 +1312,21 @@ class Benchmark {
   std::vector<Statistics> statistics_;
   std::vector<int> thread_counts_;
 
-  Benchmark& operator=(Benchmark const&);
+  typedef void (*callback_function)(const benchmark::State&);
+  callback_function setup_;
+  callback_function teardown_;
+
+  Benchmark(Benchmark const&)
+#if defined(BENCHMARK_HAS_CXX11)
+      = delete
+#endif
+      ;
+
+  Benchmark& operator=(Benchmark const&)
+#if defined(BENCHMARK_HAS_CXX11)
+      = delete
+#endif
+      ;
 };
 
 }  // namespace internal
@@ -988,27 +1335,27 @@ class Benchmark {
 // the specified functor 'fn'.
 //
 // RETURNS: A pointer to the registered benchmark.
-internal::Benchmark* RegisterBenchmark(const char* name,
+internal::Benchmark* RegisterBenchmark(const std::string& name,
                                        internal::Function* fn);
 
 #if defined(BENCHMARK_HAS_CXX11)
 template <class Lambda>
-internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn);
+internal::Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn);
 #endif
 
 // Remove all registered benchmarks. All pointers to previously registered
 // benchmarks are invalidated.
-void ClearRegisteredBenchmarks();
+BENCHMARK_EXPORT void ClearRegisteredBenchmarks();
 
 namespace internal {
 // The class used to hold all Benchmarks created from static function.
 // (ie those created using the BENCHMARK(...) macros.
-class FunctionBenchmark : public Benchmark {
+class BENCHMARK_EXPORT FunctionBenchmark : public Benchmark {
  public:
-  FunctionBenchmark(const char* name, Function* func)
+  FunctionBenchmark(const std::string& name, Function* func)
       : Benchmark(name), func_(func) {}
 
-  virtual void Run(State& st);
+  void Run(State& st) BENCHMARK_OVERRIDE;
 
  private:
   Function* func_;
@@ -1018,36 +1365,38 @@ class FunctionBenchmark : public Benchmark {
 template <class Lambda>
 class LambdaBenchmark : public Benchmark {
  public:
-  virtual void Run(State& st) { lambda_(st); }
+  void Run(State& st) BENCHMARK_OVERRIDE { lambda_(st); }
 
  private:
   template <class OLambda>
-  LambdaBenchmark(const char* name, OLambda&& lam)
+  LambdaBenchmark(const std::string& name, OLambda&& lam)
       : Benchmark(name), lambda_(std::forward<OLambda>(lam)) {}
 
   LambdaBenchmark(LambdaBenchmark const&) = delete;
 
- private:
-  template <class Lam>
-  friend Benchmark* ::benchmark::RegisterBenchmark(const char*, Lam&&);
+  template <class Lam>  // NOLINTNEXTLINE(readability-redundant-declaration)
+  friend Benchmark* ::benchmark::RegisterBenchmark(const std::string&, Lam&&);
 
   Lambda lambda_;
 };
 #endif
-
 }  // namespace internal
 
-inline internal::Benchmark* RegisterBenchmark(const char* name,
+inline internal::Benchmark* RegisterBenchmark(const std::string& name,
                                               internal::Function* fn) {
+  // FIXME: this should be a `std::make_unique<>()` but we don't have C++14.
+  // codechecker_intentional [cplusplus.NewDeleteLeaks]
   return internal::RegisterBenchmarkInternal(
       ::new internal::FunctionBenchmark(name, fn));
 }
 
 #ifdef BENCHMARK_HAS_CXX11
 template <class Lambda>
-internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn) {
+internal::Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn) {
   using BenchType =
       internal::LambdaBenchmark<typename std::decay<Lambda>::type>;
+  // FIXME: this should be a `std::make_unique<>()` but we don't have C++14.
+  // codechecker_intentional [cplusplus.NewDeleteLeaks]
   return internal::RegisterBenchmarkInternal(
       ::new BenchType(name, std::forward<Lambda>(fn)));
 }
@@ -1056,7 +1405,7 @@ internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn) {
 #if defined(BENCHMARK_HAS_CXX11) && \
     (!defined(BENCHMARK_GCC_VERSION) || BENCHMARK_GCC_VERSION >= 409)
 template <class Lambda, class... Args>
-internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn,
+internal::Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn,
                                        Args&&... args) {
   return benchmark::RegisterBenchmark(
       name, [=](benchmark::State& st) { fn(st, args...); });
@@ -1070,7 +1419,7 @@ class Fixture : public internal::Benchmark {
  public:
   Fixture() : internal::Benchmark("") {}
 
-  virtual void Run(State& st) {
+  void Run(State& st) BENCHMARK_OVERRIDE {
     this->SetUp(st);
     this->BenchmarkCase(st);
     this->TearDown(st);
@@ -1086,7 +1435,6 @@ class Fixture : public internal::Benchmark {
  protected:
   virtual void BenchmarkCase(State&) = 0;
 };
-
 }  // namespace benchmark
 
 // ------------------------------------------------------
@@ -1102,22 +1450,37 @@ class Fixture : public internal::Benchmark {
 #endif
 
 // Helpers for generating unique variable names
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_PRIVATE_NAME(...)                                      \
+  BENCHMARK_PRIVATE_CONCAT(benchmark_uniq_, BENCHMARK_PRIVATE_UNIQUE_ID, \
+                           __VA_ARGS__)
+#else
 #define BENCHMARK_PRIVATE_NAME(n) \
-  BENCHMARK_PRIVATE_CONCAT(_benchmark_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
+  BENCHMARK_PRIVATE_CONCAT(benchmark_uniq_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
+#endif  // BENCHMARK_HAS_CXX11
+
 #define BENCHMARK_PRIVATE_CONCAT(a, b, c) BENCHMARK_PRIVATE_CONCAT2(a, b, c)
 #define BENCHMARK_PRIVATE_CONCAT2(a, b, c) a##b##c
 // Helper for concatenation with macro name expansion
 #define BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method) \
-    BaseClass##_##Method##_Benchmark
+  BaseClass##_##Method##_Benchmark
 
 #define BENCHMARK_PRIVATE_DECLARE(n)                                 \
   static ::benchmark::internal::Benchmark* BENCHMARK_PRIVATE_NAME(n) \
       BENCHMARK_UNUSED
 
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK(...)                                               \
+  BENCHMARK_PRIVATE_DECLARE(_benchmark_) =                           \
+      (::benchmark::internal::RegisterBenchmarkInternal(             \
+          new ::benchmark::internal::FunctionBenchmark(#__VA_ARGS__, \
+                                                       __VA_ARGS__)))
+#else
 #define BENCHMARK(n)                                     \
   BENCHMARK_PRIVATE_DECLARE(n) =                         \
       (::benchmark::internal::RegisterBenchmarkInternal( \
           new ::benchmark::internal::FunctionBenchmark(#n, n)))
+#endif  // BENCHMARK_HAS_CXX11
 
 // Old-style macros
 #define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
@@ -1178,49 +1541,49 @@ class Fixture : public internal::Benchmark {
 #define BENCHMARK_TEMPLATE(n, a) BENCHMARK_TEMPLATE1(n, a)
 #endif
 
-#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)        \
-  class BaseClass##_##Method##_Benchmark : public BaseClass { \
-   public:                                                    \
-    BaseClass##_##Method##_Benchmark() : BaseClass() {        \
-      this->SetName(#BaseClass "/" #Method);                  \
-    }                                                         \
-                                                              \
-   protected:                                                 \
-    virtual void BenchmarkCase(::benchmark::State&);          \
+#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)          \
+  class BaseClass##_##Method##_Benchmark : public BaseClass {   \
+   public:                                                      \
+    BaseClass##_##Method##_Benchmark() {                        \
+      this->SetName(#BaseClass "/" #Method);                    \
+    }                                                           \
+                                                                \
+   protected:                                                   \
+    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE; \
   };
 
 #define BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
   class BaseClass##_##Method##_Benchmark : public BaseClass<a> {    \
    public:                                                          \
-    BaseClass##_##Method##_Benchmark() : BaseClass<a>() {           \
+    BaseClass##_##Method##_Benchmark() {                            \
       this->SetName(#BaseClass "<" #a ">/" #Method);                \
     }                                                               \
                                                                     \
    protected:                                                       \
-    virtual void BenchmarkCase(::benchmark::State&);                \
+    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;     \
   };
 
 #define BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
   class BaseClass##_##Method##_Benchmark : public BaseClass<a, b> {    \
    public:                                                             \
-    BaseClass##_##Method##_Benchmark() : BaseClass<a, b>() {           \
+    BaseClass##_##Method##_Benchmark() {                               \
       this->SetName(#BaseClass "<" #a "," #b ">/" #Method);            \
     }                                                                  \
                                                                        \
    protected:                                                          \
-    virtual void BenchmarkCase(::benchmark::State&);                   \
+    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;        \
   };
 
 #ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, ...)       \
   class BaseClass##_##Method##_Benchmark : public BaseClass<__VA_ARGS__> { \
    public:                                                                 \
-    BaseClass##_##Method##_Benchmark() : BaseClass<__VA_ARGS__>() {        \
+    BaseClass##_##Method##_Benchmark() {                                   \
       this->SetName(#BaseClass "<" #__VA_ARGS__ ">/" #Method);             \
     }                                                                      \
                                                                            \
    protected:                                                              \
-    virtual void BenchmarkCase(::benchmark::State&);                       \
+    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;            \
   };
 #else
 #define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(n, a) \
@@ -1282,11 +1645,20 @@ class Fixture : public internal::Benchmark {
 #endif
 
 // Helper macro to create a main routine in a test that runs the benchmarks
+// Note the workaround for Hexagon simulator passing argc != 0, argv = NULL.
 #define BENCHMARK_MAIN()                                                \
   int main(int argc, char** argv) {                                     \
+    char arg0_default[] = "benchmark";                                  \
+    char* args_default = arg0_default;                                  \
+    if (!argv) {                                                        \
+      argc = 1;                                                         \
+      argv = &args_default;                                             \
+    }                                                                   \
     ::benchmark::Initialize(&argc, argv);                               \
     if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; \
     ::benchmark::RunSpecifiedBenchmarks();                              \
+    ::benchmark::Shutdown();                                            \
+    return 0;                                                           \
   }                                                                     \
   int main(int, char**)
 
@@ -1295,7 +1667,7 @@ class Fixture : public internal::Benchmark {
 
 namespace benchmark {
 
-struct CPUInfo {
+struct BENCHMARK_EXPORT CPUInfo {
   struct CacheInfo {
     std::string type;
     int level;
@@ -1303,16 +1675,12 @@ struct CPUInfo {
     int num_sharing;
   };
 
-  enum Scaling {
-    UNKNOWN,
-    ENABLED,
-    DISABLED
-  };
+  enum Scaling { UNKNOWN, ENABLED, DISABLED };
 
   int num_cpus;
+  Scaling scaling;
   double cycles_per_second;
   std::vector<CacheInfo> caches;
-  Scaling scaling;
   std::vector<double> load_avg;
 
   static const CPUInfo& Get();
@@ -1323,7 +1691,7 @@ struct CPUInfo {
 };
 
 // Adding Struct for System Information
-struct SystemInfo {
+struct BENCHMARK_EXPORT SystemInfo {
   std::string name;
   static const SystemInfo& Get();
 
@@ -1335,10 +1703,11 @@ struct SystemInfo {
 // BenchmarkName contains the components of the Benchmark's name
 // which allows individual fields to be modified or cleared before
 // building the final name using 'str()'.
-struct BenchmarkName {
+struct BENCHMARK_EXPORT BenchmarkName {
   std::string function_name;
   std::string args;
   std::string min_time;
+  std::string min_warmup_time;
   std::string iterations;
   std::string repetitions;
   std::string time_type;
@@ -1354,7 +1723,7 @@ struct BenchmarkName {
 // can control the destination of the reports by calling
 // RunSpecifiedBenchmarks and passing it a custom reporter object.
 // The reporter object must implement the following interface.
-class BenchmarkReporter {
+class BENCHMARK_EXPORT BenchmarkReporter {
  public:
   struct Context {
     CPUInfo const& cpu_info;
@@ -1365,16 +1734,17 @@ class BenchmarkReporter {
     Context();
   };
 
-  struct Run {
+  struct BENCHMARK_EXPORT Run {
     static const int64_t no_repetition_index = -1;
     enum RunType { RT_Iteration, RT_Aggregate };
 
     Run()
         : run_type(RT_Iteration),
-          error_occurred(false),
+          aggregate_unit(kTime),
+          skipped(internal::NotSkipped),
           iterations(1),
           threads(1),
-          time_unit(kNanosecond),
+          time_unit(GetDefaultTimeUnit()),
           real_accumulated_time(0),
           cpu_accumulated_time(0),
           max_heapbytes_used(0),
@@ -1383,18 +1753,19 @@ class BenchmarkReporter {
           complexity_n(0),
           report_big_o(false),
           report_rms(false),
-          counters(),
-          has_memory_result(false),
-          allocs_per_iter(0.0),
-          max_bytes_used(0) {}
+          memory_result(NULL),
+          allocs_per_iter(0.0) {}
 
     std::string benchmark_name() const;
     BenchmarkName run_name;
+    int64_t family_index;
+    int64_t per_family_instance_index;
     RunType run_type;
     std::string aggregate_name;
+    StatisticUnit aggregate_unit;
     std::string report_label;  // Empty if not set by benchmark.
-    bool error_occurred;
-    std::string error_message;
+    internal::Skipped skipped;
+    std::string skip_message;
 
     IterationCount iterations;
     int64_t threads;
@@ -1434,9 +1805,21 @@ class BenchmarkReporter {
     UserCounters counters;
 
     // Memory metrics.
-    bool has_memory_result;
+    const MemoryManager::Result* memory_result;
     double allocs_per_iter;
-    int64_t max_bytes_used;
+  };
+
+  struct PerFamilyRunReports {
+    PerFamilyRunReports() : num_runs_total(0), num_runs_done(0) {}
+
+    // How many runs will all instances of this benchmark perform?
+    int num_runs_total;
+
+    // How many runs have happened already?
+    int num_runs_done;
+
+    // The reports about (non-errneous!) runs of this family.
+    std::vector<BenchmarkReporter::Run> Runs;
   };
 
   // Construct a BenchmarkReporter with the output stream set to 'std::cout'
@@ -1452,6 +1835,12 @@ class BenchmarkReporter {
   virtual bool ReportContext(const Context& context) = 0;
 
   // Called once for each group of benchmark runs, gives information about
+  // the configurations of the runs.
+  virtual void ReportRunsConfig(double /*min_time*/,
+                                bool /*has_explicit_iters*/,
+                                IterationCount /*iters*/) {}
+
+  // Called once for each group of benchmark runs, gives information about
   // cpu-time and heap memory usage during the benchmark run. If the group
   // of runs contained more than two entries then 'report' contains additional
   // elements representing the mean and standard deviation of those runs.
@@ -1496,7 +1885,7 @@ class BenchmarkReporter {
 
 // Simple reporter that outputs benchmark data to the console. This is the
 // default reporter used by RunSpecifiedBenchmarks().
-class ConsoleReporter : public BenchmarkReporter {
+class BENCHMARK_EXPORT ConsoleReporter : public BenchmarkReporter {
  public:
   enum OutputOptions {
     OO_None = 0,
@@ -1506,13 +1895,10 @@ class ConsoleReporter : public BenchmarkReporter {
     OO_Defaults = OO_ColorTabular
   };
   explicit ConsoleReporter(OutputOptions opts_ = OO_Defaults)
-      : output_options_(opts_),
-        name_field_width_(0),
-        prev_counters_(),
-        printed_header_(false) {}
+      : output_options_(opts_), name_field_width_(0), printed_header_(false) {}
 
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
+  bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
+  void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
 
  protected:
   virtual void PrintRunData(const Run& report);
@@ -1524,12 +1910,12 @@ class ConsoleReporter : public BenchmarkReporter {
   bool printed_header_;
 };
 
-class JSONReporter : public BenchmarkReporter {
+class BENCHMARK_EXPORT JSONReporter : public BenchmarkReporter {
  public:
   JSONReporter() : first_report_(true) {}
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
-  virtual void Finalize();
+  bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
+  void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
+  void Finalize() BENCHMARK_OVERRIDE;
 
  private:
   void PrintRunData(const Run& report);
@@ -1537,13 +1923,13 @@ class JSONReporter : public BenchmarkReporter {
   bool first_report_;
 };
 
-class BENCHMARK_DEPRECATED_MSG(
+class BENCHMARK_EXPORT BENCHMARK_DEPRECATED_MSG(
     "The CSV Reporter will be removed in a future release") CSVReporter
     : public BenchmarkReporter {
  public:
   CSVReporter() : printed_header_(false) {}
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
+  bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
+  void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
 
  private:
   void PrintRunData(const Run& report);
@@ -1552,29 +1938,6 @@ class BENCHMARK_DEPRECATED_MSG(
   std::set<std::string> user_counter_names_;
 };
 
-// If a MemoryManager is registered, it can be used to collect and report
-// allocation metrics for a run of the benchmark.
-class MemoryManager {
- public:
-  struct Result {
-    Result() : num_allocs(0), max_bytes_used(0) {}
-
-    // The number of allocations made in total between Start and Stop.
-    int64_t num_allocs;
-
-    // The peak memory use between Start and Stop.
-    int64_t max_bytes_used;
-  };
-
-  virtual ~MemoryManager() {}
-
-  // Implement this to start recording allocation information.
-  virtual void Start() = 0;
-
-  // Implement this to stop recording and fill out the given Result structure.
-  virtual void Stop(Result* result) = 0;
-};
-
 inline const char* GetTimeUnitString(TimeUnit unit) {
   switch (unit) {
     case kSecond:
@@ -1603,6 +1966,26 @@ inline double GetTimeUnitMultiplier(TimeUnit unit) {
   BENCHMARK_UNREACHABLE();
 }
 
+// Creates a list of integer values for the given range and multiplier.
+// This can be used together with ArgsProduct() to allow multiple ranges
+// with different multipliers.
+// Example:
+// ArgsProduct({
+//   CreateRange(0, 1024, /*multi=*/32),
+//   CreateRange(0, 100, /*multi=*/4),
+//   CreateDenseRange(0, 4, /*step=*/1),
+// });
+BENCHMARK_EXPORT
+std::vector<int64_t> CreateRange(int64_t lo, int64_t hi, int multi);
+
+// Creates a list of integer values for the given range and step.
+BENCHMARK_EXPORT
+std::vector<int64_t> CreateDenseRange(int64_t start, int64_t limit, int step);
+
 }  // namespace benchmark
 
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
 #endif  // BENCHMARK_BENCHMARK_H_
diff --git a/include/benchmark/export.h b/include/benchmark/export.h
new file mode 100644
index 0000000..f96f859
--- /dev/null
+++ b/include/benchmark/export.h
@@ -0,0 +1,47 @@
+#ifndef BENCHMARK_EXPORT_H
+#define BENCHMARK_EXPORT_H
+
+#if defined(_WIN32)
+#define EXPORT_ATTR __declspec(dllexport)
+#define IMPORT_ATTR __declspec(dllimport)
+#define NO_EXPORT_ATTR
+#define DEPRECATED_ATTR __declspec(deprecated)
+#else  // _WIN32
+#define EXPORT_ATTR __attribute__((visibility("default")))
+#define IMPORT_ATTR __attribute__((visibility("default")))
+#define NO_EXPORT_ATTR __attribute__((visibility("hidden")))
+#define DEPRECATE_ATTR __attribute__((__deprecated__))
+#endif  // _WIN32
+
+#ifdef BENCHMARK_STATIC_DEFINE
+#define BENCHMARK_EXPORT
+#define BENCHMARK_NO_EXPORT
+#else  // BENCHMARK_STATIC_DEFINE
+#ifndef BENCHMARK_EXPORT
+#ifdef benchmark_EXPORTS
+/* We are building this library */
+#define BENCHMARK_EXPORT EXPORT_ATTR
+#else  // benchmark_EXPORTS
+/* We are using this library */
+#define BENCHMARK_EXPORT IMPORT_ATTR
+#endif  // benchmark_EXPORTS
+#endif  // !BENCHMARK_EXPORT
+
+#ifndef BENCHMARK_NO_EXPORT
+#define BENCHMARK_NO_EXPORT NO_EXPORT_ATTR
+#endif  // !BENCHMARK_NO_EXPORT
+#endif  // BENCHMARK_STATIC_DEFINE
+
+#ifndef BENCHMARK_DEPRECATED
+#define BENCHMARK_DEPRECATED DEPRECATE_ATTR
+#endif  // BENCHMARK_DEPRECATED
+
+#ifndef BENCHMARK_DEPRECATED_EXPORT
+#define BENCHMARK_DEPRECATED_EXPORT BENCHMARK_EXPORT BENCHMARK_DEPRECATED
+#endif  // BENCHMARK_DEPRECATED_EXPORT
+
+#ifndef BENCHMARK_DEPRECATED_NO_EXPORT
+#define BENCHMARK_DEPRECATED_NO_EXPORT BENCHMARK_NO_EXPORT BENCHMARK_DEPRECATED
+#endif  // BENCHMARK_DEPRECATED_EXPORT
+
+#endif /* BENCHMARK_EXPORT_H */
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..fe8770b
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,50 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "google_benchmark"
+description = "A library to benchmark code snippets."
+requires-python = ">=3.8"
+license = {file = "LICENSE"}
+keywords = ["benchmark"]
+
+authors = [
+    {name = "Google", email = "benchmark-discuss@googlegroups.com"},
+]
+
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Topic :: Software Development :: Testing",
+    "Topic :: System :: Benchmark",
+]
+
+dynamic = ["readme", "version"]
+
+dependencies = [
+    "absl-py>=0.7.1",
+]
+
+[project.urls]
+Homepage = "https://github.com/google/benchmark"
+Documentation = "https://github.com/google/benchmark/tree/main/docs"
+Repository = "https://github.com/google/benchmark.git"
+Discord = "https://discord.gg/cz7UX7wKC2"
+
+[tool.setuptools]
+package-dir = {"" = "bindings/python"}
+zip-safe = false
+
+[tool.setuptools.packages.find]
+where = ["bindings/python"]
+
+[tool.setuptools.dynamic]
+version = { attr = "google_benchmark.__version__" }
+readme = { file = "README.md", content-type = "text/markdown" }
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 85e8986..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-numpy == 1.19.4
-scipy == 1.5.4
diff --git a/setup.py b/setup.py
index 5cdab10..b02a6a7 100644
--- a/setup.py
+++ b/setup.py
@@ -1,55 +1,50 @@
+import contextlib
 import os
-import posixpath
-import re
+import platform
 import shutil
-import sys
+import sysconfig
+from pathlib import Path
 
-from distutils import sysconfig
 import setuptools
 from setuptools.command import build_ext
 
 
-HERE = os.path.dirname(os.path.abspath(__file__))
+PYTHON_INCLUDE_PATH_PLACEHOLDER = "<PYTHON_INCLUDE_PATH>"
 
+IS_WINDOWS = platform.system() == "Windows"
+IS_MAC = platform.system() == "Darwin"
 
-IS_WINDOWS = sys.platform.startswith("win")
 
-
-def _get_version():
-    """Parse the version string from __init__.py."""
-    with open(
-        os.path.join(HERE, "bindings", "python", "google_benchmark", "__init__.py")
-    ) as init_file:
+@contextlib.contextmanager
+def temp_fill_include_path(fp: str):
+    """Temporarily set the Python include path in a file."""
+    with open(fp, "r+") as f:
         try:
-            version_line = next(
-                line for line in init_file if line.startswith("__version__")
+            content = f.read()
+            replaced = content.replace(
+                PYTHON_INCLUDE_PATH_PLACEHOLDER,
+                Path(sysconfig.get_paths()['include']).as_posix(),
             )
-        except StopIteration:
-            raise ValueError("__version__ not defined in __init__.py")
-        else:
-            namespace = {}
-            exec(version_line, namespace)  # pylint: disable=exec-used
-            return namespace["__version__"]
-
-
-def _parse_requirements(path):
-    with open(os.path.join(HERE, path)) as requirements:
-        return [
-            line.rstrip()
-            for line in requirements
-            if not (line.isspace() or line.startswith("#"))
-        ]
+            f.seek(0)
+            f.write(replaced)
+            f.truncate()
+            yield
+        finally:
+            # revert to the original content after exit
+            f.seek(0)
+            f.write(content)
+            f.truncate()
 
 
 class BazelExtension(setuptools.Extension):
     """A C/C++ extension that is defined as a Bazel BUILD target."""
 
-    def __init__(self, name, bazel_target):
+    def __init__(self, name: str, bazel_target: str):
+        super().__init__(name=name, sources=[])
+
         self.bazel_target = bazel_target
-        self.relpath, self.target_name = posixpath.relpath(bazel_target, "//").split(
-            ":"
-        )
-        setuptools.Extension.__init__(self, name, sources=[])
+        stripped_target = bazel_target.split("//")[-1]
+        self.relpath, self.target_name = stripped_target.split(":")
 
 
 class BuildBazelExtension(build_ext.build_ext):
@@ -60,81 +55,59 @@ class BuildBazelExtension(build_ext.build_ext):
             self.bazel_build(ext)
         build_ext.build_ext.run(self)
 
-    def bazel_build(self, ext):
+    def bazel_build(self, ext: BazelExtension):
         """Runs the bazel build to create the package."""
-        with open("WORKSPACE", "r") as workspace:
-            workspace_contents = workspace.read()
-
-        with open("WORKSPACE", "w") as workspace:
-            workspace.write(
-                re.sub(
-                    r'(?<=path = ").*(?=",  # May be overwritten by setup\.py\.)',
-                    sysconfig.get_python_inc().replace(os.path.sep, posixpath.sep),
-                    workspace_contents,
-                )
-            )
-
-        if not os.path.exists(self.build_temp):
-            os.makedirs(self.build_temp)
-
-        bazel_argv = [
-            "bazel",
-            "build",
-            ext.bazel_target,
-            "--symlink_prefix=" + os.path.join(self.build_temp, "bazel-"),
-            "--compilation_mode=" + ("dbg" if self.debug else "opt"),
-        ]
-
-        if IS_WINDOWS:
-            # Link with python*.lib.
-            for library_dir in self.library_dirs:
-                bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
-
-        self.spawn(bazel_argv)
-
-        shared_lib_suffix = '.dll' if IS_WINDOWS else '.so'
-        ext_bazel_bin_path = os.path.join(
-            self.build_temp, 'bazel-bin',
-            ext.relpath, ext.target_name + shared_lib_suffix)
-
-        ext_dest_path = self.get_ext_fullpath(ext.name)
-        ext_dest_dir = os.path.dirname(ext_dest_path)
-        if not os.path.exists(ext_dest_dir):
-            os.makedirs(ext_dest_dir)
-        shutil.copyfile(ext_bazel_bin_path, ext_dest_path)
+        with temp_fill_include_path("WORKSPACE"):
+            temp_path = Path(self.build_temp)
+
+            bazel_argv = [
+                "bazel",
+                "build",
+                ext.bazel_target,
+                f"--symlink_prefix={temp_path / 'bazel-'}",
+                f"--compilation_mode={'dbg' if self.debug else 'opt'}",
+                # C++17 is required by nanobind
+                f"--cxxopt={'/std:c++17' if IS_WINDOWS else '-std=c++17'}",
+            ]
+
+            if IS_WINDOWS:
+                # Link with python*.lib.
+                for library_dir in self.library_dirs:
+                    bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
+            elif IS_MAC:
+                if platform.machine() == "x86_64":
+                    # C++17 needs macOS 10.14 at minimum
+                    bazel_argv.append("--macos_minimum_os=10.14")
+
+                    # cross-compilation for Mac ARM64 on GitHub Mac x86 runners.
+                    # ARCHFLAGS is set by cibuildwheel before macOS wheel builds.
+                    archflags = os.getenv("ARCHFLAGS", "")
+                    if "arm64" in archflags:
+                        bazel_argv.append("--cpu=darwin_arm64")
+                        bazel_argv.append("--macos_cpus=arm64")
+
+                elif platform.machine() == "arm64":
+                    bazel_argv.append("--macos_minimum_os=11.0")
+
+            self.spawn(bazel_argv)
+
+            shared_lib_suffix = '.dll' if IS_WINDOWS else '.so'
+            ext_name = ext.target_name + shared_lib_suffix
+            ext_bazel_bin_path = temp_path / 'bazel-bin' / ext.relpath / ext_name
+
+            ext_dest_path = Path(self.get_ext_fullpath(ext.name))
+            shutil.copyfile(ext_bazel_bin_path, ext_dest_path)
+
+            # explicitly call `bazel shutdown` for graceful exit
+            self.spawn(["bazel", "shutdown"])
 
 
 setuptools.setup(
-    name="google_benchmark",
-    version=_get_version(),
-    url="https://github.com/google/benchmark",
-    description="A library to benchmark code snippets.",
-    author="Google",
-    author_email="benchmark-py@google.com",
-    # Contained modules and scripts.
-    package_dir={"": "bindings/python"},
-    packages=setuptools.find_packages("bindings/python"),
-    install_requires=_parse_requirements("bindings/python/requirements.txt"),
     cmdclass=dict(build_ext=BuildBazelExtension),
     ext_modules=[
         BazelExtension(
-            "google_benchmark._benchmark",
-            "//bindings/python/google_benchmark:_benchmark",
+            name="google_benchmark._benchmark",
+            bazel_target="//bindings/python/google_benchmark:_benchmark",
         )
     ],
-    zip_safe=False,
-    # PyPI package information.
-    classifiers=[
-        "Development Status :: 4 - Beta",
-        "Intended Audience :: Developers",
-        "Intended Audience :: Science/Research",
-        "License :: OSI Approved :: Apache Software License",
-        "Programming Language :: Python :: 3.6",
-        "Programming Language :: Python :: 3.7",
-        "Programming Language :: Python :: 3.8",
-        "Topic :: Software Development :: Testing",
-        "Topic :: System :: Benchmark",
-    ],
-    license="Apache 2.0",
-    keywords="benchmark",
 )
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 35d559e..daf82fb 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -25,32 +25,42 @@ set_target_properties(benchmark PROPERTIES
   SOVERSION ${GENERIC_LIB_SOVERSION}
 )
 target_include_directories(benchmark PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
-    )
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
+)
 
-# Link threads.
-target_link_libraries(benchmark  ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-find_library(LIBRT rt)
-if(LIBRT)
-  target_link_libraries(benchmark ${LIBRT})
+# libpfm, if available
+if (PFM_FOUND)
+  target_link_libraries(benchmark PRIVATE PFM::libpfm)
+  target_compile_definitions(benchmark PRIVATE -DHAVE_LIBPFM)
 endif()
 
-if(CMAKE_BUILD_TYPE)
-  string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UPPER)
-endif()
-if(NOT CMAKE_THREAD_LIBS_INIT AND "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}" MATCHES ".*-fsanitize=[^ ]*address.*")
-  message(WARNING "CMake's FindThreads.cmake did not fail, but CMAKE_THREAD_LIBS_INIT ended up being empty. This was fixed in https://github.com/Kitware/CMake/commit/d53317130e84898c5328c237186dbd995aaf1c12 Let's guess that -pthread is sufficient.")
-  target_link_libraries(benchmark -pthread)
+# pthread affinity, if available
+if(HAVE_PTHREAD_AFFINITY)
+  target_compile_definitions(benchmark PRIVATE -DBENCHMARK_HAS_PTHREAD_AFFINITY)
 endif()
 
+# Link threads.
+target_link_libraries(benchmark PRIVATE Threads::Threads)
+
+target_link_libraries(benchmark PRIVATE ${BENCHMARK_CXX_LIBRARIES})
+
+if(HAVE_LIB_RT)
+  target_link_libraries(benchmark PRIVATE rt)
+endif(HAVE_LIB_RT)
+
+
 # We need extra libraries on Windows
 if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
-  target_link_libraries(benchmark shlwapi)
+  target_link_libraries(benchmark PRIVATE shlwapi)
 endif()
 
 # We need extra libraries on Solaris
 if(${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
-  target_link_libraries(benchmark kstat)
+  target_link_libraries(benchmark PRIVATE kstat)
+endif()
+
+if (NOT BUILD_SHARED_LIBS)
+  target_compile_definitions(benchmark PUBLIC -DBENCHMARK_STATIC_DEFINE)
 endif()
 
 # Benchmark main library
@@ -60,34 +70,45 @@ set_target_properties(benchmark_main PROPERTIES
   OUTPUT_NAME "benchmark_main"
   VERSION ${GENERIC_LIB_VERSION}
   SOVERSION ${GENERIC_LIB_SOVERSION}
+  DEFINE_SYMBOL benchmark_EXPORTS
 )
-target_include_directories(benchmark PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
-    )
-target_link_libraries(benchmark_main benchmark::benchmark)
-
+target_link_libraries(benchmark_main PUBLIC benchmark::benchmark)
 
-set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
+set(generated_dir "${PROJECT_BINARY_DIR}")
 
 set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
 set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
 set(pkg_config "${generated_dir}/${PROJECT_NAME}.pc")
+set(targets_to_export benchmark benchmark_main)
 set(targets_export_name "${PROJECT_NAME}Targets")
 
 set(namespace "${PROJECT_NAME}::")
 
 include(CMakePackageConfigHelpers)
+
+configure_package_config_file (
+  ${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in
+  ${project_config}
+  INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
+  NO_SET_AND_CHECK_MACRO
+  NO_CHECK_REQUIRED_COMPONENTS_MACRO
+)
 write_basic_package_version_file(
   "${version_config}" VERSION ${GENERIC_LIB_VERSION} COMPATIBILITY SameMajorVersion
 )
 
-configure_file("${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in" "${project_config}" @ONLY)
 configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark.pc.in" "${pkg_config}" @ONLY)
 
+export (
+  TARGETS ${targets_to_export}
+  NAMESPACE "${namespace}"
+  FILE ${generated_dir}/${targets_export_name}.cmake
+)
+
 if (BENCHMARK_ENABLE_INSTALL)
   # Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
   install(
-    TARGETS benchmark benchmark_main
+    TARGETS ${targets_to_export}
     EXPORT ${targets_export_name}
     ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
@@ -96,6 +117,7 @@ if (BENCHMARK_ENABLE_INSTALL)
 
   install(
     DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
+              "${PROJECT_BINARY_DIR}/include/benchmark"
     DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
     FILES_MATCHING PATTERN "*.*h")
 
@@ -112,3 +134,37 @@ if (BENCHMARK_ENABLE_INSTALL)
       NAMESPACE "${namespace}"
       DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
 endif()
+
+if (BENCHMARK_ENABLE_DOXYGEN)
+  find_package(Doxygen REQUIRED)
+  set(DOXYGEN_QUIET YES)
+  set(DOXYGEN_RECURSIVE YES)
+  set(DOXYGEN_GENERATE_HTML YES)
+  set(DOXYGEN_GENERATE_MAN NO)
+  set(DOXYGEN_MARKDOWN_SUPPORT YES)
+  set(DOXYGEN_BUILTIN_STL_SUPPORT YES)
+  set(DOXYGEN_EXTRACT_PACKAGE YES)
+  set(DOXYGEN_EXTRACT_STATIC YES)
+  set(DOXYGEN_SHOW_INCLUDE_FILES YES)
+  set(DOXYGEN_BINARY_TOC YES)
+  set(DOXYGEN_TOC_EXPAND YES)
+  set(DOXYGEN_USE_MDFILE_AS_MAINPAGE "index.md")
+  doxygen_add_docs(benchmark_doxygen
+    docs
+    include
+    src
+    ALL
+    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+    COMMENT "Building documentation with Doxygen.")
+  if (BENCHMARK_ENABLE_INSTALL AND BENCHMARK_INSTALL_DOCS)
+    install(
+      DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/html/"
+      DESTINATION ${CMAKE_INSTALL_DOCDIR})
+  endif()
+else()
+  if (BENCHMARK_ENABLE_INSTALL AND BENCHMARK_INSTALL_DOCS)
+    install(
+      DIRECTORY "${PROJECT_SOURCE_DIR}/docs/"
+      DESTINATION ${CMAKE_INSTALL_DOCDIR})
+  endif()
+endif()
diff --git a/src/benchmark.cc b/src/benchmark.cc
index 1c049f2..6139e59 100644
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@@ -13,12 +13,13 @@
 // limitations under the License.
 
 #include "benchmark/benchmark.h"
+
 #include "benchmark_api_internal.h"
 #include "benchmark_runner.h"
 #include "internal_macros.h"
 
 #ifndef BENCHMARK_OS_WINDOWS
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@@ -32,7 +33,10 @@
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
+#include <limits>
+#include <map>
 #include <memory>
+#include <random>
 #include <string>
 #include <thread>
 #include <utility>
@@ -45,94 +49,146 @@
 #include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
+#include "perf_counters.h"
 #include "re.h"
 #include "statistics.h"
 #include "string_util.h"
 #include "thread_manager.h"
 #include "thread_timer.h"
 
+namespace benchmark {
 // Print a list of benchmarks. This option overrides all other options.
-DEFINE_bool(benchmark_list_tests, false);
+BM_DEFINE_bool(benchmark_list_tests, false);
 
 // A regular expression that specifies the set of benchmarks to execute.  If
 // this flag is empty, or if this flag is the string \"all\", all benchmarks
 // linked into the binary are run.
-DEFINE_string(benchmark_filter, ".");
+BM_DEFINE_string(benchmark_filter, "");
 
-// Minimum number of seconds we should run benchmark before results are
-// considered significant.  For cpu-time based tests, this is the lower bound
+// Specification of how long to run the benchmark.
+//
+// It can be either an exact number of iterations (specified as `<integer>x`),
+// or a minimum number of seconds (specified as `<float>s`). If the latter
+// format (ie., min seconds) is used, the system may run the benchmark longer
+// until the results are considered significant.
+//
+// For backward compatibility, the `s` suffix may be omitted, in which case,
+// the specified number is interpreted as the number of seconds.
+//
+// For cpu-time based tests, this is the lower bound
 // on the total cpu time used by all threads that make up the test.  For
 // real-time based tests, this is the lower bound on the elapsed time of the
 // benchmark execution, regardless of number of threads.
-DEFINE_double(benchmark_min_time, 0.5);
+BM_DEFINE_string(benchmark_min_time, kDefaultMinTimeStr);
+
+// Minimum number of seconds a benchmark should be run before results should be
+// taken into account. This e.g can be necessary for benchmarks of code which
+// needs to fill some form of cache before performance is of interest.
+// Note: results gathered within this period are discarded and not used for
+// reported result.
+BM_DEFINE_double(benchmark_min_warmup_time, 0.0);
 
 // The number of runs of each benchmark. If greater than 1, the mean and
 // standard deviation of the runs will be reported.
-DEFINE_int32(benchmark_repetitions, 1);
+BM_DEFINE_int32(benchmark_repetitions, 1);
+
+// If set, enable random interleaving of repetitions of all benchmarks.
+// See http://github.com/google/benchmark/issues/1051 for details.
+BM_DEFINE_bool(benchmark_enable_random_interleaving, false);
 
 // Report the result of each benchmark repetitions. When 'true' is specified
 // only the mean, standard deviation, and other statistics are reported for
 // repeated benchmarks. Affects all reporters.
-DEFINE_bool(benchmark_report_aggregates_only, false);
+BM_DEFINE_bool(benchmark_report_aggregates_only, false);
 
 // Display the result of each benchmark repetitions. When 'true' is specified
 // only the mean, standard deviation, and other statistics are displayed for
 // repeated benchmarks. Unlike benchmark_report_aggregates_only, only affects
 // the display reporter, but  *NOT* file reporter, which will still contain
 // all the output.
-DEFINE_bool(benchmark_display_aggregates_only, false);
+BM_DEFINE_bool(benchmark_display_aggregates_only, false);
 
 // The format to use for console output.
 // Valid values are 'console', 'json', or 'csv'.
-DEFINE_string(benchmark_format, "console");
+BM_DEFINE_string(benchmark_format, "console");
 
 // The format to use for file output.
 // Valid values are 'console', 'json', or 'csv'.
-DEFINE_string(benchmark_out_format, "json");
+BM_DEFINE_string(benchmark_out_format, "json");
 
 // The file to write additional output to.
-DEFINE_string(benchmark_out, "");
+BM_DEFINE_string(benchmark_out, "");
 
 // Whether to use colors in the output.  Valid values:
 // 'true'/'yes'/1, 'false'/'no'/0, and 'auto'. 'auto' means to use colors if
 // the output is being sent to a terminal and the TERM environment variable is
 // set to a terminal type that supports colors.
-DEFINE_string(benchmark_color, "auto");
+BM_DEFINE_string(benchmark_color, "auto");
 
 // Whether to use tabular format when printing user counters to the console.
 // Valid values: 'true'/'yes'/1, 'false'/'no'/0.  Defaults to false.
-DEFINE_bool(benchmark_counters_tabular, false);
+BM_DEFINE_bool(benchmark_counters_tabular, false);
 
-// The level of verbose logging to output
-DEFINE_int32(v, 0);
+// List of additional perf counters to collect, in libpfm format. For more
+// information about libpfm: https://man7.org/linux/man-pages/man3/libpfm.3.html
+BM_DEFINE_string(benchmark_perf_counters, "");
 
-namespace benchmark {
+// Extra context to include in the output formatted as comma-separated key-value
+// pairs. Kept internal as it's only used for parsing from env/command line.
+BM_DEFINE_kvpairs(benchmark_context, {});
+
+// Set the default time unit to use for reports
+// Valid values are 'ns', 'us', 'ms' or 's'
+BM_DEFINE_string(benchmark_time_unit, "");
+
+// The level of verbose logging to output
+BM_DEFINE_int32(v, 0);
 
 namespace internal {
 
+std::map<std::string, std::string>* global_context = nullptr;
+
+BENCHMARK_EXPORT std::map<std::string, std::string>*& GetGlobalContext() {
+  return global_context;
+}
+
 // FIXME: wouldn't LTO mess this up?
 void UseCharPointer(char const volatile*) {}
 
 }  // namespace internal
 
-State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
-             int thread_i, int n_threads, internal::ThreadTimer* timer,
-             internal::ThreadManager* manager)
+State::State(std::string name, IterationCount max_iters,
+             const std::vector<int64_t>& ranges, int thread_i, int n_threads,
+             internal::ThreadTimer* timer, internal::ThreadManager* manager,
+             internal::PerfCountersMeasurement* perf_counters_measurement)
     : total_iterations_(0),
       batch_leftover_(0),
       max_iterations(max_iters),
       started_(false),
       finished_(false),
-      error_occurred_(false),
+      skipped_(internal::NotSkipped),
       range_(ranges),
       complexity_n_(0),
-      counters(),
-      thread_index(thread_i),
-      threads(n_threads),
+      name_(std::move(name)),
+      thread_index_(thread_i),
+      threads_(n_threads),
       timer_(timer),
-      manager_(manager) {
-  CHECK(max_iterations != 0) << "At least one iteration must be run";
-  CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
+      manager_(manager),
+      perf_counters_measurement_(perf_counters_measurement) {
+  BM_CHECK(max_iterations != 0) << "At least one iteration must be run";
+  BM_CHECK_LT(thread_index_, threads_)
+      << "thread_index must be less than threads";
+
+  // Add counters with correct flag now.  If added with `counters[name]` in
+  // `PauseTiming`, a new `Counter` will be inserted the first time, which
+  // won't have the flag.  Inserting them now also reduces the allocations
+  // during the benchmark.
+  if (perf_counters_measurement_) {
+    for (const std::string& counter_name :
+         perf_counters_measurement_->names()) {
+      counters[counter_name] = Counter(0.0, Counter::kAvgIterations);
+    }
+  }
 
   // Note: The use of offsetof below is technically undefined until C++17
   // because State is not a standard layout type. However, all compilers
@@ -147,37 +203,78 @@ State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Winvalid-offsetof"
 #endif
+#if defined(__NVCC__)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 1427
+#endif
+#if defined(__NVCOMPILER)
+#pragma diagnostic push
+#pragma diag_suppress offset_in_non_POD_nonstandard
+#endif
   // Offset tests to ensure commonly accessed data is on the first cache line.
   const int cache_line_size = 64;
-  static_assert(offsetof(State, error_occurred_) <=
-                    (cache_line_size - sizeof(error_occurred_)),
-                "");
+  static_assert(
+      offsetof(State, skipped_) <= (cache_line_size - sizeof(skipped_)), "");
 #if defined(__INTEL_COMPILER)
 #pragma warning pop
 #elif defined(__GNUC__)
 #pragma GCC diagnostic pop
 #endif
+#if defined(__NVCC__)
+#pragma nv_diagnostic pop
+#endif
+#if defined(__NVCOMPILER)
+#pragma diagnostic pop
+#endif
 }
 
 void State::PauseTiming() {
   // Add in time accumulated so far
-  CHECK(started_ && !finished_ && !error_occurred_);
+  BM_CHECK(started_ && !finished_ && !skipped());
   timer_->StopTimer();
+  if (perf_counters_measurement_) {
+    std::vector<std::pair<std::string, double>> measurements;
+    if (!perf_counters_measurement_->Stop(measurements)) {
+      BM_CHECK(false) << "Perf counters read the value failed.";
+    }
+    for (const auto& name_and_measurement : measurements) {
+      const std::string& name = name_and_measurement.first;
+      const double measurement = name_and_measurement.second;
+      // Counter was inserted with `kAvgIterations` flag by the constructor.
+      assert(counters.find(name) != counters.end());
+      counters[name].value += measurement;
+    }
+  }
 }
 
 void State::ResumeTiming() {
-  CHECK(started_ && !finished_ && !error_occurred_);
+  BM_CHECK(started_ && !finished_ && !skipped());
   timer_->StartTimer();
+  if (perf_counters_measurement_) {
+    perf_counters_measurement_->Start();
+  }
 }
 
-void State::SkipWithError(const char* msg) {
-  CHECK(msg);
-  error_occurred_ = true;
+void State::SkipWithMessage(const std::string& msg) {
+  skipped_ = internal::SkippedWithMessage;
   {
     MutexLock l(manager_->GetBenchmarkMutex());
-    if (manager_->results.has_error_ == false) {
-      manager_->results.error_message_ = msg;
-      manager_->results.has_error_ = true;
+    if (internal::NotSkipped == manager_->results.skipped_) {
+      manager_->results.skip_message_ = msg;
+      manager_->results.skipped_ = skipped_;
+    }
+  }
+  total_iterations_ = 0;
+  if (timer_->running()) timer_->StopTimer();
+}
+
+void State::SkipWithError(const std::string& msg) {
+  skipped_ = internal::SkippedWithError;
+  {
+    MutexLock l(manager_->GetBenchmarkMutex());
+    if (internal::NotSkipped == manager_->results.skipped_) {
+      manager_->results.skip_message_ = msg;
+      manager_->results.skipped_ = skipped_;
     }
   }
   total_iterations_ = 0;
@@ -188,22 +285,22 @@ void State::SetIterationTime(double seconds) {
   timer_->SetIterationTime(seconds);
 }
 
-void State::SetLabel(const char* label) {
+void State::SetLabel(const std::string& label) {
   MutexLock l(manager_->GetBenchmarkMutex());
   manager_->results.report_label_ = label;
 }
 
 void State::StartKeepRunning() {
-  CHECK(!started_ && !finished_);
+  BM_CHECK(!started_ && !finished_);
   started_ = true;
-  total_iterations_ = error_occurred_ ? 0 : max_iterations;
+  total_iterations_ = skipped() ? 0 : max_iterations;
   manager_->StartStopBarrier();
-  if (!error_occurred_) ResumeTiming();
+  if (!skipped()) ResumeTiming();
 }
 
 void State::FinishKeepRunning() {
-  CHECK(started_ && (!finished_ || error_occurred_));
-  if (!error_occurred_) {
+  BM_CHECK(started_ && (!finished_ || skipped()));
+  if (!skipped()) {
     PauseTiming();
   }
   // Total iterations has now wrapped around past 0. Fix this.
@@ -215,11 +312,42 @@ void State::FinishKeepRunning() {
 namespace internal {
 namespace {
 
+// Flushes streams after invoking reporter methods that write to them. This
+// ensures users get timely updates even when streams are not line-buffered.
+void FlushStreams(BenchmarkReporter* reporter) {
+  if (!reporter) return;
+  std::flush(reporter->GetOutputStream());
+  std::flush(reporter->GetErrorStream());
+}
+
+// Reports in both display and file reporters.
+void Report(BenchmarkReporter* display_reporter,
+            BenchmarkReporter* file_reporter, const RunResults& run_results) {
+  auto report_one = [](BenchmarkReporter* reporter, bool aggregates_only,
+                       const RunResults& results) {
+    assert(reporter);
+    // If there are no aggregates, do output non-aggregates.
+    aggregates_only &= !results.aggregates_only.empty();
+    if (!aggregates_only) reporter->ReportRuns(results.non_aggregates);
+    if (!results.aggregates_only.empty())
+      reporter->ReportRuns(results.aggregates_only);
+  };
+
+  report_one(display_reporter, run_results.display_report_aggregates_only,
+             run_results);
+  if (file_reporter)
+    report_one(file_reporter, run_results.file_report_aggregates_only,
+               run_results);
+
+  FlushStreams(display_reporter);
+  FlushStreams(file_reporter);
+}
+
 void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
                    BenchmarkReporter* display_reporter,
                    BenchmarkReporter* file_reporter) {
   // Note the file_reporter can be null.
-  CHECK(display_reporter != nullptr);
+  BM_CHECK(display_reporter != nullptr);
 
   // Determine the width of the name field using a minimum width of 10.
   bool might_have_aggregates = FLAGS_benchmark_repetitions > 1;
@@ -227,10 +355,10 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
   size_t stat_field_width = 0;
   for (const BenchmarkInstance& benchmark : benchmarks) {
     name_field_width =
-        std::max<size_t>(name_field_width, benchmark.name.str().size());
-    might_have_aggregates |= benchmark.repetitions > 1;
+        std::max<size_t>(name_field_width, benchmark.name().str().size());
+    might_have_aggregates |= benchmark.repetitions() > 1;
 
-    for (const auto& Stat : *benchmark.statistics)
+    for (const auto& Stat : benchmark.statistics())
       stat_field_width = std::max<size_t>(stat_field_width, Stat.name_.size());
   }
   if (might_have_aggregates) name_field_width += 1 + stat_field_width;
@@ -239,75 +367,129 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
   BenchmarkReporter::Context context;
   context.name_field_width = name_field_width;
 
-  // Keep track of running times of all instances of current benchmark
-  std::vector<BenchmarkReporter::Run> complexity_reports;
-
-  // We flush streams after invoking reporter methods that write to them. This
-  // ensures users get timely updates even when streams are not line-buffered.
-  auto flushStreams = [](BenchmarkReporter* reporter) {
-    if (!reporter) return;
-    std::flush(reporter->GetOutputStream());
-    std::flush(reporter->GetErrorStream());
-  };
+  // Keep track of running times of all instances of each benchmark family.
+  std::map<int /*family_index*/, BenchmarkReporter::PerFamilyRunReports>
+      per_family_reports;
 
   if (display_reporter->ReportContext(context) &&
       (!file_reporter || file_reporter->ReportContext(context))) {
-    flushStreams(display_reporter);
-    flushStreams(file_reporter);
-
-    for (const auto& benchmark : benchmarks) {
-      RunResults run_results = RunBenchmark(benchmark, &complexity_reports);
-
-      auto report = [&run_results](BenchmarkReporter* reporter,
-                                   bool report_aggregates_only) {
-        assert(reporter);
-        // If there are no aggregates, do output non-aggregates.
-        report_aggregates_only &= !run_results.aggregates_only.empty();
-        if (!report_aggregates_only)
-          reporter->ReportRuns(run_results.non_aggregates);
-        if (!run_results.aggregates_only.empty())
-          reporter->ReportRuns(run_results.aggregates_only);
-      };
-
-      report(display_reporter, run_results.display_report_aggregates_only);
-      if (file_reporter)
-        report(file_reporter, run_results.file_report_aggregates_only);
+    FlushStreams(display_reporter);
+    FlushStreams(file_reporter);
+
+    size_t num_repetitions_total = 0;
+
+    // This perfcounters object needs to be created before the runners vector
+    // below so it outlasts their lifetime.
+    PerfCountersMeasurement perfcounters(
+        StrSplit(FLAGS_benchmark_perf_counters, ','));
+
+    // Vector of benchmarks to run
+    std::vector<internal::BenchmarkRunner> runners;
+    runners.reserve(benchmarks.size());
+
+    // Count the number of benchmarks with threads to warn the user in case
+    // performance counters are used.
+    int benchmarks_with_threads = 0;
+
+    // Loop through all benchmarks
+    for (const BenchmarkInstance& benchmark : benchmarks) {
+      BenchmarkReporter::PerFamilyRunReports* reports_for_family = nullptr;
+      if (benchmark.complexity() != oNone)
+        reports_for_family = &per_family_reports[benchmark.family_index()];
+      benchmarks_with_threads += (benchmark.threads() > 1);
+      runners.emplace_back(benchmark, &perfcounters, reports_for_family);
+      int num_repeats_of_this_instance = runners.back().GetNumRepeats();
+      num_repetitions_total += num_repeats_of_this_instance;
+      if (reports_for_family)
+        reports_for_family->num_runs_total += num_repeats_of_this_instance;
+    }
+    assert(runners.size() == benchmarks.size() && "Unexpected runner count.");
+
+    // The use of performance counters with threads would be unintuitive for
+    // the average user so we need to warn them about this case
+    if ((benchmarks_with_threads > 0) && (perfcounters.num_counters() > 0)) {
+      GetErrorLogInstance()
+          << "***WARNING*** There are " << benchmarks_with_threads
+          << " benchmarks with threads and " << perfcounters.num_counters()
+          << " performance counters were requested. Beware counters will "
+             "reflect the combined usage across all "
+             "threads.\n";
+    }
+
+    std::vector<size_t> repetition_indices;
+    repetition_indices.reserve(num_repetitions_total);
+    for (size_t runner_index = 0, num_runners = runners.size();
+         runner_index != num_runners; ++runner_index) {
+      const internal::BenchmarkRunner& runner = runners[runner_index];
+      std::fill_n(std::back_inserter(repetition_indices),
+                  runner.GetNumRepeats(), runner_index);
+    }
+    assert(repetition_indices.size() == num_repetitions_total &&
+           "Unexpected number of repetition indexes.");
+
+    if (FLAGS_benchmark_enable_random_interleaving) {
+      std::random_device rd;
+      std::mt19937 g(rd());
+      std::shuffle(repetition_indices.begin(), repetition_indices.end(), g);
+    }
+
+    for (size_t repetition_index : repetition_indices) {
+      internal::BenchmarkRunner& runner = runners[repetition_index];
+      runner.DoOneRepetition();
+      if (runner.HasRepeatsRemaining()) continue;
+      // FIXME: report each repetition separately, not all of them in bulk.
 
-      flushStreams(display_reporter);
-      flushStreams(file_reporter);
+      display_reporter->ReportRunsConfig(
+          runner.GetMinTime(), runner.HasExplicitIters(), runner.GetIters());
+      if (file_reporter)
+        file_reporter->ReportRunsConfig(
+            runner.GetMinTime(), runner.HasExplicitIters(), runner.GetIters());
+
+      RunResults run_results = runner.GetResults();
+
+      // Maybe calculate complexity report
+      if (const auto* reports_for_family = runner.GetReportsForFamily()) {
+        if (reports_for_family->num_runs_done ==
+            reports_for_family->num_runs_total) {
+          auto additional_run_stats = ComputeBigO(reports_for_family->Runs);
+          run_results.aggregates_only.insert(run_results.aggregates_only.end(),
+                                             additional_run_stats.begin(),
+                                             additional_run_stats.end());
+          per_family_reports.erase(
+              static_cast<int>(reports_for_family->Runs.front().family_index));
+        }
+      }
+
+      Report(display_reporter, file_reporter, run_results);
     }
   }
   display_reporter->Finalize();
   if (file_reporter) file_reporter->Finalize();
-  flushStreams(display_reporter);
-  flushStreams(file_reporter);
+  FlushStreams(display_reporter);
+  FlushStreams(file_reporter);
 }
 
 // Disable deprecated warnings temporarily because we need to reference
 // CSVReporter but don't want to trigger -Werror=-Wdeprecated-declarations
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
+BENCHMARK_DISABLE_DEPRECATED_WARNING
 
 std::unique_ptr<BenchmarkReporter> CreateReporter(
     std::string const& name, ConsoleReporter::OutputOptions output_opts) {
   typedef std::unique_ptr<BenchmarkReporter> PtrType;
   if (name == "console") {
     return PtrType(new ConsoleReporter(output_opts));
-  } else if (name == "json") {
-    return PtrType(new JSONReporter);
-  } else if (name == "csv") {
-    return PtrType(new CSVReporter);
-  } else {
-    std::cerr << "Unexpected format: '" << name << "'\n";
-    std::exit(1);
   }
+  if (name == "json") {
+    return PtrType(new JSONReporter());
+  }
+  if (name == "csv") {
+    return PtrType(new CSVReporter());
+  }
+  std::cerr << "Unexpected format: '" << name << "'\n";
+  std::exit(1);
 }
 
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
+BENCHMARK_RESTORE_DEPRECATED_WARNING
 
 }  // end namespace
 
@@ -341,17 +523,41 @@ ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) {
 
 }  // end namespace internal
 
+BenchmarkReporter* CreateDefaultDisplayReporter() {
+  static auto default_display_reporter =
+      internal::CreateReporter(FLAGS_benchmark_format,
+                               internal::GetOutputOptions())
+          .release();
+  return default_display_reporter;
+}
+
 size_t RunSpecifiedBenchmarks() {
-  return RunSpecifiedBenchmarks(nullptr, nullptr);
+  return RunSpecifiedBenchmarks(nullptr, nullptr, FLAGS_benchmark_filter);
+}
+
+size_t RunSpecifiedBenchmarks(std::string spec) {
+  return RunSpecifiedBenchmarks(nullptr, nullptr, std::move(spec));
 }
 
 size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter) {
-  return RunSpecifiedBenchmarks(display_reporter, nullptr);
+  return RunSpecifiedBenchmarks(display_reporter, nullptr,
+                                FLAGS_benchmark_filter);
+}
+
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              std::string spec) {
+  return RunSpecifiedBenchmarks(display_reporter, nullptr, std::move(spec));
 }
 
 size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
                               BenchmarkReporter* file_reporter) {
-  std::string spec = FLAGS_benchmark_filter;
+  return RunSpecifiedBenchmarks(display_reporter, file_reporter,
+                                FLAGS_benchmark_filter);
+}
+
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              BenchmarkReporter* file_reporter,
+                              std::string spec) {
   if (spec.empty() || spec == "all")
     spec = ".";  // Regexp that matches all benchmarks
 
@@ -360,8 +566,7 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
   std::unique_ptr<BenchmarkReporter> default_display_reporter;
   std::unique_ptr<BenchmarkReporter> default_file_reporter;
   if (!display_reporter) {
-    default_display_reporter = internal::CreateReporter(
-        FLAGS_benchmark_format, internal::GetOutputOptions());
+    default_display_reporter.reset(CreateDefaultDisplayReporter());
     display_reporter = default_display_reporter.get();
   }
   auto& Out = display_reporter->GetOutputStream();
@@ -377,12 +582,14 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
   if (!fname.empty()) {
     output_file.open(fname);
     if (!output_file.is_open()) {
-      Err << "invalid file name: '" << fname << std::endl;
+      Err << "invalid file name: '" << fname << "'" << std::endl;
       std::exit(1);
     }
     if (!file_reporter) {
       default_file_reporter = internal::CreateReporter(
-          FLAGS_benchmark_out_format, ConsoleReporter::OO_None);
+          FLAGS_benchmark_out_format, FLAGS_benchmark_counters_tabular
+                                          ? ConsoleReporter::OO_Tabular
+                                          : ConsoleReporter::OO_None);
       file_reporter = default_file_reporter.get();
     }
     file_reporter->SetOutputStream(&output_file);
@@ -399,7 +606,7 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
 
   if (FLAGS_benchmark_list_tests) {
     for (auto const& benchmark : benchmarks)
-      Out << benchmark.name.str() << "\n";
+      Out << benchmark.name().str() << "\n";
   } else {
     internal::RunBenchmarks(benchmarks, display_reporter, file_reporter);
   }
@@ -407,30 +614,64 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
   return benchmarks.size();
 }
 
+namespace {
+// stores the time unit benchmarks use by default
+TimeUnit default_time_unit = kNanosecond;
+}  // namespace
+
+TimeUnit GetDefaultTimeUnit() { return default_time_unit; }
+
+void SetDefaultTimeUnit(TimeUnit unit) { default_time_unit = unit; }
+
+std::string GetBenchmarkFilter() { return FLAGS_benchmark_filter; }
+
+void SetBenchmarkFilter(std::string value) {
+  FLAGS_benchmark_filter = std::move(value);
+}
+
+int32_t GetBenchmarkVerbosity() { return FLAGS_v; }
+
 void RegisterMemoryManager(MemoryManager* manager) {
   internal::memory_manager = manager;
 }
 
+void AddCustomContext(const std::string& key, const std::string& value) {
+  if (internal::global_context == nullptr) {
+    internal::global_context = new std::map<std::string, std::string>();
+  }
+  if (!internal::global_context->emplace(key, value).second) {
+    std::cerr << "Failed to add custom context \"" << key << "\" as it already "
+              << "exists with value \"" << value << "\"\n";
+  }
+}
+
 namespace internal {
 
+void (*HelperPrintf)();
+
 void PrintUsageAndExit() {
-  fprintf(stdout,
-          "benchmark"
-          " [--benchmark_list_tests={true|false}]\n"
-          "          [--benchmark_filter=<regex>]\n"
-          "          [--benchmark_min_time=<min_time>]\n"
-          "          [--benchmark_repetitions=<num_repetitions>]\n"
-          "          [--benchmark_report_aggregates_only={true|false}]\n"
-          "          [--benchmark_display_aggregates_only={true|false}]\n"
-          "          [--benchmark_format=<console|json|csv>]\n"
-          "          [--benchmark_out=<filename>]\n"
-          "          [--benchmark_out_format=<json|console|csv>]\n"
-          "          [--benchmark_color={auto|true|false}]\n"
-          "          [--benchmark_counters_tabular={true|false}]\n"
-          "          [--v=<verbosity>]\n");
+  HelperPrintf();
   exit(0);
 }
 
+void SetDefaultTimeUnitFromFlag(const std::string& time_unit_flag) {
+  if (time_unit_flag == "s") {
+    return SetDefaultTimeUnit(kSecond);
+  }
+  if (time_unit_flag == "ms") {
+    return SetDefaultTimeUnit(kMillisecond);
+  }
+  if (time_unit_flag == "us") {
+    return SetDefaultTimeUnit(kMicrosecond);
+  }
+  if (time_unit_flag == "ns") {
+    return SetDefaultTimeUnit(kNanosecond);
+  }
+  if (!time_unit_flag.empty()) {
+    PrintUsageAndExit();
+  }
+}
+
 void ParseCommandLineFlags(int* argc, char** argv) {
   using namespace benchmark;
   BenchmarkReporter::Context::executable_name =
@@ -439,10 +680,14 @@ void ParseCommandLineFlags(int* argc, char** argv) {
     if (ParseBoolFlag(argv[i], "benchmark_list_tests",
                       &FLAGS_benchmark_list_tests) ||
         ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) ||
-        ParseDoubleFlag(argv[i], "benchmark_min_time",
+        ParseStringFlag(argv[i], "benchmark_min_time",
                         &FLAGS_benchmark_min_time) ||
+        ParseDoubleFlag(argv[i], "benchmark_min_warmup_time",
+                        &FLAGS_benchmark_min_warmup_time) ||
         ParseInt32Flag(argv[i], "benchmark_repetitions",
                        &FLAGS_benchmark_repetitions) ||
+        ParseBoolFlag(argv[i], "benchmark_enable_random_interleaving",
+                      &FLAGS_benchmark_enable_random_interleaving) ||
         ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
                       &FLAGS_benchmark_report_aggregates_only) ||
         ParseBoolFlag(argv[i], "benchmark_display_aggregates_only",
@@ -452,11 +697,14 @@ void ParseCommandLineFlags(int* argc, char** argv) {
         ParseStringFlag(argv[i], "benchmark_out_format",
                         &FLAGS_benchmark_out_format) ||
         ParseStringFlag(argv[i], "benchmark_color", &FLAGS_benchmark_color) ||
-        // "color_print" is the deprecated name for "benchmark_color".
-        // TODO: Remove this.
-        ParseStringFlag(argv[i], "color_print", &FLAGS_benchmark_color) ||
         ParseBoolFlag(argv[i], "benchmark_counters_tabular",
                       &FLAGS_benchmark_counters_tabular) ||
+        ParseStringFlag(argv[i], "benchmark_perf_counters",
+                        &FLAGS_benchmark_perf_counters) ||
+        ParseKeyValueFlag(argv[i], "benchmark_context",
+                          &FLAGS_benchmark_context) ||
+        ParseStringFlag(argv[i], "benchmark_time_unit",
+                        &FLAGS_benchmark_time_unit) ||
         ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
       for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1];
 
@@ -467,13 +715,18 @@ void ParseCommandLineFlags(int* argc, char** argv) {
     }
   }
   for (auto const* flag :
-       {&FLAGS_benchmark_format, &FLAGS_benchmark_out_format})
+       {&FLAGS_benchmark_format, &FLAGS_benchmark_out_format}) {
     if (*flag != "console" && *flag != "json" && *flag != "csv") {
       PrintUsageAndExit();
     }
+  }
+  SetDefaultTimeUnitFromFlag(FLAGS_benchmark_time_unit);
   if (FLAGS_benchmark_color.empty()) {
     PrintUsageAndExit();
   }
+  for (const auto& kv : FLAGS_benchmark_context) {
+    AddCustomContext(kv.first, kv.second);
+  }
 }
 
 int InitializeStreams() {
@@ -483,11 +736,38 @@ int InitializeStreams() {
 
 }  // end namespace internal
 
-void Initialize(int* argc, char** argv) {
+void PrintDefaultHelp() {
+  fprintf(stdout,
+          "benchmark"
+          " [--benchmark_list_tests={true|false}]\n"
+          "          [--benchmark_filter=<regex>]\n"
+          "          [--benchmark_min_time=`<integer>x` OR `<float>s` ]\n"
+          "          [--benchmark_min_warmup_time=<min_warmup_time>]\n"
+          "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_enable_random_interleaving={true|false}]\n"
+          "          [--benchmark_report_aggregates_only={true|false}]\n"
+          "          [--benchmark_display_aggregates_only={true|false}]\n"
+          "          [--benchmark_format=<console|json|csv>]\n"
+          "          [--benchmark_out=<filename>]\n"
+          "          [--benchmark_out_format=<json|console|csv>]\n"
+          "          [--benchmark_color={auto|true|false}]\n"
+          "          [--benchmark_counters_tabular={true|false}]\n"
+#if defined HAVE_LIBPFM
+          "          [--benchmark_perf_counters=<counter>,...]\n"
+#endif
+          "          [--benchmark_context=<key>=<value>,...]\n"
+          "          [--benchmark_time_unit={ns|us|ms|s}]\n"
+          "          [--v=<verbosity>]\n");
+}
+
+void Initialize(int* argc, char** argv, void (*HelperPrintf)()) {
+  internal::HelperPrintf = HelperPrintf;
   internal::ParseCommandLineFlags(argc, argv);
   internal::LogLevel() = FLAGS_v;
 }
 
+void Shutdown() { delete internal::global_context; }
+
 bool ReportUnrecognizedArguments(int argc, char** argv) {
   for (int i = 1; i < argc; ++i) {
     fprintf(stderr, "%s: error: unrecognized command-line flag: %s\n", argv[0],
diff --git a/src/benchmark_api_internal.cc b/src/benchmark_api_internal.cc
index d468a25..286f986 100644
--- a/src/benchmark_api_internal.cc
+++ b/src/benchmark_api_internal.cc
@@ -1,15 +1,118 @@
 #include "benchmark_api_internal.h"
 
+#include <cinttypes>
+
+#include "string_util.h"
+
 namespace benchmark {
 namespace internal {
 
-State BenchmarkInstance::Run(IterationCount iters, int thread_id,
-                             internal::ThreadTimer* timer,
-                             internal::ThreadManager* manager) const {
-  State st(iters, arg, thread_id, threads, timer, manager);
-  benchmark->Run(st);
+BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
+                                     int per_family_instance_idx,
+                                     const std::vector<int64_t>& args,
+                                     int thread_count)
+    : benchmark_(*benchmark),
+      family_index_(family_idx),
+      per_family_instance_index_(per_family_instance_idx),
+      aggregation_report_mode_(benchmark_.aggregation_report_mode_),
+      args_(args),
+      time_unit_(benchmark_.GetTimeUnit()),
+      measure_process_cpu_time_(benchmark_.measure_process_cpu_time_),
+      use_real_time_(benchmark_.use_real_time_),
+      use_manual_time_(benchmark_.use_manual_time_),
+      complexity_(benchmark_.complexity_),
+      complexity_lambda_(benchmark_.complexity_lambda_),
+      statistics_(benchmark_.statistics_),
+      repetitions_(benchmark_.repetitions_),
+      min_time_(benchmark_.min_time_),
+      min_warmup_time_(benchmark_.min_warmup_time_),
+      iterations_(benchmark_.iterations_),
+      threads_(thread_count) {
+  name_.function_name = benchmark_.name_;
+
+  size_t arg_i = 0;
+  for (const auto& arg : args) {
+    if (!name_.args.empty()) {
+      name_.args += '/';
+    }
+
+    if (arg_i < benchmark->arg_names_.size()) {
+      const auto& arg_name = benchmark_.arg_names_[arg_i];
+      if (!arg_name.empty()) {
+        name_.args += StrFormat("%s:", arg_name.c_str());
+      }
+    }
+
+    name_.args += StrFormat("%" PRId64, arg);
+    ++arg_i;
+  }
+
+  if (!IsZero(benchmark->min_time_)) {
+    name_.min_time = StrFormat("min_time:%0.3f", benchmark_.min_time_);
+  }
+
+  if (!IsZero(benchmark->min_warmup_time_)) {
+    name_.min_warmup_time =
+        StrFormat("min_warmup_time:%0.3f", benchmark_.min_warmup_time_);
+  }
+
+  if (benchmark_.iterations_ != 0) {
+    name_.iterations = StrFormat(
+        "iterations:%lu", static_cast<unsigned long>(benchmark_.iterations_));
+  }
+
+  if (benchmark_.repetitions_ != 0) {
+    name_.repetitions = StrFormat("repeats:%d", benchmark_.repetitions_);
+  }
+
+  if (benchmark_.measure_process_cpu_time_) {
+    name_.time_type = "process_time";
+  }
+
+  if (benchmark_.use_manual_time_) {
+    if (!name_.time_type.empty()) {
+      name_.time_type += '/';
+    }
+    name_.time_type += "manual_time";
+  } else if (benchmark_.use_real_time_) {
+    if (!name_.time_type.empty()) {
+      name_.time_type += '/';
+    }
+    name_.time_type += "real_time";
+  }
+
+  if (!benchmark_.thread_counts_.empty()) {
+    name_.threads = StrFormat("threads:%d", threads_);
+  }
+
+  setup_ = benchmark_.setup_;
+  teardown_ = benchmark_.teardown_;
+}
+
+State BenchmarkInstance::Run(
+    IterationCount iters, int thread_id, internal::ThreadTimer* timer,
+    internal::ThreadManager* manager,
+    internal::PerfCountersMeasurement* perf_counters_measurement) const {
+  State st(name_.function_name, iters, args_, thread_id, threads_, timer,
+           manager, perf_counters_measurement);
+  benchmark_.Run(st);
   return st;
 }
 
-}  // internal
-}  // benchmark
+void BenchmarkInstance::Setup() const {
+  if (setup_) {
+    State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
+             nullptr, nullptr, nullptr);
+    setup_(st);
+  }
+}
+
+void BenchmarkInstance::Teardown() const {
+  if (teardown_) {
+    State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
+             nullptr, nullptr, nullptr);
+    teardown_(st);
+  }
+}
+}  // namespace internal
+}  // namespace benchmark
diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h
index 264eff9..94f5165 100644
--- a/src/benchmark_api_internal.h
+++ b/src/benchmark_api_internal.h
@@ -1,9 +1,6 @@
 #ifndef BENCHMARK_API_INTERNAL_H
 #define BENCHMARK_API_INTERNAL_H
 
-#include "benchmark/benchmark.h"
-#include "commandlineflags.h"
-
 #include <cmath>
 #include <iosfwd>
 #include <limits>
@@ -11,32 +8,68 @@
 #include <string>
 #include <vector>
 
+#include "benchmark/benchmark.h"
+#include "commandlineflags.h"
+
 namespace benchmark {
 namespace internal {
 
 // Information kept per benchmark we may want to run
-struct BenchmarkInstance {
-  BenchmarkName name;
-  Benchmark* benchmark;
-  AggregationReportMode aggregation_report_mode;
-  std::vector<int64_t> arg;
-  TimeUnit time_unit;
-  int range_multiplier;
-  bool measure_process_cpu_time;
-  bool use_real_time;
-  bool use_manual_time;
-  BigO complexity;
-  BigOFunc* complexity_lambda;
-  UserCounters counters;
-  const std::vector<Statistics>* statistics;
-  bool last_benchmark_instance;
-  int repetitions;
-  double min_time;
-  IterationCount iterations;
-  int threads;  // Number of concurrent threads to us
+class BenchmarkInstance {
+ public:
+  BenchmarkInstance(Benchmark* benchmark, int family_index,
+                    int per_family_instance_index,
+                    const std::vector<int64_t>& args, int threads);
+
+  const BenchmarkName& name() const { return name_; }
+  int family_index() const { return family_index_; }
+  int per_family_instance_index() const { return per_family_instance_index_; }
+  AggregationReportMode aggregation_report_mode() const {
+    return aggregation_report_mode_;
+  }
+  TimeUnit time_unit() const { return time_unit_; }
+  bool measure_process_cpu_time() const { return measure_process_cpu_time_; }
+  bool use_real_time() const { return use_real_time_; }
+  bool use_manual_time() const { return use_manual_time_; }
+  BigO complexity() const { return complexity_; }
+  BigOFunc* complexity_lambda() const { return complexity_lambda_; }
+  const std::vector<Statistics>& statistics() const { return statistics_; }
+  int repetitions() const { return repetitions_; }
+  double min_time() const { return min_time_; }
+  double min_warmup_time() const { return min_warmup_time_; }
+  IterationCount iterations() const { return iterations_; }
+  int threads() const { return threads_; }
+  void Setup() const;
+  void Teardown() const;
 
   State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
-            internal::ThreadManager* manager) const;
+            internal::ThreadManager* manager,
+            internal::PerfCountersMeasurement* perf_counters_measurement) const;
+
+ private:
+  BenchmarkName name_;
+  Benchmark& benchmark_;
+  const int family_index_;
+  const int per_family_instance_index_;
+  AggregationReportMode aggregation_report_mode_;
+  const std::vector<int64_t>& args_;
+  TimeUnit time_unit_;
+  bool measure_process_cpu_time_;
+  bool use_real_time_;
+  bool use_manual_time_;
+  BigO complexity_;
+  BigOFunc* complexity_lambda_;
+  UserCounters counters_;
+  const std::vector<Statistics>& statistics_;
+  int repetitions_;
+  double min_time_;
+  double min_warmup_time_;
+  IterationCount iterations_;
+  int threads_;  // Number of concurrent threads to us
+
+  typedef void (*callback_function)(const benchmark::State&);
+  callback_function setup_ = nullptr;
+  callback_function teardown_ = nullptr;
 };
 
 bool FindBenchmarksInternal(const std::string& re,
@@ -45,6 +78,7 @@ bool FindBenchmarksInternal(const std::string& re,
 
 bool IsZero(double n);
 
+BENCHMARK_EXPORT
 ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false);
 
 }  // end namespace internal
diff --git a/src/benchmark_main.cc b/src/benchmark_main.cc
index b3b2478..cd61cd2 100644
--- a/src/benchmark_main.cc
+++ b/src/benchmark_main.cc
@@ -14,4 +14,5 @@
 
 #include "benchmark/benchmark.h"
 
+BENCHMARK_EXPORT int main(int, char**);
 BENCHMARK_MAIN();
diff --git a/src/benchmark_name.cc b/src/benchmark_name.cc
index 2a17ebc..01676bb 100644
--- a/src/benchmark_name.cc
+++ b/src/benchmark_name.cc
@@ -51,8 +51,9 @@ std::string join(char delimiter, const Ts&... ts) {
 }
 }  // namespace
 
+BENCHMARK_EXPORT
 std::string BenchmarkName::str() const {
-  return join('/', function_name, args, min_time, iterations, repetitions,
-              time_type, threads);
+  return join('/', function_name, args, min_time, min_warmup_time, iterations,
+              repetitions, time_type, threads);
 }
 }  // namespace benchmark
diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc
index 65d9944..e447c9a 100644
--- a/src/benchmark_register.cc
+++ b/src/benchmark_register.cc
@@ -15,7 +15,7 @@
 #include "benchmark_register.h"
 
 #ifndef BENCHMARK_OS_WINDOWS
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@@ -24,6 +24,7 @@
 
 #include <algorithm>
 #include <atomic>
+#include <cinttypes>
 #include <condition_variable>
 #include <cstdio>
 #include <cstdlib>
@@ -35,11 +36,6 @@
 #include <sstream>
 #include <thread>
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-#include <inttypes.h>
-
 #include "benchmark/benchmark.h"
 #include "benchmark_api_internal.h"
 #include "check.h"
@@ -57,10 +53,13 @@ namespace benchmark {
 
 namespace {
 // For non-dense Range, intermediate values are powers of kRangeMultiplier.
-static const int kRangeMultiplier = 8;
+static constexpr int kRangeMultiplier = 8;
+
 // The size of a benchmark family determines is the number of inputs to repeat
 // the benchmark on. If this is "large" then warn the user during configuration.
-static const size_t kMaxFamilySize = 100;
+static constexpr size_t kMaxFamilySize = 100;
+
+static constexpr char kDisabledPrefix[] = "DISABLED_";
 }  // end namespace
 
 namespace internal {
@@ -115,15 +114,15 @@ void BenchmarkFamilies::ClearBenchmarks() {
 bool BenchmarkFamilies::FindBenchmarks(
     std::string spec, std::vector<BenchmarkInstance>* benchmarks,
     std::ostream* ErrStream) {
-  CHECK(ErrStream);
+  BM_CHECK(ErrStream);
   auto& Err = *ErrStream;
   // Make regular expression out of command-line flag
   std::string error_msg;
   Regex re;
-  bool isNegativeFilter = false;
+  bool is_negative_filter = false;
   if (spec[0] == '-') {
     spec.replace(0, 1, "");
-    isNegativeFilter = true;
+    is_negative_filter = true;
   }
   if (!re.Init(spec, &error_msg)) {
     Err << "Could not compile benchmark re: " << error_msg << std::endl;
@@ -133,8 +132,13 @@ bool BenchmarkFamilies::FindBenchmarks(
   // Special list of thread counts to use when none are specified
   const std::vector<int> one_thread = {1};
 
+  int next_family_index = 0;
+
   MutexLock l(mutex_);
   for (std::unique_ptr<Benchmark>& family : families_) {
+    int family_index = next_family_index;
+    int per_family_instance_index = 0;
+
     // Family was deleted or benchmark doesn't match
     if (!family) continue;
 
@@ -153,85 +157,27 @@ bool BenchmarkFamilies::FindBenchmarks(
           << " will be repeated at least " << family_size << " times.\n";
     }
     // reserve in the special case the regex ".", since we know the final
-    // family size.
-    if (spec == ".") benchmarks->reserve(family_size);
+    // family size.  this doesn't take into account any disabled benchmarks
+    // so worst case we reserve more than we need.
+    if (spec == ".") benchmarks->reserve(benchmarks->size() + family_size);
 
     for (auto const& args : family->args_) {
       for (int num_threads : *thread_counts) {
-        BenchmarkInstance instance;
-        instance.name.function_name = family->name_;
-        instance.benchmark = family.get();
-        instance.aggregation_report_mode = family->aggregation_report_mode_;
-        instance.arg = args;
-        instance.time_unit = family->time_unit_;
-        instance.range_multiplier = family->range_multiplier_;
-        instance.min_time = family->min_time_;
-        instance.iterations = family->iterations_;
-        instance.repetitions = family->repetitions_;
-        instance.measure_process_cpu_time = family->measure_process_cpu_time_;
-        instance.use_real_time = family->use_real_time_;
-        instance.use_manual_time = family->use_manual_time_;
-        instance.complexity = family->complexity_;
-        instance.complexity_lambda = family->complexity_lambda_;
-        instance.statistics = &family->statistics_;
-        instance.threads = num_threads;
-
-        // Add arguments to instance name
-        size_t arg_i = 0;
-        for (auto const& arg : args) {
-          if (!instance.name.args.empty()) {
-            instance.name.args += '/';
-          }
-
-          if (arg_i < family->arg_names_.size()) {
-            const auto& arg_name = family->arg_names_[arg_i];
-            if (!arg_name.empty()) {
-              instance.name.args += StrFormat("%s:", arg_name.c_str());
-            }
-          }
-
-          instance.name.args += StrFormat("%" PRId64, arg);
-          ++arg_i;
-        }
-
-        if (!IsZero(family->min_time_))
-          instance.name.min_time =
-              StrFormat("min_time:%0.3f", family->min_time_);
-        if (family->iterations_ != 0) {
-          instance.name.iterations =
-              StrFormat("iterations:%lu",
-                        static_cast<unsigned long>(family->iterations_));
-        }
-        if (family->repetitions_ != 0)
-          instance.name.repetitions =
-              StrFormat("repeats:%d", family->repetitions_);
-
-        if (family->measure_process_cpu_time_) {
-          instance.name.time_type = "process_time";
-        }
+        BenchmarkInstance instance(family.get(), family_index,
+                                   per_family_instance_index, args,
+                                   num_threads);
+
+        const auto full_name = instance.name().str();
+        if (full_name.rfind(kDisabledPrefix, 0) != 0 &&
+            ((re.Match(full_name) && !is_negative_filter) ||
+             (!re.Match(full_name) && is_negative_filter))) {
+          benchmarks->push_back(std::move(instance));
 
-        if (family->use_manual_time_) {
-          if (!instance.name.time_type.empty()) {
-            instance.name.time_type += '/';
-          }
-          instance.name.time_type += "manual_time";
-        } else if (family->use_real_time_) {
-          if (!instance.name.time_type.empty()) {
-            instance.name.time_type += '/';
-          }
-          instance.name.time_type += "real_time";
-        }
+          ++per_family_instance_index;
 
-        // Add the number of threads used to the name
-        if (!family->thread_counts_.empty()) {
-          instance.name.threads = StrFormat("threads:%d", instance.threads);
-        }
-
-        const auto full_name = instance.name.str();
-        if ((re.Match(full_name) && !isNegativeFilter) ||
-            (!re.Match(full_name) && isNegativeFilter)) {
-          instance.last_benchmark_instance = (&args == &family->args_.back());
-          benchmarks->push_back(std::move(instance));
+          // Only bump the next family index once we've estabilished that
+          // at least one instance of this family will be run.
+          if (next_family_index == family_index) ++next_family_index;
         }
       }
     }
@@ -258,39 +204,50 @@ bool FindBenchmarksInternal(const std::string& re,
 //                               Benchmark
 //=============================================================================//
 
-Benchmark::Benchmark(const char* name)
+Benchmark::Benchmark(const std::string& name)
     : name_(name),
       aggregation_report_mode_(ARM_Unspecified),
-      time_unit_(kNanosecond),
+      time_unit_(GetDefaultTimeUnit()),
+      use_default_time_unit_(true),
       range_multiplier_(kRangeMultiplier),
       min_time_(0),
+      min_warmup_time_(0),
       iterations_(0),
       repetitions_(0),
       measure_process_cpu_time_(false),
       use_real_time_(false),
       use_manual_time_(false),
       complexity_(oNone),
-      complexity_lambda_(nullptr) {
+      complexity_lambda_(nullptr),
+      setup_(nullptr),
+      teardown_(nullptr) {
   ComputeStatistics("mean", StatisticsMean);
   ComputeStatistics("median", StatisticsMedian);
   ComputeStatistics("stddev", StatisticsStdDev);
+  ComputeStatistics("cv", StatisticsCV, kPercentage);
 }
 
 Benchmark::~Benchmark() {}
 
+Benchmark* Benchmark::Name(const std::string& name) {
+  SetName(name);
+  return this;
+}
+
 Benchmark* Benchmark::Arg(int64_t x) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
   args_.push_back({x});
   return this;
 }
 
 Benchmark* Benchmark::Unit(TimeUnit unit) {
   time_unit_ = unit;
+  use_default_time_unit_ = false;
   return this;
 }
 
 Benchmark* Benchmark::Range(int64_t start, int64_t limit) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
   std::vector<int64_t> arglist;
   AddRange(&arglist, start, limit, range_multiplier_);
 
@@ -302,7 +259,7 @@ Benchmark* Benchmark::Range(int64_t start, int64_t limit) {
 
 Benchmark* Benchmark::Ranges(
     const std::vector<std::pair<int64_t, int64_t>>& ranges) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
   std::vector<std::vector<int64_t>> arglists(ranges.size());
   for (std::size_t i = 0; i < ranges.size(); i++) {
     AddRange(&arglists[i], ranges[i].first, ranges[i].second,
@@ -316,7 +273,7 @@ Benchmark* Benchmark::Ranges(
 
 Benchmark* Benchmark::ArgsProduct(
     const std::vector<std::vector<int64_t>>& arglists) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(arglists.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(arglists.size()));
 
   std::vector<std::size_t> indices(arglists.size());
   const std::size_t total = std::accumulate(
@@ -343,20 +300,20 @@ Benchmark* Benchmark::ArgsProduct(
 }
 
 Benchmark* Benchmark::ArgName(const std::string& name) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
   arg_names_ = {name};
   return this;
 }
 
 Benchmark* Benchmark::ArgNames(const std::vector<std::string>& names) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
   arg_names_ = names;
   return this;
 }
 
 Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-  CHECK_LE(start, limit);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK_LE(start, limit);
   for (int64_t arg = start; arg <= limit; arg += step) {
     args_.push_back({arg});
   }
@@ -364,7 +321,7 @@ Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
 }
 
 Benchmark* Benchmark::Args(const std::vector<int64_t>& args) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
   args_.push_back(args);
   return this;
 }
@@ -374,28 +331,48 @@ Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
   return this;
 }
 
+Benchmark* Benchmark::Setup(void (*setup)(const benchmark::State&)) {
+  BM_CHECK(setup != nullptr);
+  setup_ = setup;
+  return this;
+}
+
+Benchmark* Benchmark::Teardown(void (*teardown)(const benchmark::State&)) {
+  BM_CHECK(teardown != nullptr);
+  teardown_ = teardown;
+  return this;
+}
+
 Benchmark* Benchmark::RangeMultiplier(int multiplier) {
-  CHECK(multiplier > 1);
+  BM_CHECK(multiplier > 1);
   range_multiplier_ = multiplier;
   return this;
 }
 
 Benchmark* Benchmark::MinTime(double t) {
-  CHECK(t > 0.0);
-  CHECK(iterations_ == 0);
+  BM_CHECK(t > 0.0);
+  BM_CHECK(iterations_ == 0);
   min_time_ = t;
   return this;
 }
 
+Benchmark* Benchmark::MinWarmUpTime(double t) {
+  BM_CHECK(t >= 0.0);
+  BM_CHECK(iterations_ == 0);
+  min_warmup_time_ = t;
+  return this;
+}
+
 Benchmark* Benchmark::Iterations(IterationCount n) {
-  CHECK(n > 0);
-  CHECK(IsZero(min_time_));
+  BM_CHECK(n > 0);
+  BM_CHECK(IsZero(min_time_));
+  BM_CHECK(IsZero(min_warmup_time_));
   iterations_ = n;
   return this;
 }
 
 Benchmark* Benchmark::Repetitions(int n) {
-  CHECK(n > 0);
+  BM_CHECK(n > 0);
   repetitions_ = n;
   return this;
 }
@@ -428,14 +405,14 @@ Benchmark* Benchmark::MeasureProcessCPUTime() {
 }
 
 Benchmark* Benchmark::UseRealTime() {
-  CHECK(!use_manual_time_)
+  BM_CHECK(!use_manual_time_)
       << "Cannot set UseRealTime and UseManualTime simultaneously.";
   use_real_time_ = true;
   return this;
 }
 
 Benchmark* Benchmark::UseManualTime() {
-  CHECK(!use_real_time_)
+  BM_CHECK(!use_real_time_)
       << "Cannot set UseRealTime and UseManualTime simultaneously.";
   use_manual_time_ = true;
   return this;
@@ -452,21 +429,22 @@ Benchmark* Benchmark::Complexity(BigOFunc* complexity) {
   return this;
 }
 
-Benchmark* Benchmark::ComputeStatistics(std::string name,
-                                        StatisticsFunc* statistics) {
-  statistics_.emplace_back(name, statistics);
+Benchmark* Benchmark::ComputeStatistics(const std::string& name,
+                                        StatisticsFunc* statistics,
+                                        StatisticUnit unit) {
+  statistics_.emplace_back(name, statistics, unit);
   return this;
 }
 
 Benchmark* Benchmark::Threads(int t) {
-  CHECK_GT(t, 0);
+  BM_CHECK_GT(t, 0);
   thread_counts_.push_back(t);
   return this;
 }
 
 Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
+  BM_CHECK_GT(min_threads, 0);
+  BM_CHECK_GE(max_threads, min_threads);
 
   AddRange(&thread_counts_, min_threads, max_threads, 2);
   return this;
@@ -474,9 +452,9 @@ Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
 
 Benchmark* Benchmark::DenseThreadRange(int min_threads, int max_threads,
                                        int stride) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
-  CHECK_GE(stride, 1);
+  BM_CHECK_GT(min_threads, 0);
+  BM_CHECK_GE(max_threads, min_threads);
+  BM_CHECK_GE(stride, 1);
 
   for (auto i = min_threads; i < max_threads; i += stride) {
     thread_counts_.push_back(i);
@@ -490,7 +468,9 @@ Benchmark* Benchmark::ThreadPerCpu() {
   return this;
 }
 
-void Benchmark::SetName(const char* name) { name_ = name; }
+void Benchmark::SetName(const std::string& name) { name_ = name; }
+
+const char* Benchmark::GetName() const { return name_.c_str(); }
 
 int Benchmark::ArgsCnt() const {
   if (args_.empty()) {
@@ -500,6 +480,16 @@ int Benchmark::ArgsCnt() const {
   return static_cast<int>(args_.front().size());
 }
 
+const char* Benchmark::GetArgName(int arg) const {
+  BM_CHECK_GE(arg, 0);
+  BM_CHECK_LT(arg, static_cast<int>(arg_names_.size()));
+  return arg_names_[arg].c_str();
+}
+
+TimeUnit Benchmark::GetTimeUnit() const {
+  return use_default_time_unit_ ? GetDefaultTimeUnit() : time_unit_;
+}
+
 //=============================================================================//
 //                            FunctionBenchmark
 //=============================================================================//
@@ -512,4 +502,19 @@ void ClearRegisteredBenchmarks() {
   internal::BenchmarkFamilies::GetInstance()->ClearBenchmarks();
 }
 
+std::vector<int64_t> CreateRange(int64_t lo, int64_t hi, int multi) {
+  std::vector<int64_t> args;
+  internal::AddRange(&args, lo, hi, multi);
+  return args;
+}
+
+std::vector<int64_t> CreateDenseRange(int64_t start, int64_t limit, int step) {
+  BM_CHECK_LE(start, limit);
+  std::vector<int64_t> args;
+  for (int64_t arg = start; arg <= limit; arg += step) {
+    args.push_back(arg);
+  }
+  return args;
+}
+
 }  // end namespace benchmark
diff --git a/src/benchmark_register.h b/src/benchmark_register.h
index c774e6f..53367c7 100644
--- a/src/benchmark_register.h
+++ b/src/benchmark_register.h
@@ -1,6 +1,7 @@
 #ifndef BENCHMARK_REGISTER_H
 #define BENCHMARK_REGISTER_H
 
+#include <algorithm>
 #include <limits>
 #include <vector>
 
@@ -12,18 +13,18 @@ namespace internal {
 // Append the powers of 'mult' in the closed interval [lo, hi].
 // Returns iterator to the start of the inserted range.
 template <typename T>
-typename std::vector<T>::iterator
-AddPowers(std::vector<T>* dst, T lo, T hi, int mult) {
-  CHECK_GE(lo, 0);
-  CHECK_GE(hi, lo);
-  CHECK_GE(mult, 2);
+typename std::vector<T>::iterator AddPowers(std::vector<T>* dst, T lo, T hi,
+                                            int mult) {
+  BM_CHECK_GE(lo, 0);
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_GE(mult, 2);
 
   const size_t start_offset = dst->size();
 
   static const T kmax = std::numeric_limits<T>::max();
 
   // Space out the values in multiples of "mult"
-  for (T i = 1; i <= hi; i *= mult) {
+  for (T i = static_cast<T>(1); i <= hi; i *= static_cast<T>(mult)) {
     if (i >= lo) {
       dst->push_back(i);
     }
@@ -32,16 +33,16 @@ AddPowers(std::vector<T>* dst, T lo, T hi, int mult) {
     if (i > kmax / mult) break;
   }
 
-  return dst->begin() + start_offset;
+  return dst->begin() + static_cast<int>(start_offset);
 }
 
 template <typename T>
 void AddNegatedPowers(std::vector<T>* dst, T lo, T hi, int mult) {
   // We negate lo and hi so we require that they cannot be equal to 'min'.
-  CHECK_GT(lo, std::numeric_limits<T>::min());
-  CHECK_GT(hi, std::numeric_limits<T>::min());
-  CHECK_GE(hi, lo);
-  CHECK_LE(hi, 0);
+  BM_CHECK_GT(lo, std::numeric_limits<T>::min());
+  BM_CHECK_GT(hi, std::numeric_limits<T>::min());
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_LE(hi, 0);
 
   // Add positive powers, then negate and reverse.
   // Casts necessary since small integers get promoted
@@ -60,8 +61,8 @@ void AddRange(std::vector<T>* dst, T lo, T hi, int mult) {
   static_assert(std::is_integral<T>::value && std::is_signed<T>::value,
                 "Args type must be a signed integer");
 
-  CHECK_GE(hi, lo);
-  CHECK_GE(mult, 2);
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_GE(mult, 2);
 
   // Add "lo"
   dst->push_back(lo);
diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc
index 7bc6b63..f7ae424 100644
--- a/src/benchmark_runner.cc
+++ b/src/benchmark_runner.cc
@@ -13,12 +13,13 @@
 // limitations under the License.
 
 #include "benchmark_runner.h"
+
 #include "benchmark/benchmark.h"
 #include "benchmark_api_internal.h"
 #include "internal_macros.h"
 
 #ifndef BENCHMARK_OS_WINDOWS
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@@ -27,11 +28,14 @@
 
 #include <algorithm>
 #include <atomic>
+#include <climits>
+#include <cmath>
 #include <condition_variable>
 #include <cstdio>
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
+#include <limits>
 #include <memory>
 #include <string>
 #include <thread>
@@ -45,6 +49,7 @@
 #include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
+#include "perf_counters.h"
 #include "re.h"
 #include "statistics.h"
 #include "string_util.h"
@@ -60,64 +65,72 @@ MemoryManager* memory_manager = nullptr;
 namespace {
 
 static constexpr IterationCount kMaxIterations = 1000000000;
+const double kDefaultMinTime =
+    std::strtod(::benchmark::kDefaultMinTimeStr, /*p_end*/ nullptr);
 
 BenchmarkReporter::Run CreateRunReport(
     const benchmark::internal::BenchmarkInstance& b,
     const internal::ThreadManager::Result& results,
     IterationCount memory_iterations,
-    const MemoryManager::Result& memory_result, double seconds,
-    int64_t repetition_index) {
+    const MemoryManager::Result* memory_result, double seconds,
+    int64_t repetition_index, int64_t repeats) {
   // Create report about this benchmark run.
   BenchmarkReporter::Run report;
 
-  report.run_name = b.name;
-  report.error_occurred = results.has_error_;
-  report.error_message = results.error_message_;
+  report.run_name = b.name();
+  report.family_index = b.family_index();
+  report.per_family_instance_index = b.per_family_instance_index();
+  report.skipped = results.skipped_;
+  report.skip_message = results.skip_message_;
   report.report_label = results.report_label_;
   // This is the total iterations across all threads.
   report.iterations = results.iterations;
-  report.time_unit = b.time_unit;
-  report.threads = b.threads;
+  report.time_unit = b.time_unit();
+  report.threads = b.threads();
   report.repetition_index = repetition_index;
-  report.repetitions = b.repetitions;
+  report.repetitions = repeats;
 
-  if (!report.error_occurred) {
-    if (b.use_manual_time) {
+  if (!report.skipped) {
+    if (b.use_manual_time()) {
       report.real_accumulated_time = results.manual_time_used;
     } else {
       report.real_accumulated_time = results.real_time_used;
     }
     report.cpu_accumulated_time = results.cpu_time_used;
     report.complexity_n = results.complexity_n;
-    report.complexity = b.complexity;
-    report.complexity_lambda = b.complexity_lambda;
-    report.statistics = b.statistics;
+    report.complexity = b.complexity();
+    report.complexity_lambda = b.complexity_lambda();
+    report.statistics = &b.statistics();
     report.counters = results.counters;
 
     if (memory_iterations > 0) {
-      report.has_memory_result = true;
+      assert(memory_result != nullptr);
+      report.memory_result = memory_result;
       report.allocs_per_iter =
-          memory_iterations ? static_cast<double>(memory_result.num_allocs) /
+          memory_iterations ? static_cast<double>(memory_result->num_allocs) /
                                   memory_iterations
                             : 0;
-      report.max_bytes_used = memory_result.max_bytes_used;
     }
 
-    internal::Finish(&report.counters, results.iterations, seconds, b.threads);
+    internal::Finish(&report.counters, results.iterations, seconds,
+                     b.threads());
   }
   return report;
 }
 
 // Execute one thread of benchmark b for the specified number of iterations.
-// Adds the stats collected for the thread into *total.
+// Adds the stats collected for the thread into manager->results.
 void RunInThread(const BenchmarkInstance* b, IterationCount iters,
-                 int thread_id, ThreadManager* manager) {
+                 int thread_id, ThreadManager* manager,
+                 PerfCountersMeasurement* perf_counters_measurement) {
   internal::ThreadTimer timer(
-      b->measure_process_cpu_time
+      b->measure_process_cpu_time()
           ? internal::ThreadTimer::CreateProcessCpuTime()
           : internal::ThreadTimer::Create());
-  State st = b->Run(iters, thread_id, &timer, manager);
-  CHECK(st.error_occurred() || st.iterations() >= st.max_iterations)
+
+  State st =
+      b->Run(iters, thread_id, &timer, manager, perf_counters_measurement);
+  BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations)
       << "Benchmark returned before State::KeepRunning() returned false!";
   {
     MutexLock l(manager->GetBenchmarkMutex());
@@ -132,229 +145,351 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters,
   manager->NotifyThreadComplete();
 }
 
-class BenchmarkRunner {
- public:
-  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
-                  std::vector<BenchmarkReporter::Run>* complexity_reports_)
-      : b(b_),
-        complexity_reports(*complexity_reports_),
-        min_time(!IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time),
-        repeats(b.repetitions != 0 ? b.repetitions
+double ComputeMinTime(const benchmark::internal::BenchmarkInstance& b,
+                      const BenchTimeType& iters_or_time) {
+  if (!IsZero(b.min_time())) return b.min_time();
+  // If the flag was used to specify number of iters, then return the default
+  // min_time.
+  if (iters_or_time.tag == BenchTimeType::ITERS) return kDefaultMinTime;
+
+  return iters_or_time.time;
+}
+
+IterationCount ComputeIters(const benchmark::internal::BenchmarkInstance& b,
+                            const BenchTimeType& iters_or_time) {
+  if (b.iterations() != 0) return b.iterations();
+
+  // We've already concluded that this flag is currently used to pass
+  // iters but do a check here again anyway.
+  BM_CHECK(iters_or_time.tag == BenchTimeType::ITERS);
+  return iters_or_time.iters;
+}
+
+}  // end namespace
+
+BenchTimeType ParseBenchMinTime(const std::string& value) {
+  BenchTimeType ret;
+
+  if (value.empty()) {
+    ret.tag = BenchTimeType::TIME;
+    ret.time = 0.0;
+    return ret;
+  }
+
+  if (value.back() == 'x') {
+    char* p_end;
+    // Reset errno before it's changed by strtol.
+    errno = 0;
+    IterationCount num_iters = std::strtol(value.c_str(), &p_end, 10);
+
+    // After a valid parse, p_end should have been set to
+    // point to the 'x' suffix.
+    BM_CHECK(errno == 0 && p_end != nullptr && *p_end == 'x')
+        << "Malformed iters value passed to --benchmark_min_time: `" << value
+        << "`. Expected --benchmark_min_time=<integer>x.";
+
+    ret.tag = BenchTimeType::ITERS;
+    ret.iters = num_iters;
+    return ret;
+  }
+
+  bool has_suffix = value.back() == 's';
+  if (!has_suffix) {
+    BM_VLOG(0) << "Value passed to --benchmark_min_time should have a suffix. "
+                  "Eg., `30s` for 30-seconds.";
+  }
+
+  char* p_end;
+  // Reset errno before it's changed by strtod.
+  errno = 0;
+  double min_time = std::strtod(value.c_str(), &p_end);
+
+  // After a successful parse, p_end should point to the suffix 's',
+  // or the end of the string if the suffix was omitted.
+  BM_CHECK(errno == 0 && p_end != nullptr &&
+           ((has_suffix && *p_end == 's') || *p_end == '\0'))
+      << "Malformed seconds value passed to --benchmark_min_time: `" << value
+      << "`. Expected --benchmark_min_time=<float>x.";
+
+  ret.tag = BenchTimeType::TIME;
+  ret.time = min_time;
+
+  return ret;
+}
+
+BenchmarkRunner::BenchmarkRunner(
+    const benchmark::internal::BenchmarkInstance& b_,
+    PerfCountersMeasurement* pcm_,
+    BenchmarkReporter::PerFamilyRunReports* reports_for_family_)
+    : b(b_),
+      reports_for_family(reports_for_family_),
+      parsed_benchtime_flag(ParseBenchMinTime(FLAGS_benchmark_min_time)),
+      min_time(ComputeMinTime(b_, parsed_benchtime_flag)),
+      min_warmup_time((!IsZero(b.min_time()) && b.min_warmup_time() > 0.0)
+                          ? b.min_warmup_time()
+                          : FLAGS_benchmark_min_warmup_time),
+      warmup_done(!(min_warmup_time > 0.0)),
+      repeats(b.repetitions() != 0 ? b.repetitions()
                                    : FLAGS_benchmark_repetitions),
-        has_explicit_iteration_count(b.iterations != 0),
-        pool(b.threads - 1),
-        iters(has_explicit_iteration_count ? b.iterations : 1) {
+      has_explicit_iteration_count(b.iterations() != 0 ||
+                                   parsed_benchtime_flag.tag ==
+                                       BenchTimeType::ITERS),
+      pool(b.threads() - 1),
+      iters(has_explicit_iteration_count
+                ? ComputeIters(b_, parsed_benchtime_flag)
+                : 1),
+      perf_counters_measurement_ptr(pcm_) {
+  run_results.display_report_aggregates_only =
+      (FLAGS_benchmark_report_aggregates_only ||
+       FLAGS_benchmark_display_aggregates_only);
+  run_results.file_report_aggregates_only =
+      FLAGS_benchmark_report_aggregates_only;
+  if (b.aggregation_report_mode() != internal::ARM_Unspecified) {
     run_results.display_report_aggregates_only =
-        (FLAGS_benchmark_report_aggregates_only ||
-         FLAGS_benchmark_display_aggregates_only);
+        (b.aggregation_report_mode() &
+         internal::ARM_DisplayReportAggregatesOnly);
     run_results.file_report_aggregates_only =
-        FLAGS_benchmark_report_aggregates_only;
-    if (b.aggregation_report_mode != internal::ARM_Unspecified) {
-      run_results.display_report_aggregates_only =
-          (b.aggregation_report_mode &
-           internal::ARM_DisplayReportAggregatesOnly);
-      run_results.file_report_aggregates_only =
-          (b.aggregation_report_mode & internal::ARM_FileReportAggregatesOnly);
-    }
+        (b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly);
+    BM_CHECK(FLAGS_benchmark_perf_counters.empty() ||
+             (perf_counters_measurement_ptr->num_counters() == 0))
+        << "Perf counters were requested but could not be set up.";
+  }
+}
 
-    for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
-      DoOneRepetition(repetition_num);
-    }
+BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
+  BM_VLOG(2) << "Running " << b.name().str() << " for " << iters << "\n";
 
-    // Calculate additional statistics
-    run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
+  std::unique_ptr<internal::ThreadManager> manager;
+  manager.reset(new internal::ThreadManager(b.threads()));
 
-    // Maybe calculate complexity report
-    if ((b.complexity != oNone) && b.last_benchmark_instance) {
-      auto additional_run_stats = ComputeBigO(complexity_reports);
-      run_results.aggregates_only.insert(run_results.aggregates_only.end(),
-                                         additional_run_stats.begin(),
-                                         additional_run_stats.end());
-      complexity_reports.clear();
-    }
+  // Run all but one thread in separate threads
+  for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+    pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
+                           manager.get(), perf_counters_measurement_ptr);
   }
+  // And run one thread here directly.
+  // (If we were asked to run just one thread, we don't create new threads.)
+  // Yes, we need to do this here *after* we start the separate threads.
+  RunInThread(&b, iters, 0, manager.get(), perf_counters_measurement_ptr);
 
-  RunResults&& get_results() { return std::move(run_results); }
+  // The main thread has finished. Now let's wait for the other threads.
+  manager->WaitForAllThreads();
+  for (std::thread& thread : pool) thread.join();
 
- private:
-  RunResults run_results;
+  IterationResults i;
+  // Acquire the measurements/counters from the manager, UNDER THE LOCK!
+  {
+    MutexLock l(manager->GetBenchmarkMutex());
+    i.results = manager->results;
+  }
 
-  const benchmark::internal::BenchmarkInstance& b;
-  std::vector<BenchmarkReporter::Run>& complexity_reports;
+  // And get rid of the manager.
+  manager.reset();
 
-  const double min_time;
-  const int repeats;
-  const bool has_explicit_iteration_count;
+  // Adjust real/manual time stats since they were reported per thread.
+  i.results.real_time_used /= b.threads();
+  i.results.manual_time_used /= b.threads();
+  // If we were measuring whole-process CPU usage, adjust the CPU time too.
+  if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads();
 
-  std::vector<std::thread> pool;
+  BM_VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
+             << i.results.real_time_used << "\n";
 
-  IterationCount iters;  // preserved between repetitions!
-  // So only the first repetition has to find/calculate it,
-  // the other repetitions will just use that precomputed iteration count.
+  // By using KeepRunningBatch a benchmark can iterate more times than
+  // requested, so take the iteration count from i.results.
+  i.iters = i.results.iterations / b.threads();
 
-  struct IterationResults {
-    internal::ThreadManager::Result results;
-    IterationCount iters;
-    double seconds;
-  };
-  IterationResults DoNIterations() {
-    VLOG(2) << "Running " << b.name.str() << " for " << iters << "\n";
+  // Base decisions off of real time if requested by this benchmark.
+  i.seconds = i.results.cpu_time_used;
+  if (b.use_manual_time()) {
+    i.seconds = i.results.manual_time_used;
+  } else if (b.use_real_time()) {
+    i.seconds = i.results.real_time_used;
+  }
 
-    std::unique_ptr<internal::ThreadManager> manager;
-    manager.reset(new internal::ThreadManager(b.threads));
+  return i;
+}
 
-    // Run all but one thread in separate threads
-    for (std::size_t ti = 0; ti < pool.size(); ++ti) {
-      pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
-                             manager.get());
-    }
-    // And run one thread here directly.
-    // (If we were asked to run just one thread, we don't create new threads.)
-    // Yes, we need to do this here *after* we start the separate threads.
-    RunInThread(&b, iters, 0, manager.get());
+IterationCount BenchmarkRunner::PredictNumItersNeeded(
+    const IterationResults& i) const {
+  // See how much iterations should be increased by.
+  // Note: Avoid division by zero with max(seconds, 1ns).
+  double multiplier = GetMinTimeToApply() * 1.4 / std::max(i.seconds, 1e-9);
+  // If our last run was at least 10% of FLAGS_benchmark_min_time then we
+  // use the multiplier directly.
+  // Otherwise we use at most 10 times expansion.
+  // NOTE: When the last run was at least 10% of the min time the max
+  // expansion should be 14x.
+  const bool is_significant = (i.seconds / GetMinTimeToApply()) > 0.1;
+  multiplier = is_significant ? multiplier : 10.0;
+
+  // So what seems to be the sufficiently-large iteration count? Round up.
+  const IterationCount max_next_iters = static_cast<IterationCount>(
+      std::lround(std::max(multiplier * static_cast<double>(i.iters),
+                           static_cast<double>(i.iters) + 1.0)));
+  // But we do have *some* limits though..
+  const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
+
+  BM_VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
+  return next_iters;  // round up before conversion to integer.
+}
 
-    // The main thread has finished. Now let's wait for the other threads.
-    manager->WaitForAllThreads();
-    for (std::thread& thread : pool) thread.join();
+bool BenchmarkRunner::ShouldReportIterationResults(
+    const IterationResults& i) const {
+  // Determine if this run should be reported;
+  // Either it has run for a sufficient amount of time
+  // or because an error was reported.
+  return i.results.skipped_ ||
+         i.iters >= kMaxIterations ||  // Too many iterations already.
+         i.seconds >=
+             GetMinTimeToApply() ||  // The elapsed time is large enough.
+         // CPU time is specified but the elapsed real time greatly exceeds
+         // the minimum time.
+         // Note that user provided timers are except from this test.
+         ((i.results.real_time_used >= 5 * GetMinTimeToApply()) &&
+          !b.use_manual_time());
+}
 
-    IterationResults i;
-    // Acquire the measurements/counters from the manager, UNDER THE LOCK!
-    {
-      MutexLock l(manager->GetBenchmarkMutex());
-      i.results = manager->results;
-    }
+double BenchmarkRunner::GetMinTimeToApply() const {
+  // In order to re-use functionality to run and measure benchmarks for running
+  // a warmup phase of the benchmark, we need a way of telling whether to apply
+  // min_time or min_warmup_time. This function will figure out if we are in the
+  // warmup phase and therefore need to apply min_warmup_time or if we already
+  // in the benchmarking phase and min_time needs to be applied.
+  return warmup_done ? min_time : min_warmup_time;
+}
 
-    // And get rid of the manager.
-    manager.reset();
+void BenchmarkRunner::FinishWarmUp(const IterationCount& i) {
+  warmup_done = true;
+  iters = i;
+}
 
-    // Adjust real/manual time stats since they were reported per thread.
-    i.results.real_time_used /= b.threads;
-    i.results.manual_time_used /= b.threads;
-    // If we were measuring whole-process CPU usage, adjust the CPU time too.
-    if (b.measure_process_cpu_time) i.results.cpu_time_used /= b.threads;
-
-    VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
-            << i.results.real_time_used << "\n";
-
-    // So for how long were we running?
-    i.iters = iters;
-    // Base decisions off of real time if requested by this benchmark.
-    i.seconds = i.results.cpu_time_used;
-    if (b.use_manual_time) {
-      i.seconds = i.results.manual_time_used;
-    } else if (b.use_real_time) {
-      i.seconds = i.results.real_time_used;
+void BenchmarkRunner::RunWarmUp() {
+  // Use the same mechanisms for warming up the benchmark as used for actually
+  // running and measuring the benchmark.
+  IterationResults i_warmup;
+  // Dont use the iterations determined in the warmup phase for the actual
+  // measured benchmark phase. While this may be a good starting point for the
+  // benchmark and it would therefore get rid of the need to figure out how many
+  // iterations are needed if min_time is set again, this may also be a complete
+  // wrong guess since the warmup loops might be considerably slower (e.g
+  // because of caching effects).
+  const IterationCount i_backup = iters;
+
+  for (;;) {
+    b.Setup();
+    i_warmup = DoNIterations();
+    b.Teardown();
+
+    const bool finish = ShouldReportIterationResults(i_warmup);
+
+    if (finish) {
+      FinishWarmUp(i_backup);
+      break;
     }
 
-    return i;
+    // Although we are running "only" a warmup phase where running enough
+    // iterations at once without measuring time isn't as important as it is for
+    // the benchmarking phase, we still do it the same way as otherwise it is
+    // very confusing for the user to know how to choose a proper value for
+    // min_warmup_time if a different approach on running it is used.
+    iters = PredictNumItersNeeded(i_warmup);
+    assert(iters > i_warmup.iters &&
+           "if we did more iterations than we want to do the next time, "
+           "then we should have accepted the current iteration run.");
   }
+}
 
-  IterationCount PredictNumItersNeeded(const IterationResults& i) const {
-    // See how much iterations should be increased by.
-    // Note: Avoid division by zero with max(seconds, 1ns).
-    double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9);
-    // If our last run was at least 10% of FLAGS_benchmark_min_time then we
-    // use the multiplier directly.
-    // Otherwise we use at most 10 times expansion.
-    // NOTE: When the last run was at least 10% of the min time the max
-    // expansion should be 14x.
-    bool is_significant = (i.seconds / min_time) > 0.1;
-    multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
-    if (multiplier <= 1.0) multiplier = 2.0;
-
-    // So what seems to be the sufficiently-large iteration count? Round up.
-    const IterationCount max_next_iters = static_cast<IterationCount>(
-        std::lround(std::max(multiplier * static_cast<double>(i.iters),
-                             static_cast<double>(i.iters) + 1.0)));
-    // But we do have *some* sanity limits though..
-    const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
-
-    VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
-    return next_iters;  // round up before conversion to integer.
+void BenchmarkRunner::DoOneRepetition() {
+  assert(HasRepeatsRemaining() && "Already done all repetitions?");
+
+  const bool is_the_first_repetition = num_repetitions_done == 0;
+
+  // In case a warmup phase is requested by the benchmark, run it now.
+  // After running the warmup phase the BenchmarkRunner should be in a state as
+  // this warmup never happened except the fact that warmup_done is set. Every
+  // other manipulation of the BenchmarkRunner instance would be a bug! Please
+  // fix it.
+  if (!warmup_done) RunWarmUp();
+
+  IterationResults i;
+  // We *may* be gradually increasing the length (iteration count)
+  // of the benchmark until we decide the results are significant.
+  // And once we do, we report those last results and exit.
+  // Please do note that the if there are repetitions, the iteration count
+  // is *only* calculated for the *first* repetition, and other repetitions
+  // simply use that precomputed iteration count.
+  for (;;) {
+    b.Setup();
+    i = DoNIterations();
+    b.Teardown();
+
+    // Do we consider the results to be significant?
+    // If we are doing repetitions, and the first repetition was already done,
+    // it has calculated the correct iteration time, so we have run that very
+    // iteration count just now. No need to calculate anything. Just report.
+    // Else, the normal rules apply.
+    const bool results_are_significant = !is_the_first_repetition ||
+                                         has_explicit_iteration_count ||
+                                         ShouldReportIterationResults(i);
+
+    if (results_are_significant) break;  // Good, let's report them!
+
+    // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
+    // iteration count, and run the benchmark again...
+
+    iters = PredictNumItersNeeded(i);
+    assert(iters > i.iters &&
+           "if we did more iterations than we want to do the next time, "
+           "then we should have accepted the current iteration run.");
   }
 
-  bool ShouldReportIterationResults(const IterationResults& i) const {
-    // Determine if this run should be reported;
-    // Either it has run for a sufficient amount of time
-    // or because an error was reported.
-    return i.results.has_error_ ||
-           i.iters >= kMaxIterations ||  // Too many iterations already.
-           i.seconds >= min_time ||      // The elapsed time is large enough.
-           // CPU time is specified but the elapsed real time greatly exceeds
-           // the minimum time.
-           // Note that user provided timers are except from this sanity check.
-           ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time);
+  // Oh, one last thing, we need to also produce the 'memory measurements'..
+  MemoryManager::Result* memory_result = nullptr;
+  IterationCount memory_iterations = 0;
+  if (memory_manager != nullptr) {
+    // TODO(vyng): Consider making BenchmarkReporter::Run::memory_result an
+    // optional so we don't have to own the Result here.
+    // Can't do it now due to cxx03.
+    memory_results.push_back(MemoryManager::Result());
+    memory_result = &memory_results.back();
+    // Only run a few iterations to reduce the impact of one-time
+    // allocations in benchmarks that are not properly managed.
+    memory_iterations = std::min<IterationCount>(16, iters);
+    memory_manager->Start();
+    std::unique_ptr<internal::ThreadManager> manager;
+    manager.reset(new internal::ThreadManager(1));
+    b.Setup();
+    RunInThread(&b, memory_iterations, 0, manager.get(),
+                perf_counters_measurement_ptr);
+    manager->WaitForAllThreads();
+    manager.reset();
+    b.Teardown();
+    memory_manager->Stop(*memory_result);
   }
 
-  void DoOneRepetition(int64_t repetition_index) {
-    const bool is_the_first_repetition = repetition_index == 0;
-    IterationResults i;
-
-    // We *may* be gradually increasing the length (iteration count)
-    // of the benchmark until we decide the results are significant.
-    // And once we do, we report those last results and exit.
-    // Please do note that the if there are repetitions, the iteration count
-    // is *only* calculated for the *first* repetition, and other repetitions
-    // simply use that precomputed iteration count.
-    for (;;) {
-      i = DoNIterations();
-
-      // Do we consider the results to be significant?
-      // If we are doing repetitions, and the first repetition was already done,
-      // it has calculated the correct iteration time, so we have run that very
-      // iteration count just now. No need to calculate anything. Just report.
-      // Else, the normal rules apply.
-      const bool results_are_significant = !is_the_first_repetition ||
-                                           has_explicit_iteration_count ||
-                                           ShouldReportIterationResults(i);
-
-      if (results_are_significant) break;  // Good, let's report them!
-
-      // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
-      // iteration count, and run the benchmark again...
-
-      iters = PredictNumItersNeeded(i);
-      assert(iters > i.iters &&
-             "if we did more iterations than we want to do the next time, "
-             "then we should have accepted the current iteration run.");
-    }
+  // Ok, now actually report.
+  BenchmarkReporter::Run report =
+      CreateRunReport(b, i.results, memory_iterations, memory_result, i.seconds,
+                      num_repetitions_done, repeats);
 
-    // Oh, one last thing, we need to also produce the 'memory measurements'..
-    MemoryManager::Result memory_result;
-    IterationCount memory_iterations = 0;
-    if (memory_manager != nullptr) {
-      // Only run a few iterations to reduce the impact of one-time
-      // allocations in benchmarks that are not properly managed.
-      memory_iterations = std::min<IterationCount>(16, iters);
-      memory_manager->Start();
-      std::unique_ptr<internal::ThreadManager> manager;
-      manager.reset(new internal::ThreadManager(1));
-      RunInThread(&b, memory_iterations, 0, manager.get());
-      manager->WaitForAllThreads();
-      manager.reset();
-
-      memory_manager->Stop(&memory_result);
-    }
+  if (reports_for_family) {
+    ++reports_for_family->num_runs_done;
+    if (!report.skipped) reports_for_family->Runs.push_back(report);
+  }
 
-    // Ok, now actualy report.
-    BenchmarkReporter::Run report =
-        CreateRunReport(b, i.results, memory_iterations, memory_result,
-                        i.seconds, repetition_index);
+  run_results.non_aggregates.push_back(report);
 
-    if (!report.error_occurred && b.complexity != oNone)
-      complexity_reports.push_back(report);
+  ++num_repetitions_done;
+}
 
-    run_results.non_aggregates.push_back(report);
-  }
-};
+RunResults&& BenchmarkRunner::GetResults() {
+  assert(!HasRepeatsRemaining() && "Did not run all repetitions yet?");
 
-}  // end namespace
+  // Calculate additional statistics over the repetitions of this instance.
+  run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
 
-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports) {
-  internal::BenchmarkRunner r(b, complexity_reports);
-  return r.get_results();
+  return std::move(run_results);
 }
 
 }  // end namespace internal
diff --git a/src/benchmark_runner.h b/src/benchmark_runner.h
index 96e8282..db2fa04 100644
--- a/src/benchmark_runner.h
+++ b/src/benchmark_runner.h
@@ -15,19 +15,23 @@
 #ifndef BENCHMARK_RUNNER_H_
 #define BENCHMARK_RUNNER_H_
 
+#include <thread>
+#include <vector>
+
 #include "benchmark_api_internal.h"
 #include "internal_macros.h"
-
-DECLARE_double(benchmark_min_time);
-
-DECLARE_int32(benchmark_repetitions);
-
-DECLARE_bool(benchmark_report_aggregates_only);
-
-DECLARE_bool(benchmark_display_aggregates_only);
+#include "perf_counters.h"
+#include "thread_manager.h"
 
 namespace benchmark {
 
+BM_DECLARE_string(benchmark_min_time);
+BM_DECLARE_double(benchmark_min_warmup_time);
+BM_DECLARE_int32(benchmark_repetitions);
+BM_DECLARE_bool(benchmark_report_aggregates_only);
+BM_DECLARE_bool(benchmark_display_aggregates_only);
+BM_DECLARE_string(benchmark_perf_counters);
+
 namespace internal {
 
 extern MemoryManager* memory_manager;
@@ -40,9 +44,85 @@ struct RunResults {
   bool file_report_aggregates_only = false;
 };
 
-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports);
+struct BENCHMARK_EXPORT BenchTimeType {
+  enum { ITERS, TIME } tag;
+  union {
+    IterationCount iters;
+    double time;
+  };
+};
+
+BENCHMARK_EXPORT
+BenchTimeType ParseBenchMinTime(const std::string& value);
+
+class BenchmarkRunner {
+ public:
+  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
+                  benchmark::internal::PerfCountersMeasurement* pmc_,
+                  BenchmarkReporter::PerFamilyRunReports* reports_for_family);
+
+  int GetNumRepeats() const { return repeats; }
+
+  bool HasRepeatsRemaining() const {
+    return GetNumRepeats() != num_repetitions_done;
+  }
+
+  void DoOneRepetition();
+
+  RunResults&& GetResults();
+
+  BenchmarkReporter::PerFamilyRunReports* GetReportsForFamily() const {
+    return reports_for_family;
+  }
+
+  double GetMinTime() const { return min_time; }
+
+  bool HasExplicitIters() const { return has_explicit_iteration_count; }
+
+  IterationCount GetIters() const { return iters; }
+
+ private:
+  RunResults run_results;
+
+  const benchmark::internal::BenchmarkInstance& b;
+  BenchmarkReporter::PerFamilyRunReports* reports_for_family;
+
+  BenchTimeType parsed_benchtime_flag;
+  const double min_time;
+  const double min_warmup_time;
+  bool warmup_done;
+  const int repeats;
+  const bool has_explicit_iteration_count;
+
+  int num_repetitions_done = 0;
+
+  std::vector<std::thread> pool;
+
+  std::vector<MemoryManager::Result> memory_results;
+
+  IterationCount iters;  // preserved between repetitions!
+  // So only the first repetition has to find/calculate it,
+  // the other repetitions will just use that precomputed iteration count.
+
+  PerfCountersMeasurement* const perf_counters_measurement_ptr = nullptr;
+
+  struct IterationResults {
+    internal::ThreadManager::Result results;
+    IterationCount iters;
+    double seconds;
+  };
+  IterationResults DoNIterations();
+
+  IterationCount PredictNumItersNeeded(const IterationResults& i) const;
+
+  bool ShouldReportIterationResults(const IterationResults& i) const;
+
+  double GetMinTimeToApply() const;
+
+  void FinishWarmUp(const IterationCount& i);
+
+  void RunWarmUp();
+};
 
 }  // namespace internal
 
diff --git a/src/check.cc b/src/check.cc
new file mode 100644
index 0000000..5f7526e
--- /dev/null
+++ b/src/check.cc
@@ -0,0 +1,11 @@
+#include "check.h"
+
+namespace benchmark {
+namespace internal {
+
+static AbortHandlerT* handler = &std::abort;
+
+BENCHMARK_EXPORT AbortHandlerT*& GetAbortHandler() { return handler; }
+
+}  // namespace internal
+}  // namespace benchmark
diff --git a/src/check.h b/src/check.h
index f5f8253..c1cd5e8 100644
--- a/src/check.h
+++ b/src/check.h
@@ -5,26 +5,43 @@
 #include <cstdlib>
 #include <ostream>
 
+#include "benchmark/export.h"
 #include "internal_macros.h"
 #include "log.h"
 
+#if defined(__GNUC__) || defined(__clang__)
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#elif defined(_MSC_VER) && !defined(__clang__)
+#if _MSC_VER >= 1900
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#else
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+#define __func__ __FUNCTION__
+#else
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+
 namespace benchmark {
 namespace internal {
 
 typedef void(AbortHandlerT)();
 
-inline AbortHandlerT*& GetAbortHandler() {
-  static AbortHandlerT* handler = &std::abort;
-  return handler;
-}
+BENCHMARK_EXPORT
+AbortHandlerT*& GetAbortHandler();
 
 BENCHMARK_NORETURN inline void CallAbortHandler() {
   GetAbortHandler()();
   std::abort();  // fallback to enforce noreturn
 }
 
-// CheckHandler is the class constructed by failing CHECK macros. CheckHandler
-// will log information about the failures and abort when it is destructed.
+// CheckHandler is the class constructed by failing BM_CHECK macros.
+// CheckHandler will log information about the failures and abort when it is
+// destructed.
 class CheckHandler {
  public:
   CheckHandler(const char* check, const char* file, const char* func, int line)
@@ -35,10 +52,17 @@ class CheckHandler {
 
   LogType& GetLog() { return log_; }
 
+#if defined(COMPILER_MSVC)
+#pragma warning(push)
+#pragma warning(disable : 4722)
+#endif
   BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) {
     log_ << std::endl;
     CallAbortHandler();
   }
+#if defined(COMPILER_MSVC)
+#pragma warning(pop)
+#endif
 
   CheckHandler& operator=(const CheckHandler&) = delete;
   CheckHandler(const CheckHandler&) = delete;
@@ -51,32 +75,32 @@ class CheckHandler {
 }  // end namespace internal
 }  // end namespace benchmark
 
-// The CHECK macro returns a std::ostream object that can have extra information
-// written to it.
+// The BM_CHECK macro returns a std::ostream object that can have extra
+// information written to it.
 #ifndef NDEBUG
-#define CHECK(b)                                                             \
+#define BM_CHECK(b)                                                          \
   (b ? ::benchmark::internal::GetNullLogInstance()                           \
      : ::benchmark::internal::CheckHandler(#b, __FILE__, __func__, __LINE__) \
            .GetLog())
 #else
-#define CHECK(b) ::benchmark::internal::GetNullLogInstance()
+#define BM_CHECK(b) ::benchmark::internal::GetNullLogInstance()
 #endif
 
 // clang-format off
 // preserve whitespacing between operators for alignment
-#define CHECK_EQ(a, b) CHECK((a) == (b))
-#define CHECK_NE(a, b) CHECK((a) != (b))
-#define CHECK_GE(a, b) CHECK((a) >= (b))
-#define CHECK_LE(a, b) CHECK((a) <= (b))
-#define CHECK_GT(a, b) CHECK((a) > (b))
-#define CHECK_LT(a, b) CHECK((a) < (b))
-
-#define CHECK_FLOAT_EQ(a, b, eps) CHECK(std::fabs((a) - (b)) <  (eps))
-#define CHECK_FLOAT_NE(a, b, eps) CHECK(std::fabs((a) - (b)) >= (eps))
-#define CHECK_FLOAT_GE(a, b, eps) CHECK((a) - (b) > -(eps))
-#define CHECK_FLOAT_LE(a, b, eps) CHECK((b) - (a) > -(eps))
-#define CHECK_FLOAT_GT(a, b, eps) CHECK((a) - (b) >  (eps))
-#define CHECK_FLOAT_LT(a, b, eps) CHECK((b) - (a) >  (eps))
+#define BM_CHECK_EQ(a, b) BM_CHECK((a) == (b))
+#define BM_CHECK_NE(a, b) BM_CHECK((a) != (b))
+#define BM_CHECK_GE(a, b) BM_CHECK((a) >= (b))
+#define BM_CHECK_LE(a, b) BM_CHECK((a) <= (b))
+#define BM_CHECK_GT(a, b) BM_CHECK((a) > (b))
+#define BM_CHECK_LT(a, b) BM_CHECK((a) < (b))
+
+#define BM_CHECK_FLOAT_EQ(a, b, eps) BM_CHECK(std::fabs((a) - (b)) <  (eps))
+#define BM_CHECK_FLOAT_NE(a, b, eps) BM_CHECK(std::fabs((a) - (b)) >= (eps))
+#define BM_CHECK_FLOAT_GE(a, b, eps) BM_CHECK((a) - (b) > -(eps))
+#define BM_CHECK_FLOAT_LE(a, b, eps) BM_CHECK((b) - (a) > -(eps))
+#define BM_CHECK_FLOAT_GT(a, b, eps) BM_CHECK((a) - (b) >  (eps))
+#define BM_CHECK_FLOAT_LT(a, b, eps) BM_CHECK((b) - (a) >  (eps))
 //clang-format on
 
 #endif  // CHECK_H_
diff --git a/src/colorprint.cc b/src/colorprint.cc
index fff6a98..0bfd670 100644
--- a/src/colorprint.cc
+++ b/src/colorprint.cc
@@ -25,8 +25,8 @@
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
-#include <windows.h>
 #include <io.h>
+#include <windows.h>
 #else
 #include <unistd.h>
 #endif  // BENCHMARK_OS_WINDOWS
@@ -94,20 +94,20 @@ std::string FormatString(const char* msg, va_list args) {
   va_end(args_cp);
 
   // currently there is no error handling for failure, so this is hack.
-  CHECK(ret >= 0);
+  BM_CHECK(ret >= 0);
 
-  if (ret == 0)  // handle empty expansion
+  if (ret == 0) {  // handle empty expansion
     return {};
-  else if (static_cast<size_t>(ret) < size)
+  }
+  if (static_cast<size_t>(ret) < size) {
     return local_buff;
-  else {
-    // we did not provide a long enough buffer on our first attempt.
-    size = (size_t)ret + 1;  // + 1 for the null byte
-    std::unique_ptr<char[]> buff(new char[size]);
-    ret = vsnprintf(buff.get(), size, msg, args);
-    CHECK(ret > 0 && ((size_t)ret) < size);
-    return buff.get();
   }
+  // we did not provide a long enough buffer on our first attempt.
+  size = static_cast<size_t>(ret) + 1;  // + 1 for the null byte
+  std::unique_ptr<char[]> buff(new char[size]);
+  ret = vsnprintf(buff.get(), size, msg, args);
+  BM_CHECK(ret > 0 && (static_cast<size_t>(ret)) < size);
+  return buff.get();
 }
 
 std::string FormatString(const char* msg, ...) {
@@ -163,12 +163,24 @@ bool IsColorTerminal() {
 #else
   // On non-Windows platforms, we rely on the TERM variable. This list of
   // supported TERM values is copied from Google Test:
-  // <https://github.com/google/googletest/blob/master/googletest/src/gtest.cc#L2925>.
+  // <https://github.com/google/googletest/blob/v1.13.0/googletest/src/gtest.cc#L3225-L3259>.
   const char* const SUPPORTED_TERM_VALUES[] = {
-      "xterm",         "xterm-color",     "xterm-256color",
-      "screen",        "screen-256color", "tmux",
-      "tmux-256color", "rxvt-unicode",    "rxvt-unicode-256color",
-      "linux",         "cygwin",
+      "xterm",
+      "xterm-color",
+      "xterm-256color",
+      "screen",
+      "screen-256color",
+      "tmux",
+      "tmux-256color",
+      "rxvt-unicode",
+      "rxvt-unicode-256color",
+      "linux",
+      "cygwin",
+      "xterm-kitty",
+      "alacritty",
+      "foot",
+      "foot-extra",
+      "wezterm",
   };
 
   const char* const term = getenv("TERM");
diff --git a/src/commandlineflags.cc b/src/commandlineflags.cc
index 0648fe3..dcb4149 100644
--- a/src/commandlineflags.cc
+++ b/src/commandlineflags.cc
@@ -20,6 +20,10 @@
 #include <cstring>
 #include <iostream>
 #include <limits>
+#include <map>
+#include <utility>
+
+#include "../src/string_util.h"
 
 namespace benchmark {
 namespace {
@@ -78,6 +82,30 @@ bool ParseDouble(const std::string& src_text, const char* str, double* value) {
   return true;
 }
 
+// Parses 'str' into KV pairs. If successful, writes the result to *value and
+// returns true; otherwise leaves *value unchanged and returns false.
+bool ParseKvPairs(const std::string& src_text, const char* str,
+                  std::map<std::string, std::string>* value) {
+  std::map<std::string, std::string> kvs;
+  for (const auto& kvpair : StrSplit(str, ',')) {
+    const auto kv = StrSplit(kvpair, '=');
+    if (kv.size() != 2) {
+      std::cerr << src_text << " is expected to be a comma-separated list of "
+                << "<key>=<value> strings, but actually has value \"" << str
+                << "\".\n";
+      return false;
+    }
+    if (!kvs.emplace(kv[0], kv[1]).second) {
+      std::cerr << src_text << " is expected to contain unique keys but key \""
+                << kv[0] << "\" was repeated.\n";
+      return false;
+    }
+  }
+
+  *value = kvs;
+  return true;
+}
+
 // Returns the name of the environment variable corresponding to the
 // given flag.  For example, FlagToEnvVar("foo") will return
 // "BENCHMARK_FOO" in the open-source version.
@@ -93,12 +121,14 @@ static std::string FlagToEnvVar(const char* flag) {
 
 }  // namespace
 
+BENCHMARK_EXPORT
 bool BoolFromEnv(const char* flag, bool default_val) {
   const std::string env_var = FlagToEnvVar(flag);
   const char* const value_str = getenv(env_var.c_str());
   return value_str == nullptr ? default_val : IsTruthyFlagValue(value_str);
 }
 
+BENCHMARK_EXPORT
 int32_t Int32FromEnv(const char* flag, int32_t default_val) {
   const std::string env_var = FlagToEnvVar(flag);
   const char* const value_str = getenv(env_var.c_str());
@@ -111,6 +141,7 @@ int32_t Int32FromEnv(const char* flag, int32_t default_val) {
   return value;
 }
 
+BENCHMARK_EXPORT
 double DoubleFromEnv(const char* flag, double default_val) {
   const std::string env_var = FlagToEnvVar(flag);
   const char* const value_str = getenv(env_var.c_str());
@@ -123,12 +154,28 @@ double DoubleFromEnv(const char* flag, double default_val) {
   return value;
 }
 
+BENCHMARK_EXPORT
 const char* StringFromEnv(const char* flag, const char* default_val) {
   const std::string env_var = FlagToEnvVar(flag);
   const char* const value = getenv(env_var.c_str());
   return value == nullptr ? default_val : value;
 }
 
+BENCHMARK_EXPORT
+std::map<std::string, std::string> KvPairsFromEnv(
+    const char* flag, std::map<std::string, std::string> default_val) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value_str = getenv(env_var.c_str());
+
+  if (value_str == nullptr) return default_val;
+
+  std::map<std::string, std::string> value;
+  if (!ParseKvPairs("Environment variable " + env_var, value_str, &value)) {
+    return default_val;
+  }
+  return value;
+}
+
 // Parses a string as a command line flag.  The string should have
 // the format "--flag=value".  When def_optional is true, the "=value"
 // part can be omitted.
@@ -159,6 +206,7 @@ const char* ParseFlagValue(const char* str, const char* flag,
   return flag_end + 1;
 }
 
+BENCHMARK_EXPORT
 bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, true);
@@ -171,6 +219,7 @@ bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
   return true;
 }
 
+BENCHMARK_EXPORT
 bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, false);
@@ -183,6 +232,7 @@ bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
                     value);
 }
 
+BENCHMARK_EXPORT
 bool ParseDoubleFlag(const char* str, const char* flag, double* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, false);
@@ -195,6 +245,7 @@ bool ParseDoubleFlag(const char* str, const char* flag, double* value) {
                      value);
 }
 
+BENCHMARK_EXPORT
 bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, false);
@@ -206,23 +257,42 @@ bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
   return true;
 }
 
+BENCHMARK_EXPORT
+bool ParseKeyValueFlag(const char* str, const char* flag,
+                       std::map<std::string, std::string>* value) {
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  if (value_str == nullptr) return false;
+
+  for (const auto& kvpair : StrSplit(value_str, ',')) {
+    const auto kv = StrSplit(kvpair, '=');
+    if (kv.size() != 2) return false;
+    value->emplace(kv[0], kv[1]);
+  }
+
+  return true;
+}
+
+BENCHMARK_EXPORT
 bool IsFlag(const char* str, const char* flag) {
   return (ParseFlagValue(str, flag, true) != nullptr);
 }
 
+BENCHMARK_EXPORT
 bool IsTruthyFlagValue(const std::string& value) {
   if (value.size() == 1) {
     char v = value[0];
     return isalnum(v) &&
            !(v == '0' || v == 'f' || v == 'F' || v == 'n' || v == 'N');
-  } else if (!value.empty()) {
+  }
+  if (!value.empty()) {
     std::string value_lower(value);
     std::transform(value_lower.begin(), value_lower.end(), value_lower.begin(),
                    [](char c) { return static_cast<char>(::tolower(c)); });
     return !(value_lower == "false" || value_lower == "no" ||
              value_lower == "off");
-  } else
-    return true;
+  }
+  return true;
 }
 
 }  // end namespace benchmark
diff --git a/src/commandlineflags.h b/src/commandlineflags.h
index 3a1f6a8..7882628 100644
--- a/src/commandlineflags.h
+++ b/src/commandlineflags.h
@@ -2,61 +2,80 @@
 #define BENCHMARK_COMMANDLINEFLAGS_H_
 
 #include <cstdint>
+#include <map>
 #include <string>
 
+#include "benchmark/export.h"
+
 // Macro for referencing flags.
 #define FLAG(name) FLAGS_##name
 
 // Macros for declaring flags.
-#define DECLARE_bool(name) extern bool FLAG(name)
-#define DECLARE_int32(name) extern int32_t FLAG(name)
-#define DECLARE_double(name) extern double FLAG(name)
-#define DECLARE_string(name) extern std::string FLAG(name)
+#define BM_DECLARE_bool(name) BENCHMARK_EXPORT extern bool FLAG(name)
+#define BM_DECLARE_int32(name) BENCHMARK_EXPORT extern int32_t FLAG(name)
+#define BM_DECLARE_double(name) BENCHMARK_EXPORT extern double FLAG(name)
+#define BM_DECLARE_string(name) BENCHMARK_EXPORT extern std::string FLAG(name)
+#define BM_DECLARE_kvpairs(name) \
+  BENCHMARK_EXPORT extern std::map<std::string, std::string> FLAG(name)
 
 // Macros for defining flags.
-#define DEFINE_bool(name, default_val)            \
-  bool FLAG(name) =                               \
-    benchmark::BoolFromEnv(#name, default_val)
-#define DEFINE_int32(name, default_val)           \
-  int32_t FLAG(name) =                            \
-    benchmark::Int32FromEnv(#name, default_val)
-#define DEFINE_double(name, default_val)          \
-  double FLAG(name) =                             \
-    benchmark::DoubleFromEnv(#name, default_val)
-#define DEFINE_string(name, default_val)          \
-  std::string FLAG(name) =                        \
-    benchmark::StringFromEnv(#name, default_val)
+#define BM_DEFINE_bool(name, default_val) \
+  BENCHMARK_EXPORT bool FLAG(name) = benchmark::BoolFromEnv(#name, default_val)
+#define BM_DEFINE_int32(name, default_val) \
+  BENCHMARK_EXPORT int32_t FLAG(name) =    \
+      benchmark::Int32FromEnv(#name, default_val)
+#define BM_DEFINE_double(name, default_val) \
+  BENCHMARK_EXPORT double FLAG(name) =      \
+      benchmark::DoubleFromEnv(#name, default_val)
+#define BM_DEFINE_string(name, default_val) \
+  BENCHMARK_EXPORT std::string FLAG(name) = \
+      benchmark::StringFromEnv(#name, default_val)
+#define BM_DEFINE_kvpairs(name, default_val)                       \
+  BENCHMARK_EXPORT std::map<std::string, std::string> FLAG(name) = \
+      benchmark::KvPairsFromEnv(#name, default_val)
 
 namespace benchmark {
 
-// Parses a bool from the environment variable
-// corresponding to the given flag.
+// Parses a bool from the environment variable corresponding to the given flag.
 //
 // If the variable exists, returns IsTruthyFlagValue() value;  if not,
 // returns the given default value.
+BENCHMARK_EXPORT
 bool BoolFromEnv(const char* flag, bool default_val);
 
-// Parses an Int32 from the environment variable
-// corresponding to the given flag.
+// Parses an Int32 from the environment variable corresponding to the given
+// flag.
 //
 // If the variable exists, returns ParseInt32() value;  if not, returns
 // the given default value.
+BENCHMARK_EXPORT
 int32_t Int32FromEnv(const char* flag, int32_t default_val);
 
-// Parses an Double from the environment variable
-// corresponding to the given flag.
+// Parses an Double from the environment variable corresponding to the given
+// flag.
 //
 // If the variable exists, returns ParseDouble();  if not, returns
 // the given default value.
+BENCHMARK_EXPORT
 double DoubleFromEnv(const char* flag, double default_val);
 
-// Parses a string from the environment variable
-// corresponding to the given flag.
+// Parses a string from the environment variable corresponding to the given
+// flag.
 //
 // If variable exists, returns its value;  if not, returns
 // the given default value.
+BENCHMARK_EXPORT
 const char* StringFromEnv(const char* flag, const char* default_val);
 
+// Parses a set of kvpairs from the environment variable corresponding to the
+// given flag.
+//
+// If variable exists, returns its value;  if not, returns
+// the given default value.
+BENCHMARK_EXPORT
+std::map<std::string, std::string> KvPairsFromEnv(
+    const char* flag, std::map<std::string, std::string> default_val);
+
 // Parses a string for a bool flag, in the form of either
 // "--flag=value" or "--flag".
 //
@@ -66,36 +85,47 @@ const char* StringFromEnv(const char* flag, const char* default_val);
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseBoolFlag(const char* str, const char* flag, bool* value);
 
-// Parses a string for an Int32 flag, in the form of
-// "--flag=value".
+// Parses a string for an Int32 flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseInt32Flag(const char* str, const char* flag, int32_t* value);
 
-// Parses a string for a Double flag, in the form of
-// "--flag=value".
+// Parses a string for a Double flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseDoubleFlag(const char* str, const char* flag, double* value);
 
-// Parses a string for a string flag, in the form of
-// "--flag=value".
+// Parses a string for a string flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseStringFlag(const char* str, const char* flag, std::string* value);
 
+// Parses a string for a kvpairs flag in the form "--flag=key=value,key=value"
+//
+// On success, stores the value of the flag in *value and returns true. On
+// failure returns false, though *value may have been mutated.
+BENCHMARK_EXPORT
+bool ParseKeyValueFlag(const char* str, const char* flag,
+                       std::map<std::string, std::string>* value);
+
 // Returns true if the string matches the flag.
+BENCHMARK_EXPORT
 bool IsFlag(const char* str, const char* flag);
 
 // Returns true unless value starts with one of: '0', 'f', 'F', 'n' or 'N', or
 // some non-alphanumeric character. Also returns false if the value matches
 // one of 'no', 'false', 'off' (case-insensitive). As a special case, also
 // returns true if value is the empty string.
+BENCHMARK_EXPORT
 bool IsTruthyFlagValue(const std::string& value);
 
 }  // end namespace benchmark
diff --git a/src/complexity.cc b/src/complexity.cc
index aeed67f..825c573 100644
--- a/src/complexity.cc
+++ b/src/complexity.cc
@@ -15,12 +15,13 @@
 // Source project : https://github.com/ismaelJimenez/cpp.leastsq
 // Adapted to be used with google benchmark
 
-#include "benchmark/benchmark.h"
+#include "complexity.h"
 
 #include <algorithm>
 #include <cmath>
+
+#include "benchmark/benchmark.h"
 #include "check.h"
-#include "complexity.h"
 
 namespace benchmark {
 
@@ -82,7 +83,6 @@ std::string GetBigOString(BigO complexity) {
 LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
                        const std::vector<double>& time,
                        BigOFunc* fitting_curve) {
-  double sigma_gn = 0.0;
   double sigma_gn_squared = 0.0;
   double sigma_time = 0.0;
   double sigma_time_gn = 0.0;
@@ -90,7 +90,6 @@ LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
   // Calculate least square fitting parameter
   for (size_t i = 0; i < n.size(); ++i) {
     double gn_i = fitting_curve(n[i]);
-    sigma_gn += gn_i;
     sigma_gn_squared += gn_i * gn_i;
     sigma_time += time[i];
     sigma_time_gn += time[i] * gn_i;
@@ -125,10 +124,10 @@ LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
 //                  fitting curve.
 LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
                        const std::vector<double>& time, const BigO complexity) {
-  CHECK_EQ(n.size(), time.size());
-  CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
-                          // benchmark runs are given
-  CHECK_NE(complexity, oNone);
+  BM_CHECK_EQ(n.size(), time.size());
+  BM_CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
+                             // benchmark runs are given
+  BM_CHECK_NE(complexity, oNone);
 
   LeastSq best_fit;
 
@@ -169,7 +168,8 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
 
   // Populate the accumulators.
   for (const Run& run : reports) {
-    CHECK_GT(run.complexity_n, 0) << "Did you forget to call SetComplexityN?";
+    BM_CHECK_GT(run.complexity_n, 0)
+        << "Did you forget to call SetComplexityN?";
     n.push_back(run.complexity_n);
     real_time.push_back(run.real_accumulated_time / run.iterations);
     cpu_time.push_back(run.cpu_accumulated_time / run.iterations);
@@ -193,11 +193,14 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
   // Get the data from the accumulator to BenchmarkReporter::Run's.
   Run big_o;
   big_o.run_name = run_name;
+  big_o.family_index = reports[0].family_index;
+  big_o.per_family_instance_index = reports[0].per_family_instance_index;
   big_o.run_type = BenchmarkReporter::Run::RT_Aggregate;
   big_o.repetitions = reports[0].repetitions;
   big_o.repetition_index = Run::no_repetition_index;
   big_o.threads = reports[0].threads;
   big_o.aggregate_name = "BigO";
+  big_o.aggregate_unit = StatisticUnit::kTime;
   big_o.report_label = reports[0].report_label;
   big_o.iterations = 0;
   big_o.real_accumulated_time = result_real.coef;
@@ -215,8 +218,11 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
   // Only add label to mean/stddev if it is same for all runs
   Run rms;
   rms.run_name = run_name;
+  rms.family_index = reports[0].family_index;
+  rms.per_family_instance_index = reports[0].per_family_instance_index;
   rms.run_type = BenchmarkReporter::Run::RT_Aggregate;
   rms.aggregate_name = "RMS";
+  rms.aggregate_unit = StatisticUnit::kPercentage;
   rms.report_label = big_o.report_label;
   rms.iterations = 0;
   rms.repetition_index = Run::no_repetition_index;
diff --git a/src/complexity.h b/src/complexity.h
index df29b48..0a0679b 100644
--- a/src/complexity.h
+++ b/src/complexity.h
@@ -31,7 +31,7 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
     const std::vector<BenchmarkReporter::Run>& reports);
 
 // This data structure will contain the result returned by MinimalLeastSq
-//   - coef        : Estimated coeficient for the high-order term as
+//   - coef        : Estimated coefficient for the high-order term as
 //                   interpolated from data.
 //   - rms         : Normalized Root Mean Squared Error.
 //   - complexity  : Scalability form (e.g. oN, oNLogN). In case a scalability
diff --git a/src/console_reporter.cc b/src/console_reporter.cc
index 6fd7645..10e05e1 100644
--- a/src/console_reporter.cc
+++ b/src/console_reporter.cc
@@ -33,6 +33,7 @@
 
 namespace benchmark {
 
+BENCHMARK_EXPORT
 bool ConsoleReporter::ReportContext(const Context& context) {
   name_field_width_ = context.name_field_width;
   printed_header_ = false;
@@ -45,19 +46,21 @@ bool ConsoleReporter::ReportContext(const Context& context) {
     GetErrorStream()
         << "Color printing is only supported for stdout on windows."
            " Disabling color printing\n";
-    output_options_ = static_cast< OutputOptions >(output_options_ & ~OO_Color);
+    output_options_ = static_cast<OutputOptions>(output_options_ & ~OO_Color);
   }
 #endif
 
   return true;
 }
 
+BENCHMARK_EXPORT
 void ConsoleReporter::PrintHeader(const Run& run) {
-  std::string str = FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
-                                 "Benchmark", "Time", "CPU", "Iterations");
-  if(!run.counters.empty()) {
-    if(output_options_ & OO_Tabular) {
-      for(auto const& c : run.counters) {
+  std::string str =
+      FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
+                   "Benchmark", "Time", "CPU", "Iterations");
+  if (!run.counters.empty()) {
+    if (output_options_ & OO_Tabular) {
+      for (auto const& c : run.counters) {
         str += FormatString(" %10s", c.first.c_str());
       }
     } else {
@@ -68,6 +71,7 @@ void ConsoleReporter::PrintHeader(const Run& run) {
   GetOutputStream() << line << "\n" << str << "\n" << line << "\n";
 }
 
+BENCHMARK_EXPORT
 void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
   for (const auto& run : reports) {
     // print the header:
@@ -97,8 +101,10 @@ static void IgnoreColorPrint(std::ostream& out, LogColor, const char* fmt,
   va_end(args);
 }
 
-
 static std::string FormatTime(double time) {
+  // For the time columns of the console printer 13 digits are reserved. One of
+  // them is a space and max two of them are the time unit (e.g ns). That puts
+  // us at 10 digits usable for the number.
   // Align decimal places...
   if (time < 1.0) {
     return FormatString("%10.3f", time);
@@ -109,22 +115,33 @@ static std::string FormatTime(double time) {
   if (time < 100.0) {
     return FormatString("%10.1f", time);
   }
+  // Assuming the time is at max 9.9999e+99 and we have 10 digits for the
+  // number, we get 10-1(.)-1(e)-1(sign)-2(exponent) = 5 digits to print.
+  if (time > 9999999999 /*max 10 digit number*/) {
+    return FormatString("%1.4e", time);
+  }
   return FormatString("%10.0f", time);
 }
 
+BENCHMARK_EXPORT
 void ConsoleReporter::PrintRunData(const Run& result) {
   typedef void(PrinterFn)(std::ostream&, LogColor, const char*, ...);
   auto& Out = GetOutputStream();
-  PrinterFn* printer = (output_options_ & OO_Color) ?
-                         (PrinterFn*)ColorPrintf : IgnoreColorPrint;
+  PrinterFn* printer = (output_options_ & OO_Color)
+                           ? static_cast<PrinterFn*>(ColorPrintf)
+                           : IgnoreColorPrint;
   auto name_color =
       (result.report_big_o || result.report_rms) ? COLOR_BLUE : COLOR_GREEN;
   printer(Out, name_color, "%-*s ", name_field_width_,
           result.benchmark_name().c_str());
 
-  if (result.error_occurred) {
+  if (internal::SkippedWithError == result.skipped) {
     printer(Out, COLOR_RED, "ERROR OCCURRED: \'%s\'",
-            result.error_message.c_str());
+            result.skip_message.c_str());
+    printer(Out, COLOR_DEFAULT, "\n");
+    return;
+  } else if (internal::SkippedWithMessage == result.skipped) {
+    printer(Out, COLOR_WHITE, "SKIPPED: \'%s\'", result.skip_message.c_str());
     printer(Out, COLOR_DEFAULT, "\n");
     return;
   }
@@ -134,18 +151,23 @@ void ConsoleReporter::PrintRunData(const Run& result) {
   const std::string real_time_str = FormatTime(real_time);
   const std::string cpu_time_str = FormatTime(cpu_time);
 
-
   if (result.report_big_o) {
     std::string big_o = GetBigOString(result.complexity);
-    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ", real_time, big_o.c_str(),
-            cpu_time, big_o.c_str());
+    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ", real_time,
+            big_o.c_str(), cpu_time, big_o.c_str());
   } else if (result.report_rms) {
     printer(Out, COLOR_YELLOW, "%10.0f %-4s %10.0f %-4s ", real_time * 100, "%",
             cpu_time * 100, "%");
-  } else {
+  } else if (result.run_type != Run::RT_Aggregate ||
+             result.aggregate_unit == StatisticUnit::kTime) {
     const char* timeLabel = GetTimeUnitString(result.time_unit);
-    printer(Out, COLOR_YELLOW, "%s %-4s %s %-4s ", real_time_str.c_str(), timeLabel,
-            cpu_time_str.c_str(), timeLabel);
+    printer(Out, COLOR_YELLOW, "%s %-4s %s %-4s ", real_time_str.c_str(),
+            timeLabel, cpu_time_str.c_str(), timeLabel);
+  } else {
+    assert(result.aggregate_unit == StatisticUnit::kPercentage);
+    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ",
+            (100. * result.real_accumulated_time), "%",
+            (100. * result.cpu_accumulated_time), "%");
   }
 
   if (!result.report_big_o && !result.report_rms) {
@@ -153,12 +175,19 @@ void ConsoleReporter::PrintRunData(const Run& result) {
   }
 
   for (auto& c : result.counters) {
-    const std::size_t cNameLen = std::max(std::string::size_type(10),
-                                          c.first.length());
-    auto const& s = HumanReadableNumber(c.second.value, c.second.oneK);
+    const std::size_t cNameLen =
+        std::max(std::string::size_type(10), c.first.length());
+    std::string s;
     const char* unit = "";
-    if (c.second.flags & Counter::kIsRate)
-      unit = (c.second.flags & Counter::kInvert) ? "s" : "/s";
+    if (result.run_type == Run::RT_Aggregate &&
+        result.aggregate_unit == StatisticUnit::kPercentage) {
+      s = StrFormat("%.2f", 100. * c.second.value);
+      unit = "%";
+    } else {
+      s = HumanReadableNumber(c.second.value, c.second.oneK);
+      if (c.second.flags & Counter::kIsRate)
+        unit = (c.second.flags & Counter::kInvert) ? "s" : "/s";
+    }
     if (output_options_ & OO_Tabular) {
       printer(Out, COLOR_DEFAULT, " %*s%s", cNameLen - strlen(unit), s.c_str(),
               unit);
diff --git a/src/csv_reporter.cc b/src/csv_reporter.cc
index af2c18f..7b56da1 100644
--- a/src/csv_reporter.cc
+++ b/src/csv_reporter.cc
@@ -12,9 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
-#include "complexity.h"
-
 #include <algorithm>
 #include <cstdint>
 #include <iostream>
@@ -22,7 +19,9 @@
 #include <tuple>
 #include <vector>
 
+#include "benchmark/benchmark.h"
 #include "check.h"
+#include "complexity.h"
 #include "string_util.h"
 #include "timers.h"
 
@@ -37,23 +36,29 @@ std::vector<std::string> elements = {
     "error_occurred", "error_message"};
 }  // namespace
 
-std::string CsvEscape(const std::string & s) {
+std::string CsvEscape(const std::string& s) {
   std::string tmp;
   tmp.reserve(s.size() + 2);
   for (char c : s) {
     switch (c) {
-    case '"' : tmp += "\"\""; break;
-    default  : tmp += c; break;
+      case '"':
+        tmp += "\"\"";
+        break;
+      default:
+        tmp += c;
+        break;
     }
   }
   return '"' + tmp + '"';
 }
 
+BENCHMARK_EXPORT
 bool CSVReporter::ReportContext(const Context& context) {
   PrintBasicContext(&GetErrorStream(), context);
   return true;
 }
 
+BENCHMARK_EXPORT
 void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
   std::ostream& Out = GetOutputStream();
 
@@ -85,7 +90,8 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
       for (const auto& cnt : run.counters) {
         if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
           continue;
-        CHECK(user_counter_names_.find(cnt.first) != user_counter_names_.end())
+        BM_CHECK(user_counter_names_.find(cnt.first) !=
+                 user_counter_names_.end())
             << "All counters must be present in each run. "
             << "Counter named \"" << cnt.first
             << "\" was not in a run after being added to the header";
@@ -99,13 +105,14 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
   }
 }
 
+BENCHMARK_EXPORT
 void CSVReporter::PrintRunData(const Run& run) {
   std::ostream& Out = GetOutputStream();
   Out << CsvEscape(run.benchmark_name()) << ",";
-  if (run.error_occurred) {
+  if (run.skipped) {
     Out << std::string(elements.size() - 3, ',');
-    Out << "true,";
-    Out << CsvEscape(run.error_message) << "\n";
+    Out << std::boolalpha << (internal::SkippedWithError == run.skipped) << ",";
+    Out << CsvEscape(run.skip_message) << "\n";
     return;
   }
 
diff --git a/src/cycleclock.h b/src/cycleclock.h
index 6843b69..ae1ef2d 100644
--- a/src/cycleclock.h
+++ b/src/cycleclock.h
@@ -36,7 +36,8 @@
 // declarations of some other intrinsics, breaking compilation.
 // Therefore, we simply declare __rdtsc ourselves. See also
 // http://connect.microsoft.com/VisualStudio/feedback/details/262047
-#if defined(COMPILER_MSVC) && !defined(_M_IX86) && !defined(_M_ARM64)
+#if defined(COMPILER_MSVC) && !defined(_M_IX86) && !defined(_M_ARM64) && \
+    !defined(_M_ARM64EC)
 extern "C" uint64_t __rdtsc();
 #pragma intrinsic(__rdtsc)
 #endif
@@ -114,8 +115,8 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   // when I know it will work.  Otherwise, I'll use __rdtsc and hope
   // the code is being compiled with a non-ancient compiler.
   _asm rdtsc
-#elif defined(COMPILER_MSVC) && defined(_M_ARM64)
-  // See https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics?view=vs-2019
+#elif defined(COMPILER_MSVC) && (defined(_M_ARM64) || defined(_M_ARM64EC))
+  // See // https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics
   // and https://reviews.llvm.org/D53115
   int64_t virtual_timer_value;
   virtual_timer_value = _ReadStatusReg(ARM64_CNTVCT);
@@ -132,7 +133,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
 
   // Native Client does not provide any API to access cycle counter.
   // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday
-  // because is provides nanosecond resolution (which is noticable at
+  // because is provides nanosecond resolution (which is noticeable at
   // least for PNaCl modules running on x86 Mac & Linux).
   // Initialize to always return 0 if clock_gettime fails.
   struct timespec ts = {0, 0};
@@ -173,6 +174,10 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__loongarch__) || defined(__csky__)
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
 #elif defined(__s390__)  // Covers both s390 and s390x.
   // Return the CPU clock.
   uint64_t tsc;
@@ -183,7 +188,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   asm("stck %0" : "=Q"(tsc) : : "cc");
 #endif
   return tsc;
-#elif defined(__riscv) // RISC-V
+#elif defined(__riscv)  // RISC-V
   // Use RDCYCLE (and RDCYCLEH on riscv32)
 #if __riscv_xlen == 32
   uint32_t cycles_lo, cycles_hi0, cycles_hi1;
@@ -204,6 +209,14 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   asm volatile("rdcycle %0" : "=r"(cycles));
   return cycles;
 #endif
+#elif defined(__e2k__) || defined(__elbrus__)
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__hexagon__)
+  uint64_t pcycle;
+  asm volatile("%0 = C15:14" : "=r"(pcycle));
+  return static_cast<double>(pcycle);
 #else
 // The soft failover to a generic implementation is automatic only for ARM.
 // For other platforms the developer is expected to make an attempt to create
diff --git a/src/internal_macros.h b/src/internal_macros.h
index 91f367b..8dd7d0c 100644
--- a/src/internal_macros.h
+++ b/src/internal_macros.h
@@ -1,8 +1,6 @@
 #ifndef BENCHMARK_INTERNAL_MACROS_H_
 #define BENCHMARK_INTERNAL_MACROS_H_
 
-#include "benchmark/benchmark.h"
-
 /* Needed to detect STL */
 #include <cstdlib>
 
@@ -44,6 +42,19 @@
   #define BENCHMARK_OS_CYGWIN 1
 #elif defined(_WIN32)
   #define BENCHMARK_OS_WINDOWS 1
+  // WINAPI_FAMILY_PARTITION is defined in winapifamily.h.
+  // We include windows.h which implicitly includes winapifamily.h for compatibility.
+  #ifndef NOMINMAX
+    #define NOMINMAX
+  #endif
+  #include <windows.h>
+  #if defined(WINAPI_FAMILY_PARTITION)
+    #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+      #define BENCHMARK_OS_WINDOWS_WIN32 1
+    #elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+      #define BENCHMARK_OS_WINDOWS_RT 1
+    #endif
+  #endif
   #if defined(__MINGW32__)
     #define BENCHMARK_OS_MINGW 1
   #endif
@@ -80,6 +91,8 @@
 #define BENCHMARK_OS_QNX 1
 #elif defined(__MVS__)
 #define BENCHMARK_OS_ZOS 1
+#elif defined(__hexagon__)
+#define BENCHMARK_OS_QURT 1
 #endif
 
 #if defined(__ANDROID__) && defined(__GLIBCXX__)
diff --git a/src/json_reporter.cc b/src/json_reporter.cc
index 959d245..6559dfd 100644
--- a/src/json_reporter.cc
+++ b/src/json_reporter.cc
@@ -12,9 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
-#include "complexity.h"
-
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
@@ -25,41 +22,61 @@
 #include <tuple>
 #include <vector>
 
+#include "benchmark/benchmark.h"
+#include "complexity.h"
 #include "string_util.h"
 #include "timers.h"
 
 namespace benchmark {
-
 namespace {
 
-std::string StrEscape(const std::string & s) {
+std::string StrEscape(const std::string& s) {
   std::string tmp;
   tmp.reserve(s.size());
   for (char c : s) {
     switch (c) {
-    case '\b': tmp += "\\b"; break;
-    case '\f': tmp += "\\f"; break;
-    case '\n': tmp += "\\n"; break;
-    case '\r': tmp += "\\r"; break;
-    case '\t': tmp += "\\t"; break;
-    case '\\': tmp += "\\\\"; break;
-    case '"' : tmp += "\\\""; break;
-    default  : tmp += c; break;
+      case '\b':
+        tmp += "\\b";
+        break;
+      case '\f':
+        tmp += "\\f";
+        break;
+      case '\n':
+        tmp += "\\n";
+        break;
+      case '\r':
+        tmp += "\\r";
+        break;
+      case '\t':
+        tmp += "\\t";
+        break;
+      case '\\':
+        tmp += "\\\\";
+        break;
+      case '"':
+        tmp += "\\\"";
+        break;
+      default:
+        tmp += c;
+        break;
     }
   }
   return tmp;
 }
 
 std::string FormatKV(std::string const& key, std::string const& value) {
-  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(),
+                   StrEscape(value).c_str());
 }
 
 std::string FormatKV(std::string const& key, const char* value) {
-  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(),
+                   StrEscape(value).c_str());
 }
 
 std::string FormatKV(std::string const& key, bool value) {
-  return StrFormat("\"%s\": %s", StrEscape(key).c_str(), value ? "true" : "false");
+  return StrFormat("\"%s\": %s", StrEscape(key).c_str(),
+                   value ? "true" : "false");
 }
 
 std::string FormatKV(std::string const& key, int64_t value) {
@@ -68,12 +85,6 @@ std::string FormatKV(std::string const& key, int64_t value) {
   return ss.str();
 }
 
-std::string FormatKV(std::string const& key, IterationCount value) {
-  std::stringstream ss;
-  ss << '"' << StrEscape(key) << "\": " << value;
-  return ss.str();
-}
-
 std::string FormatKV(std::string const& key, double value) {
   std::stringstream ss;
   ss << '"' << StrEscape(key) << "\": ";
@@ -123,7 +134,9 @@ bool JSONReporter::ReportContext(const Context& context) {
                   RoundDouble(info.cycles_per_second / 1000000.0))
       << ",\n";
   if (CPUInfo::Scaling::UNKNOWN != info.scaling) {
-    out << indent << FormatKV("cpu_scaling_enabled", info.scaling == CPUInfo::Scaling::ENABLED ? true : false)
+    out << indent
+        << FormatKV("cpu_scaling_enabled",
+                    info.scaling == CPUInfo::Scaling::ENABLED ? true : false)
         << ",\n";
   }
 
@@ -136,8 +149,8 @@ bool JSONReporter::ReportContext(const Context& context) {
     out << cache_indent << FormatKV("type", CI.type) << ",\n";
     out << cache_indent << FormatKV("level", static_cast<int64_t>(CI.level))
         << ",\n";
-    out << cache_indent
-        << FormatKV("size", static_cast<int64_t>(CI.size)) << ",\n";
+    out << cache_indent << FormatKV("size", static_cast<int64_t>(CI.size))
+        << ",\n";
     out << cache_indent
         << FormatKV("num_sharing", static_cast<int64_t>(CI.num_sharing))
         << "\n";
@@ -159,7 +172,19 @@ bool JSONReporter::ReportContext(const Context& context) {
 #else
   const char build_type[] = "debug";
 #endif
-  out << indent << FormatKV("library_build_type", build_type) << "\n";
+  out << indent << FormatKV("library_build_type", build_type);
+
+  std::map<std::string, std::string>* global_context =
+      internal::GetGlobalContext();
+
+  if (global_context != nullptr) {
+    for (const auto& kv : *global_context) {
+      out << ",\n";
+      out << indent << FormatKV(kv.first, kv.second);
+    }
+  }
+  out << "\n";
+
   // Close context block and open the list of benchmarks.
   out << inner_indent << "},\n";
   out << inner_indent << "\"benchmarks\": [\n";
@@ -197,6 +222,10 @@ void JSONReporter::PrintRunData(Run const& run) {
   std::string indent(6, ' ');
   std::ostream& out = GetOutputStream();
   out << indent << FormatKV("name", run.benchmark_name()) << ",\n";
+  out << indent << FormatKV("family_index", run.family_index) << ",\n";
+  out << indent
+      << FormatKV("per_family_instance_index", run.per_family_instance_index)
+      << ",\n";
   out << indent << FormatKV("run_name", run.run_name.str()) << ",\n";
   out << indent << FormatKV("run_type", [&run]() -> const char* {
     switch (run.run_type) {
@@ -215,15 +244,36 @@ void JSONReporter::PrintRunData(Run const& run) {
   out << indent << FormatKV("threads", run.threads) << ",\n";
   if (run.run_type == BenchmarkReporter::Run::RT_Aggregate) {
     out << indent << FormatKV("aggregate_name", run.aggregate_name) << ",\n";
+    out << indent << FormatKV("aggregate_unit", [&run]() -> const char* {
+      switch (run.aggregate_unit) {
+        case StatisticUnit::kTime:
+          return "time";
+        case StatisticUnit::kPercentage:
+          return "percentage";
+      }
+      BENCHMARK_UNREACHABLE();
+    }()) << ",\n";
   }
-  if (run.error_occurred) {
-    out << indent << FormatKV("error_occurred", run.error_occurred) << ",\n";
-    out << indent << FormatKV("error_message", run.error_message) << ",\n";
+  if (internal::SkippedWithError == run.skipped) {
+    out << indent << FormatKV("error_occurred", true) << ",\n";
+    out << indent << FormatKV("error_message", run.skip_message) << ",\n";
+  } else if (internal::SkippedWithMessage == run.skipped) {
+    out << indent << FormatKV("skipped", true) << ",\n";
+    out << indent << FormatKV("skip_message", run.skip_message) << ",\n";
   }
   if (!run.report_big_o && !run.report_rms) {
     out << indent << FormatKV("iterations", run.iterations) << ",\n";
-    out << indent << FormatKV("real_time", run.GetAdjustedRealTime()) << ",\n";
-    out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
+    if (run.run_type != Run::RT_Aggregate ||
+        run.aggregate_unit == StatisticUnit::kTime) {
+      out << indent << FormatKV("real_time", run.GetAdjustedRealTime())
+          << ",\n";
+      out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
+    } else {
+      assert(run.aggregate_unit == StatisticUnit::kPercentage);
+      out << indent << FormatKV("real_time", run.real_accumulated_time)
+          << ",\n";
+      out << indent << FormatKV("cpu_time", run.cpu_accumulated_time);
+    }
     out << ",\n"
         << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
   } else if (run.report_big_o) {
@@ -241,9 +291,21 @@ void JSONReporter::PrintRunData(Run const& run) {
     out << ",\n" << indent << FormatKV(c.first, c.second);
   }
 
-  if (run.has_memory_result) {
+  if (run.memory_result) {
+    const MemoryManager::Result memory_result = *run.memory_result;
     out << ",\n" << indent << FormatKV("allocs_per_iter", run.allocs_per_iter);
-    out << ",\n" << indent << FormatKV("max_bytes_used", run.max_bytes_used);
+    out << ",\n"
+        << indent << FormatKV("max_bytes_used", memory_result.max_bytes_used);
+
+    auto report_if_present = [&out, &indent](const std::string& label,
+                                             int64_t val) {
+      if (val != MemoryManager::TombstoneValue)
+        out << ",\n" << indent << FormatKV(label, val);
+    };
+
+    report_if_present("total_allocated_bytes",
+                      memory_result.total_allocated_bytes);
+    report_if_present("net_heap_growth", memory_result.net_heap_growth);
   }
 
   if (!run.report_label.empty()) {
@@ -252,4 +314,7 @@ void JSONReporter::PrintRunData(Run const& run) {
   out << '\n';
 }
 
+const int64_t MemoryManager::TombstoneValue =
+    std::numeric_limits<int64_t>::max();
+
 }  // end namespace benchmark
diff --git a/src/log.h b/src/log.h
index 47d0c35..9a21400 100644
--- a/src/log.h
+++ b/src/log.h
@@ -4,7 +4,12 @@
 #include <iostream>
 #include <ostream>
 
-#include "benchmark/benchmark.h"
+// NOTE: this is also defined in benchmark.h but we're trying to avoid a
+// dependency.
+// The _MSVC_LANG check should detect Visual Studio 2015 Update 3 and newer.
+#if __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L)
+#define BENCHMARK_HAS_CXX11
+#endif
 
 namespace benchmark {
 namespace internal {
@@ -23,7 +28,16 @@ class LogType {
  private:
   LogType(std::ostream* out) : out_(out) {}
   std::ostream* out_;
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(LogType);
+
+  // NOTE: we could use BENCHMARK_DISALLOW_COPY_AND_ASSIGN but we shouldn't have
+  // a dependency on benchmark.h from here.
+#ifndef BENCHMARK_HAS_CXX11
+  LogType(const LogType&);
+  LogType& operator=(const LogType&);
+#else
+  LogType(const LogType&) = delete;
+  LogType& operator=(const LogType&) = delete;
+#endif
 };
 
 template <class Tp>
@@ -47,13 +61,13 @@ inline int& LogLevel() {
 }
 
 inline LogType& GetNullLogInstance() {
-  static LogType log(nullptr);
-  return log;
+  static LogType null_log(static_cast<std::ostream*>(nullptr));
+  return null_log;
 }
 
 inline LogType& GetErrorLogInstance() {
-  static LogType log(&std::clog);
-  return log;
+  static LogType error_log(&std::clog);
+  return error_log;
 }
 
 inline LogType& GetLogInstanceForLevel(int level) {
@@ -67,7 +81,7 @@ inline LogType& GetLogInstanceForLevel(int level) {
 }  // end namespace benchmark
 
 // clang-format off
-#define VLOG(x)                                                               \
+#define BM_VLOG(x)                                                               \
   (::benchmark::internal::GetLogInstanceForLevel(x) << "-- LOG(" << x << "):" \
                                                                          " ")
 // clang-format on
diff --git a/src/mutex.h b/src/mutex.h
index 3fac79a..bec78d9 100644
--- a/src/mutex.h
+++ b/src/mutex.h
@@ -9,60 +9,60 @@
 // Enable thread safety attributes only with clang.
 // The attributes can be safely erased when compiling with other compilers.
 #if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
-#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
+#define THREAD_ANNOTATION_ATTRIBUTE_(x) __attribute__((x))
 #else
-#define THREAD_ANNOTATION_ATTRIBUTE__(x)  // no-op
+#define THREAD_ANNOTATION_ATTRIBUTE_(x)  // no-op
 #endif
 
-#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
+#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(capability(x))
 
-#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE_(scoped_lockable)
 
-#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(guarded_by(x))
 
-#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
+#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(pt_guarded_by(x))
 
 #define ACQUIRED_BEFORE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_before(__VA_ARGS__))
 
 #define ACQUIRED_AFTER(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_after(__VA_ARGS__))
 
 #define REQUIRES(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_capability(__VA_ARGS__))
 
 #define REQUIRES_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_shared_capability(__VA_ARGS__))
 
 #define ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_capability(__VA_ARGS__))
 
 #define ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_shared_capability(__VA_ARGS__))
 
 #define RELEASE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(release_capability(__VA_ARGS__))
 
 #define RELEASE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(release_shared_capability(__VA_ARGS__))
 
 #define TRY_ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_capability(__VA_ARGS__))
 
 #define TRY_ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_shared_capability(__VA_ARGS__))
 
-#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE_(locks_excluded(__VA_ARGS__))
 
-#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
+#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(assert_capability(x))
 
 #define ASSERT_SHARED_CAPABILITY(x) \
-  THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x))
+  THREAD_ANNOTATION_ATTRIBUTE_(assert_shared_capability(x))
 
-#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(lock_returned(x))
 
 #define NO_THREAD_SAFETY_ANALYSIS \
-  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
+  THREAD_ANNOTATION_ATTRIBUTE_(no_thread_safety_analysis)
 
 namespace benchmark {
 
@@ -130,7 +130,7 @@ class Barrier {
   // entered the barrier.  Returns iff this is the last thread to
   // enter the barrier.
   bool createBarrier(MutexLock& ml) REQUIRES(lock_) {
-    CHECK_LT(entered_, running_threads_);
+    BM_CHECK_LT(entered_, running_threads_);
     entered_++;
     if (entered_ < running_threads_) {
       // Wait for all threads to enter
diff --git a/src/perf_counters.cc b/src/perf_counters.cc
new file mode 100644
index 0000000..417acdb
--- /dev/null
+++ b/src/perf_counters.cc
@@ -0,0 +1,282 @@
+// Copyright 2021 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "perf_counters.h"
+
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#if defined HAVE_LIBPFM
+#include "perfmon/pfmlib.h"
+#include "perfmon/pfmlib_perf_event.h"
+#endif
+
+namespace benchmark {
+namespace internal {
+
+constexpr size_t PerfCounterValues::kMaxCounters;
+
+#if defined HAVE_LIBPFM
+
+size_t PerfCounterValues::Read(const std::vector<int>& leaders) {
+  // Create a pointer for multiple reads
+  const size_t bufsize = values_.size() * sizeof(values_[0]);
+  char* ptr = reinterpret_cast<char*>(values_.data());
+  size_t size = bufsize;
+  for (int lead : leaders) {
+    auto read_bytes = ::read(lead, ptr, size);
+    if (read_bytes >= ssize_t(sizeof(uint64_t))) {
+      // Actual data bytes are all bytes minus initial padding
+      std::size_t data_bytes = read_bytes - sizeof(uint64_t);
+      // This should be very cheap since it's in hot cache
+      std::memmove(ptr, ptr + sizeof(uint64_t), data_bytes);
+      // Increment our counters
+      ptr += data_bytes;
+      size -= data_bytes;
+    } else {
+      int err = errno;
+      GetErrorLogInstance() << "Error reading lead " << lead << " errno:" << err
+                            << " " << ::strerror(err) << "\n";
+      return 0;
+    }
+  }
+  return (bufsize - size) / sizeof(uint64_t);
+}
+
+const bool PerfCounters::kSupported = true;
+
+// Initializes libpfm only on the first call.  Returns whether that single
+// initialization was successful.
+bool PerfCounters::Initialize() {
+  // Function-scope static gets initialized only once on first call.
+  static const bool success = []() {
+    return pfm_initialize() == PFM_SUCCESS;
+  }();
+  return success;
+}
+
+bool PerfCounters::IsCounterSupported(const std::string& name) {
+  Initialize();
+  perf_event_attr_t attr;
+  std::memset(&attr, 0, sizeof(attr));
+  pfm_perf_encode_arg_t arg;
+  std::memset(&arg, 0, sizeof(arg));
+  arg.attr = &attr;
+  const int mode = PFM_PLM3;  // user mode only
+  int ret = pfm_get_os_event_encoding(name.c_str(), mode, PFM_OS_PERF_EVENT_EXT,
+                                      &arg);
+  return (ret == PFM_SUCCESS);
+}
+
+PerfCounters PerfCounters::Create(
+    const std::vector<std::string>& counter_names) {
+  if (!counter_names.empty()) {
+    Initialize();
+  }
+
+  // Valid counters will populate these arrays but we start empty
+  std::vector<std::string> valid_names;
+  std::vector<int> counter_ids;
+  std::vector<int> leader_ids;
+
+  // Resize to the maximum possible
+  valid_names.reserve(counter_names.size());
+  counter_ids.reserve(counter_names.size());
+
+  const int kCounterMode = PFM_PLM3;  // user mode only
+
+  // Group leads will be assigned on demand. The idea is that once we cannot
+  // create a counter descriptor, the reason is that this group has maxed out
+  // so we set the group_id again to -1 and retry - giving the algorithm a
+  // chance to create a new group leader to hold the next set of counters.
+  int group_id = -1;
+
+  // Loop through all performance counters
+  for (size_t i = 0; i < counter_names.size(); ++i) {
+    // we are about to push into the valid names vector
+    // check if we did not reach the maximum
+    if (valid_names.size() == PerfCounterValues::kMaxCounters) {
+      // Log a message if we maxed out and stop adding
+      GetErrorLogInstance()
+          << counter_names.size() << " counters were requested. The maximum is "
+          << PerfCounterValues::kMaxCounters << " and " << valid_names.size()
+          << " were already added. All remaining counters will be ignored\n";
+      // stop the loop and return what we have already
+      break;
+    }
+
+    // Check if this name is empty
+    const auto& name = counter_names[i];
+    if (name.empty()) {
+      GetErrorLogInstance()
+          << "A performance counter name was the empty string\n";
+      continue;
+    }
+
+    // Here first means first in group, ie the group leader
+    const bool is_first = (group_id < 0);
+
+    // This struct will be populated by libpfm from the counter string
+    // and then fed into the syscall perf_event_open
+    struct perf_event_attr attr {};
+    attr.size = sizeof(attr);
+
+    // This is the input struct to libpfm.
+    pfm_perf_encode_arg_t arg{};
+    arg.attr = &attr;
+    const int pfm_get = pfm_get_os_event_encoding(name.c_str(), kCounterMode,
+                                                  PFM_OS_PERF_EVENT, &arg);
+    if (pfm_get != PFM_SUCCESS) {
+      GetErrorLogInstance()
+          << "Unknown performance counter name: " << name << "\n";
+      continue;
+    }
+
+    // We then proceed to populate the remaining fields in our attribute struct
+    // Note: the man page for perf_event_create suggests inherit = true and
+    // read_format = PERF_FORMAT_GROUP don't work together, but that's not the
+    // case.
+    attr.disabled = is_first;
+    attr.inherit = true;
+    attr.pinned = is_first;
+    attr.exclude_kernel = true;
+    attr.exclude_user = false;
+    attr.exclude_hv = true;
+
+    // Read all counters in a group in one read.
+    attr.read_format = PERF_FORMAT_GROUP;
+
+    int id = -1;
+    while (id < 0) {
+      static constexpr size_t kNrOfSyscallRetries = 5;
+      // Retry syscall as it was interrupted often (b/64774091).
+      for (size_t num_retries = 0; num_retries < kNrOfSyscallRetries;
+           ++num_retries) {
+        id = perf_event_open(&attr, 0, -1, group_id, 0);
+        if (id >= 0 || errno != EINTR) {
+          break;
+        }
+      }
+      if (id < 0) {
+        // If the file descriptor is negative we might have reached a limit
+        // in the current group. Set the group_id to -1 and retry
+        if (group_id >= 0) {
+          // Create a new group
+          group_id = -1;
+        } else {
+          // At this point we have already retried to set a new group id and
+          // failed. We then give up.
+          break;
+        }
+      }
+    }
+
+    // We failed to get a new file descriptor. We might have reached a hard
+    // hardware limit that cannot be resolved even with group multiplexing
+    if (id < 0) {
+      GetErrorLogInstance() << "***WARNING** Failed to get a file descriptor "
+                               "for performance counter "
+                            << name << ". Ignoring\n";
+
+      // We give up on this counter but try to keep going
+      // as the others would be fine
+      continue;
+    }
+    if (group_id < 0) {
+      // This is a leader, store and assign it to the current file descriptor
+      leader_ids.push_back(id);
+      group_id = id;
+    }
+    // This is a valid counter, add it to our descriptor's list
+    counter_ids.push_back(id);
+    valid_names.push_back(name);
+  }
+
+  // Loop through all group leaders activating them
+  // There is another option of starting ALL counters in a process but
+  // that would be far reaching an intrusion. If the user is using PMCs
+  // by themselves then this would have a side effect on them. It is
+  // friendlier to loop through all groups individually.
+  for (int lead : leader_ids) {
+    if (ioctl(lead, PERF_EVENT_IOC_ENABLE) != 0) {
+      // This should never happen but if it does, we give up on the
+      // entire batch as recovery would be a mess.
+      GetErrorLogInstance() << "***WARNING*** Failed to start counters. "
+                               "Claring out all counters.\n";
+
+      // Close all peformance counters
+      for (int id : counter_ids) {
+        ::close(id);
+      }
+
+      // Return an empty object so our internal state is still good and
+      // the process can continue normally without impact
+      return NoCounters();
+    }
+  }
+
+  return PerfCounters(std::move(valid_names), std::move(counter_ids),
+                      std::move(leader_ids));
+}
+
+void PerfCounters::CloseCounters() const {
+  if (counter_ids_.empty()) {
+    return;
+  }
+  for (int lead : leader_ids_) {
+    ioctl(lead, PERF_EVENT_IOC_DISABLE);
+  }
+  for (int fd : counter_ids_) {
+    close(fd);
+  }
+}
+#else   // defined HAVE_LIBPFM
+size_t PerfCounterValues::Read(const std::vector<int>&) { return 0; }
+
+const bool PerfCounters::kSupported = false;
+
+bool PerfCounters::Initialize() { return false; }
+
+bool PerfCounters::IsCounterSupported(const std::string&) { return false; }
+
+PerfCounters PerfCounters::Create(
+    const std::vector<std::string>& counter_names) {
+  if (!counter_names.empty()) {
+    GetErrorLogInstance() << "Performance counters not supported.";
+  }
+  return NoCounters();
+}
+
+void PerfCounters::CloseCounters() const {}
+#endif  // defined HAVE_LIBPFM
+
+PerfCountersMeasurement::PerfCountersMeasurement(
+    const std::vector<std::string>& counter_names)
+    : start_values_(counter_names.size()), end_values_(counter_names.size()) {
+  counters_ = PerfCounters::Create(counter_names);
+}
+
+PerfCounters& PerfCounters::operator=(PerfCounters&& other) noexcept {
+  if (this != &other) {
+    CloseCounters();
+
+    counter_ids_ = std::move(other.counter_ids_);
+    leader_ids_ = std::move(other.leader_ids_);
+    counter_names_ = std::move(other.counter_names_);
+  }
+  return *this;
+}
+}  // namespace internal
+}  // namespace benchmark
diff --git a/src/perf_counters.h b/src/perf_counters.h
new file mode 100644
index 0000000..bf5eb6b
--- /dev/null
+++ b/src/perf_counters.h
@@ -0,0 +1,200 @@
+// Copyright 2021 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BENCHMARK_PERF_COUNTERS_H
+#define BENCHMARK_PERF_COUNTERS_H
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "check.h"
+#include "log.h"
+#include "mutex.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#include <unistd.h>
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// C4251: <symbol> needs to have dll-interface to be used by clients of class
+#pragma warning(disable : 4251)
+#endif
+
+namespace benchmark {
+namespace internal {
+
+// Typically, we can only read a small number of counters. There is also a
+// padding preceding counter values, when reading multiple counters with one
+// syscall (which is desirable). PerfCounterValues abstracts these details.
+// The implementation ensures the storage is inlined, and allows 0-based
+// indexing into the counter values.
+// The object is used in conjunction with a PerfCounters object, by passing it
+// to Snapshot(). The Read() method relocates individual reads, discarding
+// the initial padding from each group leader in the values buffer such that
+// all user accesses through the [] operator are correct.
+class BENCHMARK_EXPORT PerfCounterValues {
+ public:
+  explicit PerfCounterValues(size_t nr_counters) : nr_counters_(nr_counters) {
+    BM_CHECK_LE(nr_counters_, kMaxCounters);
+  }
+
+  // We are reading correctly now so the values don't need to skip padding
+  uint64_t operator[](size_t pos) const { return values_[pos]; }
+
+  // Increased the maximum to 32 only since the buffer
+  // is std::array<> backed
+  static constexpr size_t kMaxCounters = 32;
+
+ private:
+  friend class PerfCounters;
+  // Get the byte buffer in which perf counters can be captured.
+  // This is used by PerfCounters::Read
+  std::pair<char*, size_t> get_data_buffer() {
+    return {reinterpret_cast<char*>(values_.data()),
+            sizeof(uint64_t) * (kPadding + nr_counters_)};
+  }
+
+  // This reading is complex and as the goal of this class is to
+  // abstract away the intrincacies of the reading process, this is
+  // a better place for it
+  size_t Read(const std::vector<int>& leaders);
+
+  // Move the padding to 2 due to the reading algorithm (1st padding plus a
+  // current read padding)
+  static constexpr size_t kPadding = 2;
+  std::array<uint64_t, kPadding + kMaxCounters> values_;
+  const size_t nr_counters_;
+};
+
+// Collect PMU counters. The object, once constructed, is ready to be used by
+// calling read(). PMU counter collection is enabled from the time create() is
+// called, to obtain the object, until the object's destructor is called.
+class BENCHMARK_EXPORT PerfCounters final {
+ public:
+  // True iff this platform supports performance counters.
+  static const bool kSupported;
+
+  // Returns an empty object
+  static PerfCounters NoCounters() { return PerfCounters(); }
+
+  ~PerfCounters() { CloseCounters(); }
+  PerfCounters() = default;
+  PerfCounters(PerfCounters&&) = default;
+  PerfCounters(const PerfCounters&) = delete;
+  PerfCounters& operator=(PerfCounters&&) noexcept;
+  PerfCounters& operator=(const PerfCounters&) = delete;
+
+  // Platform-specific implementations may choose to do some library
+  // initialization here.
+  static bool Initialize();
+
+  // Check if the given counter is supported, if the app wants to
+  // check before passing
+  static bool IsCounterSupported(const std::string& name);
+
+  // Return a PerfCounters object ready to read the counters with the names
+  // specified. The values are user-mode only. The counter name format is
+  // implementation and OS specific.
+  // In case of failure, this method will in the worst case return an
+  // empty object whose state will still be valid.
+  static PerfCounters Create(const std::vector<std::string>& counter_names);
+
+  // Take a snapshot of the current value of the counters into the provided
+  // valid PerfCounterValues storage. The values are populated such that:
+  // names()[i]'s value is (*values)[i]
+  BENCHMARK_ALWAYS_INLINE bool Snapshot(PerfCounterValues* values) const {
+#ifndef BENCHMARK_OS_WINDOWS
+    assert(values != nullptr);
+    return values->Read(leader_ids_) == counter_ids_.size();
+#else
+    (void)values;
+    return false;
+#endif
+  }
+
+  const std::vector<std::string>& names() const { return counter_names_; }
+  size_t num_counters() const { return counter_names_.size(); }
+
+ private:
+  PerfCounters(const std::vector<std::string>& counter_names,
+               std::vector<int>&& counter_ids, std::vector<int>&& leader_ids)
+      : counter_ids_(std::move(counter_ids)),
+        leader_ids_(std::move(leader_ids)),
+        counter_names_(counter_names) {}
+
+  void CloseCounters() const;
+
+  std::vector<int> counter_ids_;
+  std::vector<int> leader_ids_;
+  std::vector<std::string> counter_names_;
+};
+
+// Typical usage of the above primitives.
+class BENCHMARK_EXPORT PerfCountersMeasurement final {
+ public:
+  PerfCountersMeasurement(const std::vector<std::string>& counter_names);
+
+  size_t num_counters() const { return counters_.num_counters(); }
+
+  std::vector<std::string> names() const { return counters_.names(); }
+
+  BENCHMARK_ALWAYS_INLINE bool Start() {
+    if (num_counters() == 0) return true;
+    // Tell the compiler to not move instructions above/below where we take
+    // the snapshot.
+    ClobberMemory();
+    valid_read_ &= counters_.Snapshot(&start_values_);
+    ClobberMemory();
+
+    return valid_read_;
+  }
+
+  BENCHMARK_ALWAYS_INLINE bool Stop(
+      std::vector<std::pair<std::string, double>>& measurements) {
+    if (num_counters() == 0) return true;
+    // Tell the compiler to not move instructions above/below where we take
+    // the snapshot.
+    ClobberMemory();
+    valid_read_ &= counters_.Snapshot(&end_values_);
+    ClobberMemory();
+
+    for (size_t i = 0; i < counters_.names().size(); ++i) {
+      double measurement = static_cast<double>(end_values_[i]) -
+                           static_cast<double>(start_values_[i]);
+      measurements.push_back({counters_.names()[i], measurement});
+    }
+
+    return valid_read_;
+  }
+
+ private:
+  PerfCounters counters_;
+  bool valid_read_ = true;
+  PerfCounterValues start_values_;
+  PerfCounterValues end_values_;
+};
+
+}  // namespace internal
+}  // namespace benchmark
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+#endif  // BENCHMARK_PERF_COUNTERS_H
diff --git a/src/re.h b/src/re.h
index fbe2503..9afb869 100644
--- a/src/re.h
+++ b/src/re.h
@@ -33,7 +33,7 @@
 // Prefer C regex libraries when compiling w/o exceptions so that we can
 // correctly report errors.
 #if defined(BENCHMARK_HAS_NO_EXCEPTIONS) && \
-    defined(BENCHMARK_HAVE_STD_REGEX) && \
+    defined(HAVE_STD_REGEX) && \
     (defined(HAVE_GNU_POSIX_REGEX) || defined(HAVE_POSIX_REGEX))
   #undef HAVE_STD_REGEX
 #endif
@@ -126,7 +126,7 @@ inline bool Regex::Init(const std::string& spec, std::string* error) {
 
       // regerror returns the number of bytes necessary to null terminate
       // the string, so we move that when assigning to error.
-      CHECK_NE(needed, 0);
+      BM_CHECK_NE(needed, 0);
       error->assign(errbuf, needed - 1);
 
       delete[] errbuf;
diff --git a/src/reporter.cc b/src/reporter.cc
index 337575a..076bc31 100644
--- a/src/reporter.cc
+++ b/src/reporter.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
-#include "timers.h"
-
 #include <cstdlib>
-
 #include <iostream>
+#include <map>
+#include <string>
 #include <tuple>
 #include <vector>
 
+#include "benchmark/benchmark.h"
 #include "check.h"
 #include "string_util.h"
+#include "timers.h"
 
 namespace benchmark {
 
@@ -33,10 +33,14 @@ BenchmarkReporter::~BenchmarkReporter() {}
 
 void BenchmarkReporter::PrintBasicContext(std::ostream *out,
                                           Context const &context) {
-  CHECK(out) << "cannot be null";
+  BM_CHECK(out) << "cannot be null";
   auto &Out = *out;
 
+#ifndef BENCHMARK_OS_QURT
+  // Date/time information is not available on QuRT.
+  // Attempting to get it via this call cause the binary to crash.
   Out << LocalDateTimeString() << "\n";
+#endif
 
   if (context.executable_name)
     Out << "Running " << context.executable_name << "\n";
@@ -64,6 +68,15 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
     Out << "\n";
   }
 
+  std::map<std::string, std::string> *global_context =
+      internal::GetGlobalContext();
+
+  if (global_context != nullptr) {
+    for (const auto &kv : *global_context) {
+      Out << kv.first << ": " << kv.second << "\n";
+    }
+  }
+
   if (CPUInfo::Scaling::ENABLED == info.scaling) {
     Out << "***WARNING*** CPU scaling is enabled, the benchmark "
            "real time measurements may be noisy and will incur extra "
diff --git a/src/sleep.cc b/src/sleep.cc
deleted file mode 100644
index 4609d54..0000000
--- a/src/sleep.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "sleep.h"
-
-#include <cerrno>
-#include <cstdlib>
-#include <ctime>
-
-#include "internal_macros.h"
-
-#ifdef BENCHMARK_OS_WINDOWS
-#include <windows.h>
-#endif
-
-#ifdef BENCHMARK_OS_ZOS
-#include <unistd.h>
-#endif
-
-namespace benchmark {
-#ifdef BENCHMARK_OS_WINDOWS
-// Window's Sleep takes milliseconds argument.
-void SleepForMilliseconds(int milliseconds) { Sleep(milliseconds); }
-void SleepForSeconds(double seconds) {
-  SleepForMilliseconds(static_cast<int>(kNumMillisPerSecond * seconds));
-}
-#else   // BENCHMARK_OS_WINDOWS
-void SleepForMicroseconds(int microseconds) {
-#ifdef BENCHMARK_OS_ZOS
-  // z/OS does not support nanosleep. Instead call sleep() and then usleep() to
-  // sleep for the remaining microseconds because usleep() will fail if its
-  // argument is greater than 1000000.
-  div_t sleepTime = div(microseconds, kNumMicrosPerSecond);
-  int seconds = sleepTime.quot;
-  while (seconds != 0)
-    seconds = sleep(seconds);
-  while (usleep(sleepTime.rem) == -1 && errno == EINTR)
-    ;
-#else
-  struct timespec sleep_time;
-  sleep_time.tv_sec = microseconds / kNumMicrosPerSecond;
-  sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro;
-  while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR)
-    ;  // Ignore signals and wait for the full interval to elapse.
-#endif
-}
-
-void SleepForMilliseconds(int milliseconds) {
-  SleepForMicroseconds(milliseconds * kNumMicrosPerMilli);
-}
-
-void SleepForSeconds(double seconds) {
-  SleepForMicroseconds(static_cast<int>(seconds * kNumMicrosPerSecond));
-}
-#endif  // BENCHMARK_OS_WINDOWS
-}  // end namespace benchmark
diff --git a/src/sleep.h b/src/sleep.h
deleted file mode 100644
index f98551a..0000000
--- a/src/sleep.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef BENCHMARK_SLEEP_H_
-#define BENCHMARK_SLEEP_H_
-
-namespace benchmark {
-const int kNumMillisPerSecond = 1000;
-const int kNumMicrosPerMilli = 1000;
-const int kNumMicrosPerSecond = kNumMillisPerSecond * 1000;
-const int kNumNanosPerMicro = 1000;
-const int kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond;
-
-void SleepForMilliseconds(int milliseconds);
-void SleepForSeconds(double seconds);
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_SLEEP_H_
diff --git a/src/statistics.cc b/src/statistics.cc
index bd5a3d6..844e926 100644
--- a/src/statistics.cc
+++ b/src/statistics.cc
@@ -13,15 +13,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
+#include "statistics.h"
 
 #include <algorithm>
 #include <cmath>
 #include <numeric>
 #include <string>
 #include <vector>
+
+#include "benchmark/benchmark.h"
 #include "check.h"
-#include "statistics.h"
 
 namespace benchmark {
 
@@ -41,13 +42,13 @@ double StatisticsMedian(const std::vector<double>& v) {
   auto center = copy.begin() + v.size() / 2;
   std::nth_element(copy.begin(), center, copy.end());
 
-  // did we have an odd number of samples?
-  // if yes, then center is the median
-  // it no, then we are looking for the average between center and the value
-  // before
+  // Did we have an odd number of samples?  If yes, then center is the median.
+  // If not, then we are looking for the average between center and the value
+  // before.  Instead of resorting, we just look for the max value before it,
+  // which is not necessarily the element immediately preceding `center` Since
+  // `copy` is only partially sorted by `nth_element`.
   if (v.size() % 2 == 1) return *center;
-  auto center2 = copy.begin() + v.size() / 2 - 1;
-  std::nth_element(copy.begin(), center2, copy.end());
+  auto center2 = std::max_element(copy.begin(), center);
   return (*center + *center2) / 2.0;
 }
 
@@ -74,14 +75,22 @@ double StatisticsStdDev(const std::vector<double>& v) {
   return Sqrt(v.size() / (v.size() - 1.0) * (avg_squares - Sqr(mean)));
 }
 
+double StatisticsCV(const std::vector<double>& v) {
+  if (v.size() < 2) return 0.0;
+
+  const auto stddev = StatisticsStdDev(v);
+  const auto mean = StatisticsMean(v);
+
+  return stddev / mean;
+}
+
 std::vector<BenchmarkReporter::Run> ComputeStats(
     const std::vector<BenchmarkReporter::Run>& reports) {
   typedef BenchmarkReporter::Run Run;
   std::vector<Run> results;
 
-  auto error_count =
-      std::count_if(reports.begin(), reports.end(),
-                    [](Run const& run) { return run.error_occurred; });
+  auto error_count = std::count_if(reports.begin(), reports.end(),
+                                   [](Run const& run) { return run.skipped; });
 
   if (reports.size() - error_count < 2) {
     // We don't report aggregated data if there was a single run.
@@ -108,26 +117,28 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
     for (auto const& cnt : r.counters) {
       auto it = counter_stats.find(cnt.first);
       if (it == counter_stats.end()) {
-        counter_stats.insert({cnt.first, {cnt.second, std::vector<double>{}}});
-        it = counter_stats.find(cnt.first);
+        it = counter_stats
+                 .emplace(cnt.first,
+                          CounterStat{cnt.second, std::vector<double>{}})
+                 .first;
         it->second.s.reserve(reports.size());
       } else {
-        CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags);
+        BM_CHECK_EQ(it->second.c.flags, cnt.second.flags);
       }
     }
   }
 
   // Populate the accumulators.
   for (Run const& run : reports) {
-    CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
-    CHECK_EQ(run_iterations, run.iterations);
-    if (run.error_occurred) continue;
+    BM_CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
+    BM_CHECK_EQ(run_iterations, run.iterations);
+    if (run.skipped) continue;
     real_accumulated_time_stat.emplace_back(run.real_accumulated_time);
     cpu_accumulated_time_stat.emplace_back(run.cpu_accumulated_time);
     // user counters
     for (auto const& cnt : run.counters) {
       auto it = counter_stats.find(cnt.first);
-      CHECK_NE(it, counter_stats.end());
+      BM_CHECK_NE(it, counter_stats.end());
       it->second.s.emplace_back(cnt.second);
     }
   }
@@ -148,11 +159,14 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
     // Get the data from the accumulator to BenchmarkReporter::Run's.
     Run data;
     data.run_name = reports[0].run_name;
+    data.family_index = reports[0].family_index;
+    data.per_family_instance_index = reports[0].per_family_instance_index;
     data.run_type = BenchmarkReporter::Run::RT_Aggregate;
     data.threads = reports[0].threads;
     data.repetitions = reports[0].repetitions;
     data.repetition_index = Run::no_repetition_index;
     data.aggregate_name = Stat.name_;
+    data.aggregate_unit = Stat.unit_;
     data.report_label = report_label;
 
     // It is incorrect to say that an aggregate is computed over
@@ -165,13 +179,15 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
     data.real_accumulated_time = Stat.compute_(real_accumulated_time_stat);
     data.cpu_accumulated_time = Stat.compute_(cpu_accumulated_time_stat);
 
-    // We will divide these times by data.iterations when reporting, but the
-    // data.iterations is not nessesairly the scale of these measurements,
-    // because in each repetition, these timers are sum over all the iterations.
-    // And if we want to say that the stats are over N repetitions and not
-    // M iterations, we need to multiply these by (N/M).
-    data.real_accumulated_time *= iteration_rescale_factor;
-    data.cpu_accumulated_time *= iteration_rescale_factor;
+    if (data.aggregate_unit == StatisticUnit::kTime) {
+      // We will divide these times by data.iterations when reporting, but the
+      // data.iterations is not necessarily the scale of these measurements,
+      // because in each repetition, these timers are sum over all the iters.
+      // And if we want to say that the stats are over N repetitions and not
+      // M iterations, we need to multiply these by (N/M).
+      data.real_accumulated_time *= iteration_rescale_factor;
+      data.cpu_accumulated_time *= iteration_rescale_factor;
+    }
 
     data.time_unit = reports[0].time_unit;
 
diff --git a/src/statistics.h b/src/statistics.h
index 7eccc85..6e5560e 100644
--- a/src/statistics.h
+++ b/src/statistics.h
@@ -22,15 +22,22 @@
 
 namespace benchmark {
 
-// Return a vector containing the mean, median and standard devation information
-// (and any user-specified info) for the specified list of reports. If 'reports'
-// contains less than two non-errored runs an empty vector is returned
+// Return a vector containing the mean, median and standard deviation
+// information (and any user-specified info) for the specified list of reports.
+// If 'reports' contains less than two non-errored runs an empty vector is
+// returned
+BENCHMARK_EXPORT
 std::vector<BenchmarkReporter::Run> ComputeStats(
     const std::vector<BenchmarkReporter::Run>& reports);
 
+BENCHMARK_EXPORT
 double StatisticsMean(const std::vector<double>& v);
+BENCHMARK_EXPORT
 double StatisticsMedian(const std::vector<double>& v);
+BENCHMARK_EXPORT
 double StatisticsStdDev(const std::vector<double>& v);
+BENCHMARK_EXPORT
+double StatisticsCV(const std::vector<double>& v);
 
 }  // end namespace benchmark
 
diff --git a/src/string_util.cc b/src/string_util.cc
index ac60b55..c69e40a 100644
--- a/src/string_util.cc
+++ b/src/string_util.cc
@@ -11,16 +11,17 @@
 #include <sstream>
 
 #include "arraysize.h"
+#include "benchmark/benchmark.h"
 
 namespace benchmark {
 namespace {
-
 // kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta.
-const char kBigSIUnits[] = "kMGTPEZY";
+const char* const kBigSIUnits[] = {"k", "M", "G", "T", "P", "E", "Z", "Y"};
 // Kibi, Mebi, Gibi, Tebi, Pebi, Exbi, Zebi, Yobi.
-const char kBigIECUnits[] = "KMGTPEZY";
+const char* const kBigIECUnits[] = {"Ki", "Mi", "Gi", "Ti",
+                                    "Pi", "Ei", "Zi", "Yi"};
 // milli, micro, nano, pico, femto, atto, zepto, yocto.
-const char kSmallSIUnits[] = "munpfazy";
+const char* const kSmallSIUnits[] = {"m", "u", "n", "p", "f", "a", "z", "y"};
 
 // We require that all three arrays have the same size.
 static_assert(arraysize(kBigSIUnits) == arraysize(kBigIECUnits),
@@ -30,9 +31,8 @@ static_assert(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits),
 
 static const int64_t kUnitsSize = arraysize(kBigSIUnits);
 
-void ToExponentAndMantissa(double val, double thresh, int precision,
-                           double one_k, std::string* mantissa,
-                           int64_t* exponent) {
+void ToExponentAndMantissa(double val, int precision, double one_k,
+                           std::string* mantissa, int64_t* exponent) {
   std::stringstream mantissa_stream;
 
   if (val < 0) {
@@ -43,8 +43,8 @@ void ToExponentAndMantissa(double val, double thresh, int precision,
   // Adjust threshold so that it never excludes things which can't be rendered
   // in 'precision' digits.
   const double adjusted_threshold =
-      std::max(thresh, 1.0 / std::pow(10.0, precision));
-  const double big_threshold = adjusted_threshold * one_k;
+      std::max(1.0, 1.0 / std::pow(10.0, precision));
+  const double big_threshold = (adjusted_threshold * one_k) - 1;
   const double small_threshold = adjusted_threshold;
   // Values in ]simple_threshold,small_threshold[ will be printed as-is
   const double simple_threshold = 0.01;
@@ -92,37 +92,20 @@ std::string ExponentToPrefix(int64_t exponent, bool iec) {
   const int64_t index = (exponent > 0 ? exponent - 1 : -exponent - 1);
   if (index >= kUnitsSize) return "";
 
-  const char* array =
+  const char* const* array =
       (exponent > 0 ? (iec ? kBigIECUnits : kBigSIUnits) : kSmallSIUnits);
-  if (iec)
-    return array[index] + std::string("i");
-  else
-    return std::string(1, array[index]);
+
+  return std::string(array[index]);
 }
 
-std::string ToBinaryStringFullySpecified(double value, double threshold,
-                                         int precision, double one_k = 1024.0) {
+std::string ToBinaryStringFullySpecified(double value, int precision,
+                                         Counter::OneK one_k) {
   std::string mantissa;
   int64_t exponent;
-  ToExponentAndMantissa(value, threshold, precision, one_k, &mantissa,
+  ToExponentAndMantissa(value, precision,
+                        one_k == Counter::kIs1024 ? 1024.0 : 1000.0, &mantissa,
                         &exponent);
-  return mantissa + ExponentToPrefix(exponent, false);
-}
-
-}  // end namespace
-
-void AppendHumanReadable(int n, std::string* str) {
-  std::stringstream ss;
-  // Round down to the nearest SI prefix.
-  ss << ToBinaryStringFullySpecified(n, 1.0, 0);
-  *str += ss.str();
-}
-
-std::string HumanReadableNumber(double n, double one_k) {
-  // 1.1 means that figures up to 1.1k should be shown with the next unit down;
-  // this softens edge effects.
-  // 1 means that we should show one decimal place of precision.
-  return ToBinaryStringFullySpecified(n, 1.1, 1, one_k);
+  return mantissa + ExponentToPrefix(exponent, one_k == Counter::kIs1024);
 }
 
 std::string StrFormatImp(const char* msg, va_list args) {
@@ -133,28 +116,34 @@ std::string StrFormatImp(const char* msg, va_list args) {
   // TODO(ericwf): use std::array for first attempt to avoid one memory
   // allocation guess what the size might be
   std::array<char, 256> local_buff;
-  std::size_t size = local_buff.size();
+
   // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
   // in the android-ndk
-  auto ret = vsnprintf(local_buff.data(), size, msg, args_cp);
+  auto ret = vsnprintf(local_buff.data(), local_buff.size(), msg, args_cp);
 
   va_end(args_cp);
 
   // handle empty expansion
   if (ret == 0) return std::string{};
-  if (static_cast<std::size_t>(ret) < size)
+  if (static_cast<std::size_t>(ret) < local_buff.size())
     return std::string(local_buff.data());
 
   // we did not provide a long enough buffer on our first attempt.
   // add 1 to size to account for null-byte in size cast to prevent overflow
-  size = static_cast<std::size_t>(ret) + 1;
+  std::size_t size = static_cast<std::size_t>(ret) + 1;
   auto buff_ptr = std::unique_ptr<char[]>(new char[size]);
   // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
   // in the android-ndk
-  ret = vsnprintf(buff_ptr.get(), size, msg, args);
+  vsnprintf(buff_ptr.get(), size, msg, args);
   return std::string(buff_ptr.get());
 }
 
+}  // end namespace
+
+std::string HumanReadableNumber(double n, Counter::OneK one_k) {
+  return ToBinaryStringFullySpecified(n, 1, one_k);
+}
+
 std::string StrFormat(const char* format, ...) {
   va_list args;
   va_start(args, format);
@@ -163,6 +152,19 @@ std::string StrFormat(const char* format, ...) {
   return tmp;
 }
 
+std::vector<std::string> StrSplit(const std::string& str, char delim) {
+  if (str.empty()) return {};
+  std::vector<std::string> ret;
+  size_t first = 0;
+  size_t next = str.find(delim);
+  for (; next != std::string::npos;
+       first = next + 1, next = str.find(delim, first)) {
+    ret.push_back(str.substr(first, next - first));
+  }
+  ret.push_back(str.substr(first));
+  return ret;
+}
+
 #ifdef BENCHMARK_STL_ANDROID_GNUSTL
 /*
  * GNU STL in Android NDK lacks support for some C++11 functions, including
@@ -185,11 +187,10 @@ unsigned long stoul(const std::string& str, size_t* pos, int base) {
 
   /* Check for errors and return */
   if (strtoulErrno == ERANGE) {
-    throw std::out_of_range(
-      "stoul failed: " + str + " is outside of range of unsigned long");
+    throw std::out_of_range("stoul failed: " + str +
+                            " is outside of range of unsigned long");
   } else if (strEnd == strStart || strtoulErrno != 0) {
-    throw std::invalid_argument(
-      "stoul failed: " + str + " is not an integer");
+    throw std::invalid_argument("stoul failed: " + str + " is not an integer");
   }
   if (pos != nullptr) {
     *pos = static_cast<size_t>(strEnd - strStart);
@@ -212,11 +213,10 @@ int stoi(const std::string& str, size_t* pos, int base) {
 
   /* Check for errors and return */
   if (strtolErrno == ERANGE || long(int(result)) != result) {
-    throw std::out_of_range(
-      "stoul failed: " + str + " is outside of range of int");
+    throw std::out_of_range("stoul failed: " + str +
+                            " is outside of range of int");
   } else if (strEnd == strStart || strtolErrno != 0) {
-    throw std::invalid_argument(
-      "stoul failed: " + str + " is not an integer");
+    throw std::invalid_argument("stoul failed: " + str + " is not an integer");
   }
   if (pos != nullptr) {
     *pos = static_cast<size_t>(strEnd - strStart);
@@ -239,11 +239,10 @@ double stod(const std::string& str, size_t* pos) {
 
   /* Check for errors and return */
   if (strtodErrno == ERANGE) {
-    throw std::out_of_range(
-      "stoul failed: " + str + " is outside of range of int");
+    throw std::out_of_range("stoul failed: " + str +
+                            " is outside of range of int");
   } else if (strEnd == strStart || strtodErrno != 0) {
-    throw std::invalid_argument(
-      "stoul failed: " + str + " is not an integer");
+    throw std::invalid_argument("stoul failed: " + str + " is not an integer");
   }
   if (pos != nullptr) {
     *pos = static_cast<size_t>(strEnd - strStart);
diff --git a/src/string_util.h b/src/string_util.h
index 09d7b4b..731aa2c 100644
--- a/src/string_util.h
+++ b/src/string_util.h
@@ -4,14 +4,19 @@
 #include <sstream>
 #include <string>
 #include <utility>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "benchmark/export.h"
+#include "check.h"
 #include "internal_macros.h"
 
 namespace benchmark {
 
-void AppendHumanReadable(int n, std::string* str);
-
-std::string HumanReadableNumber(double n, double one_k = 1024.0);
+BENCHMARK_EXPORT
+std::string HumanReadableNumber(double n, Counter::OneK one_k);
 
+BENCHMARK_EXPORT
 #if defined(__MINGW32__)
 __attribute__((format(__MINGW_PRINTF_FORMAT, 1, 2)))
 #elif defined(__GNUC__)
@@ -37,6 +42,11 @@ inline std::string StrCat(Args&&... args) {
   return ss.str();
 }
 
+BENCHMARK_EXPORT
+std::vector<std::string> StrSplit(const std::string& str, char delim);
+
+// Disable lint checking for this block since it re-implements C functions.
+// NOLINTBEGIN
 #ifdef BENCHMARK_STL_ANDROID_GNUSTL
 /*
  * GNU STL in Android NDK lacks support for some C++11 functions, including
@@ -45,14 +55,15 @@ inline std::string StrCat(Args&&... args) {
  * namespace, not std:: namespace.
  */
 unsigned long stoul(const std::string& str, size_t* pos = nullptr,
-                           int base = 10);
+                    int base = 10);
 int stoi(const std::string& str, size_t* pos = nullptr, int base = 10);
 double stod(const std::string& str, size_t* pos = nullptr);
 #else
-using std::stoul;
-using std::stoi;
-using std::stod;
+using std::stod;   // NOLINT(misc-unused-using-decls)
+using std::stoi;   // NOLINT(misc-unused-using-decls)
+using std::stoul;  // NOLINT(misc-unused-using-decls)
 #endif
+// NOLINTEND
 
 }  // end namespace benchmark
 
diff --git a/src/sysinfo.cc b/src/sysinfo.cc
index b30b4f8..922e83a 100644
--- a/src/sysinfo.cc
+++ b/src/sysinfo.cc
@@ -19,10 +19,11 @@
 #undef StrCat  // Don't let StrCat in string_util.h be renamed to lstrcatA
 #include <versionhelpers.h>
 #include <windows.h>
+
 #include <codecvt>
 #else
 #include <fcntl.h>
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@@ -37,10 +38,17 @@
 #endif
 #if defined(BENCHMARK_OS_SOLARIS)
 #include <kstat.h>
+#include <netdb.h>
 #endif
 #if defined(BENCHMARK_OS_QNX)
 #include <sys/syspage.h>
 #endif
+#if defined(BENCHMARK_OS_QURT)
+#include <qurt.h>
+#endif
+#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
+#include <pthread.h>
+#endif
 
 #include <algorithm>
 #include <array>
@@ -55,17 +63,19 @@
 #include <iostream>
 #include <iterator>
 #include <limits>
+#include <locale>
 #include <memory>
+#include <random>
 #include <sstream>
-#include <locale>
 #include <utility>
 
+#include "benchmark/benchmark.h"
 #include "check.h"
 #include "cycleclock.h"
 #include "internal_macros.h"
 #include "log.h"
-#include "sleep.h"
 #include "string_util.h"
+#include "timers.h"
 
 namespace benchmark {
 namespace {
@@ -90,67 +100,59 @@ BENCHMARK_NORETURN void PrintErrorAndDie(Args&&... args) {
 /// `sysctl` with the result type it's to be interpreted as.
 struct ValueUnion {
   union DataT {
-    uint32_t uint32_value;
-    uint64_t uint64_value;
+    int32_t int32_value;
+    int64_t int64_value;
     // For correct aliasing of union members from bytes.
     char bytes[8];
   };
   using DataPtr = std::unique_ptr<DataT, decltype(&std::free)>;
 
   // The size of the data union member + its trailing array size.
-  size_t Size;
-  DataPtr Buff;
+  std::size_t size;
+  DataPtr buff;
 
  public:
-  ValueUnion() : Size(0), Buff(nullptr, &std::free) {}
+  ValueUnion() : size(0), buff(nullptr, &std::free) {}
 
-  explicit ValueUnion(size_t BuffSize)
-      : Size(sizeof(DataT) + BuffSize),
-        Buff(::new (std::malloc(Size)) DataT(), &std::free) {}
+  explicit ValueUnion(std::size_t buff_size)
+      : size(sizeof(DataT) + buff_size),
+        buff(::new (std::malloc(size)) DataT(), &std::free) {}
 
   ValueUnion(ValueUnion&& other) = default;
 
-  explicit operator bool() const { return bool(Buff); }
+  explicit operator bool() const { return bool(buff); }
 
-  char* data() const { return Buff->bytes; }
+  char* data() const { return buff->bytes; }
 
   std::string GetAsString() const { return std::string(data()); }
 
   int64_t GetAsInteger() const {
-    if (Size == sizeof(Buff->uint32_value))
-      return static_cast<int32_t>(Buff->uint32_value);
-    else if (Size == sizeof(Buff->uint64_value))
-      return static_cast<int64_t>(Buff->uint64_value);
-    BENCHMARK_UNREACHABLE();
-  }
-
-  uint64_t GetAsUnsigned() const {
-    if (Size == sizeof(Buff->uint32_value))
-      return Buff->uint32_value;
-    else if (Size == sizeof(Buff->uint64_value))
-      return Buff->uint64_value;
+    if (size == sizeof(buff->int32_value))
+      return buff->int32_value;
+    else if (size == sizeof(buff->int64_value))
+      return buff->int64_value;
     BENCHMARK_UNREACHABLE();
   }
 
   template <class T, int N>
   std::array<T, N> GetAsArray() {
-    const int ArrSize = sizeof(T) * N;
-    CHECK_LE(ArrSize, Size);
-    std::array<T, N> Arr;
-    std::memcpy(Arr.data(), data(), ArrSize);
-    return Arr;
+    const int arr_size = sizeof(T) * N;
+    BM_CHECK_LE(arr_size, size);
+    std::array<T, N> arr;
+    std::memcpy(arr.data(), data(), arr_size);
+    return arr;
   }
 };
 
-ValueUnion GetSysctlImp(std::string const& Name) {
+ValueUnion GetSysctlImp(std::string const& name) {
 #if defined BENCHMARK_OS_OPENBSD
   int mib[2];
 
   mib[0] = CTL_HW;
-  if ((Name == "hw.ncpu") || (Name == "hw.cpuspeed")){
+  if ((name == "hw.ncpu") || (name == "hw.cpuspeed")) {
     ValueUnion buff(sizeof(int));
 
-    if (Name == "hw.ncpu") {
+    if (name == "hw.ncpu") {
       mib[1] = HW_NCPU;
     } else {
       mib[1] = HW_CPUSPEED;
@@ -163,41 +165,41 @@ ValueUnion GetSysctlImp(std::string const& Name) {
   }
   return ValueUnion();
 #else
-  size_t CurBuffSize = 0;
-  if (sysctlbyname(Name.c_str(), nullptr, &CurBuffSize, nullptr, 0) == -1)
+  std::size_t cur_buff_size = 0;
+  if (sysctlbyname(name.c_str(), nullptr, &cur_buff_size, nullptr, 0) == -1)
     return ValueUnion();
 
-  ValueUnion buff(CurBuffSize);
-  if (sysctlbyname(Name.c_str(), buff.data(), &buff.Size, nullptr, 0) == 0)
+  ValueUnion buff(cur_buff_size);
+  if (sysctlbyname(name.c_str(), buff.data(), &buff.size, nullptr, 0) == 0)
     return buff;
   return ValueUnion();
 #endif
 }
 
 BENCHMARK_MAYBE_UNUSED
-bool GetSysctl(std::string const& Name, std::string* Out) {
-  Out->clear();
-  auto Buff = GetSysctlImp(Name);
-  if (!Buff) return false;
-  Out->assign(Buff.data());
+bool GetSysctl(std::string const& name, std::string* out) {
+  out->clear();
+  auto buff = GetSysctlImp(name);
+  if (!buff) return false;
+  out->assign(buff.data());
   return true;
 }
 
 template <class Tp,
           class = typename std::enable_if<std::is_integral<Tp>::value>::type>
-bool GetSysctl(std::string const& Name, Tp* Out) {
-  *Out = 0;
-  auto Buff = GetSysctlImp(Name);
-  if (!Buff) return false;
-  *Out = static_cast<Tp>(Buff.GetAsUnsigned());
+bool GetSysctl(std::string const& name, Tp* out) {
+  *out = 0;
+  auto buff = GetSysctlImp(name);
+  if (!buff) return false;
+  *out = static_cast<Tp>(buff.GetAsInteger());
   return true;
 }
 
 template <class Tp, size_t N>
-bool GetSysctl(std::string const& Name, std::array<Tp, N>* Out) {
-  auto Buff = GetSysctlImp(Name);
-  if (!Buff) return false;
-  *Out = Buff.GetAsArray<Tp, N>();
+bool GetSysctl(std::string const& name, std::array<Tp, N>* out) {
+  auto buff = GetSysctlImp(name);
+  if (!buff) return false;
+  *out = buff.GetAsArray<Tp, N>();
   return true;
 }
 #endif
@@ -214,10 +216,9 @@ bool ReadFromFile(std::string const& fname, ArgT* arg) {
 CPUInfo::Scaling CpuScaling(int num_cpus) {
   // We don't have a valid CPU count, so don't even bother.
   if (num_cpus <= 0) return CPUInfo::Scaling::UNKNOWN;
-#ifdef BENCHMARK_OS_QNX
+#if defined(BENCHMARK_OS_QNX)
   return CPUInfo::Scaling::UNKNOWN;
-#endif
-#ifndef BENCHMARK_OS_WINDOWS
+#elif !defined(BENCHMARK_OS_WINDOWS)
   // On Linux, the CPUfreq subsystem exposes CPU information as files on the
   // local file system. If reading the exported files fails, then we may not be
   // running on Linux, so we silently ignore all the read errors.
@@ -225,28 +226,30 @@ CPUInfo::Scaling CpuScaling(int num_cpus) {
   for (int cpu = 0; cpu < num_cpus; ++cpu) {
     std::string governor_file =
         StrCat("/sys/devices/system/cpu/cpu", cpu, "/cpufreq/scaling_governor");
-    if (ReadFromFile(governor_file, &res) && res != "performance") return CPUInfo::Scaling::ENABLED;
+    if (ReadFromFile(governor_file, &res) && res != "performance")
+      return CPUInfo::Scaling::ENABLED;
   }
   return CPUInfo::Scaling::DISABLED;
-#endif
+#else
   return CPUInfo::Scaling::UNKNOWN;
+#endif
 }
 
-int CountSetBitsInCPUMap(std::string Val) {
-  auto CountBits = [](std::string Part) {
+int CountSetBitsInCPUMap(std::string val) {
+  auto CountBits = [](std::string part) {
     using CPUMask = std::bitset<sizeof(std::uintptr_t) * CHAR_BIT>;
-    Part = "0x" + Part;
-    CPUMask Mask(benchmark::stoul(Part, nullptr, 16));
-    return static_cast<int>(Mask.count());
+    part = "0x" + part;
+    CPUMask mask(benchmark::stoul(part, nullptr, 16));
+    return static_cast<int>(mask.count());
   };
-  size_t Pos;
+  std::size_t pos;
   int total = 0;
-  while ((Pos = Val.find(',')) != std::string::npos) {
-    total += CountBits(Val.substr(0, Pos));
-    Val = Val.substr(Pos + 1);
+  while ((pos = val.find(',')) != std::string::npos) {
+    total += CountBits(val.substr(0, pos));
+    val = val.substr(pos + 1);
   }
-  if (!Val.empty()) {
-    total += CountBits(Val);
+  if (!val.empty()) {
+    total += CountBits(val);
   }
   return total;
 }
@@ -255,16 +258,16 @@ BENCHMARK_MAYBE_UNUSED
 std::vector<CPUInfo::CacheInfo> GetCacheSizesFromKVFS() {
   std::vector<CPUInfo::CacheInfo> res;
   std::string dir = "/sys/devices/system/cpu/cpu0/cache/";
-  int Idx = 0;
+  int idx = 0;
   while (true) {
     CPUInfo::CacheInfo info;
-    std::string FPath = StrCat(dir, "index", Idx++, "/");
-    std::ifstream f(StrCat(FPath, "size").c_str());
+    std::string fpath = StrCat(dir, "index", idx++, "/");
+    std::ifstream f(StrCat(fpath, "size").c_str());
     if (!f.is_open()) break;
     std::string suffix;
     f >> info.size;
     if (f.fail())
-      PrintErrorAndDie("Failed while reading file '", FPath, "size'");
+      PrintErrorAndDie("Failed while reading file '", fpath, "size'");
     if (f.good()) {
       f >> suffix;
       if (f.bad())
@@ -275,13 +278,13 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesFromKVFS() {
       else if (suffix == "K")
         info.size *= 1024;
     }
-    if (!ReadFromFile(StrCat(FPath, "type"), &info.type))
-      PrintErrorAndDie("Failed to read from file ", FPath, "type");
-    if (!ReadFromFile(StrCat(FPath, "level"), &info.level))
-      PrintErrorAndDie("Failed to read from file ", FPath, "level");
+    if (!ReadFromFile(StrCat(fpath, "type"), &info.type))
+      PrintErrorAndDie("Failed to read from file ", fpath, "type");
+    if (!ReadFromFile(StrCat(fpath, "level"), &info.level))
+      PrintErrorAndDie("Failed to read from file ", fpath, "level");
     std::string map_str;
-    if (!ReadFromFile(StrCat(FPath, "shared_cpu_map"), &map_str))
-      PrintErrorAndDie("Failed to read from file ", FPath, "shared_cpu_map");
+    if (!ReadFromFile(StrCat(fpath, "shared_cpu_map"), &map_str))
+      PrintErrorAndDie("Failed to read from file ", fpath, "shared_cpu_map");
     info.num_sharing = CountSetBitsInCPUMap(map_str);
     res.push_back(info);
   }
@@ -292,26 +295,26 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesFromKVFS() {
 #ifdef BENCHMARK_OS_MACOSX
 std::vector<CPUInfo::CacheInfo> GetCacheSizesMacOSX() {
   std::vector<CPUInfo::CacheInfo> res;
-  std::array<uint64_t, 4> CacheCounts{{0, 0, 0, 0}};
-  GetSysctl("hw.cacheconfig", &CacheCounts);
+  std::array<int, 4> cache_counts{{0, 0, 0, 0}};
+  GetSysctl("hw.cacheconfig", &cache_counts);
 
   struct {
     std::string name;
     std::string type;
     int level;
-    uint64_t num_sharing;
-  } Cases[] = {{"hw.l1dcachesize", "Data", 1, CacheCounts[1]},
-               {"hw.l1icachesize", "Instruction", 1, CacheCounts[1]},
-               {"hw.l2cachesize", "Unified", 2, CacheCounts[2]},
-               {"hw.l3cachesize", "Unified", 3, CacheCounts[3]}};
-  for (auto& C : Cases) {
+    int num_sharing;
+  } cases[] = {{"hw.l1dcachesize", "Data", 1, cache_counts[1]},
+               {"hw.l1icachesize", "Instruction", 1, cache_counts[1]},
+               {"hw.l2cachesize", "Unified", 2, cache_counts[2]},
+               {"hw.l3cachesize", "Unified", 3, cache_counts[3]}};
+  for (auto& c : cases) {
     int val;
-    if (!GetSysctl(C.name, &val)) continue;
+    if (!GetSysctl(c.name, &val)) continue;
     CPUInfo::CacheInfo info;
-    info.type = C.type;
-    info.level = C.level;
+    info.type = c.type;
+    info.level = c.level;
     info.size = val;
-    info.num_sharing = static_cast<int>(C.num_sharing);
+    info.num_sharing = c.num_sharing;
     res.push_back(std::move(info));
   }
   return res;
@@ -325,7 +328,7 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
 
   using UPtr = std::unique_ptr<PInfo, decltype(&std::free)>;
   GetLogicalProcessorInformation(nullptr, &buffer_size);
-  UPtr buff((PInfo*)malloc(buffer_size), &std::free);
+  UPtr buff(static_cast<PInfo*>(std::malloc(buffer_size)), &std::free);
   if (!GetLogicalProcessorInformation(buff.get(), &buffer_size))
     PrintErrorAndDie("Failed during call to GetLogicalProcessorInformation: ",
                      GetLastError());
@@ -336,15 +339,16 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
   for (; it != end; ++it) {
     if (it->Relationship != RelationCache) continue;
     using BitSet = std::bitset<sizeof(ULONG_PTR) * CHAR_BIT>;
-    BitSet B(it->ProcessorMask);
+    BitSet b(it->ProcessorMask);
     // To prevent duplicates, only consider caches where CPU 0 is specified
-    if (!B.test(0)) continue;
-    CInfo* Cache = &it->Cache;
+    if (!b.test(0)) continue;
+    const CInfo& cache = it->Cache;
     CPUInfo::CacheInfo C;
-    C.num_sharing = static_cast<int>(B.count());
-    C.level = Cache->Level;
-    C.size = Cache->Size;
-    switch (Cache->Type) {
+    C.num_sharing = static_cast<int>(b.count());
+    C.level = cache.Level;
+    C.size = cache.Size;
+    C.type = "Unknown";
+    switch (cache.Type) {
       case CacheUnified:
         C.type = "Unified";
         break;
@@ -357,9 +361,6 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
       case CacheTrace:
         C.type = "Trace";
         break;
-      default:
-        C.type = "Unknown";
-        break;
     }
     res.push_back(C);
   }
@@ -368,29 +369,29 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
 #elif BENCHMARK_OS_QNX
 std::vector<CPUInfo::CacheInfo> GetCacheSizesQNX() {
   std::vector<CPUInfo::CacheInfo> res;
-  struct cacheattr_entry *cache = SYSPAGE_ENTRY(cacheattr);
+  struct cacheattr_entry* cache = SYSPAGE_ENTRY(cacheattr);
   uint32_t const elsize = SYSPAGE_ELEMENT_SIZE(cacheattr);
-  int num = SYSPAGE_ENTRY_SIZE(cacheattr) / elsize ;
-  for(int i = 0; i < num; ++i ) {
+  int num = SYSPAGE_ENTRY_SIZE(cacheattr) / elsize;
+  for (int i = 0; i < num; ++i) {
     CPUInfo::CacheInfo info;
-    switch (cache->flags){
-      case CACHE_FLAG_INSTR :
+    switch (cache->flags) {
+      case CACHE_FLAG_INSTR:
         info.type = "Instruction";
         info.level = 1;
         break;
-      case CACHE_FLAG_DATA :
+      case CACHE_FLAG_DATA:
         info.type = "Data";
         info.level = 1;
         break;
-      case CACHE_FLAG_UNIFIED :
+      case CACHE_FLAG_UNIFIED:
         info.type = "Unified";
         info.level = 2;
         break;
-      case CACHE_FLAG_SHARED :
+      case CACHE_FLAG_SHARED:
         info.type = "Shared";
         info.level = 3;
         break;
-      default :
+      default:
         continue;
         break;
     }
@@ -410,6 +411,8 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizes() {
   return GetCacheSizesWindows();
 #elif defined(BENCHMARK_OS_QNX)
   return GetCacheSizesQNX();
+#elif defined(BENCHMARK_OS_QURT)
+  return std::vector<CPUInfo::CacheInfo>();
 #else
   return GetCacheSizesFromKVFS();
 #endif
@@ -418,24 +421,32 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizes() {
 std::string GetSystemName() {
 #if defined(BENCHMARK_OS_WINDOWS)
   std::string str;
-  const unsigned COUNT = MAX_COMPUTERNAME_LENGTH+1;
-  TCHAR  hostname[COUNT] = {'\0'};
+  static constexpr int COUNT = MAX_COMPUTERNAME_LENGTH + 1;
+  TCHAR hostname[COUNT] = {'\0'};
   DWORD DWCOUNT = COUNT;
-  if (!GetComputerName(hostname, &DWCOUNT))
-    return std::string("");
+  if (!GetComputerName(hostname, &DWCOUNT)) return std::string("");
 #ifndef UNICODE
   str = std::string(hostname, DWCOUNT);
 #else
-  //Using wstring_convert, Is deprecated in C++17
-  using convert_type = std::codecvt_utf8<wchar_t>;
-  std::wstring_convert<convert_type, wchar_t> converter;
-  std::wstring wStr(hostname, DWCOUNT);
-  str = converter.to_bytes(wStr);
+  // `WideCharToMultiByte` returns `0` when conversion fails.
+  int len = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, hostname,
+                                DWCOUNT, NULL, 0, NULL, NULL);
+  str.resize(len);
+  WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, hostname, DWCOUNT, &str[0],
+                      str.size(), NULL, NULL);
 #endif
   return str;
-#else // defined(BENCHMARK_OS_WINDOWS)
+#elif defined(BENCHMARK_OS_QURT)
+  std::string str = "Hexagon DSP";
+  qurt_arch_version_t arch_version_struct;
+  if (qurt_sysenv_get_arch_version(&arch_version_struct) == QURT_EOK) {
+    str += " v";
+    str += std::to_string(arch_version_struct.arch_version);
+  }
+  return str;
+#else
 #ifndef HOST_NAME_MAX
-#ifdef BENCHMARK_HAS_SYSCTL // BSD/Mac Doesnt have HOST_NAME_MAX defined
+#ifdef BENCHMARK_HAS_SYSCTL  // BSD/Mac doesn't have HOST_NAME_MAX defined
 #define HOST_NAME_MAX 64
 #elif defined(BENCHMARK_OS_NACL)
 #define HOST_NAME_MAX 64
@@ -443,22 +454,24 @@ std::string GetSystemName() {
 #define HOST_NAME_MAX 154
 #elif defined(BENCHMARK_OS_RTEMS)
 #define HOST_NAME_MAX 256
+#elif defined(BENCHMARK_OS_SOLARIS)
+#define HOST_NAME_MAX MAXHOSTNAMELEN
 #else
-#warning "HOST_NAME_MAX not defined. using 64"
+#pragma message("HOST_NAME_MAX not defined. using 64")
 #define HOST_NAME_MAX 64
 #endif
-#endif // def HOST_NAME_MAX
+#endif  // def HOST_NAME_MAX
   char hostname[HOST_NAME_MAX];
   int retVal = gethostname(hostname, HOST_NAME_MAX);
   if (retVal != 0) return std::string("");
   return std::string(hostname);
-#endif // Catch-all POSIX block.
+#endif  // Catch-all POSIX block.
 }
 
 int GetNumCPUs() {
 #ifdef BENCHMARK_HAS_SYSCTL
-  int NumCPU = -1;
-  if (GetSysctl("hw.ncpu", &NumCPU)) return NumCPU;
+  int num_cpu = -1;
+  if (GetSysctl("hw.ncpu", &num_cpu)) return num_cpu;
   fprintf(stderr, "Err: %s\n", strerror(errno));
   std::exit(EXIT_FAILURE);
 #elif defined(BENCHMARK_OS_WINDOWS)
@@ -472,18 +485,23 @@ int GetNumCPUs() {
                                         // group
 #elif defined(BENCHMARK_OS_SOLARIS)
   // Returns -1 in case of a failure.
-  int NumCPU = sysconf(_SC_NPROCESSORS_ONLN);
-  if (NumCPU < 0) {
-    fprintf(stderr,
-            "sysconf(_SC_NPROCESSORS_ONLN) failed with error: %s\n",
+  long num_cpu = sysconf(_SC_NPROCESSORS_ONLN);
+  if (num_cpu < 0) {
+    fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed with error: %s\n",
             strerror(errno));
   }
-  return NumCPU;
+  return (int)num_cpu;
 #elif defined(BENCHMARK_OS_QNX)
   return static_cast<int>(_syspage_ptr->num_cpu);
+#elif defined(BENCHMARK_OS_QURT)
+  qurt_sysenv_max_hthreads_t hardware_threads;
+  if (qurt_sysenv_get_max_hw_threads(&hardware_threads) != QURT_EOK) {
+    hardware_threads.max_hthreads = 1;
+  }
+  return hardware_threads.max_hthreads;
 #else
-  int NumCPUs = 0;
-  int MaxID = -1;
+  int num_cpus = 0;
+  int max_id = -1;
   std::ifstream f("/proc/cpuinfo");
   if (!f.is_open()) {
     std::cerr << "failed to open /proc/cpuinfo\n";
@@ -493,20 +511,21 @@ int GetNumCPUs() {
   std::string ln;
   while (std::getline(f, ln)) {
     if (ln.empty()) continue;
-    size_t SplitIdx = ln.find(':');
+    std::size_t split_idx = ln.find(':');
     std::string value;
 #if defined(__s390__)
     // s390 has another format in /proc/cpuinfo
     // it needs to be parsed differently
-    if (SplitIdx != std::string::npos) value = ln.substr(Key.size()+1,SplitIdx-Key.size()-1);
+    if (split_idx != std::string::npos)
+      value = ln.substr(Key.size() + 1, split_idx - Key.size() - 1);
 #else
-    if (SplitIdx != std::string::npos) value = ln.substr(SplitIdx + 1);
+    if (split_idx != std::string::npos) value = ln.substr(split_idx + 1);
 #endif
     if (ln.size() >= Key.size() && ln.compare(0, Key.size(), Key) == 0) {
-      NumCPUs++;
+      num_cpus++;
       if (!value.empty()) {
-        int CurID = benchmark::stoi(value);
-        MaxID = std::max(CurID, MaxID);
+        const int cur_id = benchmark::stoi(value);
+        max_id = std::max(cur_id, max_id);
       }
     }
   }
@@ -520,17 +539,95 @@ int GetNumCPUs() {
   }
   f.close();
 
-  if ((MaxID + 1) != NumCPUs) {
+  if ((max_id + 1) != num_cpus) {
     fprintf(stderr,
             "CPU ID assignments in /proc/cpuinfo seem messed up."
             " This is usually caused by a bad BIOS.\n");
   }
-  return NumCPUs;
+  return num_cpus;
 #endif
   BENCHMARK_UNREACHABLE();
 }
 
-double GetCPUCyclesPerSecond() {
+class ThreadAffinityGuard final {
+ public:
+  ThreadAffinityGuard() : reset_affinity(SetAffinity()) {
+    if (!reset_affinity)
+      std::cerr << "***WARNING*** Failed to set thread affinity. Estimated CPU "
+                   "frequency may be incorrect."
+                << std::endl;
+  }
+
+  ~ThreadAffinityGuard() {
+    if (!reset_affinity) return;
+
+#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
+    int ret = pthread_setaffinity_np(self, sizeof(previous_affinity),
+                                     &previous_affinity);
+    if (ret == 0) return;
+#elif defined(BENCHMARK_OS_WINDOWS_WIN32)
+    DWORD_PTR ret = SetThreadAffinityMask(self, previous_affinity);
+    if (ret != 0) return;
+#endif  // def BENCHMARK_HAS_PTHREAD_AFFINITY
+    PrintErrorAndDie("Failed to reset thread affinity");
+  }
+
+  ThreadAffinityGuard(ThreadAffinityGuard&&) = delete;
+  ThreadAffinityGuard(const ThreadAffinityGuard&) = delete;
+  ThreadAffinityGuard& operator=(ThreadAffinityGuard&&) = delete;
+  ThreadAffinityGuard& operator=(const ThreadAffinityGuard&) = delete;
+
+ private:
+  bool SetAffinity() {
+#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
+    int ret;
+    self = pthread_self();
+    ret = pthread_getaffinity_np(self, sizeof(previous_affinity),
+                                 &previous_affinity);
+    if (ret != 0) return false;
+
+    cpu_set_t affinity;
+    memcpy(&affinity, &previous_affinity, sizeof(affinity));
+
+    bool is_first_cpu = true;
+
+    for (int i = 0; i < CPU_SETSIZE; ++i)
+      if (CPU_ISSET(i, &affinity)) {
+        if (is_first_cpu)
+          is_first_cpu = false;
+        else
+          CPU_CLR(i, &affinity);
+      }
+
+    if (is_first_cpu) return false;
+
+    ret = pthread_setaffinity_np(self, sizeof(affinity), &affinity);
+    return ret == 0;
+#elif defined(BENCHMARK_OS_WINDOWS_WIN32)
+    self = GetCurrentThread();
+    DWORD_PTR mask = static_cast<DWORD_PTR>(1) << GetCurrentProcessorNumber();
+    previous_affinity = SetThreadAffinityMask(self, mask);
+    return previous_affinity != 0;
+#else
+    return false;
+#endif  // def BENCHMARK_HAS_PTHREAD_AFFINITY
+  }
+
+#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
+  pthread_t self;
+  cpu_set_t previous_affinity;
+#elif defined(BENCHMARK_OS_WINDOWS_WIN32)
+  HANDLE self;
+  DWORD_PTR previous_affinity;
+#endif  // def BENCHMARK_HAS_PTHREAD_AFFINITY
+  bool reset_affinity;
+};
+
+double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
+  // Currently, scaling is only used on linux path here,
+  // suppress diagnostics about it being unused on other paths.
+  (void)scaling;
+
 #if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
   long freq;
 
@@ -541,8 +638,15 @@ double GetCPUCyclesPerSecond() {
   // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as
   // well.
   if (ReadFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)
-      // If CPU scaling is in effect, we want to use the *maximum* frequency,
-      // not whatever CPU speed some random processor happens to be using now.
+      // If CPU scaling is disabled, use the *current* frequency.
+      // Note that we specifically don't want to read cpuinfo_cur_freq,
+      // because it is only readable by root.
+      || (scaling == CPUInfo::Scaling::DISABLED &&
+          ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq",
+                       &freq))
+      // Otherwise, if CPU scaling may be in effect, we want to use
+      // the *maximum* frequency, not whatever CPU speed some random processor
+      // happens to be using now.
       || ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
                       &freq)) {
     // The value is in kHz (as the file name suggests).  For example, on a
@@ -559,7 +663,7 @@ double GetCPUCyclesPerSecond() {
     return error_value;
   }
 
-  auto startsWithKey = [](std::string const& Value, std::string const& Key) {
+  auto StartsWithKey = [](std::string const& Value, std::string const& Key) {
     if (Key.size() > Value.size()) return false;
     auto Cmp = [&](char X, char Y) {
       return std::tolower(X) == std::tolower(Y);
@@ -570,18 +674,18 @@ double GetCPUCyclesPerSecond() {
   std::string ln;
   while (std::getline(f, ln)) {
     if (ln.empty()) continue;
-    size_t SplitIdx = ln.find(':');
+    std::size_t split_idx = ln.find(':');
     std::string value;
-    if (SplitIdx != std::string::npos) value = ln.substr(SplitIdx + 1);
+    if (split_idx != std::string::npos) value = ln.substr(split_idx + 1);
     // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only
     // accept positive values. Some environments (virtual machines) report zero,
     // which would cause infinite looping in WallTime_Init.
-    if (startsWithKey(ln, "cpu MHz")) {
+    if (StartsWithKey(ln, "cpu MHz")) {
       if (!value.empty()) {
         double cycles_per_second = benchmark::stod(value) * 1000000.0;
         if (cycles_per_second > 0) return cycles_per_second;
       }
-    } else if (startsWithKey(ln, "bogomips")) {
+    } else if (StartsWithKey(ln, "bogomips")) {
       if (!value.empty()) {
         bogo_clock = benchmark::stod(value) * 1000000.0;
         if (bogo_clock < 0.0) bogo_clock = error_value;
@@ -603,7 +707,7 @@ double GetCPUCyclesPerSecond() {
   if (bogo_clock >= 0.0) return bogo_clock;
 
 #elif defined BENCHMARK_HAS_SYSCTL
-  constexpr auto* FreqStr =
+  constexpr auto* freqStr =
 #if defined(BENCHMARK_OS_FREEBSD) || defined(BENCHMARK_OS_NETBSD)
       "machdep.tsc_freq";
 #elif defined BENCHMARK_OS_OPENBSD
@@ -615,14 +719,17 @@ double GetCPUCyclesPerSecond() {
 #endif
   unsigned long long hz = 0;
 #if defined BENCHMARK_OS_OPENBSD
-  if (GetSysctl(FreqStr, &hz)) return hz * 1000000;
+  if (GetSysctl(freqStr, &hz)) return hz * 1000000;
 #else
-  if (GetSysctl(FreqStr, &hz)) return hz;
+  if (GetSysctl(freqStr, &hz)) return hz;
 #endif
   fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
-          FreqStr, strerror(errno));
+          freqStr, strerror(errno));
+  fprintf(stderr,
+          "This does not affect benchmark measurements, only the "
+          "metadata output.\n");
 
-#elif defined BENCHMARK_OS_WINDOWS
+#elif defined BENCHMARK_OS_WINDOWS_WIN32
   // In NT, read MHz from the registry. If we fail to do so or we're in win9x
   // then make a crude estimate.
   DWORD data, data_size = sizeof(data);
@@ -631,15 +738,16 @@ double GetCPUCyclesPerSecond() {
           SHGetValueA(HKEY_LOCAL_MACHINE,
                       "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
                       "~MHz", nullptr, &data, &data_size)))
-    return static_cast<double>((int64_t)data *
-                               (int64_t)(1000 * 1000));  // was mhz
-#elif defined (BENCHMARK_OS_SOLARIS)
-  kstat_ctl_t *kc = kstat_open();
+    return static_cast<double>(static_cast<int64_t>(data) *
+                               static_cast<int64_t>(1000 * 1000));  // was mhz
+#elif defined(BENCHMARK_OS_SOLARIS)
+  kstat_ctl_t* kc = kstat_open();
   if (!kc) {
     std::cerr << "failed to open /dev/kstat\n";
     return -1;
   }
-  kstat_t *ksp = kstat_lookup(kc, (char*)"cpu_info", -1, (char*)"cpu_info0");
+  kstat_t* ksp = kstat_lookup(kc, const_cast<char*>("cpu_info"), -1,
+                              const_cast<char*>("cpu_info0"));
   if (!ksp) {
     std::cerr << "failed to lookup in /dev/kstat\n";
     return -1;
@@ -648,8 +756,8 @@ double GetCPUCyclesPerSecond() {
     std::cerr << "failed to read from /dev/kstat\n";
     return -1;
   }
-  kstat_named_t *knp =
-      (kstat_named_t*)kstat_data_lookup(ksp, (char*)"current_clock_Hz");
+  kstat_named_t* knp = (kstat_named_t*)kstat_data_lookup(
+      ksp, const_cast<char*>("current_clock_Hz"));
   if (!knp) {
     std::cerr << "failed to lookup data in /dev/kstat\n";
     return -1;
@@ -662,23 +770,55 @@ double GetCPUCyclesPerSecond() {
   double clock_hz = knp->value.ui64;
   kstat_close(kc);
   return clock_hz;
-#elif defined (BENCHMARK_OS_QNX)
+#elif defined(BENCHMARK_OS_QNX)
   return static_cast<double>((int64_t)(SYSPAGE_ENTRY(cpuinfo)->speed) *
                              (int64_t)(1000 * 1000));
+#elif defined(BENCHMARK_OS_QURT)
+  // QuRT doesn't provide any API to query Hexagon frequency.
+  return 1000000000;
 #endif
   // If we've fallen through, attempt to roughly estimate the CPU clock rate.
-  const int estimate_time_ms = 1000;
+
+  // Make sure to use the same cycle counter when starting and stopping the
+  // cycle timer. We just pin the current thread to a cpu in the previous
+  // affinity set.
+  ThreadAffinityGuard affinity_guard;
+
+  static constexpr double estimate_time_s = 1.0;
+  const double start_time = ChronoClockNow();
   const auto start_ticks = cycleclock::Now();
-  SleepForMilliseconds(estimate_time_ms);
-  return static_cast<double>(cycleclock::Now() - start_ticks);
+
+  // Impose load instead of calling sleep() to make sure the cycle counter
+  // works.
+  using PRNG = std::minstd_rand;
+  using Result = PRNG::result_type;
+  PRNG rng(static_cast<Result>(start_ticks));
+
+  Result state = 0;
+
+  do {
+    static constexpr size_t batch_size = 10000;
+    rng.discard(batch_size);
+    state += rng();
+
+  } while (ChronoClockNow() - start_time < estimate_time_s);
+
+  DoNotOptimize(state);
+
+  const auto end_ticks = cycleclock::Now();
+  const double end_time = ChronoClockNow();
+
+  return static_cast<double>(end_ticks - start_ticks) / (end_time - start_time);
+  // Reset the affinity of current thread when the lifetime of affinity_guard
+  // ends.
 }
 
 std::vector<double> GetLoadAvg() {
 #if (defined BENCHMARK_OS_FREEBSD || defined(BENCHMARK_OS_LINUX) ||     \
      defined BENCHMARK_OS_MACOSX || defined BENCHMARK_OS_NETBSD ||      \
      defined BENCHMARK_OS_OPENBSD || defined BENCHMARK_OS_DRAGONFLY) && \
-    !defined(__ANDROID__)
-  constexpr int kMaxSamples = 3;
+    !(defined(__ANDROID__) && __ANDROID_API__ < 29)
+  static constexpr int kMaxSamples = 3;
   std::vector<double> res(kMaxSamples, 0.0);
   const int nelem = getloadavg(res.data(), kMaxSamples);
   if (nelem < 1) {
@@ -701,12 +841,11 @@ const CPUInfo& CPUInfo::Get() {
 
 CPUInfo::CPUInfo()
     : num_cpus(GetNumCPUs()),
-      cycles_per_second(GetCPUCyclesPerSecond()),
-      caches(GetCacheSizes()),
       scaling(CpuScaling(num_cpus)),
+      cycles_per_second(GetCPUCyclesPerSecond(scaling)),
+      caches(GetCacheSizes()),
       load_avg(GetLoadAvg()) {}
 
-
 const SystemInfo& SystemInfo::Get() {
   static const SystemInfo* info = new SystemInfo();
   return *info;
diff --git a/src/thread_manager.h b/src/thread_manager.h
index 28e2dd5..819b3c4 100644
--- a/src/thread_manager.h
+++ b/src/thread_manager.h
@@ -36,7 +36,6 @@ class ThreadManager {
                         [this]() { return alive_threads_ == 0; });
   }
 
- public:
   struct Result {
     IterationCount iterations = 0;
     double real_time_used = 0;
@@ -44,8 +43,8 @@ class ThreadManager {
     double manual_time_used = 0;
     int64_t complexity_n = 0;
     std::string report_label_;
-    std::string error_message_;
-    bool has_error_ = false;
+    std::string skip_message_;
+    internal::Skipped skipped_ = internal::NotSkipped;
     UserCounters counters;
   };
   GUARDED_BY(GetBenchmarkMutex()) Result results;
diff --git a/src/thread_timer.h b/src/thread_timer.h
index 1703ca0..eb23f59 100644
--- a/src/thread_timer.h
+++ b/src/thread_timer.h
@@ -28,7 +28,7 @@ class ThreadTimer {
 
   // Called by each thread
   void StopTimer() {
-    CHECK(running_);
+    BM_CHECK(running_);
     running_ = false;
     real_time_used_ += ChronoClockNow() - start_real_time_;
     // Floating point error can result in the subtraction producing a negative
@@ -44,19 +44,19 @@ class ThreadTimer {
 
   // REQUIRES: timer is not running
   double real_time_used() const {
-    CHECK(!running_);
+    BM_CHECK(!running_);
     return real_time_used_;
   }
 
   // REQUIRES: timer is not running
   double cpu_time_used() const {
-    CHECK(!running_);
+    BM_CHECK(!running_);
     return cpu_time_used_;
   }
 
   // REQUIRES: timer is not running
   double manual_time_used() const {
-    CHECK(!running_);
+    BM_CHECK(!running_);
     return manual_time_used_;
   }
 
diff --git a/src/timers.cc b/src/timers.cc
index 1d3ab9a..b23feea 100644
--- a/src/timers.cc
+++ b/src/timers.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "timers.h"
+
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
@@ -22,7 +23,7 @@
 #include <windows.h>
 #else
 #include <fcntl.h>
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@@ -37,6 +38,9 @@
 #include <mach/mach_port.h>
 #include <mach/thread_act.h>
 #endif
+#if defined(BENCHMARK_OS_QURT)
+#include <qurt.h>
+#endif
 #endif
 
 #ifdef BENCHMARK_OS_EMSCRIPTEN
@@ -55,7 +59,6 @@
 
 #include "check.h"
 #include "log.h"
-#include "sleep.h"
 #include "string_util.h"
 
 namespace benchmark {
@@ -64,6 +67,9 @@ namespace benchmark {
 #if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Wunused-function"
 #endif
+#if defined(__NVCOMPILER)
+#pragma diag_suppress declared_but_not_referenced
+#endif
 
 namespace {
 #if defined(BENCHMARK_OS_WINDOWS)
@@ -78,7 +84,7 @@ double MakeTime(FILETIME const& kernel_time, FILETIME const& user_time) {
           static_cast<double>(user.QuadPart)) *
          1e-7;
 }
-#elif !defined(BENCHMARK_OS_FUCHSIA)
+#elif !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 double MakeTime(struct rusage const& ru) {
   return (static_cast<double>(ru.ru_utime.tv_sec) +
           static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
@@ -118,15 +124,19 @@ double ProcessCPUUsage() {
                       &user_time))
     return MakeTime(kernel_time, user_time);
   DiagnoseAndExit("GetProccessTimes() failed");
+#elif defined(BENCHMARK_OS_QURT)
+  return static_cast<double>(
+             qurt_timer_timetick_to_us(qurt_timer_get_ticks())) *
+         1.0e-6;
 #elif defined(BENCHMARK_OS_EMSCRIPTEN)
   // clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) returns 0 on Emscripten.
   // Use Emscripten-specific API. Reported CPU time would be exactly the
   // same as total time, but this is ok because there aren't long-latency
-  // syncronous system calls in Emscripten.
+  // synchronous system calls in Emscripten.
   return emscripten_get_now() * 1e-3;
 #elif defined(CLOCK_PROCESS_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
-  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
-  // https://github.com/google/benchmark/pull/292
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11.
+  // See https://github.com/google/benchmark/pull/292
   struct timespec spec;
   if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0)
     return MakeTime(spec);
@@ -148,14 +158,19 @@ double ThreadCPUUsage() {
   GetThreadTimes(this_thread, &creation_time, &exit_time, &kernel_time,
                  &user_time);
   return MakeTime(kernel_time, user_time);
+#elif defined(BENCHMARK_OS_QURT)
+  return static_cast<double>(
+             qurt_timer_timetick_to_us(qurt_timer_get_ticks())) *
+         1.0e-6;
 #elif defined(BENCHMARK_OS_MACOSX)
-  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
-  // https://github.com/google/benchmark/pull/292
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11.
+  // See https://github.com/google/benchmark/pull/292
   mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
   thread_basic_info_data_t info;
   mach_port_t thread = pthread_mach_thread_np(pthread_self());
-  if (thread_info(thread, THREAD_BASIC_INFO, (thread_info_t)&info, &count) ==
-      KERN_SUCCESS) {
+  if (thread_info(thread, THREAD_BASIC_INFO,
+                  reinterpret_cast<thread_info_t>(&info),
+                  &count) == KERN_SUCCESS) {
     return MakeTime(info);
   }
   DiagnoseAndExit("ThreadCPUUsage() failed when evaluating thread_info");
@@ -190,15 +205,26 @@ std::string LocalDateTimeString() {
   std::size_t timestamp_len;
   long int offset_minutes;
   char tz_offset_sign = '+';
-  // Long enough buffers to avoid format-overflow warnings
-  char tz_offset[128];
+  // tz_offset is set in one of three ways:
+  // * strftime with %z - This either returns empty or the ISO 8601 time.  The
+  // maximum length an
+  //   ISO 8601 string can be is 7 (e.g. -03:30, plus trailing zero).
+  // * snprintf with %c%02li:%02li - The maximum length is 41 (one for %c, up to
+  // 19 for %02li,
+  //   one for :, up to 19 %02li, plus trailing zero).
+  // * A fixed string of "-00:00".  The maximum length is 7 (-00:00, plus
+  // trailing zero).
+  //
+  // Thus, the maximum size this needs to be is 41.
+  char tz_offset[41];
+  // Long enough buffer to avoid format-overflow warnings
   char storage[128];
 
 #if defined(BENCHMARK_OS_WINDOWS)
-  std::tm *timeinfo_p = ::localtime(&now);
+  std::tm* timeinfo_p = ::localtime(&now);
 #else
   std::tm timeinfo;
-  std::tm *timeinfo_p = &timeinfo;
+  std::tm* timeinfo_p = &timeinfo;
   ::localtime_r(&now, &timeinfo);
 #endif
 
@@ -215,10 +241,11 @@ std::string LocalDateTimeString() {
       tz_offset_sign = '-';
     }
 
-    tz_len = ::snprintf(tz_offset, sizeof(tz_offset), "%c%02li:%02li",
-        tz_offset_sign, offset_minutes / 100, offset_minutes % 100);
-    CHECK(tz_len == kTzOffsetLen);
-    ((void)tz_len); // Prevent unused variable warning in optimized build.
+    tz_len =
+        ::snprintf(tz_offset, sizeof(tz_offset), "%c%02li:%02li",
+                   tz_offset_sign, offset_minutes / 100, offset_minutes % 100);
+    BM_CHECK(tz_len == kTzOffsetLen);
+    ((void)tz_len);  // Prevent unused variable warning in optimized build.
   } else {
     // Unknown offset. RFC3339 specifies that unknown local offsets should be
     // written as UTC time with -00:00 timezone.
@@ -232,9 +259,9 @@ std::string LocalDateTimeString() {
     strncpy(tz_offset, "-00:00", kTzOffsetLen + 1);
   }
 
-  timestamp_len = std::strftime(storage, sizeof(storage), "%Y-%m-%dT%H:%M:%S",
-      timeinfo_p);
-  CHECK(timestamp_len == kTimestampLen);
+  timestamp_len =
+      std::strftime(storage, sizeof(storage), "%Y-%m-%dT%H:%M:%S", timeinfo_p);
+  BM_CHECK(timestamp_len == kTimestampLen);
   // Prevent unused variable warning in optimized build.
   ((void)kTimestampLen);
 
diff --git a/test/AssemblyTests.cmake b/test/AssemblyTests.cmake
index 3d07858..c43c711 100644
--- a/test/AssemblyTests.cmake
+++ b/test/AssemblyTests.cmake
@@ -1,3 +1,23 @@
+set(CLANG_SUPPORTED_VERSION "5.0.0")
+set(GCC_SUPPORTED_VERSION "5.5.0")
+
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL ${CLANG_SUPPORTED_VERSION})
+    message (WARNING
+      "Unsupported Clang version " ${CMAKE_CXX_COMPILER_VERSION}
+      ". Expected is " ${CLANG_SUPPORTED_VERSION}
+      ". Assembly tests may be broken.")
+  endif()
+elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL ${GCC_SUPPORTED_VERSION})
+    message (WARNING
+      "Unsupported GCC version " ${CMAKE_CXX_COMPILER_VERSION}
+      ". Expected is " ${GCC_SUPPORTED_VERSION}
+      ". Assembly tests may be broken.")
+  endif()
+else()
+  message (WARNING "Unsupported compiler. Assembly tests may be broken.")
+endif()
 
 include(split_list)
 
@@ -23,6 +43,7 @@ string(TOUPPER "${CMAKE_CXX_COMPILER_ID}" ASM_TEST_COMPILER)
 macro(add_filecheck_test name)
   cmake_parse_arguments(ARG "" "" "CHECK_PREFIXES" ${ARGV})
   add_library(${name} OBJECT ${name}.cc)
+  target_link_libraries(${name} PRIVATE benchmark::benchmark)
   set_target_properties(${name} PROPERTIES COMPILE_FLAGS "-S ${ASM_TEST_FLAGS}")
   set(ASM_OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${name}.s")
   add_custom_target(copy_${name} ALL
diff --git a/test/BUILD b/test/BUILD
index 9bb8cb0..ea34fd4 100644
--- a/test/BUILD
+++ b/test/BUILD
@@ -1,8 +1,18 @@
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
+
+platform(
+    name = "windows",
+    constraint_values = [
+        "@platforms//os:windows",
+    ],
+)
+
 TEST_COPTS = [
     "-pedantic",
     "-pedantic-errors",
     "-std=c++11",
     "-Wall",
+    "-Wconversion",
     "-Wextra",
     "-Wshadow",
     #    "-Wshorten-64-to-32",
@@ -10,64 +20,108 @@ TEST_COPTS = [
     "-fstrict-aliasing",
 ]
 
-PER_SRC_COPTS = ({
-    "cxx03_test.cc": ["-std=c++03"],
-    # Some of the issues with DoNotOptimize only occur when optimization is enabled
+# Some of the issues with DoNotOptimize only occur when optimization is enabled
+PER_SRC_COPTS = {
     "donotoptimize_test.cc": ["-O3"],
-})
+}
 
-TEST_ARGS = ["--benchmark_min_time=0.01"]
+TEST_ARGS = ["--benchmark_min_time=0.01s"]
 
-PER_SRC_TEST_ARGS = ({
+PER_SRC_TEST_ARGS = {
     "user_counters_tabular_test.cc": ["--benchmark_counters_tabular=true"],
-})
-
-load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
+    "repetitions_test.cc": [" --benchmark_repetitions=3"],
+    "spec_arg_test.cc": ["--benchmark_filter=BM_NotChosen"],
+    "spec_arg_verbosity_test.cc": ["--v=42"],
+}
 
 cc_library(
     name = "output_test_helper",
     testonly = 1,
     srcs = ["output_test_helper.cc"],
     hdrs = ["output_test.h"],
-    copts = TEST_COPTS,
+    copts = select({
+        "//:windows": [],
+        "//conditions:default": TEST_COPTS,
+    }),
     deps = [
         "//:benchmark",
         "//:benchmark_internal_headers",
     ],
 )
 
+# Tests that use gtest.  These rely on `gtest_main`.
+[
+    cc_test(
+        name = test_src[:-len(".cc")],
+        size = "small",
+        srcs = [test_src],
+        copts = select({
+            "//:windows": [],
+            "//conditions:default": TEST_COPTS,
+        }) + PER_SRC_COPTS.get(test_src, []),
+        deps = [
+            "//:benchmark",
+            "//:benchmark_internal_headers",
+            "@com_google_googletest//:gtest",
+            "@com_google_googletest//:gtest_main",
+        ],
+    )
+    for test_src in glob(["*_gtest.cc"])
+]
+
+# Tests that do not use gtest.  These have their own `main` defined.
 [
     cc_test(
         name = test_src[:-len(".cc")],
         size = "small",
         srcs = [test_src],
         args = TEST_ARGS + PER_SRC_TEST_ARGS.get(test_src, []),
-        copts = TEST_COPTS + PER_SRC_COPTS.get(test_src, []),
+        copts = select({
+            "//:windows": [],
+            "//conditions:default": TEST_COPTS,
+        }) + PER_SRC_COPTS.get(test_src, []),
         deps = [
             ":output_test_helper",
             "//:benchmark",
             "//:benchmark_internal_headers",
-            "@com_google_googletest//:gtest",
-        ] + (
-            ["@com_google_googletest//:gtest_main"] if (test_src[-len("gtest.cc"):] == "gtest.cc") else []
-        ),
+        ],
         # FIXME: Add support for assembly tests to bazel.
         # See Issue #556
         # https://github.com/google/benchmark/issues/556
     )
     for test_src in glob(
-        ["*test.cc"],
+        ["*_test.cc"],
         exclude = [
             "*_assembly_test.cc",
+            "cxx03_test.cc",
             "link_main_test.cc",
         ],
     )
 ]
 
 cc_test(
+    name = "cxx03_test",
+    size = "small",
+    srcs = ["cxx03_test.cc"],
+    copts = TEST_COPTS + ["-std=c++03"],
+    target_compatible_with = select({
+        "//:windows": ["@platforms//:incompatible"],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":output_test_helper",
+        "//:benchmark",
+        "//:benchmark_internal_headers",
+    ],
+)
+
+cc_test(
     name = "link_main_test",
     size = "small",
     srcs = ["link_main_test.cc"],
-    copts = TEST_COPTS,
+    copts = select({
+        "//:windows": [],
+        "//conditions:default": TEST_COPTS,
+    }),
     deps = ["//:benchmark_main"],
 )
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index c1a3a3f..fd88131 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,5 +1,7 @@
 # Enable the tests
 
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+
 find_package(Threads REQUIRED)
 include(CheckCXXCompilerFlag)
 
@@ -22,6 +24,10 @@ if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
   endforeach()
 endif()
 
+if (NOT BUILD_SHARED_LIBS)
+  add_definitions(-DBENCHMARK_STATIC_DEFINE)
+endif()
+
 check_cxx_compiler_flag(-O3 BENCHMARK_HAS_O3_FLAG)
 set(BENCHMARK_O3_FLAG "")
 if (BENCHMARK_HAS_O3_FLAG)
@@ -35,10 +41,14 @@ if (DEFINED BENCHMARK_CXX_LINKER_FLAGS)
 endif()
 
 add_library(output_test_helper STATIC output_test_helper.cc output_test.h)
+target_link_libraries(output_test_helper PRIVATE benchmark::benchmark)
 
 macro(compile_benchmark_test name)
   add_executable(${name} "${name}.cc")
   target_link_libraries(${name} benchmark::benchmark ${CMAKE_THREAD_LIBS_INIT})
+  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "NVHPC")
+  target_compile_options( ${name} PRIVATE --diag_suppress partial_override )
+  endif()
 endmacro(compile_benchmark_test)
 
 macro(compile_benchmark_test_with_main name)
@@ -48,20 +58,35 @@ endmacro(compile_benchmark_test_with_main)
 
 macro(compile_output_test name)
   add_executable(${name} "${name}.cc" output_test.h)
-  target_link_libraries(${name} output_test_helper benchmark::benchmark
+  target_link_libraries(${name} output_test_helper benchmark::benchmark_main
           ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endmacro(compile_output_test)
 
 # Demonstration executable
 compile_benchmark_test(benchmark_test)
-add_test(NAME benchmark COMMAND benchmark_test --benchmark_min_time=0.01)
+add_test(NAME benchmark COMMAND benchmark_test --benchmark_min_time=0.01s)
+
+compile_benchmark_test(spec_arg_test)
+add_test(NAME spec_arg COMMAND spec_arg_test --benchmark_filter=BM_NotChosen)
+
+compile_benchmark_test(spec_arg_verbosity_test)
+add_test(NAME spec_arg_verbosity COMMAND spec_arg_verbosity_test --v=42)
+
+compile_benchmark_test(benchmark_setup_teardown_test)
+add_test(NAME benchmark_setup_teardown COMMAND benchmark_setup_teardown_test)
 
 compile_benchmark_test(filter_test)
 macro(add_filter_test name filter expect)
-  add_test(NAME ${name} COMMAND filter_test --benchmark_min_time=0.01 --benchmark_filter=${filter} ${expect})
+  add_test(NAME ${name} COMMAND filter_test --benchmark_min_time=0.01s --benchmark_filter=${filter} ${expect})
   add_test(NAME ${name}_list_only COMMAND filter_test --benchmark_list_tests --benchmark_filter=${filter} ${expect})
 endmacro(add_filter_test)
 
+compile_benchmark_test(benchmark_min_time_flag_time_test)
+add_test(NAME min_time_flag_time COMMAND benchmark_min_time_flag_time_test)
+
+compile_benchmark_test(benchmark_min_time_flag_iters_test)
+add_test(NAME min_time_flag_iters COMMAND benchmark_min_time_flag_iters_test)
+
 add_filter_test(filter_simple "Foo" 3)
 add_filter_test(filter_simple_negative "-Foo" 2)
 add_filter_test(filter_suffix "BM_.*" 4)
@@ -82,72 +107,83 @@ add_filter_test(filter_regex_end ".*Ba$" 1)
 add_filter_test(filter_regex_end_negative "-.*Ba$" 4)
 
 compile_benchmark_test(options_test)
-add_test(NAME options_benchmarks COMMAND options_test --benchmark_min_time=0.01)
+add_test(NAME options_benchmarks COMMAND options_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(basic_test)
-add_test(NAME basic_benchmark COMMAND basic_test --benchmark_min_time=0.01)
+add_test(NAME basic_benchmark COMMAND basic_test --benchmark_min_time=0.01s)
+
+compile_output_test(repetitions_test)
+add_test(NAME repetitions_benchmark COMMAND repetitions_test --benchmark_min_time=0.01s --benchmark_repetitions=3)
 
 compile_benchmark_test(diagnostics_test)
-add_test(NAME diagnostics_test COMMAND diagnostics_test --benchmark_min_time=0.01)
+add_test(NAME diagnostics_test COMMAND diagnostics_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(skip_with_error_test)
-add_test(NAME skip_with_error_test COMMAND skip_with_error_test --benchmark_min_time=0.01)
+add_test(NAME skip_with_error_test COMMAND skip_with_error_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(donotoptimize_test)
+# Enable errors for deprecated deprecations (DoNotOptimize(Tp const& value)).
+check_cxx_compiler_flag(-Werror=deprecated-declarations BENCHMARK_HAS_DEPRECATED_DECLARATIONS_FLAG)
+if (BENCHMARK_HAS_DEPRECATED_DECLARATIONS_FLAG)
+  target_compile_options (donotoptimize_test PRIVATE "-Werror=deprecated-declarations")
+endif()
 # Some of the issues with DoNotOptimize only occur when optimization is enabled
 check_cxx_compiler_flag(-O3 BENCHMARK_HAS_O3_FLAG)
 if (BENCHMARK_HAS_O3_FLAG)
   set_target_properties(donotoptimize_test PROPERTIES COMPILE_FLAGS "-O3")
 endif()
-add_test(NAME donotoptimize_test COMMAND donotoptimize_test --benchmark_min_time=0.01)
+add_test(NAME donotoptimize_test COMMAND donotoptimize_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(fixture_test)
-add_test(NAME fixture_test COMMAND fixture_test --benchmark_min_time=0.01)
+add_test(NAME fixture_test COMMAND fixture_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(register_benchmark_test)
-add_test(NAME register_benchmark_test COMMAND register_benchmark_test --benchmark_min_time=0.01)
+add_test(NAME register_benchmark_test COMMAND register_benchmark_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(map_test)
-add_test(NAME map_test COMMAND map_test --benchmark_min_time=0.01)
+add_test(NAME map_test COMMAND map_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(multiple_ranges_test)
-add_test(NAME multiple_ranges_test COMMAND multiple_ranges_test --benchmark_min_time=0.01)
+add_test(NAME multiple_ranges_test COMMAND multiple_ranges_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(args_product_test)
-add_test(NAME args_product_test COMMAND args_product_test --benchmark_min_time=0.01)
+add_test(NAME args_product_test COMMAND args_product_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test_with_main(link_main_test)
-add_test(NAME link_main_test COMMAND link_main_test --benchmark_min_time=0.01)
+add_test(NAME link_main_test COMMAND link_main_test --benchmark_min_time=0.01s)
 
 compile_output_test(reporter_output_test)
-add_test(NAME reporter_output_test COMMAND reporter_output_test --benchmark_min_time=0.01)
+add_test(NAME reporter_output_test COMMAND reporter_output_test --benchmark_min_time=0.01s)
 
 compile_output_test(templated_fixture_test)
-add_test(NAME templated_fixture_test COMMAND templated_fixture_test --benchmark_min_time=0.01)
+add_test(NAME templated_fixture_test COMMAND templated_fixture_test --benchmark_min_time=0.01s)
 
 compile_output_test(user_counters_test)
-add_test(NAME user_counters_test COMMAND user_counters_test --benchmark_min_time=0.01)
+add_test(NAME user_counters_test COMMAND user_counters_test --benchmark_min_time=0.01s)
+
+compile_output_test(perf_counters_test)
+add_test(NAME perf_counters_test COMMAND perf_counters_test --benchmark_min_time=0.01s --benchmark_perf_counters=CYCLES,BRANCHES)
 
 compile_output_test(internal_threading_test)
-add_test(NAME internal_threading_test COMMAND internal_threading_test --benchmark_min_time=0.01)
+add_test(NAME internal_threading_test COMMAND internal_threading_test --benchmark_min_time=0.01s)
 
 compile_output_test(report_aggregates_only_test)
-add_test(NAME report_aggregates_only_test COMMAND report_aggregates_only_test --benchmark_min_time=0.01)
+add_test(NAME report_aggregates_only_test COMMAND report_aggregates_only_test --benchmark_min_time=0.01s)
 
 compile_output_test(display_aggregates_only_test)
-add_test(NAME display_aggregates_only_test COMMAND display_aggregates_only_test --benchmark_min_time=0.01)
+add_test(NAME display_aggregates_only_test COMMAND display_aggregates_only_test --benchmark_min_time=0.01s)
 
 compile_output_test(user_counters_tabular_test)
-add_test(NAME user_counters_tabular_test COMMAND user_counters_tabular_test --benchmark_counters_tabular=true --benchmark_min_time=0.01)
+add_test(NAME user_counters_tabular_test COMMAND user_counters_tabular_test --benchmark_counters_tabular=true --benchmark_min_time=0.01s)
 
 compile_output_test(user_counters_thousands_test)
-add_test(NAME user_counters_thousands_test COMMAND user_counters_thousands_test --benchmark_min_time=0.01)
+add_test(NAME user_counters_thousands_test COMMAND user_counters_thousands_test --benchmark_min_time=0.01s)
 
 compile_output_test(memory_manager_test)
-add_test(NAME memory_manager_test COMMAND memory_manager_test --benchmark_min_time=0.01)
+add_test(NAME memory_manager_test COMMAND memory_manager_test --benchmark_min_time=0.01s)
 
-check_cxx_compiler_flag(-std=c++03 BENCHMARK_HAS_CXX03_FLAG)
-if (BENCHMARK_HAS_CXX03_FLAG)
+# MSVC does not allow to set the language standard to C++98/03.
+if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
   compile_benchmark_test(cxx03_test)
   set_target_properties(cxx03_test
       PROPERTIES
@@ -158,19 +194,25 @@ if (BENCHMARK_HAS_CXX03_FLAG)
   # causing the test to fail to compile. To prevent this we explicitly disable
   # the warning.
   check_cxx_compiler_flag(-Wno-odr BENCHMARK_HAS_WNO_ODR)
-  if (BENCHMARK_ENABLE_LTO AND BENCHMARK_HAS_WNO_ODR)
-    set_target_properties(cxx03_test
-        PROPERTIES
-        LINK_FLAGS "-Wno-odr")
+  check_cxx_compiler_flag(-Wno-lto-type-mismatch BENCHMARK_HAS_WNO_LTO_TYPE_MISMATCH)
+  # Cannot set_target_properties multiple times here because the warnings will
+  # be overwritten on each call
+  set (DISABLE_LTO_WARNINGS "")
+  if (BENCHMARK_HAS_WNO_ODR)
+    set(DISABLE_LTO_WARNINGS "${DISABLE_LTO_WARNINGS} -Wno-odr")
+  endif()
+  if (BENCHMARK_HAS_WNO_LTO_TYPE_MISMATCH)
+    set(DISABLE_LTO_WARNINGS "${DISABLE_LTO_WARNINGS} -Wno-lto-type-mismatch")
   endif()
-  add_test(NAME cxx03 COMMAND cxx03_test --benchmark_min_time=0.01)
+  set_target_properties(cxx03_test PROPERTIES LINK_FLAGS "${DISABLE_LTO_WARNINGS}")
+  add_test(NAME cxx03 COMMAND cxx03_test --benchmark_min_time=0.01s)
 endif()
 
 # Attempt to work around flaky test failures when running on Appveyor servers.
 if (DEFINED ENV{APPVEYOR})
-  set(COMPLEXITY_MIN_TIME "0.5")
+  set(COMPLEXITY_MIN_TIME "0.5s")
 else()
-  set(COMPLEXITY_MIN_TIME "0.01")
+  set(COMPLEXITY_MIN_TIME "0.01s")
 endif()
 compile_output_test(complexity_test)
 add_test(NAME complexity_benchmark COMMAND complexity_test --benchmark_min_time=${COMPLEXITY_MIN_TIME})
@@ -193,9 +235,13 @@ if (BENCHMARK_ENABLE_GTEST_TESTS)
 
   add_gtest(benchmark_gtest)
   add_gtest(benchmark_name_gtest)
+  add_gtest(benchmark_random_interleaving_gtest)
   add_gtest(commandlineflags_gtest)
   add_gtest(statistics_gtest)
   add_gtest(string_util_gtest)
+  add_gtest(perf_counters_gtest)
+  add_gtest(time_unit_gtest)
+  add_gtest(min_time_parse_gtest)
 endif(BENCHMARK_ENABLE_GTEST_TESTS)
 
 ###############################################################################
diff --git a/test/args_product_test.cc b/test/args_product_test.cc
index 8a859f8..63b8b71 100644
--- a/test/args_product_test.cc
+++ b/test/args_product_test.cc
@@ -1,10 +1,10 @@
-#include "benchmark/benchmark.h"
-
 #include <cassert>
 #include <iostream>
 #include <set>
 #include <vector>
 
+#include "benchmark/benchmark.h"
+
 class ArgsProductFixture : public ::benchmark::Fixture {
  public:
   ArgsProductFixture()
@@ -23,7 +23,7 @@ class ArgsProductFixture : public ::benchmark::Fixture {
                         {2, 15, 10, 9},
                         {4, 5, 6, 11}}) {}
 
-  void SetUp(const ::benchmark::State& state) {
+  void SetUp(const ::benchmark::State& state) override {
     std::vector<int64_t> ranges = {state.range(0), state.range(1),
                                    state.range(2), state.range(3)};
 
@@ -34,10 +34,10 @@ class ArgsProductFixture : public ::benchmark::Fixture {
 
   // NOTE: This is not TearDown as we want to check after _all_ runs are
   // complete.
-  virtual ~ArgsProductFixture() {
+  ~ArgsProductFixture() override {
     if (actualValues != expectedValues) {
       std::cout << "EXPECTED\n";
-      for (auto v : expectedValues) {
+      for (const auto& v : expectedValues) {
         std::cout << "{";
         for (int64_t iv : v) {
           std::cout << iv << ", ";
@@ -45,7 +45,7 @@ class ArgsProductFixture : public ::benchmark::Fixture {
         std::cout << "}\n";
       }
       std::cout << "ACTUAL\n";
-      for (auto v : actualValues) {
+      for (const auto& v : actualValues) {
         std::cout << "{";
         for (int64_t iv : v) {
           std::cout << iv << ", ";
diff --git a/test/basic_test.cc b/test/basic_test.cc
index 5f3dd1a..cba1b0f 100644
--- a/test/basic_test.cc
+++ b/test/basic_test.cc
@@ -5,7 +5,8 @@
 
 void BM_empty(benchmark::State& state) {
   for (auto _ : state) {
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
 }
 BENCHMARK(BM_empty);
@@ -13,7 +14,7 @@ BENCHMARK(BM_empty)->ThreadPerCpu();
 
 void BM_spin_empty(benchmark::State& state) {
   for (auto _ : state) {
-    for (int x = 0; x < state.range(0); ++x) {
+    for (auto x = 0; x < state.range(0); ++x) {
       benchmark::DoNotOptimize(x);
     }
   }
@@ -22,11 +23,11 @@ BASIC_BENCHMARK_TEST(BM_spin_empty);
 BASIC_BENCHMARK_TEST(BM_spin_empty)->ThreadPerCpu();
 
 void BM_spin_pause_before(benchmark::State& state) {
-  for (int i = 0; i < state.range(0); ++i) {
+  for (auto i = 0; i < state.range(0); ++i) {
     benchmark::DoNotOptimize(i);
   }
   for (auto _ : state) {
-    for (int i = 0; i < state.range(0); ++i) {
+    for (auto i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
   }
@@ -37,11 +38,11 @@ BASIC_BENCHMARK_TEST(BM_spin_pause_before)->ThreadPerCpu();
 void BM_spin_pause_during(benchmark::State& state) {
   for (auto _ : state) {
     state.PauseTiming();
-    for (int i = 0; i < state.range(0); ++i) {
+    for (auto i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
     state.ResumeTiming();
-    for (int i = 0; i < state.range(0); ++i) {
+    for (auto i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
   }
@@ -62,11 +63,11 @@ BENCHMARK(BM_pause_during)->UseRealTime()->ThreadPerCpu();
 
 void BM_spin_pause_after(benchmark::State& state) {
   for (auto _ : state) {
-    for (int i = 0; i < state.range(0); ++i) {
+    for (auto i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
   }
-  for (int i = 0; i < state.range(0); ++i) {
+  for (auto i = 0; i < state.range(0); ++i) {
     benchmark::DoNotOptimize(i);
   }
 }
@@ -74,15 +75,15 @@ BASIC_BENCHMARK_TEST(BM_spin_pause_after);
 BASIC_BENCHMARK_TEST(BM_spin_pause_after)->ThreadPerCpu();
 
 void BM_spin_pause_before_and_after(benchmark::State& state) {
-  for (int i = 0; i < state.range(0); ++i) {
+  for (auto i = 0; i < state.range(0); ++i) {
     benchmark::DoNotOptimize(i);
   }
   for (auto _ : state) {
-    for (int i = 0; i < state.range(0); ++i) {
+    for (auto i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
   }
-  for (int i = 0; i < state.range(0); ++i) {
+  for (auto i = 0; i < state.range(0); ++i) {
     benchmark::DoNotOptimize(i);
   }
 }
@@ -96,7 +97,6 @@ void BM_empty_stop_start(benchmark::State& state) {
 BENCHMARK(BM_empty_stop_start);
 BENCHMARK(BM_empty_stop_start)->ThreadPerCpu();
 
-
 void BM_KeepRunning(benchmark::State& state) {
   benchmark::IterationCount iter_count = 0;
   assert(iter_count == state.iterations());
@@ -108,15 +108,30 @@ void BM_KeepRunning(benchmark::State& state) {
 BENCHMARK(BM_KeepRunning);
 
 void BM_KeepRunningBatch(benchmark::State& state) {
-  // Choose a prime batch size to avoid evenly dividing max_iterations.
-  const benchmark::IterationCount batch_size = 101;
+  // Choose a batch size >1000 to skip the typical runs with iteration
+  // targets of 10, 100 and 1000.  If these are not actually skipped the
+  // bug would be detectable as consecutive runs with the same iteration
+  // count.  Below we assert that this does not happen.
+  const benchmark::IterationCount batch_size = 1009;
+
+  static benchmark::IterationCount prior_iter_count = 0;
   benchmark::IterationCount iter_count = 0;
   while (state.KeepRunningBatch(batch_size)) {
     iter_count += batch_size;
   }
   assert(state.iterations() == iter_count);
+
+  // Verify that the iteration count always increases across runs (see
+  // comment above).
+  assert(iter_count == batch_size            // max_iterations == 1
+         || iter_count > prior_iter_count);  // max_iterations > batch_size
+  prior_iter_count = iter_count;
 }
-BENCHMARK(BM_KeepRunningBatch);
+// Register with a fixed repetition count to establish the invariant that
+// the iteration count should always change across runs.  This overrides
+// the --benchmark_repetitions command line flag, which would otherwise
+// cause this test to fail if set > 1.
+BENCHMARK(BM_KeepRunningBatch)->Repetitions(1);
 
 void BM_RangedFor(benchmark::State& state) {
   benchmark::IterationCount iter_count = 0;
@@ -127,10 +142,39 @@ void BM_RangedFor(benchmark::State& state) {
 }
 BENCHMARK(BM_RangedFor);
 
+#ifdef BENCHMARK_HAS_CXX11
+template <typename T>
+void BM_OneTemplateFunc(benchmark::State& state) {
+  auto arg = state.range(0);
+  T sum = 0;
+  for (auto _ : state) {
+    sum += static_cast<T>(arg);
+  }
+}
+BENCHMARK(BM_OneTemplateFunc<int>)->Arg(1);
+BENCHMARK(BM_OneTemplateFunc<double>)->Arg(1);
+
+template <typename A, typename B>
+void BM_TwoTemplateFunc(benchmark::State& state) {
+  auto arg = state.range(0);
+  A sum = 0;
+  B prod = 1;
+  for (auto _ : state) {
+    sum += static_cast<A>(arg);
+    prod *= static_cast<B>(arg);
+  }
+}
+BENCHMARK(BM_TwoTemplateFunc<int, double>)->Arg(1);
+BENCHMARK(BM_TwoTemplateFunc<double, int>)->Arg(1);
+
+#endif  // BENCHMARK_HAS_CXX11
+
 // Ensure that StateIterator provides all the necessary typedefs required to
 // instantiate std::iterator_traits.
-static_assert(std::is_same<
-  typename std::iterator_traits<benchmark::State::StateIterator>::value_type,
-  typename benchmark::State::StateIterator::value_type>::value, "");
+static_assert(
+    std::is_same<typename std::iterator_traits<
+                     benchmark::State::StateIterator>::value_type,
+                 typename benchmark::State::StateIterator::value_type>::value,
+    "");
 
 BENCHMARK_MAIN();
diff --git a/test/benchmark_gtest.cc b/test/benchmark_gtest.cc
index 6dbf7a5..2c9e555 100644
--- a/test/benchmark_gtest.cc
+++ b/test/benchmark_gtest.cc
@@ -1,11 +1,15 @@
+#include <map>
+#include <string>
 #include <vector>
 
 #include "../src/benchmark_register.h"
+#include "benchmark/benchmark.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
 namespace benchmark {
 namespace internal {
+
 namespace {
 
 TEST(AddRangeTest, Simple) {
@@ -34,8 +38,9 @@ TEST(AddRangeTest, Advanced64) {
 
 TEST(AddRangeTest, FullRange8) {
   std::vector<int8_t> dst;
-  AddRange(&dst, int8_t{1}, std::numeric_limits<int8_t>::max(), 8);
-  EXPECT_THAT(dst, testing::ElementsAre(1, 8, 64, 127));
+  AddRange(&dst, int8_t{1}, std::numeric_limits<int8_t>::max(), int8_t{8});
+  EXPECT_THAT(
+      dst, testing::ElementsAre(int8_t{1}, int8_t{8}, int8_t{64}, int8_t{127}));
 }
 
 TEST(AddRangeTest, FullRange64) {
@@ -125,8 +130,38 @@ TEST(AddRangeTest, FullNegativeRange64) {
 
 TEST(AddRangeTest, Simple8) {
   std::vector<int8_t> dst;
-  AddRange<int8_t>(&dst, 1, 8, 2);
-  EXPECT_THAT(dst, testing::ElementsAre(1, 2, 4, 8));
+  AddRange<int8_t>(&dst, int8_t{1}, int8_t{8}, int8_t{2});
+  EXPECT_THAT(dst,
+              testing::ElementsAre(int8_t{1}, int8_t{2}, int8_t{4}, int8_t{8}));
+}
+
+TEST(AddCustomContext, Simple) {
+  std::map<std::string, std::string> *&global_context = GetGlobalContext();
+  EXPECT_THAT(global_context, nullptr);
+
+  AddCustomContext("foo", "bar");
+  AddCustomContext("baz", "qux");
+
+  EXPECT_THAT(*global_context,
+              testing::UnorderedElementsAre(testing::Pair("foo", "bar"),
+                                            testing::Pair("baz", "qux")));
+
+  delete global_context;
+  global_context = nullptr;
+}
+
+TEST(AddCustomContext, DuplicateKey) {
+  std::map<std::string, std::string> *&global_context = GetGlobalContext();
+  EXPECT_THAT(global_context, nullptr);
+
+  AddCustomContext("foo", "bar");
+  AddCustomContext("foo", "qux");
+
+  EXPECT_THAT(*global_context,
+              testing::UnorderedElementsAre(testing::Pair("foo", "bar")));
+
+  delete global_context;
+  global_context = nullptr;
 }
 
 }  // namespace
diff --git a/test/benchmark_min_time_flag_iters_test.cc b/test/benchmark_min_time_flag_iters_test.cc
new file mode 100644
index 0000000..3de93a7
--- /dev/null
+++ b/test/benchmark_min_time_flag_iters_test.cc
@@ -0,0 +1,66 @@
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+
+// Tests that we can specify the number of iterations with
+// --benchmark_min_time=<NUM>x.
+namespace {
+
+class TestReporter : public benchmark::ConsoleReporter {
+ public:
+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
+    return ConsoleReporter::ReportContext(context);
+  };
+
+  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
+    assert(report.size() == 1);
+    iter_nums_.push_back(report[0].iterations);
+    ConsoleReporter::ReportRuns(report);
+  };
+
+  TestReporter() {}
+
+  virtual ~TestReporter() {}
+
+  const std::vector<benchmark::IterationCount>& GetIters() const {
+    return iter_nums_;
+  }
+
+ private:
+  std::vector<benchmark::IterationCount> iter_nums_;
+};
+
+}  // end namespace
+
+static void BM_MyBench(benchmark::State& state) {
+  for (auto s : state) {
+  }
+}
+BENCHMARK(BM_MyBench);
+
+int main(int argc, char** argv) {
+  // Make a fake argv and append the new --benchmark_min_time=<foo> to it.
+  int fake_argc = argc + 1;
+  const char** fake_argv = new const char*[static_cast<size_t>(fake_argc)];
+  for (int i = 0; i < argc; ++i) fake_argv[i] = argv[i];
+  fake_argv[argc] = "--benchmark_min_time=4x";
+
+  benchmark::Initialize(&fake_argc, const_cast<char**>(fake_argv));
+
+  TestReporter test_reporter;
+  const size_t returned_count =
+      benchmark::RunSpecifiedBenchmarks(&test_reporter, "BM_MyBench");
+  assert(returned_count == 1);
+
+  // Check the executed iters.
+  const std::vector<benchmark::IterationCount> iters = test_reporter.GetIters();
+  assert(!iters.empty() && iters[0] == 4);
+
+  delete[] fake_argv;
+  return 0;
+}
diff --git a/test/benchmark_min_time_flag_time_test.cc b/test/benchmark_min_time_flag_time_test.cc
new file mode 100644
index 0000000..04a82eb
--- /dev/null
+++ b/test/benchmark_min_time_flag_time_test.cc
@@ -0,0 +1,90 @@
+#include <cassert>
+#include <climits>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+
+// Tests that we can specify the min time with
+// --benchmark_min_time=<NUM> (no suffix needed) OR
+// --benchmark_min_time=<NUM>s
+namespace {
+
+// This is from benchmark.h
+typedef int64_t IterationCount;
+
+class TestReporter : public benchmark::ConsoleReporter {
+ public:
+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
+    return ConsoleReporter::ReportContext(context);
+  };
+
+  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
+    assert(report.size() == 1);
+    ConsoleReporter::ReportRuns(report);
+  };
+
+  virtual void ReportRunsConfig(double min_time, bool /* has_explicit_iters */,
+                                IterationCount /* iters */) BENCHMARK_OVERRIDE {
+    min_times_.push_back(min_time);
+  }
+
+  TestReporter() {}
+
+  virtual ~TestReporter() {}
+
+  const std::vector<double>& GetMinTimes() const { return min_times_; }
+
+ private:
+  std::vector<double> min_times_;
+};
+
+bool AlmostEqual(double a, double b) {
+  return std::fabs(a - b) < std::numeric_limits<double>::epsilon();
+}
+
+void DoTestHelper(int* argc, const char** argv, double expected) {
+  benchmark::Initialize(argc, const_cast<char**>(argv));
+
+  TestReporter test_reporter;
+  const size_t returned_count =
+      benchmark::RunSpecifiedBenchmarks(&test_reporter, "BM_MyBench");
+  assert(returned_count == 1);
+
+  // Check the min_time
+  const std::vector<double>& min_times = test_reporter.GetMinTimes();
+  assert(!min_times.empty() && AlmostEqual(min_times[0], expected));
+}
+
+}  // end namespace
+
+static void BM_MyBench(benchmark::State& state) {
+  for (auto s : state) {
+  }
+}
+BENCHMARK(BM_MyBench);
+
+int main(int argc, char** argv) {
+  // Make a fake argv and append the new --benchmark_min_time=<foo> to it.
+  int fake_argc = argc + 1;
+  const char** fake_argv = new const char*[static_cast<size_t>(fake_argc)];
+
+  for (int i = 0; i < argc; ++i) fake_argv[i] = argv[i];
+
+  const char* no_suffix = "--benchmark_min_time=4";
+  const char* with_suffix = "--benchmark_min_time=4.0s";
+  double expected = 4.0;
+
+  fake_argv[argc] = no_suffix;
+  DoTestHelper(&fake_argc, fake_argv, expected);
+
+  fake_argv[argc] = with_suffix;
+  DoTestHelper(&fake_argc, fake_argv, expected);
+
+  delete[] fake_argv;
+  return 0;
+}
diff --git a/test/benchmark_name_gtest.cc b/test/benchmark_name_gtest.cc
index afb401c..0a6746d 100644
--- a/test/benchmark_name_gtest.cc
+++ b/test/benchmark_name_gtest.cc
@@ -32,6 +32,14 @@ TEST(BenchmarkNameTest, MinTime) {
   EXPECT_EQ(name.str(), "function_name/some_args:3/4/min_time:3.4s");
 }
 
+TEST(BenchmarkNameTest, MinWarmUpTime) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.args = "some_args:3/4";
+  name.min_warmup_time = "min_warmup_time:3.5s";
+  EXPECT_EQ(name.str(), "function_name/some_args:3/4/min_warmup_time:3.5s");
+}
+
 TEST(BenchmarkNameTest, Iterations) {
   auto name = BenchmarkName();
   name.function_name = "function_name";
diff --git a/test/benchmark_random_interleaving_gtest.cc b/test/benchmark_random_interleaving_gtest.cc
new file mode 100644
index 0000000..7f20867
--- /dev/null
+++ b/test/benchmark_random_interleaving_gtest.cc
@@ -0,0 +1,126 @@
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "../src/commandlineflags.h"
+#include "../src/string_util.h"
+#include "benchmark/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace benchmark {
+
+BM_DECLARE_bool(benchmark_enable_random_interleaving);
+BM_DECLARE_string(benchmark_filter);
+BM_DECLARE_int32(benchmark_repetitions);
+
+namespace internal {
+namespace {
+
+class EventQueue : public std::queue<std::string> {
+ public:
+  void Put(const std::string& event) { push(event); }
+
+  void Clear() {
+    while (!empty()) {
+      pop();
+    }
+  }
+
+  std::string Get() {
+    std::string event = front();
+    pop();
+    return event;
+  }
+};
+
+EventQueue* queue = new EventQueue();
+
+class NullReporter : public BenchmarkReporter {
+ public:
+  bool ReportContext(const Context& /*context*/) override { return true; }
+  void ReportRuns(const std::vector<Run>& /* report */) override {}
+};
+
+class BenchmarkTest : public testing::Test {
+ public:
+  static void SetupHook(int /* num_threads */) { queue->push("Setup"); }
+
+  static void TeardownHook(int /* num_threads */) { queue->push("Teardown"); }
+
+  void Execute(const std::string& pattern) {
+    queue->Clear();
+
+    std::unique_ptr<BenchmarkReporter> reporter(new NullReporter());
+    FLAGS_benchmark_filter = pattern;
+    RunSpecifiedBenchmarks(reporter.get());
+
+    queue->Put("DONE");  // End marker
+  }
+};
+
+void BM_Match1(benchmark::State& state) {
+  const int64_t arg = state.range(0);
+
+  for (auto _ : state) {
+  }
+  queue->Put(StrFormat("BM_Match1/%d", static_cast<int>(arg)));
+}
+BENCHMARK(BM_Match1)
+    ->Iterations(100)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(3)
+    ->Range(10, 80)
+    ->Args({90})
+    ->Args({100});
+
+TEST_F(BenchmarkTest, Match1) {
+  Execute("BM_Match1");
+  ASSERT_EQ("BM_Match1/1", queue->Get());
+  ASSERT_EQ("BM_Match1/2", queue->Get());
+  ASSERT_EQ("BM_Match1/3", queue->Get());
+  ASSERT_EQ("BM_Match1/10", queue->Get());
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("BM_Match1/90", queue->Get());
+  ASSERT_EQ("BM_Match1/100", queue->Get());
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+TEST_F(BenchmarkTest, Match1WithRepetition) {
+  FLAGS_benchmark_repetitions = 2;
+
+  Execute("BM_Match1/(64|80)");
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+TEST_F(BenchmarkTest, Match1WithRandomInterleaving) {
+  FLAGS_benchmark_enable_random_interleaving = true;
+  FLAGS_benchmark_repetitions = 100;
+
+  std::map<std::string, int> element_count;
+  std::map<std::string, int> interleaving_count;
+  Execute("BM_Match1/(64|80)");
+  for (int i = 0; i < 100; ++i) {
+    std::vector<std::string> interleaving;
+    interleaving.push_back(queue->Get());
+    interleaving.push_back(queue->Get());
+    element_count[interleaving[0]]++;
+    element_count[interleaving[1]]++;
+    interleaving_count[StrFormat("%s,%s", interleaving[0].c_str(),
+                                 interleaving[1].c_str())]++;
+  }
+  EXPECT_EQ(element_count["BM_Match1/64"], 100) << "Unexpected repetitions.";
+  EXPECT_EQ(element_count["BM_Match1/80"], 100) << "Unexpected repetitions.";
+  EXPECT_GE(interleaving_count.size(), 2) << "Interleaving was not randomized.";
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace benchmark
diff --git a/test/benchmark_setup_teardown_test.cc b/test/benchmark_setup_teardown_test.cc
new file mode 100644
index 0000000..6c3cc2e
--- /dev/null
+++ b/test/benchmark_setup_teardown_test.cc
@@ -0,0 +1,157 @@
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <string>
+
+#include "benchmark/benchmark.h"
+
+// Test that Setup() and Teardown() are called exactly once
+// for each benchmark run (single-threaded).
+namespace singlethreaded {
+static int setup_call = 0;
+static int teardown_call = 0;
+}  // namespace singlethreaded
+static void DoSetup1(const benchmark::State& state) {
+  ++singlethreaded::setup_call;
+
+  // Setup/Teardown should never be called with any thread_idx != 0.
+  assert(state.thread_index() == 0);
+}
+
+static void DoTeardown1(const benchmark::State& state) {
+  ++singlethreaded::teardown_call;
+  assert(state.thread_index() == 0);
+}
+
+static void BM_with_setup(benchmark::State& state) {
+  for (auto s : state) {
+  }
+}
+BENCHMARK(BM_with_setup)
+    ->Arg(1)
+    ->Arg(3)
+    ->Arg(5)
+    ->Arg(7)
+    ->Iterations(100)
+    ->Setup(DoSetup1)
+    ->Teardown(DoTeardown1);
+
+// Test that Setup() and Teardown() are called once for each group of threads.
+namespace concurrent {
+static std::atomic<int> setup_call(0);
+static std::atomic<int> teardown_call(0);
+static std::atomic<int> func_call(0);
+}  // namespace concurrent
+
+static void DoSetup2(const benchmark::State& state) {
+  concurrent::setup_call.fetch_add(1, std::memory_order_acquire);
+  assert(state.thread_index() == 0);
+}
+
+static void DoTeardown2(const benchmark::State& state) {
+  concurrent::teardown_call.fetch_add(1, std::memory_order_acquire);
+  assert(state.thread_index() == 0);
+}
+
+static void BM_concurrent(benchmark::State& state) {
+  for (auto s : state) {
+  }
+  concurrent::func_call.fetch_add(1, std::memory_order_acquire);
+}
+
+BENCHMARK(BM_concurrent)
+    ->Setup(DoSetup2)
+    ->Teardown(DoTeardown2)
+    ->Iterations(100)
+    ->Threads(5)
+    ->Threads(10)
+    ->Threads(15);
+
+// Testing interaction with Fixture::Setup/Teardown
+namespace fixture_interaction {
+int setup = 0;
+int fixture_setup = 0;
+}  // namespace fixture_interaction
+
+#define FIXTURE_BECHMARK_NAME MyFixture
+
+class FIXTURE_BECHMARK_NAME : public ::benchmark::Fixture {
+ public:
+  void SetUp(const ::benchmark::State&) override {
+    fixture_interaction::fixture_setup++;
+  }
+
+  ~FIXTURE_BECHMARK_NAME() override {}
+};
+
+BENCHMARK_F(FIXTURE_BECHMARK_NAME, BM_WithFixture)(benchmark::State& st) {
+  for (auto _ : st) {
+  }
+}
+
+static void DoSetupWithFixture(const benchmark::State&) {
+  fixture_interaction::setup++;
+}
+
+BENCHMARK_REGISTER_F(FIXTURE_BECHMARK_NAME, BM_WithFixture)
+    ->Arg(1)
+    ->Arg(3)
+    ->Arg(5)
+    ->Arg(7)
+    ->Setup(DoSetupWithFixture)
+    ->Repetitions(1)
+    ->Iterations(100);
+
+// Testing repetitions.
+namespace repetitions {
+int setup = 0;
+}
+
+static void DoSetupWithRepetitions(const benchmark::State&) {
+  repetitions::setup++;
+}
+static void BM_WithRep(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+
+BENCHMARK(BM_WithRep)
+    ->Arg(1)
+    ->Arg(3)
+    ->Arg(5)
+    ->Arg(7)
+    ->Setup(DoSetupWithRepetitions)
+    ->Iterations(100)
+    ->Repetitions(4);
+
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+
+  size_t ret = benchmark::RunSpecifiedBenchmarks(".");
+  assert(ret > 0);
+
+  // Setup/Teardown is called once for each arg group (1,3,5,7).
+  assert(singlethreaded::setup_call == 4);
+  assert(singlethreaded::teardown_call == 4);
+
+  // 3 group of threads calling this function (3,5,10).
+  assert(concurrent::setup_call.load(std::memory_order_relaxed) == 3);
+  assert(concurrent::teardown_call.load(std::memory_order_relaxed) == 3);
+  assert((5 + 10 + 15) ==
+         concurrent::func_call.load(std::memory_order_relaxed));
+
+  // Setup is called 4 times, once for each arg group (1,3,5,7)
+  assert(fixture_interaction::setup == 4);
+  // Fixture::Setup is called every time the bm routine is run.
+  // The exact number is indeterministic, so we just assert that
+  // it's more than setup.
+  assert(fixture_interaction::fixture_setup > fixture_interaction::setup);
+
+  // Setup is call once for each repetition * num_arg =  4 * 4 = 16.
+  assert(repetitions::setup == 16);
+
+  return 0;
+}
diff --git a/test/benchmark_test.cc b/test/benchmark_test.cc
index 3cd4f55..94590d5 100644
--- a/test/benchmark_test.cc
+++ b/test/benchmark_test.cc
@@ -5,6 +5,7 @@
 #include <stdint.h>
 
 #include <chrono>
+#include <complex>
 #include <cstdlib>
 #include <iostream>
 #include <limits>
@@ -26,7 +27,7 @@
 
 namespace {
 
-int BENCHMARK_NOINLINE Factorial(uint32_t n) {
+int BENCHMARK_NOINLINE Factorial(int n) {
   return (n == 1) ? 1 : n * Factorial(n - 1);
 }
 
@@ -74,7 +75,8 @@ BENCHMARK_RANGE(BM_CalculatePiRange, 1, 1024 * 1024);
 static void BM_CalculatePi(benchmark::State& state) {
   static const int depth = 1024;
   for (auto _ : state) {
-    benchmark::DoNotOptimize(CalculatePi(static_cast<int>(depth)));
+    double pi = CalculatePi(static_cast<int>(depth));
+    benchmark::DoNotOptimize(pi);
   }
 }
 BENCHMARK(BM_CalculatePi)->Threads(8);
@@ -90,11 +92,13 @@ static void BM_SetInsert(benchmark::State& state) {
     for (int j = 0; j < state.range(1); ++j) data.insert(rand());
   }
   state.SetItemsProcessed(state.iterations() * state.range(1));
-  state.SetBytesProcessed(state.iterations() * state.range(1) * sizeof(int));
+  state.SetBytesProcessed(state.iterations() * state.range(1) *
+                          static_cast<int64_t>(sizeof(int)));
 }
 
-// Test many inserts at once to reduce the total iterations needed. Otherwise, the slower,
-// non-timed part of each iteration will make the benchmark take forever.
+// Test many inserts at once to reduce the total iterations needed. Otherwise,
+// the slower, non-timed part of each iteration will make the benchmark take
+// forever.
 BENCHMARK(BM_SetInsert)->Ranges({{1 << 10, 8 << 10}, {128, 512}});
 
 template <typename Container,
@@ -107,7 +111,7 @@ static void BM_Sequential(benchmark::State& state) {
   }
   const int64_t items_processed = state.iterations() * state.range(0);
   state.SetItemsProcessed(items_processed);
-  state.SetBytesProcessed(items_processed * sizeof(v));
+  state.SetBytesProcessed(items_processed * static_cast<int64_t>(sizeof(v)));
 }
 BENCHMARK_TEMPLATE2(BM_Sequential, std::vector<int>, int)
     ->Range(1 << 0, 1 << 10);
@@ -121,12 +125,15 @@ static void BM_StringCompare(benchmark::State& state) {
   size_t len = static_cast<size_t>(state.range(0));
   std::string s1(len, '-');
   std::string s2(len, '-');
-  for (auto _ : state) benchmark::DoNotOptimize(s1.compare(s2));
+  for (auto _ : state) {
+    auto comp = s1.compare(s2);
+    benchmark::DoNotOptimize(comp);
+  }
 }
 BENCHMARK(BM_StringCompare)->Range(1, 1 << 20);
 
 static void BM_SetupTeardown(benchmark::State& state) {
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     // No need to lock test_vector_mu here as this is running single-threaded.
     test_vector = new std::vector<int>();
   }
@@ -139,7 +146,7 @@ static void BM_SetupTeardown(benchmark::State& state) {
       test_vector->pop_back();
     ++i;
   }
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     delete test_vector;
   }
 }
@@ -156,11 +163,11 @@ BENCHMARK(BM_LongTest)->Range(1 << 16, 1 << 28);
 
 static void BM_ParallelMemset(benchmark::State& state) {
   int64_t size = state.range(0) / static_cast<int64_t>(sizeof(int));
-  int thread_size = static_cast<int>(size) / state.threads;
-  int from = thread_size * state.thread_index;
+  int thread_size = static_cast<int>(size) / state.threads();
+  int from = thread_size * state.thread_index();
   int to = from + thread_size;
 
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     test_vector = new std::vector<int>(static_cast<size_t>(size));
   }
 
@@ -168,11 +175,11 @@ static void BM_ParallelMemset(benchmark::State& state) {
     for (int i = from; i < to; i++) {
       // No need to lock test_vector_mu as ranges
       // do not overlap between threads.
-      benchmark::DoNotOptimize(test_vector->at(i) = 1);
+      benchmark::DoNotOptimize(test_vector->at(static_cast<size_t>(i)) = 1);
     }
   }
 
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     delete test_vector;
   }
 }
@@ -214,7 +221,8 @@ BENCHMARK_CAPTURE(BM_with_args, string_and_pair_test, std::string("abc"),
                   std::pair<int, double>(42, 3.8));
 
 void BM_non_template_args(benchmark::State& state, int, double) {
-  while(state.KeepRunning()) {}
+  while (state.KeepRunning()) {
+  }
 }
 BENCHMARK_CAPTURE(BM_non_template_args, basic_test, 0, 0);
 
@@ -223,14 +231,14 @@ BENCHMARK_CAPTURE(BM_non_template_args, basic_test, 0, 0);
 static void BM_DenseThreadRanges(benchmark::State& st) {
   switch (st.range(0)) {
     case 1:
-      assert(st.threads == 1 || st.threads == 2 || st.threads == 3);
+      assert(st.threads() == 1 || st.threads() == 2 || st.threads() == 3);
       break;
     case 2:
-      assert(st.threads == 1 || st.threads == 3 || st.threads == 4);
+      assert(st.threads() == 1 || st.threads() == 3 || st.threads() == 4);
       break;
     case 3:
-      assert(st.threads == 5 || st.threads == 8 || st.threads == 11 ||
-             st.threads == 14);
+      assert(st.threads() == 5 || st.threads() == 8 || st.threads() == 11 ||
+             st.threads() == 14);
       break;
     default:
       assert(false && "Invalid test case number");
@@ -242,4 +250,25 @@ BENCHMARK(BM_DenseThreadRanges)->Arg(1)->DenseThreadRange(1, 3);
 BENCHMARK(BM_DenseThreadRanges)->Arg(2)->DenseThreadRange(1, 4, 2);
 BENCHMARK(BM_DenseThreadRanges)->Arg(3)->DenseThreadRange(5, 14, 3);
 
+static void BM_BenchmarkName(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+
+  // Check that the benchmark name is passed correctly to `state`.
+  assert("BM_BenchmarkName" == state.name());
+}
+BENCHMARK(BM_BenchmarkName);
+
+// regression test for #1446
+template <typename type>
+static void BM_templated_test(benchmark::State& state) {
+  for (auto _ : state) {
+    type created_string;
+    benchmark::DoNotOptimize(created_string);
+  }
+}
+
+static auto BM_templated_test_double = BM_templated_test<std::complex<double>>;
+BENCHMARK(BM_templated_test_double);
+
 BENCHMARK_MAIN();
diff --git a/test/clobber_memory_assembly_test.cc b/test/clobber_memory_assembly_test.cc
index f41911a..54e26cc 100644
--- a/test/clobber_memory_assembly_test.cc
+++ b/test/clobber_memory_assembly_test.cc
@@ -3,13 +3,13 @@
 #ifdef __clang__
 #pragma clang diagnostic ignored "-Wreturn-type"
 #endif
+BENCHMARK_DISABLE_DEPRECATED_WARNING
 
 extern "C" {
 
 extern int ExternInt;
 extern int ExternInt2;
 extern int ExternInt3;
-
 }
 
 // CHECK-LABEL: test_basic:
diff --git a/test/commandlineflags_gtest.cc b/test/commandlineflags_gtest.cc
index 656020f..8412008 100644
--- a/test/commandlineflags_gtest.cc
+++ b/test/commandlineflags_gtest.cc
@@ -2,6 +2,7 @@
 
 #include "../src/commandlineflags.h"
 #include "../src/internal_macros.h"
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
 namespace benchmark {
@@ -19,9 +20,7 @@ int setenv(const char* name, const char* value, int overwrite) {
   return _putenv_s(name, value);
 }
 
-int unsetenv(const char* name) {
-  return _putenv_s(name, "");
-}
+int unsetenv(const char* name) { return _putenv_s(name, ""); }
 
 #endif  // BENCHMARK_OS_WINDOWS
 
@@ -197,5 +196,33 @@ TEST(StringFromEnv, Valid) {
   unsetenv("IN_ENV");
 }
 
+TEST(KvPairsFromEnv, Default) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_THAT(KvPairsFromEnv("not_in_env", {{"foo", "bar"}}),
+              testing::ElementsAre(testing::Pair("foo", "bar")));
+}
+
+TEST(KvPairsFromEnv, MalformedReturnsDefault) {
+  ASSERT_EQ(setenv("IN_ENV", "foo", 1), 0);
+  EXPECT_THAT(KvPairsFromEnv("in_env", {{"foo", "bar"}}),
+              testing::ElementsAre(testing::Pair("foo", "bar")));
+  unsetenv("IN_ENV");
+}
+
+TEST(KvPairsFromEnv, Single) {
+  ASSERT_EQ(setenv("IN_ENV", "foo=bar", 1), 0);
+  EXPECT_THAT(KvPairsFromEnv("in_env", {}),
+              testing::ElementsAre(testing::Pair("foo", "bar")));
+  unsetenv("IN_ENV");
+}
+
+TEST(KvPairsFromEnv, Multiple) {
+  ASSERT_EQ(setenv("IN_ENV", "foo=bar,baz=qux", 1), 0);
+  EXPECT_THAT(KvPairsFromEnv("in_env", {}),
+              testing::UnorderedElementsAre(testing::Pair("foo", "bar"),
+                                            testing::Pair("baz", "qux")));
+  unsetenv("IN_ENV");
+}
+
 }  // namespace
 }  // namespace benchmark
diff --git a/test/complexity_test.cc b/test/complexity_test.cc
index 5681fdc..76891e0 100644
--- a/test/complexity_test.cc
+++ b/test/complexity_test.cc
@@ -4,6 +4,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <vector>
+
 #include "benchmark/benchmark.h"
 #include "output_test.h"
 
@@ -12,8 +13,10 @@ namespace {
 #define ADD_COMPLEXITY_CASES(...) \
   int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)
 
-int AddComplexityTest(std::string test_name, std::string big_o_test_name,
-                      std::string rms_test_name, std::string big_o) {
+int AddComplexityTest(const std::string &test_name,
+                      const std::string &big_o_test_name,
+                      const std::string &rms_test_name,
+                      const std::string &big_o, int family_index) {
   SetSubstitutions({{"%name", test_name},
                     {"%bigo_name", big_o_test_name},
                     {"%rms_name", rms_test_name},
@@ -25,25 +28,33 @@ int AddComplexityTest(std::string test_name, std::string big_o_test_name,
       {{"^%bigo_name %bigo_str %bigo_str[ ]*$"},
        {"^%bigo_name", MR_Not},  // Assert we we didn't only matched a name.
        {"^%rms_name %rms %rms[ ]*$", MR_Next}});
-  AddCases(TC_JSONOut, {{"\"name\": \"%bigo_name\",$"},
-                        {"\"run_name\": \"%name\",$", MR_Next},
-                        {"\"run_type\": \"aggregate\",$", MR_Next},
-                        {"\"repetitions\": %int,$", MR_Next},
-                        {"\"threads\": 1,$", MR_Next},
-                        {"\"aggregate_name\": \"BigO\",$", MR_Next},
-                        {"\"cpu_coefficient\": %float,$", MR_Next},
-                        {"\"real_coefficient\": %float,$", MR_Next},
-                        {"\"big_o\": \"%bigo\",$", MR_Next},
-                        {"\"time_unit\": \"ns\"$", MR_Next},
-                        {"}", MR_Next},
-                        {"\"name\": \"%rms_name\",$"},
-                        {"\"run_name\": \"%name\",$", MR_Next},
-                        {"\"run_type\": \"aggregate\",$", MR_Next},
-                        {"\"repetitions\": %int,$", MR_Next},
-                        {"\"threads\": 1,$", MR_Next},
-                        {"\"aggregate_name\": \"RMS\",$", MR_Next},
-                        {"\"rms\": %float$", MR_Next},
-                        {"}", MR_Next}});
+  AddCases(
+      TC_JSONOut,
+      {{"\"name\": \"%bigo_name\",$"},
+       {"\"family_index\": " + std::to_string(family_index) + ",$", MR_Next},
+       {"\"per_family_instance_index\": 0,$", MR_Next},
+       {"\"run_name\": \"%name\",$", MR_Next},
+       {"\"run_type\": \"aggregate\",$", MR_Next},
+       {"\"repetitions\": %int,$", MR_Next},
+       {"\"threads\": 1,$", MR_Next},
+       {"\"aggregate_name\": \"BigO\",$", MR_Next},
+       {"\"aggregate_unit\": \"time\",$", MR_Next},
+       {"\"cpu_coefficient\": %float,$", MR_Next},
+       {"\"real_coefficient\": %float,$", MR_Next},
+       {"\"big_o\": \"%bigo\",$", MR_Next},
+       {"\"time_unit\": \"ns\"$", MR_Next},
+       {"}", MR_Next},
+       {"\"name\": \"%rms_name\",$"},
+       {"\"family_index\": " + std::to_string(family_index) + ",$", MR_Next},
+       {"\"per_family_instance_index\": 0,$", MR_Next},
+       {"\"run_name\": \"%name\",$", MR_Next},
+       {"\"run_type\": \"aggregate\",$", MR_Next},
+       {"\"repetitions\": %int,$", MR_Next},
+       {"\"threads\": 1,$", MR_Next},
+       {"\"aggregate_name\": \"RMS\",$", MR_Next},
+       {"\"aggregate_unit\": \"percentage\",$", MR_Next},
+       {"\"rms\": %float$", MR_Next},
+       {"}", MR_Next}});
   AddCases(TC_CSVOut, {{"^\"%bigo_name\",,%float,%float,%bigo,,,,,$"},
                        {"^\"%bigo_name\"", MR_Not},
                        {"^\"%rms_name\",,%float,%float,,,,,,$", MR_Next}});
@@ -56,10 +67,10 @@ int AddComplexityTest(std::string test_name, std::string big_o_test_name,
 // --------------------------- Testing BigO O(1) --------------------------- //
 // ========================================================================= //
 
-void BM_Complexity_O1(benchmark::State& state) {
+void BM_Complexity_O1(benchmark::State &state) {
   for (auto _ : state) {
     for (int i = 0; i < 1024; ++i) {
-      benchmark::DoNotOptimize(&i);
+      benchmark::DoNotOptimize(i);
     }
   }
   state.SetComplexityN(state.range(0));
@@ -82,15 +93,15 @@ const char *lambda_big_o_1 = "f\\(N\\)";
 
 // Add enum tests
 ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
-                     enum_big_o_1);
+                     enum_big_o_1, /*family_index=*/0);
 
 // Add auto enum tests
 ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
-                     auto_big_o_1);
+                     auto_big_o_1, /*family_index=*/1);
 
 // Add lambda tests
 ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
-                     lambda_big_o_1);
+                     lambda_big_o_1, /*family_index=*/2);
 
 // ========================================================================= //
 // --------------------------- Testing BigO O(N) --------------------------- //
@@ -98,19 +109,20 @@ ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
 
 std::vector<int> ConstructRandomVector(int64_t size) {
   std::vector<int> v;
-  v.reserve(static_cast<int>(size));
+  v.reserve(static_cast<size_t>(size));
   for (int i = 0; i < size; ++i) {
     v.push_back(static_cast<int>(std::rand() % size));
   }
   return v;
 }
 
-void BM_Complexity_O_N(benchmark::State& state) {
+void BM_Complexity_O_N(benchmark::State &state) {
   auto v = ConstructRandomVector(state.range(0));
   // Test worst case scenario (item not in vector)
   const int64_t item_not_in_vector = state.range(0) * 2;
   for (auto _ : state) {
-    benchmark::DoNotOptimize(std::find(v.begin(), v.end(), item_not_in_vector));
+    auto it = std::find(v.begin(), v.end(), item_not_in_vector);
+    benchmark::DoNotOptimize(it);
   }
   state.SetComplexityN(state.range(0));
 }
@@ -137,17 +149,17 @@ const char *lambda_big_o_n = "f\\(N\\)";
 
 // Add enum tests
 ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
-                     enum_auto_big_o_n);
+                     enum_auto_big_o_n, /*family_index=*/3);
 
 // Add lambda tests
 ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
-                     lambda_big_o_n);
+                     lambda_big_o_n, /*family_index=*/4);
 
 // ========================================================================= //
 // ------------------------- Testing BigO O(N*lgN) ------------------------- //
 // ========================================================================= //
 
-static void BM_Complexity_O_N_log_N(benchmark::State& state) {
+static void BM_Complexity_O_N_log_N(benchmark::State &state) {
   auto v = ConstructRandomVector(state.range(0));
   for (auto _ : state) {
     std::sort(v.begin(), v.end());
@@ -163,7 +175,7 @@ BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
     ->Complexity([](benchmark::IterationCount n) {
-      return kLog2E * n * log(static_cast<double>(n));
+      return kLog2E * static_cast<double>(n) * log(static_cast<double>(n));
     });
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
@@ -178,20 +190,23 @@ const char *lambda_big_o_n_lg_n = "f\\(N\\)";
 
 // Add enum tests
 ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
-                     rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n);
+                     rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n,
+                     /*family_index=*/6);
 
 // Add lambda tests
 ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
-                     rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n);
+                     rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n,
+                     /*family_index=*/7);
 
 // ========================================================================= //
 // -------- Testing formatting of Complexity with captured args ------------ //
 // ========================================================================= //
 
-void BM_ComplexityCaptureArgs(benchmark::State& state, int n) {
+void BM_ComplexityCaptureArgs(benchmark::State &state, int n) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   state.SetComplexityN(n);
 }
@@ -204,7 +219,7 @@ const std::string complexity_capture_name =
     "BM_ComplexityCaptureArgs/capture_test";
 
 ADD_COMPLEXITY_CASES(complexity_capture_name, complexity_capture_name + "_BigO",
-                     complexity_capture_name + "_RMS", "N");
+                     complexity_capture_name + "_RMS", "N", /*family_index=*/9);
 
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
diff --git a/test/cxx03_test.cc b/test/cxx03_test.cc
index c4c9a52..9711c1b 100644
--- a/test/cxx03_test.cc
+++ b/test/cxx03_test.cc
@@ -44,8 +44,7 @@ BENCHMARK_TEMPLATE(BM_template1, long);
 BENCHMARK_TEMPLATE1(BM_template1, int);
 
 template <class T>
-struct BM_Fixture : public ::benchmark::Fixture {
-};
+struct BM_Fixture : public ::benchmark::Fixture {};
 
 BENCHMARK_TEMPLATE_F(BM_Fixture, BM_template1, long)(benchmark::State& state) {
   BM_empty(state);
@@ -55,8 +54,8 @@ BENCHMARK_TEMPLATE1_F(BM_Fixture, BM_template2, int)(benchmark::State& state) {
 }
 
 void BM_counters(benchmark::State& state) {
-    BM_empty(state);
-    state.counters["Foo"] = 2;
+  BM_empty(state);
+  state.counters["Foo"] = 2;
 }
 BENCHMARK(BM_counters);
 
diff --git a/test/diagnostics_test.cc b/test/diagnostics_test.cc
index dd64a33..0cd3edb 100644
--- a/test/diagnostics_test.cc
+++ b/test/diagnostics_test.cc
@@ -26,7 +26,8 @@ void TestHandler() {
 }
 
 void try_invalid_pause_resume(benchmark::State& state) {
-#if !defined(TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS) && !defined(TEST_HAS_NO_EXCEPTIONS)
+#if !defined(TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS) && \
+    !defined(TEST_HAS_NO_EXCEPTIONS)
   try {
     state.PauseTiming();
     std::abort();
@@ -48,7 +49,8 @@ void BM_diagnostic_test(benchmark::State& state) {
   if (called_once == false) try_invalid_pause_resume(state);
 
   for (auto _ : state) {
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
 
   if (called_once == false) try_invalid_pause_resume(state);
@@ -57,14 +59,14 @@ void BM_diagnostic_test(benchmark::State& state) {
 }
 BENCHMARK(BM_diagnostic_test);
 
-
 void BM_diagnostic_test_keep_running(benchmark::State& state) {
   static bool called_once = false;
 
   if (called_once == false) try_invalid_pause_resume(state);
 
-  while(state.KeepRunning()) {
-    benchmark::DoNotOptimize(state.iterations());
+  while (state.KeepRunning()) {
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
 
   if (called_once == false) try_invalid_pause_resume(state);
@@ -74,7 +76,16 @@ void BM_diagnostic_test_keep_running(benchmark::State& state) {
 BENCHMARK(BM_diagnostic_test_keep_running);
 
 int main(int argc, char* argv[]) {
+#ifdef NDEBUG
+  // This test is exercising functionality for debug builds, which are not
+  // available in release builds. Skip the test if we are in that environment
+  // to avoid a test failure.
+  std::cout << "Diagnostic test disabled in release build" << std::endl;
+  (void)argc;
+  (void)argv;
+#else
   benchmark::internal::GetAbortHandler() = &TestHandler;
   benchmark::Initialize(&argc, argv);
   benchmark::RunSpecifiedBenchmarks();
+#endif
 }
diff --git a/test/display_aggregates_only_test.cc b/test/display_aggregates_only_test.cc
index 3c36d3f..6ad65e7 100644
--- a/test/display_aggregates_only_test.cc
+++ b/test/display_aggregates_only_test.cc
@@ -19,21 +19,23 @@ BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->DisplayAggregatesOnly();
 int main(int argc, char* argv[]) {
   const std::string output = GetFileReporterOutput(argc, argv);
 
-  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 6 ||
+  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 7 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3\"") != 3 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_mean\"") != 1 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_median\"") !=
           1 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"") !=
-          1) {
-    std::cout << "Precondition mismatch. Expected to only find 6 "
+          1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_cv\"") != 1) {
+    std::cout << "Precondition mismatch. Expected to only find 8 "
                  "occurrences of \"BM_SummaryRepeat/repeats:3\" substring:\n"
                  "\"name\": \"BM_SummaryRepeat/repeats:3\", "
                  "\"name\": \"BM_SummaryRepeat/repeats:3\", "
                  "\"name\": \"BM_SummaryRepeat/repeats:3\", "
                  "\"name\": \"BM_SummaryRepeat/repeats:3_mean\", "
                  "\"name\": \"BM_SummaryRepeat/repeats:3_median\", "
-                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"\nThe entire "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_cv\"\nThe entire "
                  "output:\n";
     std::cout << output;
     return 1;
diff --git a/test/donotoptimize_assembly_test.cc b/test/donotoptimize_assembly_test.cc
index d4b0bab..dc286f5 100644
--- a/test/donotoptimize_assembly_test.cc
+++ b/test/donotoptimize_assembly_test.cc
@@ -3,19 +3,23 @@
 #ifdef __clang__
 #pragma clang diagnostic ignored "-Wreturn-type"
 #endif
+BENCHMARK_DISABLE_DEPRECATED_WARNING
 
 extern "C" {
 
 extern int ExternInt;
 extern int ExternInt2;
 extern int ExternInt3;
+extern int BigArray[2049];
+
+const int ConstBigArray[2049]{};
 
 inline int Add42(int x) { return x + 42; }
 
 struct NotTriviallyCopyable {
   NotTriviallyCopyable();
   explicit NotTriviallyCopyable(int x) : value(x) {}
-  NotTriviallyCopyable(NotTriviallyCopyable const&);
+  NotTriviallyCopyable(NotTriviallyCopyable const &);
   int value;
 };
 
@@ -24,7 +28,14 @@ struct Large {
   int data[2];
 };
 
+struct ExtraLarge {
+  int arr[2049];
+};
 }
+
+extern ExtraLarge ExtraLargeObj;
+const ExtraLarge ConstExtraLargeObj{};
+
 // CHECK-LABEL: test_with_rvalue:
 extern "C" void test_with_rvalue() {
   benchmark::DoNotOptimize(Add42(0));
@@ -69,6 +80,22 @@ extern "C" void test_with_large_lvalue() {
   // CHECK: ret
 }
 
+// CHECK-LABEL: test_with_extra_large_lvalue_with_op:
+extern "C" void test_with_extra_large_lvalue_with_op() {
+  ExtraLargeObj.arr[16] = 42;
+  benchmark::DoNotOptimize(ExtraLargeObj);
+  // CHECK: movl $42, ExtraLargeObj+64(%rip)
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_big_array_with_op
+extern "C" void test_with_big_array_with_op() {
+  BigArray[16] = 42;
+  benchmark::DoNotOptimize(BigArray);
+  // CHECK: movl $42, BigArray+64(%rip)
+  // CHECK: ret
+}
+
 // CHECK-LABEL: test_with_non_trivial_lvalue:
 extern "C" void test_with_non_trivial_lvalue() {
   NotTriviallyCopyable NTC(ExternInt);
@@ -97,6 +124,18 @@ extern "C" void test_with_large_const_lvalue() {
   // CHECK: ret
 }
 
+// CHECK-LABEL: test_with_const_extra_large_obj:
+extern "C" void test_with_const_extra_large_obj() {
+  benchmark::DoNotOptimize(ConstExtraLargeObj);
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_const_big_array
+extern "C" void test_with_const_big_array() {
+  benchmark::DoNotOptimize(ConstBigArray);
+  // CHECK: ret
+}
+
 // CHECK-LABEL: test_with_non_trivial_const_lvalue:
 extern "C" void test_with_non_trivial_const_lvalue() {
   const NotTriviallyCopyable Obj(ExternInt);
@@ -118,8 +157,7 @@ extern "C" int test_div_by_two(int input) {
 // CHECK-LABEL: test_inc_integer:
 extern "C" int test_inc_integer() {
   int x = 0;
-  for (int i=0; i < 5; ++i)
-    benchmark::DoNotOptimize(++x);
+  for (int i = 0; i < 5; ++i) benchmark::DoNotOptimize(++x);
   // CHECK: movl $1, [[DEST:.*]]
   // CHECK: {{(addl \$1,|incl)}} [[DEST]]
   // CHECK: {{(addl \$1,|incl)}} [[DEST]]
@@ -147,7 +185,7 @@ extern "C" void test_pointer_const_lvalue() {
   // CHECK-CLANG: movq %rax, -{{[0-9]+}}(%[[REG:[a-z]+]])
   // CHECK: ret
   int x = 42;
-  int * const xp = &x;
+  int *const xp = &x;
   benchmark::DoNotOptimize(xp);
 }
 
diff --git a/test/donotoptimize_test.cc b/test/donotoptimize_test.cc
index 2ce92d1..04ec938 100644
--- a/test/donotoptimize_test.cc
+++ b/test/donotoptimize_test.cc
@@ -1,33 +1,43 @@
-#include "benchmark/benchmark.h"
-
 #include <cstdint>
 
+#include "benchmark/benchmark.h"
+
 namespace {
 #if defined(__GNUC__)
-std::uint64_t double_up(const std::uint64_t x) __attribute__((const));
+std::int64_t double_up(const std::int64_t x) __attribute__((const));
 #endif
-std::uint64_t double_up(const std::uint64_t x) { return x * 2; }
-}
+std::int64_t double_up(const std::int64_t x) { return x * 2; }
+}  // namespace
 
 // Using DoNotOptimize on types like BitRef seem to cause a lot of problems
 // with the inline assembly on both GCC and Clang.
 struct BitRef {
   int index;
-  unsigned char &byte;
+  unsigned char& byte;
 
-public:
+ public:
   static BitRef Make() {
     static unsigned char arr[2] = {};
     BitRef b(1, arr[0]);
     return b;
   }
-private:
+
+ private:
   BitRef(int i, unsigned char& b) : index(i), byte(b) {}
 };
 
 int main(int, char*[]) {
   // this test verifies compilation of DoNotOptimize() for some types
 
+  char buffer1[1] = "";
+  benchmark::DoNotOptimize(buffer1);
+
+  char buffer2[2] = "";
+  benchmark::DoNotOptimize(buffer2);
+
+  char buffer3[3] = "";
+  benchmark::DoNotOptimize(buffer3);
+
   char buffer8[8] = "";
   benchmark::DoNotOptimize(buffer8);
 
@@ -36,17 +46,24 @@ int main(int, char*[]) {
 
   char buffer1024[1024] = "";
   benchmark::DoNotOptimize(buffer1024);
-  benchmark::DoNotOptimize(&buffer1024[0]);
+  char* bptr = &buffer1024[0];
+  benchmark::DoNotOptimize(bptr);
 
   int x = 123;
   benchmark::DoNotOptimize(x);
-  benchmark::DoNotOptimize(&x);
+  int* xp = &x;
+  benchmark::DoNotOptimize(xp);
   benchmark::DoNotOptimize(x += 42);
 
-  benchmark::DoNotOptimize(double_up(x));
+  std::int64_t y = double_up(x);
+  benchmark::DoNotOptimize(y);
 
   // These tests are to e
-  benchmark::DoNotOptimize(BitRef::Make());
   BitRef lval = BitRef::Make();
   benchmark::DoNotOptimize(lval);
+
+#ifdef BENCHMARK_HAS_CXX11
+  // Check that accept rvalue.
+  benchmark::DoNotOptimize(BitRef::Make());
+#endif
 }
diff --git a/test/filter_test.cc b/test/filter_test.cc
index 0e27065..4c8b8ea 100644
--- a/test/filter_test.cc
+++ b/test/filter_test.cc
@@ -1,36 +1,40 @@
-#include "benchmark/benchmark.h"
-
+#include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <cstdint>
 #include <cstdlib>
-
 #include <iostream>
 #include <limits>
 #include <sstream>
 #include <string>
 
+#include "benchmark/benchmark.h"
+
 namespace {
 
 class TestReporter : public benchmark::ConsoleReporter {
  public:
-  virtual bool ReportContext(const Context& context) {
+  bool ReportContext(const Context& context) override {
     return ConsoleReporter::ReportContext(context);
   };
 
-  virtual void ReportRuns(const std::vector<Run>& report) {
+  void ReportRuns(const std::vector<Run>& report) override {
     ++count_;
+    max_family_index_ = std::max(max_family_index_, report[0].family_index);
     ConsoleReporter::ReportRuns(report);
   };
 
-  TestReporter() : count_(0) {}
+  TestReporter() : count_(0), max_family_index_(0) {}
 
-  virtual ~TestReporter() {}
+  ~TestReporter() override {}
 
-  size_t GetCount() const { return count_; }
+  int GetCount() const { return count_; }
+
+  int64_t GetMaxFamilyIndex() const { return max_family_index_; }
 
  private:
-  mutable size_t count_;
+  mutable int count_;
+  mutable int64_t max_family_index_;
 };
 
 }  // end namespace
@@ -65,7 +69,7 @@ static void BM_FooBa(benchmark::State& state) {
 }
 BENCHMARK(BM_FooBa);
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   bool list_only = false;
   for (int i = 0; i < argc; ++i)
     list_only |= std::string(argv[i]).find("--benchmark_list_tests") !=
@@ -74,13 +78,13 @@ int main(int argc, char **argv) {
   benchmark::Initialize(&argc, argv);
 
   TestReporter test_reporter;
-  const size_t returned_count =
-      benchmark::RunSpecifiedBenchmarks(&test_reporter);
+  const int64_t returned_count =
+      static_cast<int64_t>(benchmark::RunSpecifiedBenchmarks(&test_reporter));
 
   if (argc == 2) {
     // Make sure we ran all of the tests
     std::stringstream ss(argv[1]);
-    size_t expected_return;
+    int64_t expected_return;
     ss >> expected_return;
 
     if (returned_count != expected_return) {
@@ -90,14 +94,23 @@ int main(int argc, char **argv) {
       return -1;
     }
 
-    const size_t expected_reports = list_only ? 0 : expected_return;
-    const size_t reports_count = test_reporter.GetCount();
+    const int64_t expected_reports = list_only ? 0 : expected_return;
+    const int64_t reports_count = test_reporter.GetCount();
     if (reports_count != expected_reports) {
       std::cerr << "ERROR: Expected " << expected_reports
                 << " tests to be run but reported_count = " << reports_count
                 << std::endl;
       return -1;
     }
+
+    const int64_t max_family_index = test_reporter.GetMaxFamilyIndex();
+    const int64_t num_families = reports_count == 0 ? 0 : 1 + max_family_index;
+    if (num_families != expected_reports) {
+      std::cerr << "ERROR: Expected " << expected_reports
+                << " test families to be run but num_families = "
+                << num_families << std::endl;
+      return -1;
+    }
   }
 
   return 0;
diff --git a/test/fixture_test.cc b/test/fixture_test.cc
index a331c7d..d1093eb 100644
--- a/test/fixture_test.cc
+++ b/test/fixture_test.cc
@@ -1,33 +1,33 @@
 
-#include "benchmark/benchmark.h"
-
 #include <cassert>
 #include <memory>
 
+#include "benchmark/benchmark.h"
+
 #define FIXTURE_BECHMARK_NAME MyFixture
 
 class FIXTURE_BECHMARK_NAME : public ::benchmark::Fixture {
  public:
-  void SetUp(const ::benchmark::State& state) {
-    if (state.thread_index == 0) {
+  void SetUp(const ::benchmark::State& state) override {
+    if (state.thread_index() == 0) {
       assert(data.get() == nullptr);
       data.reset(new int(42));
     }
   }
 
-  void TearDown(const ::benchmark::State& state) {
-    if (state.thread_index == 0) {
+  void TearDown(const ::benchmark::State& state) override {
+    if (state.thread_index() == 0) {
       assert(data.get() != nullptr);
       data.reset();
     }
   }
 
-  ~FIXTURE_BECHMARK_NAME() { assert(data == nullptr); }
+  ~FIXTURE_BECHMARK_NAME() override { assert(data == nullptr); }
 
   std::unique_ptr<int> data;
 };
 
-BENCHMARK_F(FIXTURE_BECHMARK_NAME, Foo)(benchmark::State &st) {
+BENCHMARK_F(FIXTURE_BECHMARK_NAME, Foo)(benchmark::State& st) {
   assert(data.get() != nullptr);
   assert(*data == 42);
   for (auto _ : st) {
@@ -35,7 +35,7 @@ BENCHMARK_F(FIXTURE_BECHMARK_NAME, Foo)(benchmark::State &st) {
 }
 
 BENCHMARK_DEFINE_F(FIXTURE_BECHMARK_NAME, Bar)(benchmark::State& st) {
-  if (st.thread_index == 0) {
+  if (st.thread_index() == 0) {
     assert(data.get() != nullptr);
     assert(*data == 42);
   }
diff --git a/test/internal_threading_test.cc b/test/internal_threading_test.cc
index 039d7c1..62b5b95 100644
--- a/test/internal_threading_test.cc
+++ b/test/internal_threading_test.cc
@@ -3,6 +3,7 @@
 
 #include <chrono>
 #include <thread>
+
 #include "../src/timers.h"
 #include "benchmark/benchmark.h"
 #include "output_test.h"
diff --git a/test/link_main_test.cc b/test/link_main_test.cc
index 241ad5c..e806500 100644
--- a/test/link_main_test.cc
+++ b/test/link_main_test.cc
@@ -2,7 +2,8 @@
 
 void BM_empty(benchmark::State& state) {
   for (auto _ : state) {
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
 }
 BENCHMARK(BM_empty);
diff --git a/test/map_test.cc b/test/map_test.cc
index dbf7982..0fdba7c 100644
--- a/test/map_test.cc
+++ b/test/map_test.cc
@@ -1,8 +1,8 @@
-#include "benchmark/benchmark.h"
-
 #include <cstdlib>
 #include <map>
 
+#include "benchmark/benchmark.h"
+
 namespace {
 
 std::map<int, int> ConstructRandomMap(int size) {
@@ -24,7 +24,8 @@ static void BM_MapLookup(benchmark::State& state) {
     m = ConstructRandomMap(size);
     state.ResumeTiming();
     for (int i = 0; i < size; ++i) {
-      benchmark::DoNotOptimize(m.find(std::rand() % size));
+      auto it = m.find(std::rand() % size);
+      benchmark::DoNotOptimize(it);
     }
   }
   state.SetItemsProcessed(state.iterations() * size);
@@ -34,11 +35,11 @@ BENCHMARK(BM_MapLookup)->Range(1 << 3, 1 << 12);
 // Using fixtures.
 class MapFixture : public ::benchmark::Fixture {
  public:
-  void SetUp(const ::benchmark::State& st) {
+  void SetUp(const ::benchmark::State& st) override {
     m = ConstructRandomMap(static_cast<int>(st.range(0)));
   }
 
-  void TearDown(const ::benchmark::State&) { m.clear(); }
+  void TearDown(const ::benchmark::State&) override { m.clear(); }
 
   std::map<int, int> m;
 };
@@ -47,7 +48,8 @@ BENCHMARK_DEFINE_F(MapFixture, Lookup)(benchmark::State& state) {
   const int size = static_cast<int>(state.range(0));
   for (auto _ : state) {
     for (int i = 0; i < size; ++i) {
-      benchmark::DoNotOptimize(m.find(std::rand() % size));
+      auto it = m.find(std::rand() % size);
+      benchmark::DoNotOptimize(it);
     }
   }
   state.SetItemsProcessed(state.iterations() * size);
diff --git a/test/memory_manager_test.cc b/test/memory_manager_test.cc
index 90bed16..d94bd51 100644
--- a/test/memory_manager_test.cc
+++ b/test/memory_manager_test.cc
@@ -5,25 +5,28 @@
 #include "output_test.h"
 
 class TestMemoryManager : public benchmark::MemoryManager {
-  void Start() {}
-  void Stop(Result* result) {
-    result->num_allocs = 42;
-    result->max_bytes_used = 42000;
+  void Start() override {}
+  void Stop(Result& result) override {
+    result.num_allocs = 42;
+    result.max_bytes_used = 42000;
   }
 };
 
 void BM_empty(benchmark::State& state) {
   for (auto _ : state) {
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
 }
 BENCHMARK(BM_empty);
 
 ADD_CASES(TC_ConsoleOut, {{"^BM_empty %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_empty\",$"},
+                       {"\"family_index\": 0,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_empty\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
diff --git a/test/min_time_parse_gtest.cc b/test/min_time_parse_gtest.cc
new file mode 100644
index 0000000..e2bdf67
--- /dev/null
+++ b/test/min_time_parse_gtest.cc
@@ -0,0 +1,30 @@
+#include "../src/benchmark_runner.h"
+#include "gtest/gtest.h"
+
+namespace {
+
+TEST(ParseMinTimeTest, InvalidInput) {
+#if GTEST_HAS_DEATH_TEST
+  // Tests only runnable in debug mode (when BM_CHECK is enabled).
+#ifndef NDEBUG
+#ifndef TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS
+  ASSERT_DEATH_IF_SUPPORTED(
+      { benchmark::internal::ParseBenchMinTime("abc"); },
+      "Malformed seconds value passed to --benchmark_min_time: `abc`");
+
+  ASSERT_DEATH_IF_SUPPORTED(
+      { benchmark::internal::ParseBenchMinTime("123ms"); },
+      "Malformed seconds value passed to --benchmark_min_time: `123ms`");
+
+  ASSERT_DEATH_IF_SUPPORTED(
+      { benchmark::internal::ParseBenchMinTime("1z"); },
+      "Malformed seconds value passed to --benchmark_min_time: `1z`");
+
+  ASSERT_DEATH_IF_SUPPORTED(
+      { benchmark::internal::ParseBenchMinTime("1hs"); },
+      "Malformed seconds value passed to --benchmark_min_time: `1hs`");
+#endif
+#endif
+#endif
+}
+}  // namespace
diff --git a/test/multiple_ranges_test.cc b/test/multiple_ranges_test.cc
index b25f40e..5300a96 100644
--- a/test/multiple_ranges_test.cc
+++ b/test/multiple_ranges_test.cc
@@ -1,10 +1,10 @@
-#include "benchmark/benchmark.h"
-
 #include <cassert>
 #include <iostream>
 #include <set>
 #include <vector>
 
+#include "benchmark/benchmark.h"
+
 class MultipleRangesFixture : public ::benchmark::Fixture {
  public:
   MultipleRangesFixture()
@@ -28,7 +28,7 @@ class MultipleRangesFixture : public ::benchmark::Fixture {
                         {2, 7, 15},
                         {7, 6, 3}}) {}
 
-  void SetUp(const ::benchmark::State& state) {
+  void SetUp(const ::benchmark::State& state) override {
     std::vector<int64_t> ranges = {state.range(0), state.range(1),
                                    state.range(2)};
 
@@ -39,10 +39,10 @@ class MultipleRangesFixture : public ::benchmark::Fixture {
 
   // NOTE: This is not TearDown as we want to check after _all_ runs are
   // complete.
-  virtual ~MultipleRangesFixture() {
+  ~MultipleRangesFixture() override {
     if (actualValues != expectedValues) {
       std::cout << "EXPECTED\n";
-      for (auto v : expectedValues) {
+      for (const auto& v : expectedValues) {
         std::cout << "{";
         for (int64_t iv : v) {
           std::cout << iv << ", ";
@@ -50,7 +50,7 @@ class MultipleRangesFixture : public ::benchmark::Fixture {
         std::cout << "}\n";
       }
       std::cout << "ACTUAL\n";
-      for (auto v : actualValues) {
+      for (const auto& v : actualValues) {
         std::cout << "{";
         for (int64_t iv : v) {
           std::cout << iv << ", ";
diff --git a/test/options_test.cc b/test/options_test.cc
index 9f9a786..a1b209f 100644
--- a/test/options_test.cc
+++ b/test/options_test.cc
@@ -1,7 +1,8 @@
-#include "benchmark/benchmark.h"
 #include <chrono>
 #include <thread>
 
+#include "benchmark/benchmark.h"
+
 #if defined(NDEBUG)
 #undef NDEBUG
 #endif
@@ -32,6 +33,8 @@ BENCHMARK(BM_basic)->DenseRange(10, 15);
 BENCHMARK(BM_basic)->Args({42, 42});
 BENCHMARK(BM_basic)->Ranges({{64, 512}, {64, 512}});
 BENCHMARK(BM_basic)->MinTime(0.7);
+BENCHMARK(BM_basic)->MinWarmUpTime(0.8);
+BENCHMARK(BM_basic)->MinTime(0.1)->MinWarmUpTime(0.2);
 BENCHMARK(BM_basic)->UseRealTime();
 BENCHMARK(BM_basic)->ThreadRange(2, 4);
 BENCHMARK(BM_basic)->ThreadPerCpu();
@@ -64,12 +67,10 @@ void BM_explicit_iteration_count(benchmark::State& state) {
 
   // Test that the requested iteration count is respected.
   assert(state.max_iterations == 42);
-  size_t actual_iterations = 0;
-  for (auto _ : state)
-    ++actual_iterations;
+  for (auto _ : state) {
+  }
   assert(state.iterations() == state.max_iterations);
   assert(state.iterations() == 42);
-
 }
 BENCHMARK(BM_explicit_iteration_count)->Iterations(42);
 
diff --git a/test/output_test.h b/test/output_test.h
index 9385761..c08fe1d 100644
--- a/test/output_test.h
+++ b/test/output_test.h
@@ -85,7 +85,7 @@ std::string GetFileReporterOutput(int argc, char* argv[]);
 struct Results;
 typedef std::function<void(Results const&)> ResultsCheckFn;
 
-size_t AddChecker(const char* bm_name_pattern, ResultsCheckFn fn);
+size_t AddChecker(const std::string& bm_name_pattern, const ResultsCheckFn& fn);
 
 // Class holding the results of a benchmark.
 // It is passed in calls to checker functions.
@@ -113,13 +113,11 @@ struct Results {
     return NumIterations() * GetTime(kRealTime);
   }
   // get the cpu_time duration of the benchmark in seconds
-  double DurationCPUTime() const {
-    return NumIterations() * GetTime(kCpuTime);
-  }
+  double DurationCPUTime() const { return NumIterations() * GetTime(kCpuTime); }
 
   // get the string for a result by name, or nullptr if the name
   // is not found
-  const std::string* Get(const char* entry_name) const {
+  const std::string* Get(const std::string& entry_name) const {
     auto it = values.find(entry_name);
     if (it == values.end()) return nullptr;
     return &it->second;
@@ -128,12 +126,12 @@ struct Results {
   // get a result by name, parsed as a specific type.
   // NOTE: for counters, use GetCounterAs instead.
   template <class T>
-  T GetAs(const char* entry_name) const;
+  T GetAs(const std::string& entry_name) const;
 
   // counters are written as doubles, so they have to be read first
   // as a double, and only then converted to the asked type.
   template <class T>
-  T GetCounterAs(const char* entry_name) const {
+  T GetCounterAs(const std::string& entry_name) const {
     double dval = GetAs<double>(entry_name);
     T tval = static_cast<T>(dval);
     return tval;
@@ -141,14 +139,14 @@ struct Results {
 };
 
 template <class T>
-T Results::GetAs(const char* entry_name) const {
+T Results::GetAs(const std::string& entry_name) const {
   auto* sv = Get(entry_name);
-  CHECK(sv != nullptr && !sv->empty());
+  BM_CHECK(sv != nullptr && !sv->empty());
   std::stringstream ss;
   ss << *sv;
   T out;
   ss >> out;
-  CHECK(!ss.fail());
+  BM_CHECK(!ss.fail());
   return out;
 }
 
@@ -158,8 +156,8 @@ T Results::GetAs(const char* entry_name) const {
 
 // clang-format off
 
-#define _CHECK_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value) \
-    CONCAT(CHECK_, relationship)                                        \
+#define CHECK_RESULT_VALUE_IMPL(entry, getfn, var_type, var_name, relationship, value) \
+    CONCAT(BM_CHECK_, relationship)                                        \
     (entry.getfn< var_type >(var_name), (value)) << "\n"                \
     << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
     << __FILE__ << ":" << __LINE__ << ": "                              \
@@ -169,8 +167,8 @@ T Results::GetAs(const char* entry_name) const {
 
 // check with tolerance. eps_factor is the tolerance window, which is
 // interpreted relative to value (eg, 0.1 means 10% of value).
-#define _CHECK_FLOAT_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value, eps_factor) \
-    CONCAT(CHECK_FLOAT_, relationship)                                  \
+#define CHECK_FLOAT_RESULT_VALUE_IMPL(entry, getfn, var_type, var_name, relationship, value, eps_factor) \
+    CONCAT(BM_CHECK_FLOAT_, relationship)                                  \
     (entry.getfn< var_type >(var_name), (value), (eps_factor) * (value)) << "\n" \
     << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
     << __FILE__ << ":" << __LINE__ << ": "                              \
@@ -187,16 +185,16 @@ T Results::GetAs(const char* entry_name) const {
     << "%)"
 
 #define CHECK_RESULT_VALUE(entry, var_type, var_name, relationship, value) \
-    _CHECK_RESULT_VALUE(entry, GetAs, var_type, var_name, relationship, value)
+    CHECK_RESULT_VALUE_IMPL(entry, GetAs, var_type, var_name, relationship, value)
 
 #define CHECK_COUNTER_VALUE(entry, var_type, var_name, relationship, value) \
-    _CHECK_RESULT_VALUE(entry, GetCounterAs, var_type, var_name, relationship, value)
+    CHECK_RESULT_VALUE_IMPL(entry, GetCounterAs, var_type, var_name, relationship, value)
 
 #define CHECK_FLOAT_RESULT_VALUE(entry, var_name, relationship, value, eps_factor) \
-    _CHECK_FLOAT_RESULT_VALUE(entry, GetAs, double, var_name, relationship, value, eps_factor)
+    CHECK_FLOAT_RESULT_VALUE_IMPL(entry, GetAs, double, var_name, relationship, value, eps_factor)
 
 #define CHECK_FLOAT_COUNTER_VALUE(entry, var_name, relationship, value, eps_factor) \
-    _CHECK_FLOAT_RESULT_VALUE(entry, GetCounterAs, double, var_name, relationship, value, eps_factor)
+    CHECK_FLOAT_RESULT_VALUE_IMPL(entry, GetCounterAs, double, var_name, relationship, value, eps_factor)
 
 // clang-format on
 
diff --git a/test/output_test_helper.cc b/test/output_test_helper.cc
index 1aebc55..2567370 100644
--- a/test/output_test_helper.cc
+++ b/test/output_test_helper.cc
@@ -10,6 +10,7 @@
 
 #include "../src/benchmark_api_internal.h"
 #include "../src/check.h"  // NOTE: check.h is for internal use only!
+#include "../src/log.h"    // NOTE: log.h is for internal use only
 #include "../src/re.h"     // NOTE: re.h is for internal use only
 #include "output_test.h"
 
@@ -40,14 +41,17 @@ SubMap& GetSubstitutions() {
   // clang-format off
   static std::string safe_dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?";
   static std::string time_re = "([0-9]+[.])?[0-9]+";
+  static std::string percentage_re = "[0-9]+[.][0-9]{2}";
   static SubMap map = {
       {"%float", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?"},
       // human-readable float
-      {"%hrfloat", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?[kMGTPEZYmunpfazy]?"},
+      {"%hrfloat", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?[kKMGTPEZYmunpfazy]?i?"},
+      {"%percentage", percentage_re},
       {"%int", "[ ]*[0-9]+"},
       {" %s ", "[ ]+"},
       {"%time", "[ ]*" + time_re + "[ ]+ns"},
       {"%console_report", "[ ]*" + time_re + "[ ]+ns [ ]*" + time_re + "[ ]+ns [ ]*[0-9]+"},
+      {"%console_percentage_report", "[ ]*" + percentage_re + "[ ]+% [ ]*" + percentage_re + "[ ]+% [ ]*[0-9]+"},
       {"%console_us_report", "[ ]*" + time_re + "[ ]+us [ ]*" + time_re + "[ ]+us [ ]*[0-9]+"},
       {"%console_ms_report", "[ ]*" + time_re + "[ ]+ms [ ]*" + time_re + "[ ]+ms [ ]*[0-9]+"},
       {"%console_s_report", "[ ]*" + time_re + "[ ]+s [ ]*" + time_re + "[ ]+s [ ]*[0-9]+"},
@@ -94,27 +98,27 @@ void CheckCase(std::stringstream& remaining_output, TestCase const& TC,
   bool on_first = true;
   std::string line;
   while (remaining_output.eof() == false) {
-    CHECK(remaining_output.good());
+    BM_CHECK(remaining_output.good());
     std::getline(remaining_output, line);
     if (on_first) {
       first_line = line;
       on_first = false;
     }
     for (const auto& NC : not_checks) {
-      CHECK(!NC.regex->Match(line))
+      BM_CHECK(!NC.regex->Match(line))
           << "Unexpected match for line \"" << line << "\" for MR_Not regex \""
           << NC.regex_str << "\""
           << "\n    actual regex string \"" << TC.substituted_regex << "\""
           << "\n    started matching near: " << first_line;
     }
     if (TC.regex->Match(line)) return;
-    CHECK(TC.match_rule != MR_Next)
+    BM_CHECK(TC.match_rule != MR_Next)
         << "Expected line \"" << line << "\" to match regex \"" << TC.regex_str
         << "\""
         << "\n    actual regex string \"" << TC.substituted_regex << "\""
         << "\n    started matching near: " << first_line;
   }
-  CHECK(remaining_output.eof() == false)
+  BM_CHECK(remaining_output.eof() == false)
       << "End of output reached before match for regex \"" << TC.regex_str
       << "\" was found"
       << "\n    actual regex string \"" << TC.substituted_regex << "\""
@@ -137,14 +141,14 @@ void CheckCases(TestCaseList const& checks, std::stringstream& output) {
 class TestReporter : public benchmark::BenchmarkReporter {
  public:
   TestReporter(std::vector<benchmark::BenchmarkReporter*> reps)
-      : reporters_(reps) {}
+      : reporters_(std::move(reps)) {}
 
-  virtual bool ReportContext(const Context& context) {
+  bool ReportContext(const Context& context) override {
     bool last_ret = false;
     bool first = true;
     for (auto rep : reporters_) {
       bool new_ret = rep->ReportContext(context);
-      CHECK(first || new_ret == last_ret)
+      BM_CHECK(first || new_ret == last_ret)
           << "Reports return different values for ReportContext";
       first = false;
       last_ret = new_ret;
@@ -153,10 +157,10 @@ class TestReporter : public benchmark::BenchmarkReporter {
     return last_ret;
   }
 
-  void ReportRuns(const std::vector<Run>& report) {
+  void ReportRuns(const std::vector<Run>& report) override {
     for (auto rep : reporters_) rep->ReportRuns(report);
   }
-  void Finalize() {
+  void Finalize() override {
     for (auto rep : reporters_) rep->Finalize();
   }
 
@@ -179,7 +183,7 @@ class ResultsChecker {
  public:
   struct PatternAndFn : public TestCase {  // reusing TestCase for its regexes
     PatternAndFn(const std::string& rx, ResultsCheckFn fn_)
-        : TestCase(rx), fn(fn_) {}
+        : TestCase(rx), fn(std::move(fn_)) {}
     ResultsCheckFn fn;
   };
 
@@ -187,7 +191,7 @@ class ResultsChecker {
   std::vector<Results> results;
   std::vector<std::string> field_names;
 
-  void Add(const std::string& entry_pattern, ResultsCheckFn fn);
+  void Add(const std::string& entry_pattern, const ResultsCheckFn& fn);
 
   void CheckResults(std::stringstream& output);
 
@@ -206,7 +210,8 @@ ResultsChecker& GetResultsChecker() {
 }
 
 // add a results checker for a benchmark
-void ResultsChecker::Add(const std::string& entry_pattern, ResultsCheckFn fn) {
+void ResultsChecker::Add(const std::string& entry_pattern,
+                         const ResultsCheckFn& fn) {
   check_patterns.emplace_back(entry_pattern, fn);
 }
 
@@ -226,7 +231,7 @@ void ResultsChecker::CheckResults(std::stringstream& output) {
   std::string line;
   bool on_first = true;
   while (output.eof() == false) {
-    CHECK(output.good());
+    BM_CHECK(output.good());
     std::getline(output, line);
     if (on_first) {
       SetHeader_(line);  // this is important
@@ -237,18 +242,17 @@ void ResultsChecker::CheckResults(std::stringstream& output) {
   }
   // finally we can call the subscribed check functions
   for (const auto& p : check_patterns) {
-    VLOG(2) << "--------------------------------\n";
-    VLOG(2) << "checking for benchmarks matching " << p.regex_str << "...\n";
+    BM_VLOG(2) << "--------------------------------\n";
+    BM_VLOG(2) << "checking for benchmarks matching " << p.regex_str << "...\n";
     for (const auto& r : results) {
       if (!p.regex->Match(r.name)) {
-        VLOG(2) << p.regex_str << " is not matched by " << r.name << "\n";
+        BM_VLOG(2) << p.regex_str << " is not matched by " << r.name << "\n";
         continue;
-      } else {
-        VLOG(2) << p.regex_str << " is matched by " << r.name << "\n";
       }
-      VLOG(1) << "Checking results of " << r.name << ": ... \n";
+      BM_VLOG(2) << p.regex_str << " is matched by " << r.name << "\n";
+      BM_VLOG(1) << "Checking results of " << r.name << ": ... \n";
       p.fn(r);
-      VLOG(1) << "Checking results of " << r.name << ": OK.\n";
+      BM_VLOG(1) << "Checking results of " << r.name << ": OK.\n";
     }
   }
 }
@@ -261,9 +265,9 @@ void ResultsChecker::SetHeader_(const std::string& csv_header) {
 // set the values for a benchmark
 void ResultsChecker::SetValues_(const std::string& entry_csv_line) {
   if (entry_csv_line.empty()) return;  // some lines are empty
-  CHECK(!field_names.empty());
+  BM_CHECK(!field_names.empty());
   auto vals = SplitCsv_(entry_csv_line);
-  CHECK_EQ(vals.size(), field_names.size());
+  BM_CHECK_EQ(vals.size(), field_names.size());
   results.emplace_back(vals[0]);  // vals[0] is the benchmark name
   auto& entry = results.back();
   for (size_t i = 1, e = vals.size(); i < e; ++i) {
@@ -278,7 +282,7 @@ std::vector<std::string> ResultsChecker::SplitCsv_(const std::string& line) {
   if (!field_names.empty()) out.reserve(field_names.size());
   size_t prev = 0, pos = line.find_first_of(','), curr = pos;
   while (pos != line.npos) {
-    CHECK(curr > 0);
+    BM_CHECK(curr > 0);
     if (line[prev] == '"') ++prev;
     if (line[curr - 1] == '"') --curr;
     out.push_back(line.substr(prev, curr - prev));
@@ -295,7 +299,7 @@ std::vector<std::string> ResultsChecker::SplitCsv_(const std::string& line) {
 
 }  // end namespace internal
 
-size_t AddChecker(const char* bm_name, ResultsCheckFn fn) {
+size_t AddChecker(const std::string& bm_name, const ResultsCheckFn& fn) {
   auto& rc = internal::GetResultsChecker();
   rc.Add(bm_name, fn);
   return rc.results.size();
@@ -309,32 +313,32 @@ int Results::NumThreads() const {
   ss << name.substr(pos + 9, end);
   int num = 1;
   ss >> num;
-  CHECK(!ss.fail());
+  BM_CHECK(!ss.fail());
   return num;
 }
 
-double Results::NumIterations() const {
-  return GetAs<double>("iterations");
-}
+double Results::NumIterations() const { return GetAs<double>("iterations"); }
 
 double Results::GetTime(BenchmarkTime which) const {
-  CHECK(which == kCpuTime || which == kRealTime);
+  BM_CHECK(which == kCpuTime || which == kRealTime);
   const char* which_str = which == kCpuTime ? "cpu_time" : "real_time";
   double val = GetAs<double>(which_str);
   auto unit = Get("time_unit");
-  CHECK(unit);
+  BM_CHECK(unit);
   if (*unit == "ns") {
     return val * 1.e-9;
-  } else if (*unit == "us") {
+  }
+  if (*unit == "us") {
     return val * 1.e-6;
-  } else if (*unit == "ms") {
+  }
+  if (*unit == "ms") {
     return val * 1.e-3;
-  } else if (*unit == "s") {
+  }
+  if (*unit == "s") {
     return val;
-  } else {
-    CHECK(1 == 0) << "unknown time unit: " << *unit;
-    return 0;
   }
+  BM_CHECK(1 == 0) << "unknown time unit: " << *unit;
+  return 0;
 }
 
 // ========================================================================= //
@@ -348,10 +352,10 @@ TestCase::TestCase(std::string re, int rule)
       regex(std::make_shared<benchmark::Regex>()) {
   std::string err_str;
   regex->Init(substituted_regex, &err_str);
-  CHECK(err_str.empty()) << "Could not construct regex \"" << substituted_regex
-                         << "\""
-                         << "\n    originally \"" << regex_str << "\""
-                         << "\n    got error: " << err_str;
+  BM_CHECK(err_str.empty())
+      << "Could not construct regex \"" << substituted_regex << "\""
+      << "\n    originally \"" << regex_str << "\""
+      << "\n    got error: " << err_str;
 }
 
 int AddCases(TestCaseID ID, std::initializer_list<TestCase> il) {
@@ -380,10 +384,8 @@ int SetSubstitutions(
 
 // Disable deprecated warnings temporarily because we need to reference
 // CSVReporter but don't want to trigger -Werror=-Wdeprecated-declarations
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
+BENCHMARK_DISABLE_DEPRECATED_WARNING
+
 void RunOutputTests(int argc, char* argv[]) {
   using internal::GetTestCaseList;
   benchmark::Initialize(&argc, argv);
@@ -392,14 +394,14 @@ void RunOutputTests(int argc, char* argv[]) {
   benchmark::JSONReporter JR;
   benchmark::CSVReporter CSVR;
   struct ReporterTest {
-    const char* name;
+    std::string name;
     std::vector<TestCase>& output_cases;
     std::vector<TestCase>& error_cases;
     benchmark::BenchmarkReporter& reporter;
     std::stringstream out_stream;
     std::stringstream err_stream;
 
-    ReporterTest(const char* n, std::vector<TestCase>& out_tc,
+    ReporterTest(const std::string& n, std::vector<TestCase>& out_tc,
                  std::vector<TestCase>& err_tc,
                  benchmark::BenchmarkReporter& br)
         : name(n), output_cases(out_tc), error_cases(err_tc), reporter(br) {
@@ -407,12 +409,12 @@ void RunOutputTests(int argc, char* argv[]) {
       reporter.SetErrorStream(&err_stream);
     }
   } TestCases[] = {
-      {"ConsoleReporter", GetTestCaseList(TC_ConsoleOut),
+      {std::string("ConsoleReporter"), GetTestCaseList(TC_ConsoleOut),
        GetTestCaseList(TC_ConsoleErr), CR},
-      {"JSONReporter", GetTestCaseList(TC_JSONOut), GetTestCaseList(TC_JSONErr),
-       JR},
-      {"CSVReporter", GetTestCaseList(TC_CSVOut), GetTestCaseList(TC_CSVErr),
-       CSVR},
+      {std::string("JSONReporter"), GetTestCaseList(TC_JSONOut),
+       GetTestCaseList(TC_JSONErr), JR},
+      {std::string("CSVReporter"), GetTestCaseList(TC_CSVOut),
+       GetTestCaseList(TC_CSVErr), CSVR},
   };
 
   // Create the test reporter and run the benchmarks.
@@ -421,7 +423,8 @@ void RunOutputTests(int argc, char* argv[]) {
   benchmark::RunSpecifiedBenchmarks(&test_rep);
 
   for (auto& rep_test : TestCases) {
-    std::string msg = std::string("\nTesting ") + rep_test.name + " Output\n";
+    std::string msg =
+        std::string("\nTesting ") + rep_test.name + std::string(" Output\n");
     std::string banner(msg.size() - 1, '-');
     std::cout << banner << msg << banner << "\n";
 
@@ -438,13 +441,11 @@ void RunOutputTests(int argc, char* argv[]) {
   // the checks to subscribees.
   auto& csv = TestCases[2];
   // would use == but gcc spits a warning
-  CHECK(std::strcmp(csv.name, "CSVReporter") == 0);
+  BM_CHECK(csv.name == std::string("CSVReporter"));
   internal::GetResultsChecker().CheckResults(csv.out_stream);
 }
 
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
+BENCHMARK_RESTORE_DEPRECATED_WARNING
 
 int SubstrCnt(const std::string& haystack, const std::string& pat) {
   if (pat.length() == 0) return 0;
@@ -468,9 +469,8 @@ static char RandomHexChar() {
 
 static std::string GetRandomFileName() {
   std::string model = "test.%%%%%%";
-  for (auto & ch :  model) {
-    if (ch == '%')
-      ch = RandomHexChar();
+  for (auto& ch : model) {
+    if (ch == '%') ch = RandomHexChar();
   }
   return model;
 }
@@ -487,8 +487,7 @@ static std::string GetTempFileName() {
   int retries = 3;
   while (--retries) {
     std::string name = GetRandomFileName();
-    if (!FileExists(name))
-      return name;
+    if (!FileExists(name)) return name;
   }
   std::cerr << "Failed to create unique temporary file name" << std::endl;
   std::abort();
diff --git a/test/perf_counters_gtest.cc b/test/perf_counters_gtest.cc
new file mode 100644
index 0000000..54c7863
--- /dev/null
+++ b/test/perf_counters_gtest.cc
@@ -0,0 +1,307 @@
+#include <random>
+#include <thread>
+
+#include "../src/perf_counters.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#ifndef GTEST_SKIP
+struct MsgHandler {
+  void operator=(std::ostream&) {}
+};
+#define GTEST_SKIP() return MsgHandler() = std::cout
+#endif
+
+using benchmark::internal::PerfCounters;
+using benchmark::internal::PerfCountersMeasurement;
+using benchmark::internal::PerfCounterValues;
+using ::testing::AllOf;
+using ::testing::Gt;
+using ::testing::Lt;
+
+namespace {
+const char kGenericPerfEvent1[] = "CYCLES";
+const char kGenericPerfEvent2[] = "INSTRUCTIONS";
+
+TEST(PerfCountersTest, Init) {
+  EXPECT_EQ(PerfCounters::Initialize(), PerfCounters::kSupported);
+}
+
+TEST(PerfCountersTest, OneCounter) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Performance counters not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  EXPECT_EQ(PerfCounters::Create({kGenericPerfEvent1}).num_counters(), 1);
+}
+
+TEST(PerfCountersTest, NegativeTest) {
+  if (!PerfCounters::kSupported) {
+    EXPECT_FALSE(PerfCounters::Initialize());
+    return;
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  // Sanity checks
+  // Create() will always create a valid object, even if passed no or
+  // wrong arguments as the new behavior is to warn and drop unsupported
+  // counters
+  EXPECT_EQ(PerfCounters::Create({}).num_counters(), 0);
+  EXPECT_EQ(PerfCounters::Create({""}).num_counters(), 0);
+  EXPECT_EQ(PerfCounters::Create({"not a counter name"}).num_counters(), 0);
+  {
+    // Try sneaking in a bad egg to see if it is filtered out. The
+    // number of counters has to be two, not zero
+    auto counter =
+        PerfCounters::Create({kGenericPerfEvent2, "", kGenericPerfEvent1});
+    EXPECT_EQ(counter.num_counters(), 2);
+    EXPECT_EQ(counter.names(), std::vector<std::string>(
+                                   {kGenericPerfEvent2, kGenericPerfEvent1}));
+  }
+  {
+    // Try sneaking in an outrageous counter, like a fat finger mistake
+    auto counter = PerfCounters::Create(
+        {kGenericPerfEvent2, "not a counter name", kGenericPerfEvent1});
+    EXPECT_EQ(counter.num_counters(), 2);
+    EXPECT_EQ(counter.names(), std::vector<std::string>(
+                                   {kGenericPerfEvent2, kGenericPerfEvent1}));
+  }
+  {
+    // Finally try a golden input - it should like both of them
+    EXPECT_EQ(PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2})
+                  .num_counters(),
+              2);
+  }
+  {
+    // Add a bad apple in the end of the chain to check the edges
+    auto counter = PerfCounters::Create(
+        {kGenericPerfEvent1, kGenericPerfEvent2, "bad event name"});
+    EXPECT_EQ(counter.num_counters(), 2);
+    EXPECT_EQ(counter.names(), std::vector<std::string>(
+                                   {kGenericPerfEvent1, kGenericPerfEvent2}));
+  }
+}
+
+TEST(PerfCountersTest, Read1Counter) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  auto counters = PerfCounters::Create({kGenericPerfEvent1});
+  EXPECT_EQ(counters.num_counters(), 1);
+  PerfCounterValues values1(1);
+  EXPECT_TRUE(counters.Snapshot(&values1));
+  EXPECT_GT(values1[0], 0);
+  PerfCounterValues values2(1);
+  EXPECT_TRUE(counters.Snapshot(&values2));
+  EXPECT_GT(values2[0], 0);
+  EXPECT_GT(values2[0], values1[0]);
+}
+
+TEST(PerfCountersTest, Read2Counters) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  auto counters =
+      PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
+  EXPECT_EQ(counters.num_counters(), 2);
+  PerfCounterValues values1(2);
+  EXPECT_TRUE(counters.Snapshot(&values1));
+  EXPECT_GT(values1[0], 0);
+  EXPECT_GT(values1[1], 0);
+  PerfCounterValues values2(2);
+  EXPECT_TRUE(counters.Snapshot(&values2));
+  EXPECT_GT(values2[0], 0);
+  EXPECT_GT(values2[1], 0);
+}
+
+TEST(PerfCountersTest, ReopenExistingCounters) {
+  // This test works in recent and old Intel hardware, Pixel 3, and Pixel 6.
+  // However we cannot make assumptions beyond 2 HW counters due to Pixel 6.
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  std::vector<std::string> kMetrics({kGenericPerfEvent1});
+  std::vector<PerfCounters> counters(2);
+  for (auto& counter : counters) {
+    counter = PerfCounters::Create(kMetrics);
+  }
+  PerfCounterValues values(1);
+  EXPECT_TRUE(counters[0].Snapshot(&values));
+  EXPECT_TRUE(counters[1].Snapshot(&values));
+}
+
+TEST(PerfCountersTest, CreateExistingMeasurements) {
+  // The test works (i.e. causes read to fail) for the assumptions
+  // about hardware capabilities (i.e. small number (2) hardware
+  // counters) at this date,
+  // the same as previous test ReopenExistingCounters.
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+
+  // This means we will try 10 counters but we can only guarantee
+  // for sure at this time that only 3 will work. Perhaps in the future
+  // we could use libpfm to query for the hardware limits on this
+  // particular platform.
+  const int kMaxCounters = 10;
+  const int kMinValidCounters = 2;
+
+  // Let's use a ubiquitous counter that is guaranteed to work
+  // on all platforms
+  const std::vector<std::string> kMetrics{"cycles"};
+
+  // Cannot create a vector of actual objects because the
+  // copy constructor of PerfCounters is deleted - and so is
+  // implicitly deleted on PerfCountersMeasurement too
+  std::vector<std::unique_ptr<PerfCountersMeasurement>>
+      perf_counter_measurements;
+
+  perf_counter_measurements.reserve(kMaxCounters);
+  for (int j = 0; j < kMaxCounters; ++j) {
+    perf_counter_measurements.emplace_back(
+        new PerfCountersMeasurement(kMetrics));
+  }
+
+  std::vector<std::pair<std::string, double>> measurements;
+
+  // Start all counters together to see if they hold
+  size_t max_counters = kMaxCounters;
+  for (size_t i = 0; i < kMaxCounters; ++i) {
+    auto& counter(*perf_counter_measurements[i]);
+    EXPECT_EQ(counter.num_counters(), 1);
+    if (!counter.Start()) {
+      max_counters = i;
+      break;
+    };
+  }
+
+  ASSERT_GE(max_counters, kMinValidCounters);
+
+  // Start all together
+  for (size_t i = 0; i < max_counters; ++i) {
+    auto& counter(*perf_counter_measurements[i]);
+    EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters));
+  }
+
+  // Start/stop individually
+  for (size_t i = 0; i < max_counters; ++i) {
+    auto& counter(*perf_counter_measurements[i]);
+    measurements.clear();
+    counter.Start();
+    EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters));
+  }
+}
+
+// We try to do some meaningful work here but the compiler
+// insists in optimizing away our loop so we had to add a
+// no-optimize macro. In case it fails, we added some entropy
+// to this pool as well.
+
+BENCHMARK_DONT_OPTIMIZE size_t do_work() {
+  static std::mt19937 rd{std::random_device{}()};
+  static std::uniform_int_distribution<size_t> mrand(0, 10);
+  const size_t kNumLoops = 1000000;
+  size_t sum = 0;
+  for (size_t j = 0; j < kNumLoops; ++j) {
+    sum += mrand(rd);
+  }
+  benchmark::DoNotOptimize(sum);
+  return sum;
+}
+
+void measure(size_t threadcount, PerfCounterValues* before,
+             PerfCounterValues* after) {
+  BM_CHECK_NE(before, nullptr);
+  BM_CHECK_NE(after, nullptr);
+  std::vector<std::thread> threads(threadcount);
+  auto work = [&]() { BM_CHECK(do_work() > 1000); };
+
+  // We need to first set up the counters, then start the threads, so the
+  // threads would inherit the counters. But later, we need to first destroy
+  // the thread pool (so all the work finishes), then measure the counters. So
+  // the scopes overlap, and we need to explicitly control the scope of the
+  // threadpool.
+  auto counters =
+      PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
+  for (auto& t : threads) t = std::thread(work);
+  counters.Snapshot(before);
+  for (auto& t : threads) t.join();
+  counters.Snapshot(after);
+}
+
+TEST(PerfCountersTest, MultiThreaded) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  PerfCounterValues before(2);
+  PerfCounterValues after(2);
+
+  // Notice that this test will work even if we taskset it to a single CPU
+  // In this case the threads will run sequentially
+  // Start two threads and measure the number of combined cycles and
+  // instructions
+  measure(2, &before, &after);
+  std::vector<double> Elapsed2Threads{
+      static_cast<double>(after[0] - before[0]),
+      static_cast<double>(after[1] - before[1])};
+
+  // Start four threads and measure the number of combined cycles and
+  // instructions
+  measure(4, &before, &after);
+  std::vector<double> Elapsed4Threads{
+      static_cast<double>(after[0] - before[0]),
+      static_cast<double>(after[1] - before[1])};
+
+  // The following expectations fail (at least on a beefy workstation with lots
+  // of cpus) - it seems that in some circumstances the runtime of 4 threads
+  // can even be better than with 2.
+  // So instead of expecting 4 threads to be slower, let's just make sure they
+  // do not differ too much in general (one is not more than 10x than the
+  // other).
+  EXPECT_THAT(Elapsed4Threads[0] / Elapsed2Threads[0], AllOf(Gt(0.1), Lt(10)));
+  EXPECT_THAT(Elapsed4Threads[1] / Elapsed2Threads[1], AllOf(Gt(0.1), Lt(10)));
+}
+
+TEST(PerfCountersTest, HardwareLimits) {
+  // The test works (i.e. causes read to fail) for the assumptions
+  // about hardware capabilities (i.e. small number (3-4) hardware
+  // counters) at this date,
+  // the same as previous test ReopenExistingCounters.
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+
+  // Taken from `perf list`, but focusses only on those HW events that actually
+  // were reported when running `sudo perf stat -a sleep 10`, intersected over
+  // several platforms. All HW events listed in the first command not reported
+  // in the second seem to not work. This is sad as we don't really get to test
+  // the grouping here (groups can contain up to 6 members)...
+  std::vector<std::string> counter_names{
+      "cycles",         // leader
+      "instructions",   //
+      "branch-misses",  //
+  };
+
+  // In the off-chance that some of these values are not supported,
+  // we filter them out so the test will complete without failure
+  // albeit it might not actually test the grouping on that platform
+  std::vector<std::string> valid_names;
+  for (const std::string& name : counter_names) {
+    if (PerfCounters::IsCounterSupported(name)) {
+      valid_names.push_back(name);
+    }
+  }
+  PerfCountersMeasurement counter(valid_names);
+
+  std::vector<std::pair<std::string, double>> measurements;
+
+  counter.Start();
+  EXPECT_TRUE(counter.Stop(measurements));
+}
+
+}  // namespace
diff --git a/test/perf_counters_test.cc b/test/perf_counters_test.cc
new file mode 100644
index 0000000..b0a3ab0
--- /dev/null
+++ b/test/perf_counters_test.cc
@@ -0,0 +1,92 @@
+#include <cstdarg>
+#undef NDEBUG
+
+#include "../src/commandlineflags.h"
+#include "../src/perf_counters.h"
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+namespace benchmark {
+
+BM_DECLARE_string(benchmark_perf_counters);
+
+}  // namespace benchmark
+
+static void BM_Simple(benchmark::State& state) {
+  for (auto _ : state) {
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
+  }
+}
+BENCHMARK(BM_Simple);
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Simple\",$"}});
+
+const int kIters = 1000000;
+
+void BM_WithoutPauseResume(benchmark::State& state) {
+  int n = 0;
+
+  for (auto _ : state) {
+    for (auto i = 0; i < kIters; ++i) {
+      n = 1 - n;
+      benchmark::DoNotOptimize(n);
+    }
+  }
+}
+
+BENCHMARK(BM_WithoutPauseResume);
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_WithoutPauseResume\",$"}});
+
+void BM_WithPauseResume(benchmark::State& state) {
+  int m = 0, n = 0;
+
+  for (auto _ : state) {
+    for (auto i = 0; i < kIters; ++i) {
+      n = 1 - n;
+      benchmark::DoNotOptimize(n);
+    }
+
+    state.PauseTiming();
+    for (auto j = 0; j < kIters; ++j) {
+      m = 1 - m;
+      benchmark::DoNotOptimize(m);
+    }
+    state.ResumeTiming();
+  }
+}
+
+BENCHMARK(BM_WithPauseResume);
+
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_WithPauseResume\",$"}});
+
+static void CheckSimple(Results const& e) {
+  CHECK_COUNTER_VALUE(e, double, "CYCLES", GT, 0);
+}
+
+double withoutPauseResumeInstrCount = 0.0;
+double withPauseResumeInstrCount = 0.0;
+
+static void SaveInstrCountWithoutResume(Results const& e) {
+  withoutPauseResumeInstrCount = e.GetAs<double>("INSTRUCTIONS");
+}
+
+static void SaveInstrCountWithResume(Results const& e) {
+  withPauseResumeInstrCount = e.GetAs<double>("INSTRUCTIONS");
+}
+
+CHECK_BENCHMARK_RESULTS("BM_Simple", &CheckSimple);
+CHECK_BENCHMARK_RESULTS("BM_WithoutPauseResume", &SaveInstrCountWithoutResume);
+CHECK_BENCHMARK_RESULTS("BM_WithPauseResume", &SaveInstrCountWithResume);
+
+int main(int argc, char* argv[]) {
+  if (!benchmark::internal::PerfCounters::kSupported) {
+    return 0;
+  }
+  benchmark::FLAGS_benchmark_perf_counters = "CYCLES,INSTRUCTIONS";
+  benchmark::internal::PerfCounters::Initialize();
+  RunOutputTests(argc, argv);
+
+  BM_CHECK_GT(withPauseResumeInstrCount, kIters);
+  BM_CHECK_GT(withoutPauseResumeInstrCount, kIters);
+  BM_CHECK_LT(withPauseResumeInstrCount, 1.5 * withoutPauseResumeInstrCount);
+}
diff --git a/test/register_benchmark_test.cc b/test/register_benchmark_test.cc
index 3ac5b21..d69d144 100644
--- a/test/register_benchmark_test.cc
+++ b/test/register_benchmark_test.cc
@@ -10,7 +10,7 @@ namespace {
 
 class TestReporter : public benchmark::ConsoleReporter {
  public:
-  virtual void ReportRuns(const std::vector<Run>& report) {
+  void ReportRuns(const std::vector<Run>& report) override {
     all_runs_.insert(all_runs_.end(), begin(report), end(report));
     ConsoleReporter::ReportRuns(report);
   }
@@ -19,24 +19,24 @@ class TestReporter : public benchmark::ConsoleReporter {
 };
 
 struct TestCase {
-  std::string name;
-  const char* label;
+  const std::string name;
+  const std::string label;
   // Note: not explicit as we rely on it being converted through ADD_CASES.
-  TestCase(const char* xname) : TestCase(xname, nullptr) {}
-  TestCase(const char* xname, const char* xlabel)
+  TestCase(const std::string& xname) : TestCase(xname, "") {}
+  TestCase(const std::string& xname, const std::string& xlabel)
       : name(xname), label(xlabel) {}
 
   typedef benchmark::BenchmarkReporter::Run Run;
 
   void CheckRun(Run const& run) const {
     // clang-format off
-    CHECK(name == run.benchmark_name()) << "expected " << name << " got "
+    BM_CHECK(name == run.benchmark_name()) << "expected " << name << " got "
                                       << run.benchmark_name();
-    if (label) {
-      CHECK(run.report_label == label) << "expected " << label << " got "
+    if (!label.empty()) {
+      BM_CHECK(run.report_label == label) << "expected " << label << " got "
                                        << run.report_label;
     } else {
-      CHECK(run.report_label == "");
+      BM_CHECK(run.report_label.empty());
     }
     // clang-format on
   }
@@ -45,7 +45,7 @@ struct TestCase {
 std::vector<TestCase> ExpectedResults;
 
 int AddCases(std::initializer_list<TestCase> const& v) {
-  for (auto N : v) {
+  for (const auto& N : v) {
     ExpectedResults.push_back(N);
   }
   return 0;
@@ -96,6 +96,18 @@ ADD_CASES({"test1", "One"}, {"test2", "Two"}, {"test3", "Three"});
 #endif  // BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
 
 //----------------------------------------------------------------------------//
+// Test RegisterBenchmark with DISABLED_ benchmark
+//----------------------------------------------------------------------------//
+void DISABLED_BM_function(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(DISABLED_BM_function);
+ReturnVal dummy3 = benchmark::RegisterBenchmark("DISABLED_BM_function_manual",
+                                                DISABLED_BM_function);
+// No need to add cases because we don't expect them to run.
+
+//----------------------------------------------------------------------------//
 // Test RegisterBenchmark with different callable types
 //----------------------------------------------------------------------------//
 
@@ -111,7 +123,7 @@ void TestRegistrationAtRuntime() {
   {
     CustomFixture fx;
     benchmark::RegisterBenchmark("custom_fixture", fx);
-    AddCases({"custom_fixture"});
+    AddCases({std::string("custom_fixture")});
   }
 #endif
 #ifndef BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
diff --git a/test/repetitions_test.cc b/test/repetitions_test.cc
new file mode 100644
index 0000000..569777d
--- /dev/null
+++ b/test/repetitions_test.cc
@@ -0,0 +1,214 @@
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// ========================================================================= //
+// ------------------------ Testing Basic Output --------------------------- //
+// ========================================================================= //
+
+static void BM_ExplicitRepetitions(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_ExplicitRepetitions)->Repetitions(2);
+
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2 %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2 %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2_mean %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2_median %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2_stddev %console_report$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 1,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2_mean\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2_median\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2_stddev\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ExplicitRepetitions/repeats:2\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ExplicitRepetitions/repeats:2\",%csv_report$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_ExplicitRepetitions/repeats:2_mean\",%csv_report$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_ExplicitRepetitions/repeats:2_median\",%csv_report$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_ExplicitRepetitions/repeats:2_stddev\",%csv_report$"}});
+
+// ========================================================================= //
+// ------------------------ Testing Basic Output --------------------------- //
+// ========================================================================= //
+
+static void BM_ImplicitRepetitions(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_ImplicitRepetitions);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions_mean %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions_median %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions_stddev %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 1,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions_mean\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions_median\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions_stddev\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions_mean\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions_median\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions_stddev\",%csv_report$"}});
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/test/report_aggregates_only_test.cc b/test/report_aggregates_only_test.cc
index 9646b9b..47da503 100644
--- a/test/report_aggregates_only_test.cc
+++ b/test/report_aggregates_only_test.cc
@@ -19,17 +19,19 @@ BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->ReportAggregatesOnly();
 int main(int argc, char* argv[]) {
   const std::string output = GetFileReporterOutput(argc, argv);
 
-  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 3 ||
+  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 4 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_mean\"") != 1 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_median\"") !=
           1 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"") !=
-          1) {
-    std::cout << "Precondition mismatch. Expected to only find three "
+          1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_cv\"") != 1) {
+    std::cout << "Precondition mismatch. Expected to only find four "
                  "occurrences of \"BM_SummaryRepeat/repeats:3\" substring:\n"
                  "\"name\": \"BM_SummaryRepeat/repeats:3_mean\", "
                  "\"name\": \"BM_SummaryRepeat/repeats:3_median\", "
-                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"\nThe entire "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_cv\"\nThe entire "
                  "output:\n";
     std::cout << output;
     return 1;
diff --git a/test/reporter_output_test.cc b/test/reporter_output_test.cc
index d24a57d..2eb545a 100644
--- a/test/reporter_output_test.cc
+++ b/test/reporter_output_test.cc
@@ -1,5 +1,6 @@
 
 #undef NDEBUG
+#include <numeric>
 #include <utility>
 
 #include "benchmark/benchmark.h"
@@ -16,7 +17,7 @@ static int AddContextCases() {
   AddCases(TC_ConsoleErr,
            {
                {"^%int-%int-%intT%int:%int:%int[-+]%int:%int$", MR_Default},
-               {"Running .*/reporter_output_test(\\.exe)?$", MR_Next},
+               {"Running .*(/|\\\\)reporter_output_test(\\.exe)?$", MR_Next},
                {"Run on \\(%int X %float MHz CPU s?\\)", MR_Next},
            });
   AddCases(TC_JSONOut,
@@ -71,9 +72,11 @@ BENCHMARK(BM_basic);
 
 ADD_CASES(TC_ConsoleOut, {{"^BM_basic %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_basic\",$"},
+                       {"\"family_index\": 0,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_basic\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
@@ -90,7 +93,8 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_basic\",%csv_report$"}});
 void BM_bytes_per_second(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   state.SetBytesProcessed(1);
 }
@@ -99,9 +103,11 @@ BENCHMARK(BM_bytes_per_second);
 ADD_CASES(TC_ConsoleOut, {{"^BM_bytes_per_second %console_report "
                            "bytes_per_second=%float[kM]{0,1}/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_bytes_per_second\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_bytes_per_second\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
@@ -119,7 +125,8 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_bytes_per_second\",%csv_bytes_report$"}});
 void BM_items_per_second(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   state.SetItemsProcessed(1);
 }
@@ -128,9 +135,11 @@ BENCHMARK(BM_items_per_second);
 ADD_CASES(TC_ConsoleOut, {{"^BM_items_per_second %console_report "
                            "items_per_second=%float[kM]{0,1}/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_items_per_second\",$"},
+                       {"\"family_index\": 2,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_items_per_second\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
@@ -154,9 +163,11 @@ BENCHMARK(BM_label);
 
 ADD_CASES(TC_ConsoleOut, {{"^BM_label %console_report some label$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_label\",$"},
+                       {"\"family_index\": 3,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_label\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
@@ -181,9 +192,11 @@ BENCHMARK(BM_time_label_nanosecond)->Unit(benchmark::kNanosecond);
 ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_nanosecond %console_report$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_time_label_nanosecond\",$"},
+           {"\"family_index\": 4,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_time_label_nanosecond\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -202,9 +215,11 @@ BENCHMARK(BM_time_label_microsecond)->Unit(benchmark::kMicrosecond);
 ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_microsecond %console_us_report$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_time_label_microsecond\",$"},
+           {"\"family_index\": 5,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_time_label_microsecond\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -223,9 +238,11 @@ BENCHMARK(BM_time_label_millisecond)->Unit(benchmark::kMillisecond);
 ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_millisecond %console_ms_report$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_time_label_millisecond\",$"},
+           {"\"family_index\": 6,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_time_label_millisecond\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -243,9 +260,11 @@ BENCHMARK(BM_time_label_second)->Unit(benchmark::kSecond);
 
 ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_second %console_s_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_time_label_second\",$"},
+                       {"\"family_index\": 7,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_time_label_second\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
@@ -267,9 +286,11 @@ void BM_error(benchmark::State& state) {
 BENCHMARK(BM_error);
 ADD_CASES(TC_ConsoleOut, {{"^BM_error[ ]+ERROR OCCURRED: 'message'$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_error\",$"},
+                       {"\"family_index\": 8,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_error\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"error_occurred\": true,$", MR_Next},
@@ -289,15 +310,17 @@ void BM_no_arg_name(benchmark::State& state) {
 BENCHMARK(BM_no_arg_name)->Arg(3);
 ADD_CASES(TC_ConsoleOut, {{"^BM_no_arg_name/3 %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_no_arg_name/3\",$"},
+                       {"\"family_index\": 9,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_no_arg_name/3\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_no_arg_name/3\",%csv_report$"}});
 
 // ========================================================================= //
-// ------------------------ Testing Arg Name Output ----------------------- //
+// ------------------------ Testing Arg Name Output ------------------------ //
 // ========================================================================= //
 
 void BM_arg_name(benchmark::State& state) {
@@ -307,9 +330,11 @@ void BM_arg_name(benchmark::State& state) {
 BENCHMARK(BM_arg_name)->ArgName("first")->Arg(3);
 ADD_CASES(TC_ConsoleOut, {{"^BM_arg_name/first:3 %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_name/first:3\",$"},
+                       {"\"family_index\": 10,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_arg_name/first:3\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_arg_name/first:3\",%csv_report$"}});
@@ -327,14 +352,42 @@ ADD_CASES(TC_ConsoleOut,
           {{"^BM_arg_names/first:2/5/third:4 %console_report$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_arg_names/first:2/5/third:4\",$"},
+           {"\"family_index\": 11,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_arg_names/first:2/5/third:4\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_arg_names/first:2/5/third:4\",%csv_report$"}});
 
 // ========================================================================= //
+// ------------------------ Testing Name Output ---------------------------- //
+// ========================================================================= //
+
+void BM_name(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_name)->Name("BM_custom_name");
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_custom_name %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_custom_name\",$"},
+                       {"\"family_index\": 12,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_custom_name\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_custom_name\",%csv_report$"}});
+
+// ========================================================================= //
 // ------------------------ Testing Big Args Output ------------------------ //
 // ========================================================================= //
 
@@ -353,7 +406,8 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_BigArgs/1073741824 %console_report$"},
 void BM_Complexity_O1(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   state.SetComplexityN(state.range(0));
 }
@@ -381,37 +435,50 @@ ADD_CASES(TC_ConsoleOut,
            {"^BM_Repeat/repeats:2_median %console_time_only_report [ ]*2$"},
            {"^BM_Repeat/repeats:2_stddev %console_time_only_report [ ]*2$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:2\",$"},
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:2\"", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 2,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2\",$"},
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 2,$", MR_Next},
                        {"\"repetition_index\": 1,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2_mean\",$"},
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
                        {"\"repetitions\": 2,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 2,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2_median\",$"},
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
                        {"\"repetitions\": 2,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 2,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2_stddev\",$"},
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
                        {"\"repetitions\": 2,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 2,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:2\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:2\",%csv_report$"},
@@ -428,43 +495,58 @@ ADD_CASES(TC_ConsoleOut,
            {"^BM_Repeat/repeats:3_median %console_time_only_report [ ]*3$"},
            {"^BM_Repeat/repeats:3_stddev %console_time_only_report [ ]*3$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 3,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 3,$", MR_Next},
                        {"\"repetition_index\": 1,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 3,$", MR_Next},
                        {"\"repetition_index\": 2,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3_mean\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
                        {"\"repetitions\": 3,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 3,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3_median\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
                        {"\"repetitions\": 3,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 3,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3_stddev\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
                        {"\"repetitions\": 3,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 3,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:3\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:3\",%csv_report$"},
@@ -483,49 +565,66 @@ ADD_CASES(TC_ConsoleOut,
            {"^BM_Repeat/repeats:4_median %console_time_only_report [ ]*4$"},
            {"^BM_Repeat/repeats:4_stddev %console_time_only_report [ ]*4$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 4,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 4,$", MR_Next},
                        {"\"repetition_index\": 1,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 4,$", MR_Next},
                        {"\"repetition_index\": 2,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 4,$", MR_Next},
                        {"\"repetition_index\": 3,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4_mean\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
                        {"\"repetitions\": 4,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 4,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4_median\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
                        {"\"repetitions\": 4,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 4,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4_stddev\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
                        {"\"repetitions\": 4,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 4,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:4\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:4\",%csv_report$"},
@@ -544,6 +643,8 @@ void BM_RepeatOnce(benchmark::State& state) {
 BENCHMARK(BM_RepeatOnce)->Repetitions(1)->ReportAggregatesOnly();
 ADD_CASES(TC_ConsoleOut, {{"^BM_RepeatOnce/repeats:1 %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_RepeatOnce/repeats:1\",$"},
+                       {"\"family_index\": 18,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_RepeatOnce/repeats:1\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 1,$", MR_Next},
@@ -566,25 +667,34 @@ ADD_CASES(
 ADD_CASES(TC_JSONOut,
           {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
            {"\"name\": \"BM_SummaryRepeat/repeats:3_mean\",$"},
+           {"\"family_index\": 19,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 3,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"name\": \"BM_SummaryRepeat/repeats:3_median\",$"},
+           {"\"family_index\": 19,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 3,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"name\": \"BM_SummaryRepeat/repeats:3_stddev\",$"},
+           {"\"family_index\": 19,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 3,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
                       {"^\"BM_SummaryRepeat/repeats:3_mean\",%csv_report$"},
@@ -608,25 +718,34 @@ ADD_CASES(
 ADD_CASES(TC_JSONOut,
           {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
            {"\"name\": \"BM_SummaryDisplay/repeats:2_mean\",$"},
+           {"\"family_index\": 20,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"name\": \"BM_SummaryDisplay/repeats:2_median\",$"},
+           {"\"family_index\": 20,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"name\": \"BM_SummaryDisplay/repeats:2_stddev\",$"},
+           {"\"family_index\": 20,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next}});
 ADD_CASES(TC_CSVOut,
           {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
@@ -654,27 +773,36 @@ ADD_CASES(
 ADD_CASES(TC_JSONOut,
           {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
            {"\"name\": \"BM_RepeatTimeUnit/repeats:3_mean\",$"},
+           {"\"family_index\": 21,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 3,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"time_unit\": \"us\",?$"},
            {"\"name\": \"BM_RepeatTimeUnit/repeats:3_median\",$"},
+           {"\"family_index\": 21,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 3,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"time_unit\": \"us\",?$"},
            {"\"name\": \"BM_RepeatTimeUnit/repeats:3_stddev\",$"},
+           {"\"family_index\": 21,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 3,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"time_unit\": \"us\",?$"}});
 ADD_CASES(TC_CSVOut,
@@ -722,6 +850,8 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_UserStats/iterations:5/repeats:3/manual_time [ "
 ADD_CASES(
     TC_JSONOut,
     {{"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"iteration\",$", MR_Next},
@@ -731,6 +861,8 @@ ADD_CASES(
      {"\"iterations\": 5,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"iteration\",$", MR_Next},
@@ -740,6 +872,8 @@ ADD_CASES(
      {"\"iterations\": 5,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"iteration\",$", MR_Next},
@@ -749,39 +883,51 @@ ADD_CASES(
      {"\"iterations\": 5,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_mean\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"aggregate\",$", MR_Next},
      {"\"repetitions\": 3,$", MR_Next},
      {"\"threads\": 1,$", MR_Next},
      {"\"aggregate_name\": \"mean\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
      {"\"iterations\": 3,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_median\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"aggregate\",$", MR_Next},
      {"\"repetitions\": 3,$", MR_Next},
      {"\"threads\": 1,$", MR_Next},
      {"\"aggregate_name\": \"median\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
      {"\"iterations\": 3,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_stddev\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"aggregate\",$", MR_Next},
      {"\"repetitions\": 3,$", MR_Next},
      {"\"threads\": 1,$", MR_Next},
      {"\"aggregate_name\": \"stddev\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
      {"\"iterations\": 3,$", MR_Next},
      {"\"real_time\": %float,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"aggregate\",$", MR_Next},
      {"\"repetitions\": 3,$", MR_Next},
      {"\"threads\": 1,$", MR_Next},
      {"\"aggregate_name\": \"\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
      {"\"iterations\": 3,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next}});
 ADD_CASES(
@@ -797,6 +943,154 @@ ADD_CASES(
      {"^\"BM_UserStats/iterations:5/repeats:3/manual_time_\",%csv_report$"}});
 
 // ========================================================================= //
+// ------------- Testing relative standard deviation statistics ------------ //
+// ========================================================================= //
+
+const auto UserPercentStatistics = [](const std::vector<double>&) {
+  return 1. / 100.;
+};
+void BM_UserPercentStats(benchmark::State& state) {
+  for (auto _ : state) {
+    state.SetIterationTime(150 / 10e8);
+  }
+}
+// clang-format off
+BENCHMARK(BM_UserPercentStats)
+  ->Repetitions(3)
+  ->Iterations(5)
+  ->UseManualTime()
+  ->Unit(benchmark::TimeUnit::kNanosecond)
+  ->ComputeStatistics("", UserPercentStatistics, benchmark::StatisticUnit::kPercentage);
+// clang-format on
+
+// check that UserPercent-provided stats is calculated, and is after the
+// default-ones empty string as name is intentional, it would sort before
+// anything else
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_UserPercentStats/iterations:5/repeats:3/manual_time [ "
+            "]* 150 ns %time [ ]*5$"},
+           {"^BM_UserPercentStats/iterations:5/repeats:3/manual_time [ "
+            "]* 150 ns %time [ ]*5$"},
+           {"^BM_UserPercentStats/iterations:5/repeats:3/manual_time [ "
+            "]* 150 ns %time [ ]*5$"},
+           {"^BM_UserPercentStats/iterations:5/repeats:3/"
+            "manual_time_mean [ ]* 150 ns %time [ ]*3$"},
+           {"^BM_UserPercentStats/iterations:5/repeats:3/"
+            "manual_time_median [ ]* 150 ns %time [ ]*3$"},
+           {"^BM_UserPercentStats/iterations:5/repeats:3/"
+            "manual_time_stddev [ ]* 0.000 ns %time [ ]*3$"},
+           {"^BM_UserPercentStats/iterations:5/repeats:3/manual_time_ "
+            "[ ]* 1.00 % [ ]* 1.00 %[ ]*3$"}});
+ADD_CASES(
+    TC_JSONOut,
+    {{"\"name\": \"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 0,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 1,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 2,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time_mean\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"mean\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time_median\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"median\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time_stddev\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"stddev\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": %float,$", MR_Next},
+     {"\"name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time_\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"\",$", MR_Next},
+     {"\"aggregate_unit\": \"percentage\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.(0)*e-(0)*2,$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time\",%csv_report$"},
+                      {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time\",%csv_report$"},
+                      {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time\",%csv_report$"},
+                      {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time_mean\",%csv_report$"},
+                      {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time_median\",%csv_report$"},
+                      {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time_stddev\",%csv_report$"},
+                      {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time_\",%csv_report$"}});
+
+// ========================================================================= //
 // ------------------------- Testing StrEscape JSON ------------------------ //
 // ========================================================================= //
 #if 0  // enable when csv testing code correctly handles multi-line fields
@@ -807,9 +1101,11 @@ void BM_JSON_Format(benchmark::State& state) {
 }
 BENCHMARK(BM_JSON_Format);
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_JSON_Format\",$"},
+                                              {"\"family_index\": 23,$", MR_Next},
+{"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_JSON_Format\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"error_occurred\": true,$", MR_Next},
diff --git a/test/skip_with_error_test.cc b/test/skip_with_error_test.cc
index 97a2e3c..b4c5e15 100644
--- a/test/skip_with_error_test.cc
+++ b/test/skip_with_error_test.cc
@@ -10,17 +10,17 @@ namespace {
 
 class TestReporter : public benchmark::ConsoleReporter {
  public:
-  virtual bool ReportContext(const Context& context) {
+  bool ReportContext(const Context& context) override {
     return ConsoleReporter::ReportContext(context);
   };
 
-  virtual void ReportRuns(const std::vector<Run>& report) {
+  void ReportRuns(const std::vector<Run>& report) override {
     all_runs_.insert(all_runs_.end(), begin(report), end(report));
     ConsoleReporter::ReportRuns(report);
   }
 
   TestReporter() {}
-  virtual ~TestReporter() {}
+  ~TestReporter() override {}
 
   mutable std::vector<Run> all_runs_;
 };
@@ -33,21 +33,23 @@ struct TestCase {
   typedef benchmark::BenchmarkReporter::Run Run;
 
   void CheckRun(Run const& run) const {
-    CHECK(name == run.benchmark_name())
+    BM_CHECK(name == run.benchmark_name())
         << "expected " << name << " got " << run.benchmark_name();
-    CHECK(error_occurred == run.error_occurred);
-    CHECK(error_message == run.error_message);
+    BM_CHECK_EQ(error_occurred,
+                benchmark::internal::SkippedWithError == run.skipped);
+    BM_CHECK(error_message == run.skip_message);
     if (error_occurred) {
-      // CHECK(run.iterations == 0);
+      // BM_CHECK(run.iterations == 0);
     } else {
-      CHECK(run.iterations != 0);
+      BM_CHECK(run.iterations != 0);
     }
   }
 };
 
 std::vector<TestCase> ExpectedResults;
 
-int AddCases(const char* base_name, std::initializer_list<TestCase> const& v) {
+int AddCases(const std::string& base_name,
+             std::initializer_list<TestCase> const& v) {
   for (auto TC : v) {
     TC.name = base_name + TC.name;
     ExpectedResults.push_back(std::move(TC));
@@ -97,7 +99,7 @@ ADD_CASES("BM_error_before_running_range_for", {{"", true, "error message"}});
 void BM_error_during_running(benchmark::State& state) {
   int first_iter = true;
   while (state.KeepRunning()) {
-    if (state.range(0) == 1 && state.thread_index <= (state.threads / 2)) {
+    if (state.range(0) == 1 && state.thread_index() <= (state.threads() / 2)) {
       assert(first_iter);
       first_iter = false;
       state.SkipWithError("error message");
@@ -119,12 +121,13 @@ ADD_CASES("BM_error_during_running", {{"/1/threads:1", true, "error message"},
 
 void BM_error_during_running_ranged_for(benchmark::State& state) {
   assert(state.max_iterations > 3 && "test requires at least a few iterations");
-  int first_iter = true;
+  bool first_iter = true;
   // NOTE: Users should not write the for loop explicitly.
   for (auto It = state.begin(), End = state.end(); It != End; ++It) {
     if (state.range(0) == 1) {
       assert(first_iter);
       first_iter = false;
+      (void)first_iter;
       state.SkipWithError("error message");
       // Test the unfortunate but documented behavior that the ranged-for loop
       // doesn't automatically terminate when SkipWithError is set.
@@ -140,9 +143,10 @@ ADD_CASES("BM_error_during_running_ranged_for",
 
 void BM_error_after_running(benchmark::State& state) {
   for (auto _ : state) {
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
-  if (state.thread_index <= (state.threads / 2))
+  if (state.thread_index() <= (state.threads() / 2))
     state.SkipWithError("error message");
 }
 BENCHMARK(BM_error_after_running)->ThreadRange(1, 8);
@@ -154,7 +158,7 @@ ADD_CASES("BM_error_after_running", {{"/threads:1", true, "error message"},
 void BM_error_while_paused(benchmark::State& state) {
   bool first_iter = true;
   while (state.KeepRunning()) {
-    if (state.range(0) == 1 && state.thread_index <= (state.threads / 2)) {
+    if (state.range(0) == 1 && state.thread_index() <= (state.threads() / 2)) {
       assert(first_iter);
       first_iter = false;
       state.PauseTiming();
diff --git a/test/spec_arg_test.cc b/test/spec_arg_test.cc
new file mode 100644
index 0000000..06aafbe
--- /dev/null
+++ b/test/spec_arg_test.cc
@@ -0,0 +1,105 @@
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+
+// Tests that we can override benchmark-spec value from FLAGS_benchmark_filter
+// with argument to RunSpecifiedBenchmarks(...).
+
+namespace {
+
+class TestReporter : public benchmark::ConsoleReporter {
+ public:
+  bool ReportContext(const Context& context) override {
+    return ConsoleReporter::ReportContext(context);
+  };
+
+  void ReportRuns(const std::vector<Run>& report) override {
+    assert(report.size() == 1);
+    matched_functions.push_back(report[0].run_name.function_name);
+    ConsoleReporter::ReportRuns(report);
+  };
+
+  TestReporter() {}
+
+  ~TestReporter() override {}
+
+  const std::vector<std::string>& GetMatchedFunctions() const {
+    return matched_functions;
+  }
+
+ private:
+  std::vector<std::string> matched_functions;
+};
+
+}  // end namespace
+
+static void BM_NotChosen(benchmark::State& state) {
+  assert(false && "SHOULD NOT BE CALLED");
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_NotChosen);
+
+static void BM_Chosen(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_Chosen);
+
+int main(int argc, char** argv) {
+  const std::string flag = "BM_NotChosen";
+
+  // Verify that argv specify --benchmark_filter=BM_NotChosen.
+  bool found = false;
+  for (int i = 0; i < argc; ++i) {
+    if (strcmp("--benchmark_filter=BM_NotChosen", argv[i]) == 0) {
+      found = true;
+      break;
+    }
+  }
+  assert(found);
+
+  benchmark::Initialize(&argc, argv);
+
+  // Check that the current flag value is reported accurately via the
+  // GetBenchmarkFilter() function.
+  if (flag != benchmark::GetBenchmarkFilter()) {
+    std::cerr
+        << "Seeing different value for flags. GetBenchmarkFilter() returns ["
+        << benchmark::GetBenchmarkFilter() << "] expected flag=[" << flag
+        << "]\n";
+    return 1;
+  }
+  TestReporter test_reporter;
+  const char* const spec = "BM_Chosen";
+  const size_t returned_count =
+      benchmark::RunSpecifiedBenchmarks(&test_reporter, spec);
+  assert(returned_count == 1);
+  const std::vector<std::string> matched_functions =
+      test_reporter.GetMatchedFunctions();
+  assert(matched_functions.size() == 1);
+  if (strcmp(spec, matched_functions.front().c_str()) != 0) {
+    std::cerr << "Expected benchmark [" << spec << "] to run, but got ["
+              << matched_functions.front() << "]\n";
+    return 2;
+  }
+
+  // Test that SetBenchmarkFilter works.
+  const std::string golden_value = "golden_value";
+  benchmark::SetBenchmarkFilter(golden_value);
+  std::string current_value = benchmark::GetBenchmarkFilter();
+  if (golden_value != current_value) {
+    std::cerr << "Expected [" << golden_value
+              << "] for --benchmark_filter but got [" << current_value << "]\n";
+    return 3;
+  }
+  return 0;
+}
diff --git a/test/spec_arg_verbosity_test.cc b/test/spec_arg_verbosity_test.cc
new file mode 100644
index 0000000..8f8eb6d
--- /dev/null
+++ b/test/spec_arg_verbosity_test.cc
@@ -0,0 +1,43 @@
+#include <string.h>
+
+#include <iostream>
+
+#include "benchmark/benchmark.h"
+
+// Tests that the user specified verbosity level can be get.
+static void BM_Verbosity(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_Verbosity);
+
+int main(int argc, char** argv) {
+  const int32_t flagv = 42;
+
+  // Verify that argv specify --v=42.
+  bool found = false;
+  for (int i = 0; i < argc; ++i) {
+    if (strcmp("--v=42", argv[i]) == 0) {
+      found = true;
+      break;
+    }
+  }
+  if (!found) {
+    std::cerr << "This test requires '--v=42' to be passed as a command-line "
+              << "argument.\n";
+    return 1;
+  }
+
+  benchmark::Initialize(&argc, argv);
+
+  // Check that the current flag value is reported accurately via the
+  // GetBenchmarkVerbosity() function.
+  if (flagv != benchmark::GetBenchmarkVerbosity()) {
+    std::cerr
+        << "Seeing different value for flags. GetBenchmarkVerbosity() returns ["
+        << benchmark::GetBenchmarkVerbosity() << "] expected flag=[" << flagv
+        << "]\n";
+    return 1;
+  }
+  return 0;
+}
diff --git a/test/statistics_gtest.cc b/test/statistics_gtest.cc
index 3ddc72d..1de2d87 100644
--- a/test/statistics_gtest.cc
+++ b/test/statistics_gtest.cc
@@ -25,4 +25,11 @@ TEST(StatisticsTest, StdDev) {
                    1.151086443322134);
 }
 
+TEST(StatisticsTest, CV) {
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsCV({101, 101, 101, 101}), 0.0);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsCV({1, 2, 3}), 1. / 2.);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsCV({2.5, 2.4, 3.3, 4.2, 5.1}),
+                   0.32888184094918121);
+}
+
 }  // end namespace
diff --git a/test/string_util_gtest.cc b/test/string_util_gtest.cc
index 01bf155..67b4bc0 100644
--- a/test/string_util_gtest.cc
+++ b/test/string_util_gtest.cc
@@ -1,9 +1,12 @@
 //===---------------------------------------------------------------------===//
-// statistics_test - Unit tests for src/statistics.cc
+// string_util_test - Unit tests for src/string_util.cc
 //===---------------------------------------------------------------------===//
 
-#include "../src/string_util.h"
+#include <tuple>
+
 #include "../src/internal_macros.h"
+#include "../src/string_util.h"
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
 namespace {
@@ -32,7 +35,8 @@ TEST(StringUtilTest, stoul) {
 #elif ULONG_MAX == 0xFFFFFFFFFFFFFFFFul
   {
     size_t pos = 0;
-    EXPECT_EQ(0xFFFFFFFFFFFFFFFFul, benchmark::stoul("18446744073709551615", &pos));
+    EXPECT_EQ(0xFFFFFFFFFFFFFFFFul,
+              benchmark::stoul("18446744073709551615", &pos));
     EXPECT_EQ(20ul, pos);
   }
 #endif
@@ -63,91 +67,133 @@ TEST(StringUtilTest, stoul) {
   }
 #ifndef BENCHMARK_HAS_NO_EXCEPTIONS
   {
-    ASSERT_THROW(benchmark::stoul("this is a test"), std::invalid_argument);
+    ASSERT_THROW(std::ignore = benchmark::stoul("this is a test"),
+                 std::invalid_argument);
   }
 #endif
 }
 
-TEST(StringUtilTest, stoi) {
-  {
-    size_t pos = 0;
-    EXPECT_EQ(0, benchmark::stoi("0", &pos));
-    EXPECT_EQ(1ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(-17, benchmark::stoi("-17", &pos));
-    EXPECT_EQ(3ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(1357, benchmark::stoi("1357", &pos));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(10, benchmark::stoi("1010", &pos, 2));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(520, benchmark::stoi("1010", &pos, 8));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(1010, benchmark::stoi("1010", &pos, 10));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(4112, benchmark::stoi("1010", &pos, 16));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(0xBEEF, benchmark::stoi("BEEF", &pos, 16));
-    EXPECT_EQ(4ul, pos);
-  }
+TEST(StringUtilTest, stoi){{size_t pos = 0;
+EXPECT_EQ(0, benchmark::stoi("0", &pos));
+EXPECT_EQ(1ul, pos);
+}  // namespace
+{
+  size_t pos = 0;
+  EXPECT_EQ(-17, benchmark::stoi("-17", &pos));
+  EXPECT_EQ(3ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(1357, benchmark::stoi("1357", &pos));
+  EXPECT_EQ(4ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(10, benchmark::stoi("1010", &pos, 2));
+  EXPECT_EQ(4ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(520, benchmark::stoi("1010", &pos, 8));
+  EXPECT_EQ(4ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(1010, benchmark::stoi("1010", &pos, 10));
+  EXPECT_EQ(4ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(4112, benchmark::stoi("1010", &pos, 16));
+  EXPECT_EQ(4ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(0xBEEF, benchmark::stoi("BEEF", &pos, 16));
+  EXPECT_EQ(4ul, pos);
+}
 #ifndef BENCHMARK_HAS_NO_EXCEPTIONS
-  {
-    ASSERT_THROW(benchmark::stoi("this is a test"), std::invalid_argument);
-  }
+{
+  ASSERT_THROW(std::ignore = benchmark::stoi("this is a test"),
+               std::invalid_argument);
+}
 #endif
 }
 
-TEST(StringUtilTest, stod) {
-  {
-    size_t pos = 0;
-    EXPECT_EQ(0.0, benchmark::stod("0", &pos));
-    EXPECT_EQ(1ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(-84.0, benchmark::stod("-84", &pos));
-    EXPECT_EQ(3ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(1234.0, benchmark::stod("1234", &pos));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(1.5, benchmark::stod("1.5", &pos));
-    EXPECT_EQ(3ul, pos);
-  }
-  {
-    size_t pos = 0;
-    /* Note: exactly representable as double */
-    EXPECT_EQ(-1.25e+9, benchmark::stod("-1.25e+9", &pos));
-    EXPECT_EQ(8ul, pos);
-  }
+TEST(StringUtilTest, stod){{size_t pos = 0;
+EXPECT_EQ(0.0, benchmark::stod("0", &pos));
+EXPECT_EQ(1ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(-84.0, benchmark::stod("-84", &pos));
+  EXPECT_EQ(3ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(1234.0, benchmark::stod("1234", &pos));
+  EXPECT_EQ(4ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(1.5, benchmark::stod("1.5", &pos));
+  EXPECT_EQ(3ul, pos);
+}
+{
+  size_t pos = 0;
+  /* Note: exactly representable as double */
+  EXPECT_EQ(-1.25e+9, benchmark::stod("-1.25e+9", &pos));
+  EXPECT_EQ(8ul, pos);
+}
 #ifndef BENCHMARK_HAS_NO_EXCEPTIONS
-  {
-    ASSERT_THROW(benchmark::stod("this is a test"), std::invalid_argument);
-  }
+{
+  ASSERT_THROW(std::ignore = benchmark::stod("this is a test"),
+               std::invalid_argument);
+}
 #endif
 }
 
+TEST(StringUtilTest, StrSplit) {
+  EXPECT_EQ(benchmark::StrSplit("", ','), std::vector<std::string>{});
+  EXPECT_EQ(benchmark::StrSplit("hello", ','),
+            std::vector<std::string>({"hello"}));
+  EXPECT_EQ(benchmark::StrSplit("hello,there,is,more", ','),
+            std::vector<std::string>({"hello", "there", "is", "more"}));
+}
+
+using HumanReadableFixture = ::testing::TestWithParam<
+    std::tuple<double, benchmark::Counter::OneK, std::string>>;
+
+INSTANTIATE_TEST_SUITE_P(
+    HumanReadableTests, HumanReadableFixture,
+    ::testing::Values(
+        std::make_tuple(0.0, benchmark::Counter::kIs1024, "0"),
+        std::make_tuple(999.0, benchmark::Counter::kIs1024, "999"),
+        std::make_tuple(1000.0, benchmark::Counter::kIs1024, "1000"),
+        std::make_tuple(1024.0, benchmark::Counter::kIs1024, "1Ki"),
+        std::make_tuple(1000 * 1000.0, benchmark::Counter::kIs1024,
+                        "976\\.56.Ki"),
+        std::make_tuple(1024 * 1024.0, benchmark::Counter::kIs1024, "1Mi"),
+        std::make_tuple(1000 * 1000 * 1000.0, benchmark::Counter::kIs1024,
+                        "953\\.674Mi"),
+        std::make_tuple(1024 * 1024 * 1024.0, benchmark::Counter::kIs1024,
+                        "1Gi"),
+        std::make_tuple(0.0, benchmark::Counter::kIs1000, "0"),
+        std::make_tuple(999.0, benchmark::Counter::kIs1000, "999"),
+        std::make_tuple(1000.0, benchmark::Counter::kIs1000, "1k"),
+        std::make_tuple(1024.0, benchmark::Counter::kIs1000, "1.024k"),
+        std::make_tuple(1000 * 1000.0, benchmark::Counter::kIs1000, "1M"),
+        std::make_tuple(1024 * 1024.0, benchmark::Counter::kIs1000,
+                        "1\\.04858M"),
+        std::make_tuple(1000 * 1000 * 1000.0, benchmark::Counter::kIs1000,
+                        "1G"),
+        std::make_tuple(1024 * 1024 * 1024.0, benchmark::Counter::kIs1000,
+                        "1\\.07374G")));
+
+TEST_P(HumanReadableFixture, HumanReadableNumber) {
+  std::string str = benchmark::HumanReadableNumber(std::get<0>(GetParam()),
+                                                   std::get<1>(GetParam()));
+  ASSERT_THAT(str, ::testing::MatchesRegex(std::get<2>(GetParam())));
+}
+
 }  // end namespace
diff --git a/test/templated_fixture_test.cc b/test/templated_fixture_test.cc
index fe9865c..af239c3 100644
--- a/test/templated_fixture_test.cc
+++ b/test/templated_fixture_test.cc
@@ -1,9 +1,9 @@
 
-#include "benchmark/benchmark.h"
-
 #include <cassert>
 #include <memory>
 
+#include "benchmark/benchmark.h"
+
 template <typename T>
 class MyFixture : public ::benchmark::Fixture {
  public:
diff --git a/test/time_unit_gtest.cc b/test/time_unit_gtest.cc
new file mode 100644
index 0000000..484ecbc
--- /dev/null
+++ b/test/time_unit_gtest.cc
@@ -0,0 +1,37 @@
+#include "../include/benchmark/benchmark.h"
+#include "gtest/gtest.h"
+
+namespace benchmark {
+namespace internal {
+
+namespace {
+
+class DummyBenchmark : public Benchmark {
+ public:
+  DummyBenchmark() : Benchmark("dummy") {}
+  void Run(State&) override {}
+};
+
+TEST(DefaultTimeUnitTest, TimeUnitIsNotSet) {
+  DummyBenchmark benchmark;
+  EXPECT_EQ(benchmark.GetTimeUnit(), kNanosecond);
+}
+
+TEST(DefaultTimeUnitTest, DefaultIsSet) {
+  DummyBenchmark benchmark;
+  EXPECT_EQ(benchmark.GetTimeUnit(), kNanosecond);
+  SetDefaultTimeUnit(kMillisecond);
+  EXPECT_EQ(benchmark.GetTimeUnit(), kMillisecond);
+}
+
+TEST(DefaultTimeUnitTest, DefaultAndExplicitUnitIsSet) {
+  DummyBenchmark benchmark;
+  benchmark.Unit(kMillisecond);
+  SetDefaultTimeUnit(kMicrosecond);
+
+  EXPECT_EQ(benchmark.GetTimeUnit(), kMillisecond);
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace benchmark
diff --git a/test/user_counters_tabular_test.cc b/test/user_counters_tabular_test.cc
index 18373c0..c98b769 100644
--- a/test/user_counters_tabular_test.cc
+++ b/test/user_counters_tabular_test.cc
@@ -7,19 +7,25 @@
 // @todo: <jpmag> this checks the full output at once; the rule for
 // CounterSet1 was failing because it was not matching "^[-]+$".
 // @todo: <jpmag> check that the counters are vertically aligned.
-ADD_CASES(
-    TC_ConsoleOut,
-    {
-        // keeping these lines long improves readability, so:
-        // clang-format off
+ADD_CASES(TC_ConsoleOut,
+          {
+              // keeping these lines long improves readability, so:
+              // clang-format off
     {"^[-]+$", MR_Next},
     {"^Benchmark %s Time %s CPU %s Iterations %s Bar %s Bat %s Baz %s Foo %s Frob %s Lob$", MR_Next},
     {"^[-]+$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:1 %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:1 %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:1_mean %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:1_median %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+            {"^BM_Counters_Tabular/repeats:2/threads:1_stddev %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+            {"^BM_Counters_Tabular/repeats:2/threads:1_cv %console_percentage_report [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*%$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:2 %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:2 %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:2_mean %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:2_median %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+            {"^BM_Counters_Tabular/repeats:2/threads:2_stddev %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+            {"^BM_Counters_Tabular/repeats:2/threads:2_cv %console_percentage_report [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*%$", MR_Next},
     {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
     {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
     {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
@@ -46,8 +52,8 @@ ADD_CASES(
     {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
     {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
     {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$"},
-        // clang-format on
-    });
+              // clang-format on
+          });
 ADD_CASES(TC_CSVOut, {{"%csv_header,"
                        "\"Bar\",\"Bat\",\"Baz\",\"Foo\",\"Frob\",\"Lob\""}});
 
@@ -68,12 +74,15 @@ void BM_Counters_Tabular(benchmark::State& state) {
       {"Lob", {32, bm::Counter::kAvgThreads}},
   });
 }
-BENCHMARK(BM_Counters_Tabular)->ThreadRange(1, 16);
+BENCHMARK(BM_Counters_Tabular)->ThreadRange(1, 2)->Repetitions(2);
 ADD_CASES(TC_JSONOut,
-          {{"\"name\": \"BM_Counters_Tabular/threads:%int\",$"},
-           {"\"run_name\": \"BM_Counters_Tabular/threads:%int\",$", MR_Next},
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -87,8 +96,260 @@ ADD_CASES(TC_JSONOut,
            {"\"Frob\": %float,$", MR_Next},
            {"\"Lob\": %float$", MR_Next},
            {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Tabular/threads:%int\",%csv_report,"
-                       "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 1,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1_mean\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1_median\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1_stddev\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1_cv\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"cv\",$", MR_Next},
+           {"\"aggregate_unit\": \"percentage\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 1,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 2,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 1,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 1,$", MR_Next},
+           {"\"threads\": 2,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:2_median\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 1,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 2,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:2_stddev\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 1,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 2,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:2_cv\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 1,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 2,$", MR_Next},
+           {"\"aggregate_name\": \"cv\",$", MR_Next},
+           {"\"aggregate_unit\": \"percentage\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1_mean\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1_median\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1_stddev\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1_cv\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2_mean\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2_median\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2_stddev\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2_cv\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckTabular(Results const& e) {
@@ -99,7 +360,10 @@ void CheckTabular(Results const& e) {
   CHECK_COUNTER_VALUE(e, int, "Frob", EQ, 16);
   CHECK_COUNTER_VALUE(e, int, "Lob", EQ, 32);
 }
-CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/threads:%int", &CheckTabular);
+CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/repeats:2/threads:1$",
+                        &CheckTabular);
+CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/repeats:2/threads:2$",
+                        &CheckTabular);
 
 // ========================================================================= //
 // -------------------- Tabular+Rate Counters Output ----------------------- //
@@ -108,7 +372,8 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/threads:%int", &CheckTabular);
 void BM_CounterRates_Tabular(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters.insert({
@@ -123,10 +388,12 @@ void BM_CounterRates_Tabular(benchmark::State& state) {
 BENCHMARK(BM_CounterRates_Tabular)->ThreadRange(1, 16);
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_CounterRates_Tabular/threads:%int\",$"},
+           {"\"family_index\": 1,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_CounterRates_Tabular/threads:%int\",$",
             MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -174,9 +441,11 @@ void BM_CounterSet0_Tabular(benchmark::State& state) {
 BENCHMARK(BM_CounterSet0_Tabular)->ThreadRange(1, 16);
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_CounterSet0_Tabular/threads:%int\",$"},
+           {"\"family_index\": 2,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_CounterSet0_Tabular/threads:%int\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -212,9 +481,11 @@ void BM_CounterSet1_Tabular(benchmark::State& state) {
 BENCHMARK(BM_CounterSet1_Tabular)->ThreadRange(1, 16);
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_CounterSet1_Tabular/threads:%int\",$"},
+           {"\"family_index\": 3,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_CounterSet1_Tabular/threads:%int\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -254,9 +525,11 @@ void BM_CounterSet2_Tabular(benchmark::State& state) {
 BENCHMARK(BM_CounterSet2_Tabular)->ThreadRange(1, 16);
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_CounterSet2_Tabular/threads:%int\",$"},
+           {"\"family_index\": 4,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_CounterSet2_Tabular/threads:%int\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
diff --git a/test/user_counters_test.cc b/test/user_counters_test.cc
index 5699f4f..4cd8ee3 100644
--- a/test/user_counters_test.cc
+++ b/test/user_counters_test.cc
@@ -26,15 +26,17 @@ void BM_Counters_Simple(benchmark::State& state) {
   for (auto _ : state) {
   }
   state.counters["foo"] = 1;
-  state.counters["bar"] = 2 * (double)state.iterations();
+  state.counters["bar"] = 2 * static_cast<double>(state.iterations());
 }
 BENCHMARK(BM_Counters_Simple);
 ADD_CASES(TC_ConsoleOut,
           {{"^BM_Counters_Simple %console_report bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Simple\",$"},
+                       {"\"family_index\": 0,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Counters_Simple\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
@@ -65,7 +67,8 @@ int num_calls1 = 0;
 void BM_Counters_WithBytesAndItemsPSec(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   state.counters["foo"] = 1;
   state.counters["bar"] = ++num_calls1;
@@ -78,9 +81,11 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_WithBytesAndItemsPSec %console_report "
                            "foo=%hrfloat items_per_second=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_WithBytesAndItemsPSec\",$"},
+           {"\"family_index\": 1,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_WithBytesAndItemsPSec\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -114,7 +119,8 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_WithBytesAndItemsPSec",
 void BM_Counters_Rate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{1, bm::Counter::kIsRate};
@@ -125,9 +131,11 @@ ADD_CASES(
     TC_ConsoleOut,
     {{"^BM_Counters_Rate %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Rate\",$"},
+                       {"\"family_index\": 2,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Counters_Rate\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
@@ -155,7 +163,8 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_Rate", &CheckRate);
 void BM_Invert(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{0.0001, bm::Counter::kInvert};
@@ -165,9 +174,11 @@ BENCHMARK(BM_Invert);
 ADD_CASES(TC_ConsoleOut,
           {{"^BM_Invert %console_report bar=%hrfloatu foo=%hrfloatk$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Invert\",$"},
+                       {"\"family_index\": 3,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Invert\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
@@ -187,14 +198,14 @@ void CheckInvert(Results const& e) {
 CHECK_BENCHMARK_RESULTS("BM_Invert", &CheckInvert);
 
 // ========================================================================= //
-// ------------------------- InvertedRate Counters Output
-// -------------------------- //
+// --------------------- InvertedRate Counters Output ---------------------- //
 // ========================================================================= //
 
 void BM_Counters_InvertedRate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters["foo"] =
@@ -207,9 +218,11 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_InvertedRate %console_report "
                            "bar=%hrfloats foo=%hrfloats$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_InvertedRate\",$"},
+           {"\"family_index\": 4,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_InvertedRate\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -246,9 +259,11 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Threads/threads:%int %console_report "
                            "bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_Threads/threads:%int\",$"},
+           {"\"family_index\": 5,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_Threads/threads:%int\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -285,9 +300,11 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreads/threads:%int "
                            "%console_report bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_AvgThreads/threads:%int\",$"},
+           {"\"family_index\": 6,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_AvgThreads/threads:%int\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -316,7 +333,8 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreads/threads:%int",
 void BM_Counters_AvgThreadsRate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreadsRate};
@@ -327,10 +345,12 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreadsRate/threads:%int "
                            "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$"},
+           {"\"family_index\": 7,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$",
             MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -367,9 +387,11 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_IterationInvariant %console_report "
                            "bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_IterationInvariant\",$"},
+           {"\"family_index\": 8,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_IterationInvariant\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -399,7 +421,8 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_IterationInvariant",
 void BM_Counters_kIsIterationInvariantRate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters["foo"] =
@@ -412,10 +435,12 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_kIsIterationInvariantRate "
                            "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_kIsIterationInvariantRate\",$"},
+           {"\"family_index\": 9,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_kIsIterationInvariantRate\",$",
             MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -440,7 +465,7 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_kIsIterationInvariantRate",
                         &CheckIsIterationInvariantRate);
 
 // ========================================================================= //
-// ------------------- AvgIterations Counters Output ------------------ //
+// --------------------- AvgIterations Counters Output --------------------- //
 // ========================================================================= //
 
 void BM_Counters_AvgIterations(benchmark::State& state) {
@@ -455,9 +480,11 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgIterations %console_report "
                            "bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_AvgIterations\",$"},
+           {"\"family_index\": 10,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_AvgIterations\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -480,13 +507,14 @@ void CheckAvgIterations(Results const& e) {
 CHECK_BENCHMARK_RESULTS("BM_Counters_AvgIterations", &CheckAvgIterations);
 
 // ========================================================================= //
-// ----------------- AvgIterationsRate Counters Output ---------------- //
+// ------------------- AvgIterationsRate Counters Output ------------------- //
 // ========================================================================= //
 
 void BM_Counters_kAvgIterationsRate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgIterationsRate};
@@ -498,9 +526,11 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_kAvgIterationsRate "
                            "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_kAvgIterationsRate\",$"},
+           {"\"family_index\": 11,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_kAvgIterationsRate\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
diff --git a/test/user_counters_thousands_test.cc b/test/user_counters_thousands_test.cc
index 21d8285..fc15383 100644
--- a/test/user_counters_thousands_test.cc
+++ b/test/user_counters_thousands_test.cc
@@ -16,13 +16,13 @@ void BM_Counters_Thousands(benchmark::State& state) {
       {"t0_1000000DefaultBase",
        bm::Counter(1000 * 1000, bm::Counter::kDefaults)},
       {"t1_1000000Base1000", bm::Counter(1000 * 1000, bm::Counter::kDefaults,
-                                         benchmark::Counter::OneK::kIs1000)},
+                                         bm::Counter::OneK::kIs1000)},
       {"t2_1000000Base1024", bm::Counter(1000 * 1000, bm::Counter::kDefaults,
-                                         benchmark::Counter::OneK::kIs1024)},
+                                         bm::Counter::OneK::kIs1024)},
       {"t3_1048576Base1000", bm::Counter(1024 * 1024, bm::Counter::kDefaults,
-                                         benchmark::Counter::OneK::kIs1000)},
+                                         bm::Counter::OneK::kIs1000)},
       {"t4_1048576Base1024", bm::Counter(1024 * 1024, bm::Counter::kDefaults,
-                                         benchmark::Counter::OneK::kIs1024)},
+                                         bm::Counter::OneK::kIs1024)},
   });
 }
 BENCHMARK(BM_Counters_Thousands)->Repetitions(2);
@@ -30,27 +30,29 @@ ADD_CASES(
     TC_ConsoleOut,
     {
         {"^BM_Counters_Thousands/repeats:2 %console_report "
-         "t0_1000000DefaultBase=1000k "
-         "t1_1000000Base1000=1000k t2_1000000Base1024=976.56[23]k "
-         "t3_1048576Base1000=1048.58k t4_1048576Base1024=1024k$"},
+         "t0_1000000DefaultBase=1M "
+         "t1_1000000Base1000=1M t2_1000000Base1024=976.56[23]Ki "
+         "t3_1048576Base1000=1.04858M t4_1048576Base1024=1Mi$"},
         {"^BM_Counters_Thousands/repeats:2 %console_report "
-         "t0_1000000DefaultBase=1000k "
-         "t1_1000000Base1000=1000k t2_1000000Base1024=976.56[23]k "
-         "t3_1048576Base1000=1048.58k t4_1048576Base1024=1024k$"},
+         "t0_1000000DefaultBase=1M "
+         "t1_1000000Base1000=1M t2_1000000Base1024=976.56[23]Ki "
+         "t3_1048576Base1000=1.04858M t4_1048576Base1024=1Mi$"},
         {"^BM_Counters_Thousands/repeats:2_mean %console_report "
-         "t0_1000000DefaultBase=1000k t1_1000000Base1000=1000k "
-         "t2_1000000Base1024=976.56[23]k t3_1048576Base1000=1048.58k "
-         "t4_1048576Base1024=1024k$"},
+         "t0_1000000DefaultBase=1M t1_1000000Base1000=1M "
+         "t2_1000000Base1024=976.56[23]Ki t3_1048576Base1000=1.04858M "
+         "t4_1048576Base1024=1Mi$"},
         {"^BM_Counters_Thousands/repeats:2_median %console_report "
-         "t0_1000000DefaultBase=1000k t1_1000000Base1000=1000k "
-         "t2_1000000Base1024=976.56[23]k t3_1048576Base1000=1048.58k "
-         "t4_1048576Base1024=1024k$"},
+         "t0_1000000DefaultBase=1M t1_1000000Base1000=1M "
+         "t2_1000000Base1024=976.56[23]Ki t3_1048576Base1000=1.04858M "
+         "t4_1048576Base1024=1Mi$"},
         {"^BM_Counters_Thousands/repeats:2_stddev %console_time_only_report [ "
          "]*2 t0_1000000DefaultBase=0 t1_1000000Base1000=0 "
          "t2_1000000Base1024=0 t3_1048576Base1000=0 t4_1048576Base1024=0$"},
     });
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_Thousands/repeats:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
            {"\"repetitions\": 2,$", MR_Next},
@@ -68,6 +70,8 @@ ADD_CASES(TC_JSONOut,
            {"}", MR_Next}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_Thousands/repeats:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
            {"\"repetitions\": 2,$", MR_Next},
@@ -85,11 +89,14 @@ ADD_CASES(TC_JSONOut,
            {"}", MR_Next}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_Thousands/repeats:2_mean\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -102,11 +109,14 @@ ADD_CASES(TC_JSONOut,
            {"}", MR_Next}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_Thousands/repeats:2_median\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -119,11 +129,14 @@ ADD_CASES(TC_JSONOut,
            {"}", MR_Next}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_Thousands/repeats:2_stddev\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
diff --git a/tools/BUILD.bazel b/tools/BUILD.bazel
index 5895883..d25caa7 100644
--- a/tools/BUILD.bazel
+++ b/tools/BUILD.bazel
@@ -1,4 +1,4 @@
-load("@py_deps//:requirements.bzl", "requirement")
+load("@tools_pip_deps//:requirements.bzl", "requirement")
 
 py_library(
     name = "gbench",
@@ -12,7 +12,7 @@ py_library(
 py_binary(
     name = "compare",
     srcs = ["compare.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     deps = [
         ":gbench",
     ],
diff --git a/tools/compare.py b/tools/compare.py
index 66eed93..e5eeb24 100755
--- a/tools/compare.py
+++ b/tools/compare.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 import unittest
 """
@@ -9,25 +9,28 @@ import argparse
 from argparse import ArgumentParser
 import json
 import sys
+import os
 import gbench
 from gbench import util, report
-from gbench.util import *
 
 
 def check_inputs(in1, in2, flags):
     """
     Perform checking on the user provided inputs and diagnose any abnormalities
     """
-    in1_kind, in1_err = classify_input_file(in1)
-    in2_kind, in2_err = classify_input_file(in2)
-    output_file = find_benchmark_flag('--benchmark_out=', flags)
-    output_type = find_benchmark_flag('--benchmark_out_format=', flags)
-    if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file:
+    in1_kind, in1_err = util.classify_input_file(in1)
+    in2_kind, in2_err = util.classify_input_file(in2)
+    output_file = util.find_benchmark_flag('--benchmark_out=', flags)
+    output_type = util.find_benchmark_flag('--benchmark_out_format=', flags)
+    if in1_kind == util.IT_Executable and in2_kind == util.IT_Executable and output_file:
         print(("WARNING: '--benchmark_out=%s' will be passed to both "
                "benchmarks causing it to be overwritten") % output_file)
-    if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0:
-        print("WARNING: passing optional flags has no effect since both "
-              "inputs are JSON")
+    if in1_kind == util.IT_JSON and in2_kind == util.IT_JSON:
+        # When both sides are JSON the only supported flag is
+        # --benchmark_filter=
+        for flag in util.remove_benchmark_flags('--benchmark_filter=', flags):
+            print("WARNING: passing %s has no effect since both "
+                  "inputs are JSON" % flag)
     if output_type is not None and output_type != 'json':
         print(("ERROR: passing '--benchmark_out_format=%s' to 'compare.py`"
                " is not supported.") % output_type)
@@ -238,10 +241,10 @@ def main():
         options_contender = ['--benchmark_filter=%s' % filter_contender]
 
     # Run the benchmarks and report the results
-    json1 = json1_orig = gbench.util.run_or_load_benchmark(
-        test_baseline, benchmark_options + options_baseline)
-    json2 = json2_orig = gbench.util.run_or_load_benchmark(
-        test_contender, benchmark_options + options_contender)
+    json1 = json1_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark(
+        test_baseline, benchmark_options + options_baseline))
+    json2 = json2_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark(
+        test_contender, benchmark_options + options_contender))
 
     # Now, filter the benchmarks so that the difference report can work
     if filter_baseline and filter_contender:
diff --git a/tools/gbench/Inputs/test1_run1.json b/tools/gbench/Inputs/test1_run1.json
index 601e327..9daed0b 100644
--- a/tools/gbench/Inputs/test1_run1.json
+++ b/tools/gbench/Inputs/test1_run1.json
@@ -114,6 +114,14 @@
       "real_time": 1,
       "cpu_time": 1,
       "time_unit": "s"
+    },
+    {
+      "name": "BM_hasLabel",
+      "label": "a label",
+      "iterations": 1,
+      "real_time": 1,
+      "cpu_time": 1,
+      "time_unit": "s"
     }
   ]
 }
diff --git a/tools/gbench/Inputs/test1_run2.json b/tools/gbench/Inputs/test1_run2.json
index 3cbcf39..dc52970 100644
--- a/tools/gbench/Inputs/test1_run2.json
+++ b/tools/gbench/Inputs/test1_run2.json
@@ -114,6 +114,14 @@
       "real_time": 1,
       "cpu_time": 1,
       "time_unit": "ns"
+    },
+    {
+      "name": "BM_hasLabel",
+      "label": "a label",
+      "iterations": 1,
+      "real_time": 1,
+      "cpu_time": 1,
+      "time_unit": "s"
     }
   ]
 }
diff --git a/tools/gbench/Inputs/test4_run.json b/tools/gbench/Inputs/test4_run.json
new file mode 100644
index 0000000..eaa005f
--- /dev/null
+++ b/tools/gbench/Inputs/test4_run.json
@@ -0,0 +1,96 @@
+{
+  "benchmarks": [
+    {
+      "name": "99 family 0 instance 0 repetition 0",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "repetition_index": 0
+    },
+    {
+      "name": "98 family 0 instance 0 repetition 1",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "repetition_index": 1
+    },
+    {
+      "name": "97 family 0 instance 0 aggregate",
+      "run_type": "aggregate",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "aggregate_name": "9 aggregate"
+    },
+
+
+    {
+      "name": "96 family 0 instance 1 repetition 0",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 1,
+      "repetition_index": 0
+    },
+    {
+      "name": "95 family 0 instance 1 repetition 1",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 1,
+      "repetition_index": 1
+    },
+    {
+      "name": "94 family 0 instance 1 aggregate",
+      "run_type": "aggregate",
+      "family_index": 0,
+      "per_family_instance_index": 1,
+      "aggregate_name": "9 aggregate"
+    },
+
+
+
+
+    {
+      "name": "93 family 1 instance 0 repetition 0",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 0,
+      "repetition_index": 0
+    },
+    {
+      "name": "92 family 1 instance 0 repetition 1",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 0,
+      "repetition_index": 1
+    },
+    {
+      "name": "91 family 1 instance 0 aggregate",
+      "run_type": "aggregate",
+      "family_index": 1,
+      "per_family_instance_index": 0,
+      "aggregate_name": "9 aggregate"
+    },
+
+
+    {
+      "name": "90 family 1 instance 1 repetition 0",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 1,
+      "repetition_index": 0
+    },
+    {
+      "name": "89 family 1 instance 1 repetition 1",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 1,
+      "repetition_index": 1
+    },
+    {
+      "name": "88 family 1 instance 1 aggregate",
+      "run_type": "aggregate",
+      "family_index": 1,
+      "per_family_instance_index": 1,
+      "aggregate_name": "9 aggregate"
+    }
+  ]
+}
diff --git a/tools/gbench/Inputs/test4_run0.json b/tools/gbench/Inputs/test4_run0.json
new file mode 100644
index 0000000..54cf127
--- /dev/null
+++ b/tools/gbench/Inputs/test4_run0.json
@@ -0,0 +1,21 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "whocares",
+      "run_type": "aggregate",
+      "aggregate_name": "zz",
+      "aggregate_unit": "percentage",
+      "iterations": 1000,
+      "real_time": 0.01,
+      "cpu_time": 0.10,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/tools/gbench/Inputs/test4_run1.json b/tools/gbench/Inputs/test4_run1.json
new file mode 100644
index 0000000..25d5605
--- /dev/null
+++ b/tools/gbench/Inputs/test4_run1.json
@@ -0,0 +1,21 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "whocares",
+      "run_type": "aggregate",
+      "aggregate_name": "zz",
+      "aggregate_unit": "percentage",
+      "iterations": 1000,
+      "real_time": 0.005,
+      "cpu_time": 0.15,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/tools/gbench/report.py b/tools/gbench/report.py
index bf29492..b2bbfb9 100644
--- a/tools/gbench/report.py
+++ b/tools/gbench/report.py
@@ -1,11 +1,14 @@
-import unittest
 """report.py - Utilities for reporting statistics about benchmark results
 """
+
+import unittest
 import os
 import re
 import copy
+import random
 
-from scipy.stats import mannwhitneyu
+from scipy.stats import mannwhitneyu, gmean
+from numpy import array
 
 
 class BenchmarkColor(object):
@@ -39,6 +42,13 @@ UTEST_MIN_REPETITIONS = 2
 UTEST_OPTIMAL_REPETITIONS = 9  # Lowest reasonable number, More is better.
 UTEST_COL_NAME = "_pvalue"
 
+_TIME_UNIT_TO_SECONDS_MULTIPLIER = {
+    "s": 1.0,
+    "ms": 1e-3,
+    "us": 1e-6,
+    "ns": 1e-9,
+}
+
 
 def color_format(use_color, fmt_str, *args, **kwargs):
     """
@@ -148,6 +158,30 @@ def partition_benchmarks(json1, json2):
     return partitions
 
 
+def get_timedelta_field_as_seconds(benchmark, field_name):
+    """
+    Get value of field_name field of benchmark, which is time with time unit
+    time_unit, as time in seconds.
+    """
+    timedelta = benchmark[field_name]
+    time_unit = benchmark.get('time_unit', 's')
+    return timedelta * _TIME_UNIT_TO_SECONDS_MULTIPLIER.get(time_unit)
+
+
+def calculate_geomean(json):
+    """
+    Extract all real/cpu times from all the benchmarks as seconds,
+    and calculate their geomean.
+    """
+    times = []
+    for benchmark in json['benchmarks']:
+        if 'run_type' in benchmark and benchmark['run_type'] == 'aggregate':
+            continue
+        times.append([get_timedelta_field_as_seconds(benchmark, 'real_time'),
+                      get_timedelta_field_as_seconds(benchmark, 'cpu_time')])
+    return gmean(times) if times else array([])
+
+
 def extract_field(partition, field_name):
     # The count of elements may be different. We want *all* of them.
     lhs = [x[field_name] for x in partition[0]]
@@ -172,6 +206,7 @@ def calc_utest(timings_cpu, timings_time):
 
     return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue
 
+
 def print_utest(bc_name, utest, utest_alpha, first_col_width, use_color=True):
     def get_utest_color(pval):
         return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
@@ -220,6 +255,7 @@ def get_difference_report(
     partitions = partition_benchmarks(json1, json2)
     for partition in partitions:
         benchmark_name = partition[0][0]['name']
+        label = partition[0][0]['label'] if 'label' in partition[0][0] else ''
         time_unit = partition[0][0]['time_unit']
         measurements = []
         utest_results = {}
@@ -240,7 +276,8 @@ def get_difference_report(
         if utest:
             timings_cpu = extract_field(partition, 'cpu_time')
             timings_time = extract_field(partition, 'real_time')
-            have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(timings_cpu, timings_time)
+            have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(
+                timings_cpu, timings_time)
             if cpu_pvalue and time_pvalue:
                 utest_results = {
                     'have_optimal_repetitions': have_optimal_repetitions,
@@ -259,6 +296,7 @@ def get_difference_report(
             aggregate_name = partition[0][0]['aggregate_name'] if run_type == 'aggregate' and 'aggregate_name' in partition[0][0] else ''
             diff_report.append({
                 'name': benchmark_name,
+                'label': label,
                 'measurements': measurements,
                 'time_unit': time_unit,
                 'run_type': run_type,
@@ -266,6 +304,26 @@ def get_difference_report(
                 'utest': utest_results
             })
 
+    lhs_gmean = calculate_geomean(json1)
+    rhs_gmean = calculate_geomean(json2)
+    if lhs_gmean.any() and rhs_gmean.any():
+        diff_report.append({
+            'name': 'OVERALL_GEOMEAN',
+            'label': '',
+            'measurements': [{
+                'real_time': lhs_gmean[0],
+                'cpu_time': lhs_gmean[1],
+                'real_time_other': rhs_gmean[0],
+                'cpu_time_other': rhs_gmean[1],
+                'time': calculate_change(lhs_gmean[0], rhs_gmean[0]),
+                'cpu': calculate_change(lhs_gmean[1], rhs_gmean[1])
+            }],
+            'time_unit': 's',
+            'run_type': 'aggregate',
+            'aggregate_name': 'geomean',
+            'utest': {}
+        })
+
     return diff_report
 
 
@@ -301,26 +359,23 @@ def print_difference_report(
     fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
     for benchmark in json_diff_report:
         # *If* we were asked to only include aggregates,
-        # and if it is non-aggregate, then skip it.
-        if include_aggregates_only and 'run_type' in benchmark:
-            if benchmark['run_type'] != 'aggregate':
-                continue
-
-        for measurement in benchmark['measurements']:
-            output_strs += [color_format(use_color,
-                                         fmt_str,
-                                         BC_HEADER,
-                                         benchmark['name'],
-                                         first_col_width,
-                                         get_color(measurement['time']),
-                                         measurement['time'],
-                                         get_color(measurement['cpu']),
-                                         measurement['cpu'],
-                                         measurement['real_time'],
-                                         measurement['real_time_other'],
-                                         measurement['cpu_time'],
-                                         measurement['cpu_time_other'],
-                                         endc=BC_ENDC)]
+        # and if it is non-aggregate, then don't print it.
+        if not include_aggregates_only or not 'run_type' in benchmark or benchmark['run_type'] == 'aggregate':
+            for measurement in benchmark['measurements']:
+                output_strs += [color_format(use_color,
+                                             fmt_str,
+                                             BC_HEADER,
+                                             benchmark['name'],
+                                             first_col_width,
+                                             get_color(measurement['time']),
+                                             measurement['time'],
+                                             get_color(measurement['cpu']),
+                                             measurement['cpu'],
+                                             measurement['real_time'],
+                                             measurement['real_time_other'],
+                                             measurement['cpu_time'],
+                                             measurement['cpu_time_other'],
+                                             endc=BC_ENDC)]
 
         # After processing the measurements, if requested and
         # if applicable (e.g. u-test exists for given benchmark),
@@ -404,6 +459,8 @@ class TestReportDifference(unittest.TestCase):
                 '-0.1000', '100', '110', '100', '90'],
             ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
             ['BM_NotBadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
+            ['BM_hasLabel', '+0.0000', '+0.0000', '1', '1', '1', '1'],
+            ['OVERALL_GEOMEAN', '-0.8113', '-0.7779', '0', '0', '0', '0']
         ]
         output_lines_with_header = print_difference_report(
             self.json_diff_report, use_color=False)
@@ -420,81 +477,137 @@ class TestReportDifference(unittest.TestCase):
         expected_output = [
             {
                 'name': 'BM_SameTimes',
-                'measurements': [{'time': 0.0000, 'cpu': 0.0000, 'real_time': 10, 'real_time_other': 10, 'cpu_time': 10, 'cpu_time_other': 10}],
+                'label': '',
+                'measurements': [{'time': 0.0000, 'cpu': 0.0000,
+                                  'real_time': 10, 'real_time_other': 10,
+                                  'cpu_time': 10, 'cpu_time_other': 10}],
                 'time_unit': 'ns',
                 'utest': {}
             },
             {
                 'name': 'BM_2xFaster',
-                'measurements': [{'time': -0.5000, 'cpu': -0.5000, 'real_time': 50, 'real_time_other': 25, 'cpu_time': 50, 'cpu_time_other': 25}],
+                'label': '',
+                'measurements': [{'time': -0.5000, 'cpu': -0.5000,
+                                  'real_time': 50, 'real_time_other': 25,
+                                  'cpu_time': 50, 'cpu_time_other': 25}],
                 'time_unit': 'ns',
                 'utest': {}
             },
             {
                 'name': 'BM_2xSlower',
-                'measurements': [{'time': 1.0000, 'cpu': 1.0000, 'real_time': 50, 'real_time_other': 100, 'cpu_time': 50, 'cpu_time_other': 100}],
+                'label': '',
+                'measurements': [{'time': 1.0000, 'cpu': 1.0000,
+                                  'real_time': 50, 'real_time_other': 100,
+                                  'cpu_time': 50, 'cpu_time_other': 100}],
                 'time_unit': 'ns',
                 'utest': {}
             },
             {
                 'name': 'BM_1PercentFaster',
-                'measurements': [{'time': -0.0100, 'cpu': -0.0100, 'real_time': 100, 'real_time_other': 98.9999999, 'cpu_time': 100, 'cpu_time_other': 98.9999999}],
+                'label': '',
+                'measurements': [{'time': -0.0100, 'cpu': -0.0100,
+                                  'real_time': 100, 'real_time_other': 98.9999999,
+                                  'cpu_time': 100, 'cpu_time_other': 98.9999999}],
                 'time_unit': 'ns',
                 'utest': {}
             },
             {
                 'name': 'BM_1PercentSlower',
-                'measurements': [{'time': 0.0100, 'cpu': 0.0100, 'real_time': 100, 'real_time_other': 101, 'cpu_time': 100, 'cpu_time_other': 101}],
+                'label': '',
+                'measurements': [{'time': 0.0100, 'cpu': 0.0100,
+                                  'real_time': 100, 'real_time_other': 101,
+                                  'cpu_time': 100, 'cpu_time_other': 101}],
                 'time_unit': 'ns',
                 'utest': {}
             },
             {
                 'name': 'BM_10PercentFaster',
-                'measurements': [{'time': -0.1000, 'cpu': -0.1000, 'real_time': 100, 'real_time_other': 90, 'cpu_time': 100, 'cpu_time_other': 90}],
+                'label': '',
+                'measurements': [{'time': -0.1000, 'cpu': -0.1000,
+                                  'real_time': 100, 'real_time_other': 90,
+                                  'cpu_time': 100, 'cpu_time_other': 90}],
                 'time_unit': 'ns',
                 'utest': {}
             },
             {
                 'name': 'BM_10PercentSlower',
-                'measurements': [{'time': 0.1000, 'cpu': 0.1000, 'real_time': 100, 'real_time_other': 110, 'cpu_time': 100, 'cpu_time_other': 110}],
+                'label': '',
+                'measurements': [{'time': 0.1000, 'cpu': 0.1000,
+                                  'real_time': 100, 'real_time_other': 110,
+                                  'cpu_time': 100, 'cpu_time_other': 110}],
                 'time_unit': 'ns',
                 'utest': {}
             },
             {
                 'name': 'BM_100xSlower',
-                'measurements': [{'time': 99.0000, 'cpu': 99.0000, 'real_time': 100, 'real_time_other': 10000, 'cpu_time': 100, 'cpu_time_other': 10000}],
+                'label': '',
+                'measurements': [{'time': 99.0000, 'cpu': 99.0000,
+                                  'real_time': 100, 'real_time_other': 10000,
+                                  'cpu_time': 100, 'cpu_time_other': 10000}],
                 'time_unit': 'ns',
                 'utest': {}
             },
             {
                 'name': 'BM_100xFaster',
-                'measurements': [{'time': -0.9900, 'cpu': -0.9900, 'real_time': 10000, 'real_time_other': 100, 'cpu_time': 10000, 'cpu_time_other': 100}],
+                'label': '',
+                'measurements': [{'time': -0.9900, 'cpu': -0.9900,
+                                  'real_time': 10000, 'real_time_other': 100,
+                                  'cpu_time': 10000, 'cpu_time_other': 100}],
                 'time_unit': 'ns',
                 'utest': {}
             },
             {
                 'name': 'BM_10PercentCPUToTime',
-                'measurements': [{'time': 0.1000, 'cpu': -0.1000, 'real_time': 100, 'real_time_other': 110, 'cpu_time': 100, 'cpu_time_other': 90}],
+                'label': '',
+                'measurements': [{'time': 0.1000, 'cpu': -0.1000,
+                                  'real_time': 100, 'real_time_other': 110,
+                                  'cpu_time': 100, 'cpu_time_other': 90}],
                 'time_unit': 'ns',
                 'utest': {}
             },
             {
                 'name': 'BM_ThirdFaster',
-                'measurements': [{'time': -0.3333, 'cpu': -0.3334, 'real_time': 100, 'real_time_other': 67, 'cpu_time': 100, 'cpu_time_other': 67}],
+                'label': '',
+                'measurements': [{'time': -0.3333, 'cpu': -0.3334,
+                                  'real_time': 100, 'real_time_other': 67,
+                                  'cpu_time': 100, 'cpu_time_other': 67}],
                 'time_unit': 'ns',
                 'utest': {}
             },
             {
                 'name': 'BM_NotBadTimeUnit',
-                'measurements': [{'time': -0.9000, 'cpu': 0.2000, 'real_time': 0.4, 'real_time_other': 0.04, 'cpu_time': 0.5, 'cpu_time_other': 0.6}],
+                'label': '',
+                'measurements': [{'time': -0.9000, 'cpu': 0.2000,
+                                  'real_time': 0.4, 'real_time_other': 0.04,
+                                  'cpu_time': 0.5, 'cpu_time_other': 0.6}],
+                'time_unit': 's',
+                'utest': {}
+            },
+            {
+                'name': 'BM_hasLabel',
+                'label': 'a label',
+                'measurements': [{'time': 0.0000, 'cpu': 0.0000,
+                                  'real_time': 1, 'real_time_other': 1,
+                                  'cpu_time': 1, 'cpu_time_other': 1}],
                 'time_unit': 's',
                 'utest': {}
             },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'label': '',
+                'measurements': [{'real_time': 3.1622776601683826e-06, 'cpu_time': 3.2130844755623912e-06,
+                                  'real_time_other': 1.9768988699420897e-07, 'cpu_time_other': 2.397447755209533e-07,
+                                  'time': -0.8112976497120911, 'cpu': -0.7778551721181174}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean', 'utest': {}
+            },
         ]
         self.assertEqual(len(self.json_diff_report), len(expected_output))
         for out, expected in zip(
                 self.json_diff_report, expected_output):
             self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['label'], expected['label'])
             self.assertEqual(out['time_unit'], expected['time_unit'])
             assert_utest(self, out, expected)
             assert_measurements(self, out, expected)
@@ -525,6 +638,7 @@ class TestReportDifferenceBetweenFamilies(unittest.TestCase):
             ['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'],
             ['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'],
             ['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'],
+            ['OVERALL_GEOMEAN', '-0.5000', '-0.5000', '0', '0', '0', '0']
         ]
         output_lines_with_header = print_difference_report(
             self.json_diff_report, use_color=False)
@@ -562,6 +676,16 @@ class TestReportDifferenceBetweenFamilies(unittest.TestCase):
                 'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 30, 'real_time_other': 15, 'cpu_time': 30, 'cpu_time_other': 15}],
                 'time_unit': 'ns',
                 'utest': {}
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 2.213363839400641e-08, 'cpu_time': 2.213363839400641e-08,
+                                  'real_time_other': 1.1066819197003185e-08, 'cpu_time_other': 1.1066819197003185e-08,
+                                  'time': -0.5000000000000009, 'cpu': -0.5000000000000009}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean',
+                'utest': {}
             }
         ]
         self.assertEqual(len(self.json_diff_report), len(expected_output))
@@ -600,8 +724,8 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
             ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
             ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
             ['BM_Two_pvalue',
-             '0.6985',
-             '0.6985',
+             '1.0000',
+             '0.6667',
              'U',
              'Test,',
              'Repetitions:',
@@ -618,7 +742,7 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
             ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
             ['short_pvalue',
              '0.7671',
-             '0.1489',
+             '0.2000',
              'U',
              'Test,',
              'Repetitions:',
@@ -632,6 +756,7 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
              'repetitions',
              'recommended.'],
             ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
+            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
         ]
         output_lines_with_header = print_difference_report(
             self.json_diff_report, utest=True, utest_alpha=0.05, use_color=False)
@@ -643,6 +768,53 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
             parts = [x for x in output_lines[i].split(' ') if x]
             self.assertEqual(expect_lines[i], parts)
 
+    def test_json_diff_report_pretty_printing_aggregates_only(self):
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two_pvalue',
+             '1.0000',
+             '0.6667',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.2000',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, include_aggregates_only=True, utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
     def test_json_diff_report(self):
         expected_output = [
             {
@@ -672,7 +844,7 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
                 ],
                 'time_unit': 'ns',
                 'utest': {
-                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6985353583033387, 'time_pvalue': 0.6985353583033387
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6666666666666666, 'time_pvalue': 1.0
                 }
             },
             {
@@ -693,7 +865,7 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
                 ],
                 'time_unit': 'ns',
                 'utest': {
-                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.14891467317876572, 'time_pvalue': 0.7670968684102772
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.2, 'time_pvalue': 0.7670968684102772
                 }
             },
             {
@@ -708,6 +880,16 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
                 ],
                 'time_unit': 'ns',
                 'utest': {}
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 8.48528137423858e-09, 'cpu_time': 8.441336246629233e-08,
+                                  'real_time_other': 2.2405267593145244e-08, 'cpu_time_other': 2.5453661413660466e-08,
+                                  'time': 1.6404861082353634, 'cpu': -0.6984640740519662}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean',
+                'utest': {}
             }
         ]
         self.assertEqual(len(self.json_diff_report), len(expected_output))
@@ -747,8 +929,8 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
             ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
             ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
             ['BM_Two_pvalue',
-             '0.6985',
-             '0.6985',
+             '1.0000',
+             '0.6667',
              'U',
              'Test,',
              'Repetitions:',
@@ -765,7 +947,7 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
             ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
             ['short_pvalue',
              '0.7671',
-             '0.1489',
+             '0.2000',
              'U',
              'Test,',
              'Repetitions:',
@@ -778,7 +960,8 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
              '9+',
              'repetitions',
              'recommended.'],
-             ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53']
+            ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
+            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
         ]
         output_lines_with_header = print_difference_report(
             self.json_diff_report,
@@ -820,7 +1003,7 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
                 ],
                 'time_unit': 'ns',
                 'utest': {
-                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6985353583033387, 'time_pvalue': 0.6985353583033387
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6666666666666666, 'time_pvalue': 1.0
                 }
             },
             {
@@ -841,7 +1024,7 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
                 ],
                 'time_unit': 'ns',
                 'utest': {
-                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.14891467317876572, 'time_pvalue': 0.7670968684102772
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.2, 'time_pvalue': 0.7670968684102772
                 }
             },
             {
@@ -853,11 +1036,83 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
                      'real_time': 8,
                      'cpu_time_other': 53,
                      'cpu': -0.3375
-                    }
+                     }
                 ],
                 'utest': {},
                 'time_unit': u'ns',
                 'aggregate_name': ''
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 8.48528137423858e-09, 'cpu_time': 8.441336246629233e-08,
+                                  'real_time_other': 2.2405267593145244e-08, 'cpu_time_other': 2.5453661413660466e-08,
+                                  'time': 1.6404861082353634, 'cpu': -0.6984640740519662}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean',
+                'utest': {}
+            }
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportDifferenceForPercentageAggregates(
+        unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test4_run0.json')
+            testOutput2 = os.path.join(testInputs, 'test4_run1.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(
+            json1, json2, utest=True)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_lines = [
+            ['whocares', '-0.5000', '+0.5000', '0', '0', '0', '0']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report,
+            utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'whocares',
+                'measurements': [
+                    {'time': -0.5,
+                     'cpu': 0.5,
+                     'real_time': 0.01,
+                     'real_time_other': 0.005,
+                     'cpu_time': 0.10,
+                     'cpu_time_other': 0.15}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
             }
         ]
         self.assertEqual(len(self.json_diff_report), len(expected_output))
@@ -869,6 +1124,49 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
             assert_measurements(self, out, expected)
 
 
+class TestReportSorting(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_result():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput = os.path.join(testInputs, 'test4_run.json')
+            with open(testOutput, 'r') as f:
+                json = json.load(f)
+            return json
+
+        cls.json = load_result()
+
+    def test_json_diff_report_pretty_printing(self):
+        import util
+
+        expected_names = [
+            "99 family 0 instance 0 repetition 0",
+            "98 family 0 instance 0 repetition 1",
+            "97 family 0 instance 0 aggregate",
+            "96 family 0 instance 1 repetition 0",
+            "95 family 0 instance 1 repetition 1",
+            "94 family 0 instance 1 aggregate",
+            "93 family 1 instance 0 repetition 0",
+            "92 family 1 instance 0 repetition 1",
+            "91 family 1 instance 0 aggregate",
+            "90 family 1 instance 1 repetition 0",
+            "89 family 1 instance 1 repetition 1",
+            "88 family 1 instance 1 aggregate"
+        ]
+
+        for n in range(len(self.json['benchmarks']) ** 2):
+            random.shuffle(self.json['benchmarks'])
+            sorted_benchmarks = util.sort_benchmark_results(self.json)[
+                'benchmarks']
+            self.assertEqual(len(expected_names), len(sorted_benchmarks))
+            for out, expected in zip(sorted_benchmarks, expected_names):
+                self.assertEqual(out['name'], expected)
+
+
 def assert_utest(unittest_instance, lhs, rhs):
     if lhs['utest']:
         unittest_instance.assertAlmostEqual(
diff --git a/tools/gbench/util.py b/tools/gbench/util.py
index 661c4ba..5e79da8 100644
--- a/tools/gbench/util.py
+++ b/tools/gbench/util.py
@@ -2,9 +2,11 @@
 """
 import json
 import os
-import tempfile
+import re
 import subprocess
 import sys
+import tempfile
+
 
 # Input file type enumeration
 IT_Invalid = 0
@@ -57,7 +59,7 @@ def classify_input_file(filename):
     """
     Return a tuple (type, msg) where 'type' specifies the classified type
     of 'filename'. If 'type' is 'IT_Invalid' then 'msg' is a human readable
-    string represeting the error.
+    string representing the error.
     """
     ftype = IT_Invalid
     err_msg = None
@@ -110,13 +112,49 @@ def remove_benchmark_flags(prefix, benchmark_flags):
     return [f for f in benchmark_flags if not f.startswith(prefix)]
 
 
-def load_benchmark_results(fname):
+def load_benchmark_results(fname, benchmark_filter):
     """
     Read benchmark output from a file and return the JSON object.
+
+    Apply benchmark_filter, a regular expression, with nearly the same
+    semantics of the --benchmark_filter argument.  May be None.
+    Note: the Python regular expression engine is used instead of the
+    one used by the C++ code, which may produce different results
+    in complex cases.
+
     REQUIRES: 'fname' names a file containing JSON benchmark output.
     """
+    def benchmark_wanted(benchmark):
+        if benchmark_filter is None:
+            return True
+        name = benchmark.get('run_name', None) or benchmark['name']
+        if re.search(benchmark_filter, name):
+            return True
+        return False
+
     with open(fname, 'r') as f:
-        return json.load(f)
+        results = json.load(f)
+        if 'benchmarks' in results:
+            results['benchmarks'] = list(filter(benchmark_wanted,
+                                                results['benchmarks']))
+        return results
+
+
+def sort_benchmark_results(result):
+    benchmarks = result['benchmarks']
+
+    # From inner key to the outer key!
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: benchmark['repetition_index'] if 'repetition_index' in benchmark else -1)
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: 1 if 'run_type' in benchmark and benchmark['run_type'] == "aggregate" else 0)
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: benchmark['per_family_instance_index'] if 'per_family_instance_index' in benchmark else -1)
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: benchmark['family_index'] if 'family_index' in benchmark else -1)
+
+    result['benchmarks'] = benchmarks
+    return result
 
 
 def run_benchmark(exe_name, benchmark_flags):
@@ -142,7 +180,7 @@ def run_benchmark(exe_name, benchmark_flags):
     if exitCode != 0:
         print('TEST FAILED...')
         sys.exit(exitCode)
-    json_res = load_benchmark_results(output_name)
+    json_res = load_benchmark_results(output_name, None)
     if is_temp_output:
         os.unlink(output_name)
     return json_res
@@ -157,7 +195,9 @@ def run_or_load_benchmark(filename, benchmark_flags):
     """
     ftype = check_input_file(filename)
     if ftype == IT_JSON:
-        return load_benchmark_results(filename)
+        benchmark_filter = find_benchmark_flag('--benchmark_filter=',
+                                               benchmark_flags)
+        return load_benchmark_results(filename, benchmark_filter)
     if ftype == IT_Executable:
         return run_benchmark(filename, benchmark_flags)
     raise ValueError('Unknown file type %s' % ftype)
diff --git a/tools/libpfm.BUILD.bazel b/tools/libpfm.BUILD.bazel
new file mode 100644
index 0000000..6269534
--- /dev/null
+++ b/tools/libpfm.BUILD.bazel
@@ -0,0 +1,22 @@
+# Build rule for libpfm, which is required to collect performance counters for
+# BENCHMARK_ENABLE_LIBPFM builds.
+
+load("@rules_foreign_cc//foreign_cc:defs.bzl", "make")
+
+filegroup(
+    name = "pfm_srcs",
+    srcs = glob(["**"]),
+)
+
+make(
+    name = "libpfm",
+    lib_source = ":pfm_srcs",
+    lib_name = "libpfm",
+    copts = [
+        "-Wno-format-truncation",
+        "-Wno-use-after-free",
+    ],
+    visibility = [
+        "//visibility:public",
+    ],
+)
diff --git a/tools/requirements.txt b/tools/requirements.txt
index 3b3331b..f32f35b 100644
--- a/tools/requirements.txt
+++ b/tools/requirements.txt
@@ -1 +1,2 @@
-scipy>=1.5.0
-\ No newline at end of file
+numpy == 1.25
+scipy == 1.10.0
diff --git a/tools/strip_asm.py b/tools/strip_asm.py
index 9030550..d131dc7 100755
--- a/tools/strip_asm.py
+++ b/tools/strip_asm.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """
 strip_asm.py - Cleanup ASM output for the specified file
author	Sadaf Ebrahimi <sadafebrahimi@google.com>	2023-09-27 17:54:23 +0000
committer	Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>	2023-09-27 17:54:23 +0000
commit	63fc46edd10580e6c4b1de9055729b9902011879 (patch)
tree	cbcc19fcf98479dce5ad3e7954427bd82961a348
parent	db4553b1a39ef8ef84a097dfa2e795c0a4df60d8 (diff)
parent	83836ea18fd8db8d60e4d1a1dcf96109e1447611 (diff)
download	google-benchmark-63fc46edd10580e6c4b1de9055729b9902011879.tar.gz